mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-27 05:50:24 +08:00
53 lines
1.8 KiB
Python
53 lines
1.8 KiB
Python
'''
|
|
#Author, date: Godwin Ani, 9th- Feb - 2023.
|
|
#Dependencies: Python3, Biopython
|
|
#Intent: Exports the length of DNA sequences excluding gaps and missing data to a csv file.
|
|
#Inputs: A folder named 'Seq_length' containing DNA fasta files.
|
|
#Outputs: A spreadsheet containing the length of the sequences.
|
|
#Example: python SeqLenToCsv_v1.0.py
|
|
'''
|
|
|
|
|
|
import os, sys, re
|
|
from Bio import SeqIO
|
|
from Bio.Seq import Seq
|
|
import numpy as np
|
|
import pandas as pd
|
|
from tqdm import tqdm
|
|
|
|
'''Ensure that DNA fasta files are stored in a root directory folder named "Seq_length". This folder will automatically be created if none exists.
|
|
The result of the process is saved into a sub-folder within the "Seq_length" folder named "csv" which is created automatically if none exists. '''
|
|
|
|
|
|
def Seq_length_to_csv():
|
|
if not os.path.isdir('Seq_length'):
|
|
os.mkdir('Seq_length')
|
|
|
|
if not os.path.isdir('Seq_length/csv/'):
|
|
os.mkdir('Seq_length/csv/')
|
|
|
|
list = []
|
|
for file in os.listdir('Seq_length'):
|
|
if file.endswith('.fasta'):
|
|
list.append(file)
|
|
|
|
for x in tqdm(list):
|
|
name =[]
|
|
seq_length = []
|
|
file_name = x.split('fasta')[0] + '.csv'
|
|
for x in SeqIO.parse('Seq_length/' + x, "fasta"):
|
|
name.append(x.id)
|
|
seq = x.seq
|
|
seq_length.append(seq.count("A") + seq.count("a") + seq.count("T") + seq.count("t") + seq.count("G") + seq.count("g") + seq.count("C") + seq.count("c"))
|
|
a = np.array([seq_length])
|
|
mean_length = round(np.mean(a), 2)
|
|
data = {'Name of sequence' : name, 'Length of sequence' : seq_length, 'Average sequence length' : mean_length}
|
|
df = pd.DataFrame(data)
|
|
df.index += 1
|
|
df.to_csv('Seq_length/csv/' + file_name)
|
|
|
|
|
|
Seq_length_to_csv()
|
|
|
|
|