diff --git a/Utilities/for_fastas/SeqLengthToCsv_Feb9.py b/Utilities/for_fastas/SeqLengthToCsv_Feb9.py new file mode 100644 index 0000000..5b4cc90 --- /dev/null +++ b/Utilities/for_fastas/SeqLengthToCsv_Feb9.py @@ -0,0 +1,48 @@ +''' +Professor L. Katz and Godwin Ani +9th-Feb-2023 +Seq_length_to_csv is a program that exports the length of DNA sequences excluding gaps and missing data to a csv file. +''' + +import os, sys, re +from Bio import SeqIO +from Bio.Seq import Seq +import numpy as np +import pandas as pd +from tqdm import tqdm + +'''Ensure that DNA fasta files are stored in a root directory folder named "Seq_length". This folder will automatically be created if none exists. + The result of the process is saved into a sub-folder within the "Seq_length" folder named "csv" which is created automatically if none exists. ''' + + +def Seq_length_to_csv(): + if not os.path.isdir('Seq_length'): + os.mkdir('Seq_length') + + if not os.path.isdir('Seq_length/csv/'): + os.mkdir('Seq_length/csv/') + + list = [] + for file in os.listdir('Seq_length'): + if file.endswith('.fasta'): + list.append(file) + + for x in tqdm(list): + name =[] + seq_length = [] + file_name = x.split('fasta')[0] + '.csv' + for x in SeqIO.parse('Seq_length/' + x, "fasta"): + name.append(x.id) + seq = x.seq + seq_length.append(seq.count("A") + seq.count("a") + seq.count("T") + seq.count("t") + seq.count("G") + seq.count("g") + seq.count("C") + seq.count("c")) + a = np.array([seq_length]) + mean_length = round(np.mean(a), 2) + data = {'Name of sequence' : name, 'Length of sequence' : seq_length, 'Average sequence length' : mean_length} + df = pd.DataFrame(data) + df.index += 1 + df.to_csv('Seq_length/csv/' + file_name) + + +Seq_length_to_csv() + +