SeqLengthToCsv

Seq_length_to_csv is a program that exports the length of DNA sequences excluding gaps and missing data to a csv file.
2026-02-10 22:30:24 +08:00 · 2023-02-09 18:41:01 +01:00 · 2023-02-09 18:41:01 +01:00 · 9f86dbe040
commit 9f86dbe040
parent 341c9a487b
1 changed files with 48 additions and 0 deletions
--- a/Utilities/for_fastas/SeqLengthToCsv_Feb9.py
+++ b/Utilities/for_fastas/SeqLengthToCsv_Feb9.py
@ -0,0 +1,48 @@
+'''
+Professor L. Katz and Godwin Ani
+9th-Feb-2023
+Seq_length_to_csv is a program that exports the length of DNA sequences excluding gaps and missing data to a csv file.
+'''
+
+import os, sys, re
+from Bio import SeqIO
+from Bio.Seq import Seq
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+'''Ensure that DNA fasta files are stored in a root directory folder named "Seq_length". This folder will automatically be created if none exists.
+	The result of the process is saved into a sub-folder within the "Seq_length" folder named "csv" which is created automatically if none exists. '''
+	
+	
+def Seq_length_to_csv():
+    if not os.path.isdir('Seq_length'):
+        os.mkdir('Seq_length')
+    
+    if not os.path.isdir('Seq_length/csv/'):
+        os.mkdir('Seq_length/csv/')
+        
+    list = []
+    for file in os.listdir('Seq_length'):
+        if file.endswith('.fasta'):
+            list.append(file)
+            
+    for x in tqdm(list):
+        name =[]
+        seq_length = []
+        file_name = x.split('fasta')[0] + '.csv'
+        for x in SeqIO.parse('Seq_length/' + x, "fasta"):
+            name.append(x.id)
+            seq = x.seq
+            seq_length.append(seq.count("A") + seq.count("a") + seq.count("T") + seq.count("t") + seq.count("G") + seq.count("g") + seq.count("C") + seq.count("c"))
+            a = np.array([seq_length])
+            mean_length = round(np.mean(a), 2)
+            data = {'Name of sequence' : name, 'Length of sequence' : seq_length, 'Average sequence length' : mean_length}
+            df = pd.DataFrame(data)
+            df.index += 1
+            df.to_csv('Seq_length/csv/' + file_name)
+
+
+Seq_length_to_csv()
+
+