SeqLengthToCsv

Seq_length_to_csv is a program that exports the length of DNA sequences excluding gaps and missing data to a csv file.
This commit is contained in:
Godwin Ani 2023-02-09 18:41:01 +01:00 committed by GitHub
parent 341c9a487b
commit 9f86dbe040
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -0,0 +1,48 @@
'''
Professor L. Katz and Godwin Ani
9th-Feb-2023
Seq_length_to_csv is a program that exports the length of DNA sequences excluding gaps and missing data to a csv file.
'''
import os, sys, re
from Bio import SeqIO
from Bio.Seq import Seq
import numpy as np
import pandas as pd
from tqdm import tqdm
'''Ensure that DNA fasta files are stored in a root directory folder named "Seq_length". This folder will automatically be created if none exists.
The result of the process is saved into a sub-folder within the "Seq_length" folder named "csv" which is created automatically if none exists. '''
def Seq_length_to_csv():
if not os.path.isdir('Seq_length'):
os.mkdir('Seq_length')
if not os.path.isdir('Seq_length/csv/'):
os.mkdir('Seq_length/csv/')
list = []
for file in os.listdir('Seq_length'):
if file.endswith('.fasta'):
list.append(file)
for x in tqdm(list):
name =[]
seq_length = []
file_name = x.split('fasta')[0] + '.csv'
for x in SeqIO.parse('Seq_length/' + x, "fasta"):
name.append(x.id)
seq = x.seq
seq_length.append(seq.count("A") + seq.count("a") + seq.count("T") + seq.count("t") + seq.count("G") + seq.count("g") + seq.count("C") + seq.count("c"))
a = np.array([seq_length])
mean_length = round(np.mean(a), 2)
data = {'Name of sequence' : name, 'Length of sequence' : seq_length, 'Average sequence length' : mean_length}
df = pd.DataFrame(data)
df.index += 1
df.to_csv('Seq_length/csv/' + file_name)
Seq_length_to_csv()