EukPhylo/Utilities/for_fastas/SeqLenToCsv_v1.0.py

'''
#Author, date: Godwin Ani, 9th- Feb - 2023.
#Dependencies: Python3, Biopython
#Intent: Exports the length of DNA sequences excluding gaps and missing data to a csv file.
#Inputs: A folder named 'Seq_length' containing DNA fasta files.
#Outputs: A spreadsheet containing the length of the sequences.
#Example: python SeqLenToCsv_v1.0.py
'''


import os, sys, re
from Bio import SeqIO
from Bio.Seq import Seq
import numpy as np
import pandas as pd
from tqdm import tqdm

'''Ensure that DNA fasta files are stored in a root directory folder named "Seq_length". This folder will automatically be created if none exists.
	The result of the process is saved into a sub-folder within the "Seq_length" folder named "csv" which is created automatically if none exists. '''


def Seq_length_to_csv():
    if not os.path.isdir('Seq_length'):
        os.mkdir('Seq_length')

    if not os.path.isdir('Seq_length/csv/'):
        os.mkdir('Seq_length/csv/')

    list = []
    for file in os.listdir('Seq_length'):
        if file.endswith('.fasta'):
            list.append(file)

    for x in tqdm(list):
        name =[]
        seq_length = []
        file_name = x.split('fasta')[0] + '.csv'
        for x in SeqIO.parse('Seq_length/' + x, "fasta"):
            name.append(x.id)
            seq = x.seq
            seq_length.append(seq.count("A") + seq.count("a") + seq.count("T") + seq.count("t") + seq.count("G") + seq.count("g") + seq.count("C") + seq.count("c"))
            a = np.array([seq_length])
            mean_length = round(np.mean(a), 2)
            data = {'Name of sequence' : name, 'Length of sequence' : seq_length, 'Average sequence length' : mean_length}
            df = pd.DataFrame(data)
            df.index += 1
            df.to_csv('Seq_length/csv/' + file_name)


Seq_length_to_csv()