EukPhylo/Utilities/for_MSAs/Gappiness.py
2024-06-03 16:09:29 -04:00

86 lines
4.1 KiB
Python

#Author, date: Godwin Ani, last updated Nov 7th 2023
#Motivation: Produce gappyness stats for paralogs in an alignment.
#Intent: To use the gappyness stats in filtering paralogs.
#Inputs: A folder of alignment files
#Outputs: A spreadsheet of the gappyness stats
#Example: python Gappiness.py --alignment /Path/to/alignments --code ten digit code
#Dependencies
import os, sys, re
from Bio import SeqIO
from tqdm import tqdm
import itertools
import pandas as pd
import argparse
parser = argparse.ArgumentParser(
prog = 'gappiness assessment script',
description = "Updated November 7, 2023 by Godwin Ani.")
parser.add_argument('-a', '--alignment', help = 'The alignment files folder')
parser.add_argument('-c', '--code', help = 'The ten digit code for gappiness assessment')
args = parser.parse_args()
def faralog_gaps():
df = pd.DataFrame()
#looping through the fasta files in the folder.
for file in tqdm(os.listdir(args.alignment)):
if file.endswith('.fasta'):
#creating empty lists to store the id, sequence, and splitted sequences of the fasta files.
name =[]
seq = []
seq_len = []
outer = []
total = []
split = []
#reading the fasta files with Biopython (looping each sequence in a file and populating the empty name and seq lists).
for x in SeqIO.parse(args.alignment + '/' + file, "fasta"):
if (args.code != None and x.id.startswith(args.code)) or args.code == None:
name.append(x.id)
seq.append(x.seq)
seq_len.append(len(x.seq))
total.append(x.seq.count('-'))
#looping through the seq list, converting the Biopython seq object to string, and splitting it with all possible bases to get a list of gaps.
for x in seq:
split.append(re.split(r'[A-Z]', str(x)))
#looping through the gap lists and summing the first splitted and last splitted gaps in the sequence to get the terminal gaps length.
for x in split:
outer.append(len(x[0] + x[-1]))
internal = [total - outer for total, outer in zip(total, outer)]
data = {'Sequence': name, 'Terminal_gaps': outer, 'Internal_gaps':internal,'Aln_length':seq_len}
df1 = pd.DataFrame(data)
df1['Total'] = df1['Terminal_gaps'] + df1['Internal_gaps']
df1['Total'] = df1['Total']/df1['Aln_length']
df1['OG'] = df1['Sequence'].str[-10:]
df1['Avg_terminal_gaps_for_OG'] = df1['Terminal_gaps'].mean()
df1['Avg_internal_gaps_for_OG'] = df1['Internal_gaps'].mean()
df1['#term/avgTerm'] = df1['Terminal_gaps']/df1['Avg_terminal_gaps_for_OG']
df1['#int/avgint'] = df1['Internal_gaps']/df1['Avg_internal_gaps_for_OG']
best = df1[df1.Total == df1.Total.min()]
best_terminal_gaps = best['Terminal_gaps'].iloc[0]
if best_terminal_gaps == 0:
best_terminal_gaps += 1
best_internal_gaps = best['Internal_gaps'].iloc[0]
if best_internal_gaps == 0:
best_internal_gaps += 1
df1['Seq_term/best_seq_term'] = df1['Terminal_gaps']/best_terminal_gaps
df1['Seq_int/best_seq_int'] = df1['Internal_gaps']/best_internal_gaps
df1['Seq_term/avgTerm'] = df1['Terminal_gaps']/df1['Avg_terminal_gaps_for_OG']
df1['Seq_int/avgInt'] = df1['Internal_gaps']/df1['Avg_internal_gaps_for_OG']
cond = best['Total'].iloc[0]
df1['Best_seq'] = ['Yes' if x == cond else 'No' for x in df1['Total']]
df = pd.concat([df,df1])
df = df[['OG', 'Sequence', 'Best_seq', 'Terminal_gaps', 'Avg_terminal_gaps_for_OG','Seq_term/best_seq_term','Seq_term/avgTerm','Internal_gaps','Avg_internal_gaps_for_OG','Seq_int/best_seq_int','Seq_int/avgInt','Aln_length', 'Total']]
df = df.round(3)
df.to_csv(args.code + '.csv', index = False)
faralog_gaps()