From 341c9a487bc1d34847894a56b695cc65b9bae646 Mon Sep 17 00:00:00 2001 From: Godwin Ani Date: Thu, 9 Feb 2023 18:36:34 +0100 Subject: [PATCH] ClusterMC A clustering program that accepts and validates users input. --- Utilities/for_fastas/ClusterMC_Feb9.py | 101 +++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 Utilities/for_fastas/ClusterMC_Feb9.py diff --git a/Utilities/for_fastas/ClusterMC_Feb9.py b/Utilities/for_fastas/ClusterMC_Feb9.py new file mode 100644 index 0000000..f03fd8d --- /dev/null +++ b/Utilities/for_fastas/ClusterMC_Feb9.py @@ -0,0 +1,101 @@ +#Professor L. Katz and Godwin Ani +#9th- Feb- 2023 +#A clustering program that accepts and validates users input + +import os, sys, re +from tqdm import tqdm + + +def cluster(): + ''' This function takes an amino acid or DNA sequence stored in a fasta format and clusters it. +It uses two nested functions (cluster_AA and cluster_DNA) to perform this operation. +Ensure that fasta files to be clustered are stored in a root directory folder named "to_cluster". This folder will automatically be created if none exists. +The result of the clustering process is saved into a root directory folder named "Clustered" which is created automatically if none exists. ''' + + if not os.path.isdir('to_cluster'): + os.mkdir('to_cluster') + + if not os.path.isdir('Clustered'): + os.mkdir('Clustered') + # Nested function for amino acids clustering. + def cluster_AA(): + #input validation for the sequence identity threshold. + while True: + try: + val1 =input( 'Amino Acids Sequence Identity Threshold (e.g. 0.99, 0.95)? : ') + integer, fractional = val1.split('.') + val1 = float(val1) + if int(integer)== 0 and len(fractional) == 2: + break + except ValueError: + pass + print('ERROR! Use format 0.## for Amino acids sequence identity threshold.') + + #Input validation for the overlap value. + while True: + try: + val2 =input( 'Amino Acids Alignment Overlap Value (e.g. 0.67, 0.75)? : ') + integer, fractional = val2.split('.') + val2 = float(val2) + if int(integer)== 0 and len(fractional) == 2: + break + except ValueError: + pass + print('ERROR! Use format 0.## for Amino acids sequence alignment overlap value') + + #Selects amino acids fasta files in "to_cluster" folder and clusters them with CD-HIT. + for file in tqdm(os.listdir('to_cluster')): + if file.endswith('.fasta'): + os.system('cd-hit -i to_cluster/' + file + ' -o Clustered/' + file + ' -c %s -d 0 -aS %s' %( val1, val2)) + #Renaming the result of the clustering. + for file in os.listdir('Clustered'): + if file.endswith('.clstr'): + os.rename('Clustered/' + file, 'Clustered/' + file.split('FILE')[0] + 'Clustered.txt') + + + #Nested function for DNA clustering. + def cluster_DNA(): + #Input validation for the sequence identity threshold. + while True: + try: + val1 =input( 'DNA Sequence Identity Threshold (e.g. 0.99, 0.95)? : ') + integer, fractional = val1.split('.') + val1 = float(val1) + if int(integer)== 0 and len(fractional) == 2: + break + except ValueError: + pass + print('ERROR! Use format 0.## for DNA sequence identity threshold.') + #Input validation for the overlap value. + while True: + try: + val2 =input( 'DNA Sequence Alignment Overlap Value (e.g. 0.67, 0.75)? : ') + integer, fractional = val2.split('.') + val2 = float(val2) + if int(integer)== 0 and len(fractional) == 2: + break + except ValueError: + pass + print('ERROR! Use format 0.## for DNA sequence alignment overlap value.') + + #Selects DNA fasta files in "to_cluster" folder and clusters them with CD-HIT. + for file in tqdm(os.listdir('to_cluster')): + if file.endswith('.fasta'): + os.system('cd-hit-est -i to_cluster/' + file + ' -o Clustered/' + file + ' -c %s -d 0 -aS %s' %( val1, val2)) + + #Renaming the result of the clustering. + for file in os.listdir('Clustered'): + if file.endswith('.clstr'): + os.rename('Clustered/' + file, 'Clustered/' + file.split('FILE')[0] + 'Clustered.txt') + + # Prompts for user input and function call. + choice_1 = input('Are you clustering Amino Acids or DNA? (AA or DNA) : ') + if choice_1 in ['AA', 'Aa', 'aa']: + cluster_AA() + elif choice_1 in ['DNA', 'Dna', 'dna']: + cluster_DNA() + else: + print('Sorry. This program can only cluster Amino Acids and DNA') + +cluster() +