diff --git a/Utilities/for_fastas/Cluster_v2.0.py b/Utilities/for_fastas/Cluster_v2.0.py index f03fd8d..c63a9e1 100644 --- a/Utilities/for_fastas/Cluster_v2.0.py +++ b/Utilities/for_fastas/Cluster_v2.0.py @@ -1,101 +1,66 @@ -#Professor L. Katz and Godwin Ani -#9th- Feb- 2023 -#A clustering program that accepts and validates users input +''' +#Author, date: Godwin Ani and Laura Katz, 9th- Feb - 2023. +#Dependencies: Python3, CD-Hit +#Inputs: A folder of containing Amino acid or DNA fasta files. +#Outputs: A folder of clustered files. +#Example: python Cluster_v2.0.py --type dna --identity 0.95 --overlap 0.67 --input input_folder_dna --output output_folder_dna +''' -import os, sys, re + + +import os +import argparse from tqdm import tqdm +import subprocess +def input_validation(value, error_message): + try: + integer, fractional = value.split('.') + value = float(value) + if int(integer) == 0 and len(fractional) == 2: + return value + except ValueError: + pass + print(error_message) + exit(1) -def cluster(): - ''' This function takes an amino acid or DNA sequence stored in a fasta format and clusters it. -It uses two nested functions (cluster_AA and cluster_DNA) to perform this operation. -Ensure that fasta files to be clustered are stored in a root directory folder named "to_cluster". This folder will automatically be created if none exists. -The result of the clustering process is saved into a root directory folder named "Clustered" which is created automatically if none exists. ''' - - if not os.path.isdir('to_cluster'): - os.mkdir('to_cluster') - - if not os.path.isdir('Clustered'): - os.mkdir('Clustered') - # Nested function for amino acids clustering. - def cluster_AA(): - #input validation for the sequence identity threshold. - while True: - try: - val1 =input( 'Amino Acids Sequence Identity Threshold (e.g. 0.99, 0.95)? : ') - integer, fractional = val1.split('.') - val1 = float(val1) - if int(integer)== 0 and len(fractional) == 2: - break - except ValueError: - pass - print('ERROR! Use format 0.## for Amino acids sequence identity threshold.') - - #Input validation for the overlap value. - while True: - try: - val2 =input( 'Amino Acids Alignment Overlap Value (e.g. 0.67, 0.75)? : ') - integer, fractional = val2.split('.') - val2 = float(val2) - if int(integer)== 0 and len(fractional) == 2: - break - except ValueError: - pass - print('ERROR! Use format 0.## for Amino acids sequence alignment overlap value') - - #Selects amino acids fasta files in "to_cluster" folder and clusters them with CD-HIT. - for file in tqdm(os.listdir('to_cluster')): - if file.endswith('.fasta'): - os.system('cd-hit -i to_cluster/' + file + ' -o Clustered/' + file + ' -c %s -d 0 -aS %s' %( val1, val2)) - #Renaming the result of the clustering. - for file in os.listdir('Clustered'): - if file.endswith('.clstr'): - os.rename('Clustered/' + file, 'Clustered/' + file.split('FILE')[0] + 'Clustered.txt') +def cluster_sequences(program, threshold, overlap, input_folder, output_folder): + for file in tqdm(os.listdir(input_folder)): + if file.endswith('.fasta'): + subprocess.run([f'{program}', '-i', f'{input_folder}/{file}', '-o', f'{output_folder}/{file}', '-c', f'{threshold}', '-d', '0', '-aS', f'{overlap}']) + for file in os.listdir(output_folder): + if file.endswith('.clstr'): + os.rename(f'{output_folder}/{file}', f'{output_folder}/{file.split("FILE")[0]}Clustered.txt') - #Nested function for DNA clustering. - def cluster_DNA(): - #Input validation for the sequence identity threshold. - while True: - try: - val1 =input( 'DNA Sequence Identity Threshold (e.g. 0.99, 0.95)? : ') - integer, fractional = val1.split('.') - val1 = float(val1) - if int(integer)== 0 and len(fractional) == 2: - break - except ValueError: - pass - print('ERROR! Use format 0.## for DNA sequence identity threshold.') - #Input validation for the overlap value. - while True: - try: - val2 =input( 'DNA Sequence Alignment Overlap Value (e.g. 0.67, 0.75)? : ') - integer, fractional = val2.split('.') - val2 = float(val2) - if int(integer)== 0 and len(fractional) == 2: - break - except ValueError: - pass - print('ERROR! Use format 0.## for DNA sequence alignment overlap value.') - - #Selects DNA fasta files in "to_cluster" folder and clusters them with CD-HIT. - for file in tqdm(os.listdir('to_cluster')): - if file.endswith('.fasta'): - os.system('cd-hit-est -i to_cluster/' + file + ' -o Clustered/' + file + ' -c %s -d 0 -aS %s' %( val1, val2)) +def main(): + parser = argparse.ArgumentParser(description='Cluster amino acid or DNA sequences using CD-HIT.') + parser.add_argument('--type', choices=['aa', 'dna'], required=True, help='Type of sequences (aa for Amino Acids, dna for DNA)') + parser.add_argument('--identity', type=str, required=True, help='Sequence Identity Threshold (e.g., 0.99, 0.95)') + parser.add_argument('--overlap', type=str, required=True, help='Sequence Alignment Overlap Value (e.g., 0.67, 0.75)') + parser.add_argument('--input', type=str, required=True, help='Input folder containing sequences in fasta format') + parser.add_argument('--output', type=str, required=True, help='Output folder for clustered sequences') - #Renaming the result of the clustering. - for file in os.listdir('Clustered'): - if file.endswith('.clstr'): - os.rename('Clustered/' + file, 'Clustered/' + file.split('FILE')[0] + 'Clustered.txt') + args = parser.parse_args() - # Prompts for user input and function call. - choice_1 = input('Are you clustering Amino Acids or DNA? (AA or DNA) : ') - if choice_1 in ['AA', 'Aa', 'aa']: - cluster_AA() - elif choice_1 in ['DNA', 'Dna', 'dna']: - cluster_DNA() + if not os.path.isdir(args.input): + print(f'Error: Input folder "{args.input}" does not exist.') + exit(1) + + if not os.path.isdir(args.output): + os.mkdir(args.output) + + if args.type == 'aa': + threshold = input_validation(args.threshold, 'ERROR! Use format 0.## for Amino acids sequence identity threshold.') + overlap = input_validation(args.overlap, 'ERROR! Use format 0.## for Amino acids sequence alignment overlap value.') + cluster_sequences('cd-hit', threshold, overlap, args.input, args.output) + elif args.type == 'dna': + threshold = input_validation(args.threshold, 'ERROR! Use format 0.## for DNA sequence identity threshold.') + overlap = input_validation(args.overlap, 'ERROR! Use format 0.## for DNA sequence alignment overlap value.') + cluster_sequences('cd-hit-est', threshold, overlap, args.input, args.output) else: - print('Sorry. This program can only cluster Amino Acids and DNA') - -cluster() + print('Invalid sequence type. Choose "aa" for Amino Acids or "dna" for DNA.') + exit(1) +if __name__ == "__main__": + main()