Update Cluster_v2.0.py

2026-02-11 00:50:25 +08:00 · 2024-01-18 09:29:46 -05:00 · 2024-01-18 09:29:46 -05:00 · e8afd97a16
commit e8afd97a16
parent ff9a419c4e
1 changed files with 56 additions and 91 deletions
--- a/Utilities/for_fastas/Cluster_v2.0.py
+++ b/Utilities/for_fastas/Cluster_v2.0.py
@ -1,101 +1,66 @@
-#Professor L. Katz and Godwin Ani
-#9th- Feb- 2023
-#A clustering program that accepts and validates users input
+'''
+#Author, date: Godwin Ani and Laura Katz, 9th- Feb - 2023.
+#Dependencies: Python3, CD-Hit
+#Inputs: A folder of containing  Amino acid or DNA fasta files.
+#Outputs: A folder of clustered files.
+#Example: python Cluster_v2.0.py --type dna --identity 0.95 --overlap 0.67 --input input_folder_dna --output output_folder_dna
+'''

-import os, sys, re
+
+
+import os
+import argparse
 from tqdm import tqdm
+import subprocess

+def input_validation(value, error_message):
+    try:
+        integer, fractional = value.split('.')
+        value = float(value)
+        if int(integer) == 0 and len(fractional) == 2:
+            return value
+    except ValueError:
+        pass
+    print(error_message)
+    exit(1)

-def cluster():
-    ''' This function takes an amino acid or DNA sequence stored in a fasta format and clusters it.
-It uses two nested functions (cluster_AA and cluster_DNA) to perform this operation.
-Ensure that fasta files to be clustered are stored in a root directory folder named "to_cluster". This folder will automatically be created if none exists.
-The result of the clustering process is saved into a root directory folder named "Clustered" which is created automatically if none exists. '''
-  
-    if not os.path.isdir('to_cluster'):
-        os.mkdir('to_cluster')
-        
-    if not os.path.isdir('Clustered'):
-        os.mkdir('Clustered')
-    # Nested function for amino acids clustering.    
-    def cluster_AA():
-    #input validation for the sequence identity threshold.
-        while True:
-            try:
-                val1 =input( 'Amino Acids Sequence Identity Threshold (e.g. 0.99, 0.95)? : ')
-                integer, fractional = val1.split('.')
-                val1 = float(val1)
-                if int(integer)== 0 and len(fractional) == 2:
-                    break
-            except ValueError:
-                pass
-            print('ERROR! Use format 0.## for Amino acids sequence identity threshold.')
- 
-      #Input validation for the overlap value.
-        while True:
-            try:
-                val2 =input( 'Amino Acids Alignment Overlap Value (e.g. 0.67, 0.75)? : ')
-                integer, fractional = val2.split('.')
-                val2 = float(val2)
-                if int(integer)== 0 and len(fractional) == 2:
-                    break
-            except ValueError:
-                pass
-            print('ERROR! Use format 0.## for Amino acids sequence alignment overlap value')
-            
-  #Selects amino acids fasta files in "to_cluster" folder and clusters them with CD-HIT.
-        for file in tqdm(os.listdir('to_cluster')):
-            if file.endswith('.fasta'):
-                os.system('cd-hit -i to_cluster/' + file + ' -o Clustered/' + file + ' -c %s -d 0 -aS %s' %( val1, val2))
-     #Renaming the result of the clustering.
-        for file in os.listdir('Clustered'):
-            if file.endswith('.clstr'):
-                os.rename('Clustered/' + file, 'Clustered/' + file.split('FILE')[0] + 'Clustered.txt')
+def cluster_sequences(program, threshold, overlap, input_folder, output_folder):
+    for file in tqdm(os.listdir(input_folder)):
+        if file.endswith('.fasta'):
+            subprocess.run([f'{program}', '-i', f'{input_folder}/{file}', '-o', f'{output_folder}/{file}', '-c', f'{threshold}', '-d', '0', '-aS', f'{overlap}'])

+    for file in os.listdir(output_folder):
+        if file.endswith('.clstr'):
+            os.rename(f'{output_folder}/{file}', f'{output_folder}/{file.split("FILE")[0]}Clustered.txt')

-    #Nested function for DNA clustering.
-    def cluster_DNA():
-        #Input validation for the sequence identity threshold.
-        while True:
-            try:
-                val1 =input( 'DNA Sequence Identity Threshold (e.g. 0.99, 0.95)? : ')
-                integer, fractional = val1.split('.')
-                val1 = float(val1)
-                if int(integer)== 0 and len(fractional) == 2:
-                    break
-            except ValueError:
-                pass
-            print('ERROR! Use format 0.## for DNA sequence identity threshold.')
-      #Input validation for the overlap value. 
-        while True:
-            try:
-                val2 =input( 'DNA Sequence Alignment Overlap Value (e.g. 0.67, 0.75)? : ')
-                integer, fractional = val2.split('.')
-                val2 = float(val2)
-                if int(integer)== 0 and len(fractional) == 2:
-                    break
-            except ValueError:
-                pass
-            print('ERROR! Use format 0.## for DNA sequence alignment overlap value.')
-            
-            #Selects DNA fasta files in "to_cluster" folder and clusters them with CD-HIT.
-        for file in tqdm(os.listdir('to_cluster')):
-            if file.endswith('.fasta'):
-                os.system('cd-hit-est -i to_cluster/' + file + ' -o Clustered/' + file + ' -c %s -d 0 -aS %s' %( val1, val2))
+def main():
+    parser = argparse.ArgumentParser(description='Cluster amino acid or DNA sequences using CD-HIT.')
+    parser.add_argument('--type', choices=['aa', 'dna'], required=True, help='Type of sequences (aa for Amino Acids, dna for DNA)')
+    parser.add_argument('--identity', type=str, required=True, help='Sequence Identity Threshold (e.g., 0.99, 0.95)')
+    parser.add_argument('--overlap', type=str, required=True, help='Sequence Alignment Overlap Value (e.g., 0.67, 0.75)')
+    parser.add_argument('--input', type=str, required=True, help='Input folder containing sequences in fasta format')
+    parser.add_argument('--output', type=str, required=True, help='Output folder for clustered sequences')

-        #Renaming the result of the clustering.
-        for file in os.listdir('Clustered'):
-            if file.endswith('.clstr'):
-                os.rename('Clustered/' + file, 'Clustered/' + file.split('FILE')[0] + 'Clustered.txt')
+    args = parser.parse_args()

-    # Prompts for user input and function call.
-    choice_1 = input('Are you clustering Amino Acids or DNA? (AA or DNA) : ')
-    if choice_1 in ['AA', 'Aa', 'aa']:
-        cluster_AA()
-    elif choice_1 in ['DNA', 'Dna', 'dna']:
-        cluster_DNA()
+    if not os.path.isdir(args.input):
+        print(f'Error: Input folder "{args.input}" does not exist.')
+        exit(1)
+
+    if not os.path.isdir(args.output):
+        os.mkdir(args.output)
+
+    if args.type == 'aa':
+        threshold = input_validation(args.threshold, 'ERROR! Use format 0.## for Amino acids sequence identity threshold.')
+        overlap = input_validation(args.overlap, 'ERROR! Use format 0.## for Amino acids sequence alignment overlap value.')
+        cluster_sequences('cd-hit', threshold, overlap, args.input, args.output)
+    elif args.type == 'dna':
+        threshold = input_validation(args.threshold, 'ERROR! Use format 0.## for DNA sequence identity threshold.')
+        overlap = input_validation(args.overlap, 'ERROR! Use format 0.## for DNA sequence alignment overlap value.')
+        cluster_sequences('cd-hit-est', threshold, overlap, args.input, args.output)
    else:
-        print('Sorry. This program can only cluster Amino Acids and DNA')
-
-cluster()
+        print('Invalid sequence type. Choose "aa" for Amino Acids or "dna" for DNA.')
+        exit(1)

+if __name__ == "__main__":
+    main()