From 552e9589525c304d4015df52ee5f2cbf44b12caa Mon Sep 17 00:00:00 2001
From: Katzlab <lkatz@smith.edu>
Date: Sun, 9 Jun 2024 12:44:26 -0400
Subject: [PATCH] Update CountTaxonOccurence.py

updated to include summary of major and minor clades, and to allow target minor clade list
---
 Utilities/for_trees/CountTaxonOccurence.py | 164 ++++++++++++++-------
 1 file changed, 107 insertions(+), 57 deletions(-)
diff --git a/Utilities/for_trees/CountTaxonOccurence.py b/Utilities/for_trees/CountTaxonOccurence.py
index db29a3c..9022d71 100644
--- a/Utilities/for_trees/CountTaxonOccurence.py
+++ b/Utilities/for_trees/CountTaxonOccurence.py
@@ -1,13 +1,15 @@
 '''
-Author: Auden Cote-L'Heureux
-Last updated: 10/24/23
-Motivation: Count the number of occurences of each taxa in each OG in a post guidance file
+Author: Auden Cote-L'Heureux, Laura Katz and ChatGPT
+Last updated: June 9th, 2024
+Motivation: Count the number of occurrences of each taxa in each OG in a post guidance file
 Dependencies: Bio python, os, sys
-Inputs: Directory of postguidance files
-Outputs: CSV file tallying all the counts of taxa in each OG file
-Command line: python CountTaxonOccurence.py --input <dir of postguidance files>
+Inputs: Directory of postguidance files 
+Optional: use the --minor flag and include a file named focal_minors.txt in same folder (do not put file name in command line). This file should be csv of targets (Am_tu, Sr_rh, Sr_ci)
+Outputs: CSV file tallying all the counts of taxa in each OG file plus minor and major clade tallies
+Command line: python3 CountTaxonOccurence_faster_minor.py  --input <dir of postguidance files> --minor
 '''
 
+
 import os
 import sys
 from Bio import SeqIO
@@ -15,60 +17,108 @@ import argparse
 
 
 def get_args():
-
-	parser = argparse.ArgumentParser(
-		prog = 'Taxon occurence counting script',
-		description = "Updated Oct 24th, 2023 by Auden Cote-L'Heureux."
-	)
-
-	parser.add_argument('-i', '--input', type = str, required = True, help = 'Path to the folder containing the aligned/unaligned fasta files')
-	args = parser.parse_args()
-		
-	if(args.input.endswith('/')):
-		args.input = args.input[:-1]
-		
-	if(not os.path.isdir(args.input)):
-		print('\nThe input folder (--input) could not be found. Make sure you have given the correct path.\n')
-		exit()
-				
-	return args.input
+    parser = argparse.ArgumentParser(
+        prog = 'Taxon occurrence counting script',
+        description = "Updated June 9, 2024"
+    )
+    parser.add_argument('-i', '--input', type = str, required = True, help = 'Path to the folder containing the aligned/unaligned fasta files')
+    parser.add_argument('--minor', action='store_true', help = 'Flag to use focal minor clades from focal_minors.txt')
+    args = parser.parse_args()
+        
+    if(args.input.endswith('/')):
+        args.input = args.input[:-1]
+        
+    if(not os.path.isdir(args.input)):
+        print('\nThe input folder (--input) could not be found. Make sure you have given the correct path.\n')
+        exit()
+                
+    return args.input, args.minor
 
 
-def count_tips(in_dir):
+def count_tips(in_dir, use_focal_minors):
+    
+    focal_minors = []
+    if use_focal_minors:
+        try:
+            with open('focal_minors.txt', 'r') as f:
+                focal_minors = f.read().strip().split(',')
+                focal_minors = [minor.strip() for minor in focal_minors]
+        except FileNotFoundError:
+            print('A file called focal_minors.txt must be included in the folder with your script. This file should have a csv of target minor clades such as "Am_tu, Sr_ci, Sr_rh"')
+            exit()
+    
+    count_data = {}
+    major_clades = set()
+    minor_clades = set()
+    
+    for file in os.listdir(in_dir):
+        if file.split('.')[-1] in ('fasta', 'fas', 'faa', 'fna'):
+            fname = os.path.join(in_dir, file)
+            
+            count_data[file] = {}
+            
+            # Use SeqIO.index for faster access
+            seq_index = SeqIO.index(fname, "fasta")
+            tips = [seq_index[record_id].id[:10] for record_id in seq_index]
+            
+            for tip in tips:
+                major_clade = tip[:2]
+                minor_clade = tip[:5]
+                
+                if use_focal_minors:
+                    if minor_clade not in focal_minors:
+                        continue
+                
+                major_clades.add(major_clade)
+                minor_clades.add(minor_clade)
+                
+                if tip not in count_data[file]:
+                    count_data[file][tip] = 0
+                count_data[file][tip] += 1
+    
+    if use_focal_minors:
+        # Filter major and minor clades based on focal minors
+        major_clades = sorted({minor[:2] for minor in focal_minors})
+        minor_clades = sorted(focal_minors)
+    else:
+        major_clades = sorted(major_clades)
+        minor_clades = sorted(minor_clades)
+    
+    taxa = sorted({tax for file_data in count_data.values() for tax in file_data})
+    
+    # Generate output file name based on input folder name
+    folder_name = os.path.basename(in_dir)
+    output_file_name = folder_name + "_TaxonOccurrence.csv"
+    
+    with open(output_file_name, 'w') as o:
+        o.write(',' + ','.join(major_clades + minor_clades + taxa) + '\n')
+        for file in count_data:
+            o.write(file)
+            
+            major_clade_counts = {clade: 0 for clade in major_clades}
+            minor_clade_counts = {clade: 0 for clade in minor_clades}
+            
+            for tax, count in count_data[file].items():
+                major_clade_counts[tax[:2]] += count
+                minor_clade_counts[tax[:5]] += count
+            
+            for clade in major_clades:
+                o.write(',' + str(major_clade_counts[clade]))
+                
+            for clade in minor_clades:
+                o.write(',' + str(minor_clade_counts[clade]))
+            
+            for tax in taxa:
+                if tax in count_data[file]:
+                    o.write(',' + str(count_data[file][tax]))
+                else:
+                    o.write(',0')
+            o.write('\n')
+
 
-	count_data = { }
-	for file in os.listdir(in_dir):
-		if file.split('.')[-1] in ('fasta', 'fas', 'faa', 'fna'):
-			fname = in_dir + '/' + file
-			
-			count_data.update({ file : { } })
-			tips = [record.id[:10] for record in SeqIO.parse(in_dir+'/'+file, 'fasta')]
-					
-			for tip in tips:
-				tip = tip.strip()
-				if(tip[:10] not in count_data[file]):
-					count_data[file].update({ tip[:10] : 0 })
-				count_data[file][tip[:10]] += 1
-				
-	taxa = sorted(list(dict.fromkeys([tax for file in count_data for tax in count_data[file]])))
-				
-	with open('TaxonOccurrence.csv', 'w') as o:
-		o.write(',' + ','.join(taxa) + '\n')
-		for file in count_data:
-			o.write(file)
-			for tax in taxa:
-				if(tax in count_data[file]):
-					o.write(',' + str(count_data[file][tax]))
-				else:
-					o.write(',0')
-			o.write('\n')	
-	
-	
 def main():
-
-	in_dir = get_args()
-
-	count_tips(in_dir)
-	
+    in_dir, use_focal_minors = get_args()
+    count_tips(in_dir, use_focal_minors)
+    
 
 main()