From 552e9589525c304d4015df52ee5f2cbf44b12caa Mon Sep 17 00:00:00 2001 From: Katzlab Date: Sun, 9 Jun 2024 12:44:26 -0400 Subject: [PATCH] Update CountTaxonOccurence.py updated to include summary of major and minor clades, and to allow target minor clade list --- Utilities/for_trees/CountTaxonOccurence.py | 164 ++++++++++++++------- 1 file changed, 107 insertions(+), 57 deletions(-) diff --git a/Utilities/for_trees/CountTaxonOccurence.py b/Utilities/for_trees/CountTaxonOccurence.py index db29a3c..9022d71 100644 --- a/Utilities/for_trees/CountTaxonOccurence.py +++ b/Utilities/for_trees/CountTaxonOccurence.py @@ -1,13 +1,15 @@ ''' -Author: Auden Cote-L'Heureux -Last updated: 10/24/23 -Motivation: Count the number of occurences of each taxa in each OG in a post guidance file +Author: Auden Cote-L'Heureux, Laura Katz and ChatGPT +Last updated: June 9th, 2024 +Motivation: Count the number of occurrences of each taxa in each OG in a post guidance file Dependencies: Bio python, os, sys -Inputs: Directory of postguidance files -Outputs: CSV file tallying all the counts of taxa in each OG file -Command line: python CountTaxonOccurence.py --input +Inputs: Directory of postguidance files +Optional: use the --minor flag and include a file named focal_minors.txt in same folder (do not put file name in command line). This file should be csv of targets (Am_tu, Sr_rh, Sr_ci) +Outputs: CSV file tallying all the counts of taxa in each OG file plus minor and major clade tallies +Command line: python3 CountTaxonOccurence_faster_minor.py --input --minor ''' + import os import sys from Bio import SeqIO @@ -15,60 +17,108 @@ import argparse def get_args(): - - parser = argparse.ArgumentParser( - prog = 'Taxon occurence counting script', - description = "Updated Oct 24th, 2023 by Auden Cote-L'Heureux." - ) - - parser.add_argument('-i', '--input', type = str, required = True, help = 'Path to the folder containing the aligned/unaligned fasta files') - args = parser.parse_args() - - if(args.input.endswith('/')): - args.input = args.input[:-1] - - if(not os.path.isdir(args.input)): - print('\nThe input folder (--input) could not be found. Make sure you have given the correct path.\n') - exit() - - return args.input + parser = argparse.ArgumentParser( + prog = 'Taxon occurrence counting script', + description = "Updated June 9, 2024" + ) + parser.add_argument('-i', '--input', type = str, required = True, help = 'Path to the folder containing the aligned/unaligned fasta files') + parser.add_argument('--minor', action='store_true', help = 'Flag to use focal minor clades from focal_minors.txt') + args = parser.parse_args() + + if(args.input.endswith('/')): + args.input = args.input[:-1] + + if(not os.path.isdir(args.input)): + print('\nThe input folder (--input) could not be found. Make sure you have given the correct path.\n') + exit() + + return args.input, args.minor -def count_tips(in_dir): +def count_tips(in_dir, use_focal_minors): + + focal_minors = [] + if use_focal_minors: + try: + with open('focal_minors.txt', 'r') as f: + focal_minors = f.read().strip().split(',') + focal_minors = [minor.strip() for minor in focal_minors] + except FileNotFoundError: + print('A file called focal_minors.txt must be included in the folder with your script. This file should have a csv of target minor clades such as "Am_tu, Sr_ci, Sr_rh"') + exit() + + count_data = {} + major_clades = set() + minor_clades = set() + + for file in os.listdir(in_dir): + if file.split('.')[-1] in ('fasta', 'fas', 'faa', 'fna'): + fname = os.path.join(in_dir, file) + + count_data[file] = {} + + # Use SeqIO.index for faster access + seq_index = SeqIO.index(fname, "fasta") + tips = [seq_index[record_id].id[:10] for record_id in seq_index] + + for tip in tips: + major_clade = tip[:2] + minor_clade = tip[:5] + + if use_focal_minors: + if minor_clade not in focal_minors: + continue + + major_clades.add(major_clade) + minor_clades.add(minor_clade) + + if tip not in count_data[file]: + count_data[file][tip] = 0 + count_data[file][tip] += 1 + + if use_focal_minors: + # Filter major and minor clades based on focal minors + major_clades = sorted({minor[:2] for minor in focal_minors}) + minor_clades = sorted(focal_minors) + else: + major_clades = sorted(major_clades) + minor_clades = sorted(minor_clades) + + taxa = sorted({tax for file_data in count_data.values() for tax in file_data}) + + # Generate output file name based on input folder name + folder_name = os.path.basename(in_dir) + output_file_name = folder_name + "_TaxonOccurrence.csv" + + with open(output_file_name, 'w') as o: + o.write(',' + ','.join(major_clades + minor_clades + taxa) + '\n') + for file in count_data: + o.write(file) + + major_clade_counts = {clade: 0 for clade in major_clades} + minor_clade_counts = {clade: 0 for clade in minor_clades} + + for tax, count in count_data[file].items(): + major_clade_counts[tax[:2]] += count + minor_clade_counts[tax[:5]] += count + + for clade in major_clades: + o.write(',' + str(major_clade_counts[clade])) + + for clade in minor_clades: + o.write(',' + str(minor_clade_counts[clade])) + + for tax in taxa: + if tax in count_data[file]: + o.write(',' + str(count_data[file][tax])) + else: + o.write(',0') + o.write('\n') + - count_data = { } - for file in os.listdir(in_dir): - if file.split('.')[-1] in ('fasta', 'fas', 'faa', 'fna'): - fname = in_dir + '/' + file - - count_data.update({ file : { } }) - tips = [record.id[:10] for record in SeqIO.parse(in_dir+'/'+file, 'fasta')] - - for tip in tips: - tip = tip.strip() - if(tip[:10] not in count_data[file]): - count_data[file].update({ tip[:10] : 0 }) - count_data[file][tip[:10]] += 1 - - taxa = sorted(list(dict.fromkeys([tax for file in count_data for tax in count_data[file]]))) - - with open('TaxonOccurrence.csv', 'w') as o: - o.write(',' + ','.join(taxa) + '\n') - for file in count_data: - o.write(file) - for tax in taxa: - if(tax in count_data[file]): - o.write(',' + str(count_data[file][tax])) - else: - o.write(',0') - o.write('\n') - - def main(): - - in_dir = get_args() - - count_tips(in_dir) - + in_dir, use_focal_minors = get_args() + count_tips(in_dir, use_focal_minors) + main()