''' Author: Auden Cote-L'Heureux Last updated: 10/24/23 Motivation: Count the number of occurences of each taxa in each OG in a post guidance file Dependencies: Bio python, os, sys Inputs: Directory of postguidance files Outputs: CSV file tallying all the counts of taxa in each OG file Command line: python CountTaxonOccurence_v2.0.py --input ''' import os import sys from Bio import SeqIO import argparse def get_args(): parser = argparse.ArgumentParser( prog = 'Taxon occurence counting script', description = "Updated Oct 24th, 2023 by Auden Cote-L'Heureux." ) parser.add_argument('-i', '--input', type = str, required = True, help = 'Path to the folder containing the aligned/unaligned fasta files') args = parser.parse_args() if(args.input.endswith('/')): args.input = args.input[:-1] if(not os.path.isdir(args.input)): print('\nThe input folder (--input) could not be found. Make sure you have given the correct path.\n') exit() return args.input def count_tips(in_dir): count_data = { } for file in os.listdir(in_dir): if file.split('.')[-1] in ('fasta', 'fas', 'faa', 'fna'): fname = in_dir + '/' + file count_data.update({ file : { } }) tips = [record.id[:10] for record in SeqIO.parse(in_dir+'/'+file, 'fasta')] for tip in tips: tip = tip.strip() if(tip[:10] not in count_data[file]): count_data[file].update({ tip[:10] : 0 }) count_data[file][tip[:10]] += 1 taxa = sorted(list(dict.fromkeys([tax for file in count_data for tax in count_data[file]]))) with open('TaxonOccurrence.csv', 'w') as o: o.write(',' + ','.join(taxa) + '\n') for file in count_data: o.write(file) for tax in taxa: if(tax in count_data[file]): o.write(',' + str(count_data[file][tax])) else: o.write(',0') o.write('\n') def main(): in_dir = get_args() count_tips(in_dir) main()