EukPhylo/Utilities/for_taxonomy/get_unique_taxa.py
2024-01-12 11:19:16 -05:00

75 lines
2.7 KiB
Python

'''
#Author, date: Elinor Sterner Jan-26-2023, updated Feb-12-2023.
#Intent: To get the unique taxa from a taxonomic classification.
#Dependencies: Python3
#Inputs: text file of taxonomies. make sure each taxonomic level is separated with '; ' (semicolon space).
#Outputs: Spreadsheet with unique taxa. If you run the script multiple times, DELETE THE PREVIOUS OUTPUT.
#Example: python get_unique_taxa.py
'''
import os
from pathlib import Path
Path(f'unique_taxon_lists').mkdir(parents=True, exist_ok=True)#makes output folder
with open ('all_taxa.txt') as t:
names = t.readlines()
to_uniquify = []
skipped = []
for name in names: #iterate through each line of txt file
#Adding short names to keep
if len(name.split('; ')) <= 2:
if ';' in name:#this removes things that are only one word (ex 'no taxID')
to_uniquify.append(name)#add short taxonomies to list to uniquify because we don't want to remove any taxonomic information
#for names over 2 parts
else:
if ' ' in name.split('; ')[-1]:#check if it has a species name
if len(name.split('; ')) > 3:#remove sp and genus from taxonomies with over 3 parts
short_tax = name.split('; ')[:-2]
to_uniquify.append('; '.join(short_tax))
#if short, only remove -1
if len(name.split('; ')) < 4:#remove sp only from taxa with 3 or fewer parts
short_tax = name.split('; ')[:-1]
to_uniquify.append('; '.join(short_tax))
#for names without species names
else: #if there is no species name, remove genus only
if len(name.split('; ')) > 3:#remove sp and genus from taxonomies with over 3 parts
short_tax = name.split('; ')[:-1]
to_uniquify.append('; '.join(short_tax))
else:
to_uniquify.append(name)
unique_taxonomies = {}#initialize dictionary
count = 0
for i in to_uniquify:#iterate through names that have species removed
item = i.split('; ')#divide up each word in the line of the taxonomy
for name in item:#iterate through words in each
count+=1#count number of words total
unique_taxonomies.update({name.strip(): item[0]})#write to dictionary with key as word (so they automatically get uniquified) and value = MC
print(f'You have {len(unique_taxonomies)} unique taxonomies\n\n')
#write to csvs by major clade
for taxa, mc in unique_taxonomies.items():
file = open(f'unique_taxon_lists/{mc.strip()}.csv', 'a+')#open/create a file with the name of the major clade of each taxonomy
file.write(f'{taxa}\n')#write each taxonomy to the file named by its major clade
file.close#close the file i dont know what happens if you dont do this but its probably bad
with open('unique_taxon_lists/all_unique_terms.csv', 'w') as o:
for taxa, mc in unique_taxonomies.items():
o.write(f'{taxa}\n')