EukPhylo/Utilities/for_taxonomy/get_unique_taxa.py

import os
from pathlib import Path

'''
Written by Elinor 1/26, updated 2/12

Input: text file of taxonomies. make sure each taxonomic level is separated with '; ' (semicolon space) or the
script will not parse the names right

This cuts off the genus (and species if there is one), uniquifies the list and writes them out to files by the first
word of the taxonomy

WARNING: if you run the script multiple times, DELETE THE PREVIOUS OUTPUT. this is because it appends lines to the
end of files so you will have many duplicates

'''

Path(f'unique_taxon_lists').mkdir(parents=True, exist_ok=True)#makes output folder

with open ('all_taxa.txt') as t:
	names = t.readlines()
	to_uniquify = []
	skipped = []

	for name in names: #iterate through each line of txt file


		#Adding short names to keep
		if len(name.split('; ')) <= 2:
			if ';' in name:#this removes things that are only one word (ex 'no taxID')
				to_uniquify.append(name)#add short taxonomies to list to uniquify because we don't want to remove any taxonomic information


		#for names over 2 parts
		else:
			if ' ' in name.split('; ')[-1]:#check if it has a species name
				if len(name.split('; ')) > 3:#remove sp and genus from taxonomies with over 3 parts
					short_tax = name.split('; ')[:-2]
					to_uniquify.append('; '.join(short_tax))

				#if short, only remove -1
				if len(name.split('; ')) < 4:#remove sp only from taxa with 3 or fewer parts
					short_tax = name.split('; ')[:-1]
					to_uniquify.append('; '.join(short_tax))

			#for names without species names
			else: #if there is no species name, remove genus only
				if len(name.split('; ')) > 3:#remove sp and genus from taxonomies with over 3 parts
					short_tax = name.split('; ')[:-1]
					to_uniquify.append('; '.join(short_tax))
				else:
					to_uniquify.append(name)

unique_taxonomies = {}#initialize dictionary
count = 0
for i in to_uniquify:#iterate through names that have species removed
	item = i.split('; ')#divide up each word in the line of the taxonomy
	for name in item:#iterate through words in each
		count+=1#count number of words total
		unique_taxonomies.update({name.strip(): item[0]})#write to dictionary with key as word (so they automatically get uniquified) and value = MC


print(f'You have {len(unique_taxonomies)} unique taxonomies\n\n')


#write to csvs by major clade
for taxa, mc in unique_taxonomies.items():

	file = open(f'unique_taxon_lists/{mc.strip()}.csv', 'a+')#open/create a file with the name of the major clade of each taxonomy
	file.write(f'{taxa}\n')#write each taxonomy to the file named by its major clade
	file.close#close the file i dont know what happens if you dont do this but its probably bad

with open('unique_taxon_lists/all_unique_terms.csv', 'w') as o:
	for taxa, mc in unique_taxonomies.items():
		o.write(f'{taxa}\n')