EukPhylo/Utilities/for_taxonomy/get_unique_taxa.py
2023-04-07 15:45:43 -04:00

80 lines
2.8 KiB
Python

import os
from pathlib import Path
'''
Written by Elinor 1/26, updated 2/12
Input: text file of taxonomies. make sure each taxonomic level is separated with '; ' (semicolon space) or the
script will not parse the names right
This cuts off the genus (and species if there is one), uniquifies the list and writes them out to files by the first
word of the taxonomy
WARNING: if you run the script multiple times, DELETE THE PREVIOUS OUTPUT. this is because it appends lines to the
end of files so you will have many duplicates
'''
Path(f'unique_taxon_lists').mkdir(parents=True, exist_ok=True)#makes output folder
with open ('all_taxa.txt') as t:
names = t.readlines()
to_uniquify = []
skipped = []
for name in names: #iterate through each line of txt file
#Adding short names to keep
if len(name.split('; ')) <= 2:
if ';' in name:#this removes things that are only one word (ex 'no taxID')
to_uniquify.append(name)#add short taxonomies to list to uniquify because we don't want to remove any taxonomic information
#for names over 2 parts
else:
if ' ' in name.split('; ')[-1]:#check if it has a species name
if len(name.split('; ')) > 3:#remove sp and genus from taxonomies with over 3 parts
short_tax = name.split('; ')[:-2]
to_uniquify.append('; '.join(short_tax))
#if short, only remove -1
if len(name.split('; ')) < 4:#remove sp only from taxa with 3 or fewer parts
short_tax = name.split('; ')[:-1]
to_uniquify.append('; '.join(short_tax))
#for names without species names
else: #if there is no species name, remove genus only
if len(name.split('; ')) > 3:#remove sp and genus from taxonomies with over 3 parts
short_tax = name.split('; ')[:-1]
to_uniquify.append('; '.join(short_tax))
else:
to_uniquify.append(name)
unique_taxonomies = {}#initialize dictionary
count = 0
for i in to_uniquify:#iterate through names that have species removed
item = i.split('; ')#divide up each word in the line of the taxonomy
for name in item:#iterate through words in each
count+=1#count number of words total
unique_taxonomies.update({name.strip(): item[0]})#write to dictionary with key as word (so they automatically get uniquified) and value = MC
print(f'You have {len(unique_taxonomies)} unique taxonomies\n\n')
#write to csvs by major clade
for taxa, mc in unique_taxonomies.items():
file = open(f'unique_taxon_lists/{mc.strip()}.csv', 'a+')#open/create a file with the name of the major clade of each taxonomy
file.write(f'{taxa}\n')#write each taxonomy to the file named by its major clade
file.close#close the file i dont know what happens if you dont do this but its probably bad
with open('unique_taxon_lists/all_unique_terms.csv', 'w') as o:
for taxa, mc in unique_taxonomies.items():
o.write(f'{taxa}\n')