From 88155175f7bdba3ef83df7cbd5af4c79b228d0ce Mon Sep 17 00:00:00 2001 From: ElinorSterner <86856150+ElinorSterner@users.noreply.github.com> Date: Fri, 7 Apr 2023 15:45:43 -0400 Subject: [PATCH] Add files via upload --- Utilities/for_taxonomy/get_unique_taxa.py | 79 +++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 Utilities/for_taxonomy/get_unique_taxa.py diff --git a/Utilities/for_taxonomy/get_unique_taxa.py b/Utilities/for_taxonomy/get_unique_taxa.py new file mode 100644 index 0000000..57fea2a --- /dev/null +++ b/Utilities/for_taxonomy/get_unique_taxa.py @@ -0,0 +1,79 @@ +import os +from pathlib import Path + +''' +Written by Elinor 1/26, updated 2/12 + +Input: text file of taxonomies. make sure each taxonomic level is separated with '; ' (semicolon space) or the +script will not parse the names right + +This cuts off the genus (and species if there is one), uniquifies the list and writes them out to files by the first +word of the taxonomy + +WARNING: if you run the script multiple times, DELETE THE PREVIOUS OUTPUT. this is because it appends lines to the +end of files so you will have many duplicates + +''' + +Path(f'unique_taxon_lists').mkdir(parents=True, exist_ok=True)#makes output folder + +with open ('all_taxa.txt') as t: + names = t.readlines() + to_uniquify = [] + skipped = [] + + for name in names: #iterate through each line of txt file + + + #Adding short names to keep + if len(name.split('; ')) <= 2: + if ';' in name:#this removes things that are only one word (ex 'no taxID') + to_uniquify.append(name)#add short taxonomies to list to uniquify because we don't want to remove any taxonomic information + + + #for names over 2 parts + else: + if ' ' in name.split('; ')[-1]:#check if it has a species name + if len(name.split('; ')) > 3:#remove sp and genus from taxonomies with over 3 parts + short_tax = name.split('; ')[:-2] + to_uniquify.append('; '.join(short_tax)) + + #if short, only remove -1 + if len(name.split('; ')) < 4:#remove sp only from taxa with 3 or fewer parts + short_tax = name.split('; ')[:-1] + to_uniquify.append('; '.join(short_tax)) + + #for names without species names + else: #if there is no species name, remove genus only + if len(name.split('; ')) > 3:#remove sp and genus from taxonomies with over 3 parts + short_tax = name.split('; ')[:-1] + to_uniquify.append('; '.join(short_tax)) + else: + to_uniquify.append(name) + +unique_taxonomies = {}#initialize dictionary +count = 0 +for i in to_uniquify:#iterate through names that have species removed + item = i.split('; ')#divide up each word in the line of the taxonomy + for name in item:#iterate through words in each + count+=1#count number of words total + unique_taxonomies.update({name.strip(): item[0]})#write to dictionary with key as word (so they automatically get uniquified) and value = MC + + + +print(f'You have {len(unique_taxonomies)} unique taxonomies\n\n') + + +#write to csvs by major clade +for taxa, mc in unique_taxonomies.items(): + + file = open(f'unique_taxon_lists/{mc.strip()}.csv', 'a+')#open/create a file with the name of the major clade of each taxonomy + file.write(f'{taxa}\n')#write each taxonomy to the file named by its major clade + file.close#close the file i dont know what happens if you dont do this but its probably bad + +with open('unique_taxon_lists/all_unique_terms.csv', 'w') as o: + for taxa, mc in unique_taxonomies.items(): + o.write(f'{taxa}\n') + + +