From 88155175f7bdba3ef83df7cbd5af4c79b228d0ce Mon Sep 17 00:00:00 2001
From: ElinorSterner <86856150+ElinorSterner@users.noreply.github.com>
Date: Fri, 7 Apr 2023 15:45:43 -0400
Subject: [PATCH] Add files via upload

---
 Utilities/for_taxonomy/get_unique_taxa.py | 79 +++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 Utilities/for_taxonomy/get_unique_taxa.py

diff --git a/Utilities/for_taxonomy/get_unique_taxa.py b/Utilities/for_taxonomy/get_unique_taxa.py
new file mode 100644
index 0000000..57fea2a
--- /dev/null
+++ b/Utilities/for_taxonomy/get_unique_taxa.py
@@ -0,0 +1,79 @@
+import os
+from pathlib import Path
+
+'''
+Written by Elinor 1/26, updated 2/12
+
+Input: text file of taxonomies. make sure each taxonomic level is separated with '; ' (semicolon space) or the
+script will not parse the names right
+
+This cuts off the genus (and species if there is one), uniquifies the list and writes them out to files by the first
+word of the taxonomy
+
+WARNING: if you run the script multiple times, DELETE THE PREVIOUS OUTPUT. this is because it appends lines to the 
+end of files so you will have many duplicates
+
+'''
+
+Path(f'unique_taxon_lists').mkdir(parents=True, exist_ok=True)#makes output folder
+
+with open ('all_taxa.txt') as t:
+	names = t.readlines()
+	to_uniquify = []
+	skipped = []
+
+	for name in names: #iterate through each line of txt file
+
+
+		#Adding short names to keep
+		if len(name.split('; ')) <= 2:
+			if ';' in name:#this removes things that are only one word (ex 'no taxID')
+				to_uniquify.append(name)#add short taxonomies to list to uniquify because we don't want to remove any taxonomic information
+
+
+		#for names over 2 parts
+		else:
+			if ' ' in name.split('; ')[-1]:#check if it has a species name
+				if len(name.split('; ')) > 3:#remove sp and genus from taxonomies with over 3 parts
+					short_tax = name.split('; ')[:-2]
+					to_uniquify.append('; '.join(short_tax))
+				
+				#if short, only remove -1
+				if len(name.split('; ')) < 4:#remove sp only from taxa with 3 or fewer parts
+					short_tax = name.split('; ')[:-1]
+					to_uniquify.append('; '.join(short_tax))
+
+			#for names without species names
+			else: #if there is no species name, remove genus only 
+				if len(name.split('; ')) > 3:#remove sp and genus from taxonomies with over 3 parts
+					short_tax = name.split('; ')[:-1]
+					to_uniquify.append('; '.join(short_tax))
+				else:
+					to_uniquify.append(name)
+
+unique_taxonomies = {}#initialize dictionary
+count = 0
+for i in to_uniquify:#iterate through names that have species removed
+	item = i.split('; ')#divide up each word in the line of the taxonomy
+	for name in item:#iterate through words in each
+		count+=1#count number of words total
+		unique_taxonomies.update({name.strip(): item[0]})#write to dictionary with key as word (so they automatically get uniquified) and value = MC
+
+
+
+print(f'You have {len(unique_taxonomies)} unique taxonomies\n\n')
+
+
+#write to csvs by major clade
+for taxa, mc in unique_taxonomies.items():
+
+	file = open(f'unique_taxon_lists/{mc.strip()}.csv', 'a+')#open/create a file with the name of the major clade of each taxonomy
+	file.write(f'{taxa}\n')#write each taxonomy to the file named by its major clade
+	file.close#close the file i dont know what happens if you dont do this but its probably bad
+
+with open('unique_taxon_lists/all_unique_terms.csv', 'w') as o:
+	for taxa, mc in unique_taxonomies.items():
+		o.write(f'{taxa}\n')
+
+	
+