mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-27 05:10:28 +08:00
Add files via upload
This commit is contained in:
parent
d685f5ecec
commit
88155175f7
79
Utilities/for_taxonomy/get_unique_taxa.py
Normal file
79
Utilities/for_taxonomy/get_unique_taxa.py
Normal file
@ -0,0 +1,79 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
'''
|
||||
Written by Elinor 1/26, updated 2/12
|
||||
|
||||
Input: text file of taxonomies. make sure each taxonomic level is separated with '; ' (semicolon space) or the
|
||||
script will not parse the names right
|
||||
|
||||
This cuts off the genus (and species if there is one), uniquifies the list and writes them out to files by the first
|
||||
word of the taxonomy
|
||||
|
||||
WARNING: if you run the script multiple times, DELETE THE PREVIOUS OUTPUT. this is because it appends lines to the
|
||||
end of files so you will have many duplicates
|
||||
|
||||
'''
|
||||
|
||||
Path(f'unique_taxon_lists').mkdir(parents=True, exist_ok=True)#makes output folder
|
||||
|
||||
with open ('all_taxa.txt') as t:
|
||||
names = t.readlines()
|
||||
to_uniquify = []
|
||||
skipped = []
|
||||
|
||||
for name in names: #iterate through each line of txt file
|
||||
|
||||
|
||||
#Adding short names to keep
|
||||
if len(name.split('; ')) <= 2:
|
||||
if ';' in name:#this removes things that are only one word (ex 'no taxID')
|
||||
to_uniquify.append(name)#add short taxonomies to list to uniquify because we don't want to remove any taxonomic information
|
||||
|
||||
|
||||
#for names over 2 parts
|
||||
else:
|
||||
if ' ' in name.split('; ')[-1]:#check if it has a species name
|
||||
if len(name.split('; ')) > 3:#remove sp and genus from taxonomies with over 3 parts
|
||||
short_tax = name.split('; ')[:-2]
|
||||
to_uniquify.append('; '.join(short_tax))
|
||||
|
||||
#if short, only remove -1
|
||||
if len(name.split('; ')) < 4:#remove sp only from taxa with 3 or fewer parts
|
||||
short_tax = name.split('; ')[:-1]
|
||||
to_uniquify.append('; '.join(short_tax))
|
||||
|
||||
#for names without species names
|
||||
else: #if there is no species name, remove genus only
|
||||
if len(name.split('; ')) > 3:#remove sp and genus from taxonomies with over 3 parts
|
||||
short_tax = name.split('; ')[:-1]
|
||||
to_uniquify.append('; '.join(short_tax))
|
||||
else:
|
||||
to_uniquify.append(name)
|
||||
|
||||
unique_taxonomies = {}#initialize dictionary
|
||||
count = 0
|
||||
for i in to_uniquify:#iterate through names that have species removed
|
||||
item = i.split('; ')#divide up each word in the line of the taxonomy
|
||||
for name in item:#iterate through words in each
|
||||
count+=1#count number of words total
|
||||
unique_taxonomies.update({name.strip(): item[0]})#write to dictionary with key as word (so they automatically get uniquified) and value = MC
|
||||
|
||||
|
||||
|
||||
print(f'You have {len(unique_taxonomies)} unique taxonomies\n\n')
|
||||
|
||||
|
||||
#write to csvs by major clade
|
||||
for taxa, mc in unique_taxonomies.items():
|
||||
|
||||
file = open(f'unique_taxon_lists/{mc.strip()}.csv', 'a+')#open/create a file with the name of the major clade of each taxonomy
|
||||
file.write(f'{taxa}\n')#write each taxonomy to the file named by its major clade
|
||||
file.close#close the file i dont know what happens if you dont do this but its probably bad
|
||||
|
||||
with open('unique_taxon_lists/all_unique_terms.csv', 'w') as o:
|
||||
for taxa, mc in unique_taxonomies.items():
|
||||
o.write(f'{taxa}\n')
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user