mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-27 07:30:24 +08:00
80 lines
2.8 KiB
Python
80 lines
2.8 KiB
Python
import os
|
|
from pathlib import Path
|
|
|
|
'''
|
|
Written by Elinor 1/26, updated 2/12
|
|
|
|
Input: text file of taxonomies. make sure each taxonomic level is separated with '; ' (semicolon space) or the
|
|
script will not parse the names right
|
|
|
|
This cuts off the genus (and species if there is one), uniquifies the list and writes them out to files by the first
|
|
word of the taxonomy
|
|
|
|
WARNING: if you run the script multiple times, DELETE THE PREVIOUS OUTPUT. this is because it appends lines to the
|
|
end of files so you will have many duplicates
|
|
|
|
'''
|
|
|
|
Path(f'unique_taxon_lists').mkdir(parents=True, exist_ok=True)#makes output folder
|
|
|
|
with open ('all_taxa.txt') as t:
|
|
names = t.readlines()
|
|
to_uniquify = []
|
|
skipped = []
|
|
|
|
for name in names: #iterate through each line of txt file
|
|
|
|
|
|
#Adding short names to keep
|
|
if len(name.split('; ')) <= 2:
|
|
if ';' in name:#this removes things that are only one word (ex 'no taxID')
|
|
to_uniquify.append(name)#add short taxonomies to list to uniquify because we don't want to remove any taxonomic information
|
|
|
|
|
|
#for names over 2 parts
|
|
else:
|
|
if ' ' in name.split('; ')[-1]:#check if it has a species name
|
|
if len(name.split('; ')) > 3:#remove sp and genus from taxonomies with over 3 parts
|
|
short_tax = name.split('; ')[:-2]
|
|
to_uniquify.append('; '.join(short_tax))
|
|
|
|
#if short, only remove -1
|
|
if len(name.split('; ')) < 4:#remove sp only from taxa with 3 or fewer parts
|
|
short_tax = name.split('; ')[:-1]
|
|
to_uniquify.append('; '.join(short_tax))
|
|
|
|
#for names without species names
|
|
else: #if there is no species name, remove genus only
|
|
if len(name.split('; ')) > 3:#remove sp and genus from taxonomies with over 3 parts
|
|
short_tax = name.split('; ')[:-1]
|
|
to_uniquify.append('; '.join(short_tax))
|
|
else:
|
|
to_uniquify.append(name)
|
|
|
|
unique_taxonomies = {}#initialize dictionary
|
|
count = 0
|
|
for i in to_uniquify:#iterate through names that have species removed
|
|
item = i.split('; ')#divide up each word in the line of the taxonomy
|
|
for name in item:#iterate through words in each
|
|
count+=1#count number of words total
|
|
unique_taxonomies.update({name.strip(): item[0]})#write to dictionary with key as word (so they automatically get uniquified) and value = MC
|
|
|
|
|
|
|
|
print(f'You have {len(unique_taxonomies)} unique taxonomies\n\n')
|
|
|
|
|
|
#write to csvs by major clade
|
|
for taxa, mc in unique_taxonomies.items():
|
|
|
|
file = open(f'unique_taxon_lists/{mc.strip()}.csv', 'a+')#open/create a file with the name of the major clade of each taxonomy
|
|
file.write(f'{taxa}\n')#write each taxonomy to the file named by its major clade
|
|
file.close#close the file i dont know what happens if you dont do this but its probably bad
|
|
|
|
with open('unique_taxon_lists/all_unique_terms.csv', 'w') as o:
|
|
for taxa, mc in unique_taxonomies.items():
|
|
o.write(f'{taxa}\n')
|
|
|
|
|
|
|