mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-27 03:10:24 +08:00
Add files via upload
This commit is contained in:
parent
182615bf75
commit
2d66ff52d3
133
Utilities/for_taxonomy/Query_SRA_egs.py
Normal file
133
Utilities/for_taxonomy/Query_SRA_egs.py
Normal file
@ -0,0 +1,133 @@
|
||||
'''
|
||||
Modified by Elinor 2/13 to grab recent assemblies (since 2020) and GCA codes. Input is folder 'unique_taxon_lists' with files
|
||||
of keywords by major clade (separated by new lines). Put -t (transcriptome, SRA db) or -g (genome, assembly db) in the command
|
||||
line to specify data type
|
||||
|
||||
restrictions:
|
||||
All data is since 2020
|
||||
SRA excludes entries with 'Amplicon' in the description, and outputs the experiment and sequencing technology.
|
||||
|
||||
example command line: python Query_SRA_egs.py -t
|
||||
|
||||
Output: file of species, IDs, and GCA or SRR codes AND a file with uniquified codes. Updated 3.21.23 to output sequencing machine and type of experiment from SRA
|
||||
|
||||
'''
|
||||
|
||||
from Bio import Entrez
|
||||
from Bio import SeqIO
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def get_args():
|
||||
|
||||
Entrez.email = "@smith.edu"#CHANGE UR EMAIL
|
||||
Entrez.tool = "Biopython_NCBI_Entrez_downloads.ipynb"
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print(f'enter -t or -g in command line to choose genomes (-g) or transcriptomes (-t)')
|
||||
if '-t' in sys.argv:
|
||||
data_type = False
|
||||
elif '-g' in sys.argv:
|
||||
data_type = True
|
||||
|
||||
with open('RecentIDs.csv', 'w') as o:#starts output file and writes header
|
||||
o.write('major clade, keyword, species, ID, experiment, sequencing technology, GCA/SRR,\n')
|
||||
|
||||
get_keywords(data_type)
|
||||
|
||||
|
||||
def get_keywords(data_type):
|
||||
|
||||
for file in os.listdir('unique_taxon_lists'):
|
||||
if file.endswith('_unique.csv'):#put name of file to look at here. or only .csv to look at all of them
|
||||
with open(f'unique_taxon_lists/{file}', 'r') as lines:#read each file
|
||||
mc = file.split("_unique.csv")[0]
|
||||
print(f'Searching taxonomic names in {mc}\n\n')
|
||||
|
||||
for line in lines.readlines():#iterate file
|
||||
keyword = line.strip()#keyword for genbank search is each word in the files
|
||||
|
||||
if data_type == False:
|
||||
fetch_SRA(mc, keyword)
|
||||
if data_type == True:
|
||||
fetch_CDS(mc, keyword)
|
||||
|
||||
write_unique_codes()
|
||||
|
||||
|
||||
def fetch_CDS(mc, keyword):#searches your keywords in the assembly database
|
||||
print(f'\nGrabbing recent CDSs')
|
||||
all_stuff = []#initiate list, will put genbank codes into this
|
||||
|
||||
#get IDs of assemblies for keyword since 2020. Returns multiple IDs
|
||||
handle = Entrez.esearch(db="assembly", term=keyword + "[Organism:exp]" + "2020 [SeqReleaseDate]:3000", retmax=100)
|
||||
id_record = Entrez.read(handle)
|
||||
print(f'There are {len(id_record["IdList"])} assemblies labeled as {keyword} in genbank since 2020\nFetching IDs and GCAs\n')
|
||||
|
||||
#Iterate through list of IDs given above, seach for their associated GCAs. Only one GCA for each ID, and each corresponds to 1 individual sequenced
|
||||
for tax_id in id_record['IdList']:
|
||||
handle = Entrez.esummary(db="assembly", id=tax_id, retmode="text")
|
||||
gca_records = Entrez.read(handle, validate=False)
|
||||
handle.close()
|
||||
|
||||
#parse the output (its really awful. pythonic turduken: dict(list(str(dict))) type deal)
|
||||
for record in gca_records['DocumentSummarySet']['DocumentSummary']:
|
||||
sp = record['Organism']
|
||||
gca=record['AssemblyAccession']
|
||||
stuff = f'{mc}, {keyword}, {sp},{tax_id}, , ,{gca}'
|
||||
all_stuff.append(stuff)
|
||||
|
||||
write_to_csv(all_stuff)#send this new info to be added to output sheet
|
||||
|
||||
|
||||
def fetch_SRA(mc, keyword):#searches your keywords in the SRA db
|
||||
|
||||
all_stuff = []
|
||||
print(f'\nGrabbing recent SRRs')
|
||||
# get IDs from taxonomies
|
||||
handle = Entrez.esearch(db="sra", term=keyword + "[Organism:exp]"+ " 2020:2023[PDAT]", retmax=100)
|
||||
id_record = Entrez.read(handle, validate = False)
|
||||
print(f'There are {len(id_record["IdList"])} SRAs labeled as {keyword} in genbank since 2020\nFetching SRAs\n')
|
||||
|
||||
|
||||
#get SRRs for taxonomy
|
||||
for rec in id_record['IdList']:#iterates through all of the IDs for the taxonomy
|
||||
tax_id = rec
|
||||
handle = Entrez.esummary(db="sra", id=tax_id)
|
||||
srr_records = Entrez.read(handle)#parse genbank info
|
||||
|
||||
#parse out all information needed from genbank info
|
||||
sp = srr_records[0]['ExpXml'].split('ScientificName="')[1].split('"')[0]#extract species from genbank info
|
||||
srr = srr_records[0]['Runs'].split('"')[1]#extract srr from genbank info
|
||||
seq_type = srr_records[0]['ExpXml'].split('<LIBRARY_STRATEGY>')[1].split('</LIBRARY_STRATEGY>')[0]#parse to "library_strategy" parameter to check if its amplicon
|
||||
machine = srr_records[0]['ExpXml'].split('<Platform instrument_model="')[1].split('">')[0]#get the type of sequencing machine used
|
||||
if 'AMPLICON' not in seq_type:
|
||||
stuff = f'{mc}, {keyword}, {sp}, {tax_id}, {seq_type}, {machine}, {srr},'#write to comma separated string
|
||||
all_stuff.append(stuff)
|
||||
|
||||
|
||||
write_to_csv(all_stuff)
|
||||
|
||||
def write_to_csv(data):#writes the output from fetch_SRA or fetch_CDS to a csv
|
||||
with open('RecentIDs.csv', 'a+') as o:
|
||||
for i in data:
|
||||
o.write(f'{i}\n')
|
||||
|
||||
def write_unique_codes():#uniquify the list of IDs that the scipt grabbed. since we are searching all taxanomic levels, we query many repeats so this removes them.
|
||||
|
||||
#Writing unique files
|
||||
with open('RecentIDs.csv', 'r') as o:# read file of all data
|
||||
taxa = o.readlines()
|
||||
print(f'\nThere are {len(taxa)} codes before uniquifying\n\n')
|
||||
unique_lines = {line.split(', ')[-1] : line.split(', ')[0:-1] for line in taxa}#makes dictionary of SRR/GCA:other info to uniquify the codes
|
||||
print(f'\nYou have {len(unique_lines)} unique codes... writing them to unique_taxa.csv')
|
||||
|
||||
with open ('unique_taxa.csv', 'w') as o:#start csv of unique codes
|
||||
for gca, other in unique_lines.items():#parse uniquified dictionary
|
||||
o.write(f'{(", ").join(other)}, {gca}')#write out (use join to convert the list containing other info to a string)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
get_args()
|
||||
Loading…
x
Reference in New Issue
Block a user