''' Modified by Elinor 2/13 to grab recent assemblies (since 2020) and GCA codes. Input is folder 'unique_taxon_lists' with files of keywords by major clade (separated by new lines). Put -t (transcriptome, SRA db) or -g (genome, assembly db) in the command line to specify data type restrictions: All data is since 2020 SRA excludes entries with 'Amplicon' in the description, and outputs the experiment and sequencing technology. example command line: python Query_SRA_egs.py -t Output: file of species, IDs, and GCA or SRR codes AND a file with uniquified codes. Updated 3.21.23 to output sequencing machine and type of experiment from SRA ''' from Bio import Entrez from Bio import SeqIO import os import sys def get_args(): Entrez.email = "@smith.edu"#CHANGE UR EMAIL Entrez.tool = "Biopython_NCBI_Entrez_downloads.ipynb" if len(sys.argv) < 2: print(f'enter -t or -g in command line to choose genomes (-g) or transcriptomes (-t)') if '-t' in sys.argv: data_type = False elif '-g' in sys.argv: data_type = True with open('RecentIDs.csv', 'w') as o:#starts output file and writes header o.write('major clade, keyword, species, ID, experiment, sequencing technology, GCA/SRR,\n') get_keywords(data_type) def get_keywords(data_type): for file in os.listdir('unique_taxon_lists'): if file.endswith('_unique.csv'):#put name of file to look at here. or only .csv to look at all of them with open(f'unique_taxon_lists/{file}', 'r') as lines:#read each file mc = file.split("_unique.csv")[0] print(f'Searching taxonomic names in {mc}\n\n') for line in lines.readlines():#iterate file keyword = line.strip()#keyword for genbank search is each word in the files if data_type == False: fetch_SRA(mc, keyword) if data_type == True: fetch_CDS(mc, keyword) write_unique_codes() def fetch_CDS(mc, keyword):#searches your keywords in the assembly database print(f'\nGrabbing recent CDSs') all_stuff = []#initiate list, will put genbank codes into this #get IDs of assemblies for keyword since 2020. Returns multiple IDs handle = Entrez.esearch(db="assembly", term=keyword + "[Organism:exp]" + "2020 [SeqReleaseDate]:3000", retmax=100) id_record = Entrez.read(handle) print(f'There are {len(id_record["IdList"])} assemblies labeled as {keyword} in genbank since 2020\nFetching IDs and GCAs\n') #Iterate through list of IDs given above, seach for their associated GCAs. Only one GCA for each ID, and each corresponds to 1 individual sequenced for tax_id in id_record['IdList']: handle = Entrez.esummary(db="assembly", id=tax_id, retmode="text") gca_records = Entrez.read(handle, validate=False) handle.close() #parse the output (its really awful. pythonic turduken: dict(list(str(dict))) type deal) for record in gca_records['DocumentSummarySet']['DocumentSummary']: sp = record['Organism'] gca=record['AssemblyAccession'] stuff = f'{mc}, {keyword}, {sp},{tax_id}, , ,{gca}' all_stuff.append(stuff) write_to_csv(all_stuff)#send this new info to be added to output sheet def fetch_SRA(mc, keyword):#searches your keywords in the SRA db all_stuff = [] print(f'\nGrabbing recent SRRs') # get IDs from taxonomies handle = Entrez.esearch(db="sra", term=keyword + "[Organism:exp]"+ " 2020:2023[PDAT]", retmax=100) id_record = Entrez.read(handle, validate = False) print(f'There are {len(id_record["IdList"])} SRAs labeled as {keyword} in genbank since 2020\nFetching SRAs\n') #get SRRs for taxonomy for rec in id_record['IdList']:#iterates through all of the IDs for the taxonomy tax_id = rec handle = Entrez.esummary(db="sra", id=tax_id) srr_records = Entrez.read(handle)#parse genbank info #parse out all information needed from genbank info sp = srr_records[0]['ExpXml'].split('ScientificName="')[1].split('"')[0]#extract species from genbank info srr = srr_records[0]['Runs'].split('"')[1]#extract srr from genbank info seq_type = srr_records[0]['ExpXml'].split('')[1].split('')[0]#parse to "library_strategy" parameter to check if its amplicon machine = srr_records[0]['ExpXml'].split('')[0]#get the type of sequencing machine used if 'AMPLICON' not in seq_type: stuff = f'{mc}, {keyword}, {sp}, {tax_id}, {seq_type}, {machine}, {srr},'#write to comma separated string all_stuff.append(stuff) write_to_csv(all_stuff) def write_to_csv(data):#writes the output from fetch_SRA or fetch_CDS to a csv with open('RecentIDs.csv', 'a+') as o: for i in data: o.write(f'{i}\n') def write_unique_codes():#uniquify the list of IDs that the scipt grabbed. since we are searching all taxanomic levels, we query many repeats so this removes them. #Writing unique files with open('RecentIDs.csv', 'r') as o:# read file of all data taxa = o.readlines() print(f'\nThere are {len(taxa)} codes before uniquifying\n\n') unique_lines = {line.split(', ')[-1] : line.split(', ')[0:-1] for line in taxa}#makes dictionary of SRR/GCA:other info to uniquify the codes print(f'\nYou have {len(unique_lines)} unique codes... writing them to unique_taxa.csv') with open ('unique_taxa.csv', 'w') as o:#start csv of unique codes for gca, other in unique_lines.items():#parse uniquified dictionary o.write(f'{(", ").join(other)}, {gca}')#write out (use join to convert the list containing other info to a string) if __name__ == '__main__': get_args()