diff --git a/Utilities/For_Assemblies/Trim_Reads.py b/Utilities/For_Assemblies/Trim_Reads.py new file mode 100644 index 0000000..2d159eb --- /dev/null +++ b/Utilities/For_Assemblies/Trim_Reads.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + +#Author, date: Giulia Magri Ribeiro updated from Xyrus Maurer-Alcala and Ying Yan; June 11 2025 +#Motivation: Trim adaptors from reads and quality trimming before Assembly +#Intent: clean up reads +#Dependencies: biopython and bbmap folder +#Inputs:parameters.txt, fastq.gz forward and reverse reads +#Outputs:trimmed reads in ToAssemble folder +#Example: python3 Trim_Reads.py parameter.txt YourEmailAddress + + +from Bio import SeqIO +import sys,os +import time + +#------------------------------ Checks the Input Arguments ------------------------------# + +if len(sys.argv) == 1: + print ('\n\nThis script will remove Adapters, do quality trimming and length trimming on given score and assembly from your raw reads') + print ('\n\nChecking the overall quality and reads size on FastQC is recommended\n\n') + print ('Example Usage:\n\n\t' + 'katzlab$ python3 Trim_Reads.py parameter.txt YourEmailAddress\n\n') + print ('\t\tQuestions/Comments? Email Giulia (author) at gribeiro@smith.edu\n\n') + sys.exit() + + +if len(sys.argv) != 3: + print ('\n\nDouble check that you have added all the necessary command-line inputs! (see usage below for an example)\n\n') + print ('Example Usage:\n\n\t' + 'katzlab$ python3 Trim_Reads.py parameter.txt YourEmailAddress\n\n') + print ('Please also check that you have a parameter.txt (tab separated values) file which should contain your current filename, new filename, score of quality trimming and minimum length (see an example below)\n\n') + print ('parameter.txt example:\n\n\t' + 'XKATZ_20161110_K00134_IL100076423_S41_L005\tLKH001_Spirostomum\t24\t100\n\tXKATZ_20161110_K00134_IL100076416_S17_L005\tLKH002_Loxodes\t28\t100\n') + sys.exit() + +elif len(sys.argv) == 3: + parameter_file = sys.argv[1] + mailaddress = sys.argv[2] + if os.path.isdir('ToAssemble/') != True: + os.system('mkdir ToAssemble') + +### takes your downloaded data and renames the file so that it has taxonomic information in the filename +def rename(code): + for filename in os.listdir(os.curdir): + if filename.endswith('.fastq.gz'): + ### check name code here for forward reads + if '_1.' in filename: + cur_name = filename.split('_1.')[0] + new_name = code[cur_name] + print(cur_name, new_name) + os.system('mv ' + filename + ' ' + new_name + '_FwdPE.fastq.gz') +### Make a folder for each taxon that you are doing an assembly for ... this will be useful later (might as well do it early on!) + os.system('mkdir '+ new_name) + ### check name code here for Reverse reads + elif '_2.' in filename: + cur_name2 = filename.split('_2.')[0] + new_name2 = code[cur_name2] + print(cur_name2, new_name2) + os.system('mv ' + filename + ' ' + new_name2 + '_RevPE.fastq.gz') + elif '_FwdPE.fastq.gz' in filename: + sample_prefix = filename.split('_FwdPE')[0] + os.system(f"mkdir -p {sample_prefix}") + + +### Uses the adapters.fa file in the bbtools resources folder (and BBDuK) to remove adapter sequences -- update if necessary +### Uses BBDuK to quality trim reads so the average is q24 and the min length is 100 -- adjust if needed ... flags will be added eventually +def QualityTrim(qtrim, minlen): + for filename in os.listdir(os.curdir): + if 'FwdPE' in filename: + new_name = filename.split('_FwdPE')[0] + qscore = qtrim[new_name] + lscore = minlen[new_name] + qtrimcmd = '_q'+qscore+'_minlen'+lscore + log_file = filename.split('_Fwd')[0] + '/' + filename.split('_Fwd')[0] + qtrimcmd + '_bbduk.log' + os.system('./bbmap/bbduk.sh -Xmx20g in1=./' + filename + ' in2=./' + filename.replace('Fwd','Rev') + ' out1=ToAssemble/'+filename.replace('FwdPE','FPE'+qtrimcmd) + ' out2=ToAssemble/' + filename.split('Fwd')[0]+'RPE'+qtrimcmd+'.fastq.gz qtrim=rl trimq='+qscore+' minlen='+lscore+' mink=11 k=23 hdist=1 ktrim=r ref=bbmap/resources/adapters.fa stats=' + filename.split('_Fwd')[0] +'/'+ filename.split('_Fwd')[0] + qtrimcmd + '_Stats.txt overwrite=true'+ ' > ' + log_file + ' 2>&1') + + +### Calls on rnaSPAdes to do the transcriptome assembly on the quality trimmed files. +def rnaSPAdesAssembly(): + for filename in os.listdir(os.curdir+'/ToAssemble'): +# if 'LKH' in filename: + if 'FPE_q' in filename: + os.system('python rnaSPAdes-0.1.1/bin/rnaspades.py -m 26 -k 21,33,55,77 --min-complete-transcript 300 -1 ToAssemble/' + filename + ' -2 ToAssemble/' + filename.replace('FPE','RPE')+' -o ' + filename.split('_FPE')[0] + '/; echo "Finished assembling ' + filename.split('_FPE')[0] + '" | mail -s "Finished Transcriptome Assembly ' + (time.strftime("%d/%m/%y")) + '" ' + mailaddress) > out.txt + + +def main(): + code = {} + qtrim = {} + minlen = {} + for line in open(parameter_file,'r'): + code[line.split('\t')[0]] = line.split('\t')[1].split('\n')[0] + qtrim[line.split('\t')[1]] = line.split('\t')[2].split('\n')[0] + minlen[line.split('\t')[1]] = line.split('\t')[3].split('\n')[0] + rename(code) + QualityTrim(qtrim, minlen) +# rnaSPAdesAssembly() +main() \ No newline at end of file