Add files via upload

2026-02-10 16:40:26 +08:00 · 2025-06-11 10:17:50 -04:00 · 2025-06-11 10:17:50 -04:00 · fdf525d433
commit fdf525d433
parent 8ccde9800a
1 changed files with 94 additions and 0 deletions
--- a/Utilities/For_Assemblies/Trim_Reads.py
+++ b/Utilities/For_Assemblies/Trim_Reads.py
@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+
+#Author, date: Giulia Magri Ribeiro updated from Xyrus Maurer-Alcala and Ying Yan; June 11 2025
+#Motivation: Trim adaptors from reads and quality trimming before Assembly
+#Intent: clean up reads
+#Dependencies: biopython and bbmap folder
+#Inputs:parameters.txt, fastq.gz forward and reverse reads
+#Outputs:trimmed reads in ToAssemble folder
+#Example: python3 Trim_Reads.py parameter.txt YourEmailAddress
+
+
+from Bio import SeqIO
+import sys,os
+import time
+
+#------------------------------ Checks the Input Arguments ------------------------------#
+
+if len(sys.argv) == 1:
+	print ('\n\nThis script will remove Adapters, do quality trimming and length trimming on given score and assembly from your raw reads')
+	print ('\n\nChecking the overall quality and reads size on FastQC is recommended\n\n')
+	print ('Example Usage:\n\n\t' + 'katzlab$ python3 Trim_Reads.py parameter.txt YourEmailAddress\n\n')
+	print ('\t\tQuestions/Comments? Email Giulia (author) at gribeiro@smith.edu\n\n')
+	sys.exit()
+
+
+if len(sys.argv) != 3:
+	print ('\n\nDouble check that you have added all the necessary command-line inputs! (see usage below for an example)\n\n')
+	print  ('Example Usage:\n\n\t' + 'katzlab$ python3 Trim_Reads.py parameter.txt YourEmailAddress\n\n')	
+	print ('Please also check that you have a parameter.txt (tab separated values) file which should contain your current filename, new filename, score of quality trimming and minimum length (see an example below)\n\n')
+	print  ('parameter.txt example:\n\n\t' + 'XKATZ_20161110_K00134_IL100076423_S41_L005\tLKH001_Spirostomum\t24\t100\n\tXKATZ_20161110_K00134_IL100076416_S17_L005\tLKH002_Loxodes\t28\t100\n')
+	sys.exit()
+
+elif len(sys.argv) == 3:
+	parameter_file = sys.argv[1]
+	mailaddress = sys.argv[2]
+	if os.path.isdir('ToAssemble/') != True:	
+		os.system('mkdir ToAssemble')
+	
+### takes your downloaded data and renames the file so that it has taxonomic information in the filename
+def rename(code):
+	for filename in os.listdir(os.curdir):
+		if filename.endswith('.fastq.gz'):
+		### check name code here for forward reads
+			if '_1.' in filename:
+				cur_name = filename.split('_1.')[0]
+				new_name = code[cur_name]
+				print(cur_name, new_name)
+				os.system('mv ' + filename + ' ' + new_name + '_FwdPE.fastq.gz')
+### Make a folder for each taxon that you are doing an assembly for ... this will be useful later (might as well do it early on!)				
+				os.system('mkdir '+ new_name)
+		### check name code here for Reverse reads
+			elif '_2.' in filename:
+				cur_name2 = filename.split('_2.')[0]
+				new_name2 = code[cur_name2]
+				print(cur_name2, new_name2)
+				os.system('mv ' + filename + ' ' + new_name2 + '_RevPE.fastq.gz')
+			elif '_FwdPE.fastq.gz' in filename:
+				sample_prefix = filename.split('_FwdPE')[0]
+				os.system(f"mkdir -p {sample_prefix}")
+
+
+### Uses the adapters.fa file in the bbtools resources folder (and BBDuK) to remove adapter sequences -- update if necessary
+### Uses BBDuK to quality trim reads so the average is q24 and the min length is 100 -- adjust if needed ... flags will be added eventually			
+def QualityTrim(qtrim, minlen):
+	for filename in os.listdir(os.curdir):
+		if 'FwdPE' in filename:
+			new_name = filename.split('_FwdPE')[0]
+			qscore = qtrim[new_name]
+			lscore = minlen[new_name]
+			qtrimcmd = '_q'+qscore+'_minlen'+lscore
+			log_file = filename.split('_Fwd')[0] + '/' + filename.split('_Fwd')[0] + qtrimcmd + '_bbduk.log'
+			os.system('./bbmap/bbduk.sh -Xmx20g in1=./' + filename + ' in2=./' + filename.replace('Fwd','Rev') + ' out1=ToAssemble/'+filename.replace('FwdPE','FPE'+qtrimcmd) + ' out2=ToAssemble/' + filename.split('Fwd')[0]+'RPE'+qtrimcmd+'.fastq.gz qtrim=rl trimq='+qscore+' minlen='+lscore+' mink=11 k=23 hdist=1 ktrim=r ref=bbmap/resources/adapters.fa stats=' + filename.split('_Fwd')[0] +'/'+ filename.split('_Fwd')[0] + qtrimcmd + '_Stats.txt overwrite=true'+ ' > ' + log_file + ' 2>&1')	
+			
+
+### Calls on rnaSPAdes to do the transcriptome assembly on the quality trimmed files.				
+def rnaSPAdesAssembly():
+	for filename in os.listdir(os.curdir+'/ToAssemble'):						
+#		if 'LKH' in filename:
+		if 'FPE_q' in filename:
+			os.system('python rnaSPAdes-0.1.1/bin/rnaspades.py -m 26 -k 21,33,55,77 --min-complete-transcript 300 -1 ToAssemble/' + filename + ' -2 ToAssemble/' + filename.replace('FPE','RPE')+' -o ' + filename.split('_FPE')[0] + '/; echo "Finished assembling ' + filename.split('_FPE')[0] + '" | mail -s "Finished Transcriptome Assembly ' + (time.strftime("%d/%m/%y")) + '" ' + mailaddress) > out.txt
+	
+
+def main():
+	code = {}
+	qtrim = {}
+	minlen = {}
+	for line in open(parameter_file,'r'):
+		code[line.split('\t')[0]] = line.split('\t')[1].split('\n')[0]
+		qtrim[line.split('\t')[1]] = line.split('\t')[2].split('\n')[0]
+		minlen[line.split('\t')[1]] = line.split('\t')[3].split('\n')[0]
+	rename(code)
+	QualityTrim(qtrim, minlen)
+#	rnaSPAdesAssembly()
+main()