From 3e01100304c85e4f9b4bc451c8f6b57642f02531 Mon Sep 17 00:00:00 2001 From: Auden Cote-L'Heureux <52716489+AudenCote@users.noreply.github.com> Date: Fri, 29 Sep 2023 13:19:23 -0400 Subject: [PATCH] Delete PTL1/Transcriptomes/Scripts/1b_XSpeciesContaminationAgnes.py --- .../Scripts/1b_XSpeciesContaminationAgnes.py | 153 ------------------ 1 file changed, 153 deletions(-) delete mode 100644 PTL1/Transcriptomes/Scripts/1b_XSpeciesContaminationAgnes.py diff --git a/PTL1/Transcriptomes/Scripts/1b_XSpeciesContaminationAgnes.py b/PTL1/Transcriptomes/Scripts/1b_XSpeciesContaminationAgnes.py deleted file mode 100644 index bf697ed..0000000 --- a/PTL1/Transcriptomes/Scripts/1b_XSpeciesContaminationAgnes.py +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/python3 - -__author__ = 'Jean-David Grattepanche' -__version__ = 'ACL fixed sequence naming issue Feb 23, 2022' -__email__ = 'jeandavid.grattepanche@gmail.com' - - - -import sys -import os -import re -import time -import string -import os.path -from Bio import SeqIO -from sys import argv -listtaxa=[] -toosim = 0.99 -seqcoverage = 0.7 - -def merge_files(folder, minlen, conspecific_names): - mergefile = open('/'.join(folder.split('/')[:-1]) + '/forclustering.fasta','w+') - print("MERGE following files") - for taxafile in os.listdir(folder): - if taxafile[0] != ".": - listtaxa.append(taxafile.split('.' + str(minlen) + 'bp')[0]) - - for line2 in SeqIO.parse(folder+'/'+taxafile, 'fasta'): - if int(len(str(line2.seq))) >= int(minlen): - mergefile.write('>'+taxafile.split('.' + str(minlen) + 'bp')[0] + '_' + line2.description + '\n' + str(line2.seq) + '\n') - else: - print(line2, " is too short") - mergefile.close() - - sort_cluster(folder, listtaxa, minlen, conspecific_names) - - -def sort_cluster(folder, listtaxa, minlen, conspecific_names): - if not os.path.exists('/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/'): - os.makedirs('/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/') - - fastalist = []; fastadict= {} - conspecific_names_dict = { line.split('\t')[0] : line.split('\t')[1].strip() for line in open(conspecific_names) } - - print('CREATE a dictionnary of sequences') - for record in SeqIO.parse(open('/'.join(folder.split('/')[:-1]) + '/forclustering.fasta','r'),'fasta'): - if record.id[:10] not in conspecific_names_dict: - print('\nError in cross-plate contamination assessment: the ten-digit code ' + record.id[:10] + ' is not found in the conspecific names file. Please check that this file is correct and try again.\n') - quit() - - IDL = record.description, int(record.description.split('_Cov')[1].replace('\n','')) - fastalist.append(IDL) - fastadict[record.description] = record.seq - - print("CLUSTER sequences that overlap at least 70%") - os.system('vsearch --cluster_fast ' + '/'.join(folder.split('/')[:-1]) + '/forclustering.fasta --strand both --query_cov '+str(seqcoverage)+' --id '+str(toosim) +' --uc ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/results_forclustering.uc --threads 60' ) - - #input2 = open('/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/results_forclustering.uc','r') - #input2 = open('/Output_PostClusterBackup/clusteringresults_vsearch/results_forclustering.uc','r') - cluster_output = '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/results_forclustering.uc' - out2 = open('/'.join(folder.split('/')[:-1]) + '/fastatokeep.fas','w+') - out3 = open('/'.join(folder.split('/')[:-1]) + '/fastatoremoved.fas','w+') - out4 = open('/'.join(folder.split('/')[:-1]) + '/fastatoremoved.uc','w+') - print("CREATE a dictionary with clustering results") - clustdict= {}; clustlist = []; allseq = []; clustline = {}; list= []; i=0; j=0 - for row2 in open(cluster_output, 'r'): - if row2.split('\t')[0] == 'C' and int(row2.split('\t')[2]) < 2: # keep all unique sequences - out2.write('>'+row2.split('\t')[8] + '\n' + str(fastadict[row2.split('\t')[8]])+ '\n') - if row2.split('\t')[0] == 'C' and int(row2.split('\t')[2]) > 1: # create another dictionary -# print("create dico: ", row2.split('\t')[8]) - clustdict.setdefault(row2.split('\t')[8], [row2.split('\t')[8]]) - clustlist.append(row2.split('\t')[8]) - - for row3 in open(cluster_output, 'r'): - if row3.split('\t')[0] == 'H': -# print("add dico: ", row3.split('\t')[9], row3.split('\t')[8]) - clustdict[row3.split('\t')[9].replace('\n','')].append(row3.split('\t')[8].replace('\n','')) - clustline[row3.split('\t')[8].replace('\n','')] = row3.replace('\n','') - clustline[row3.split('\t')[9].replace('\n','')] = row3.replace('\n','') - - print("PARSE the clusters: keep seed sequences (highest coverage) for each cluster") - for clust in clustlist: - list = sorted(clustdict[clust], reverse = True, key=lambda x: int(x.split('_Cov')[1])) - master = list[0] - Covmaster = int(list[0].split('_Cov')[1]) - master8dig = ('_').join(list[0].split('_')[0:3])[:-2] - for seq in list: - clustered = seq.replace('\n','') - Covclustered = int(clustered.split('_Cov')[1]) - clustered8dig = ('_').join(clustered.split('_')[0:3])[:-2] -# print(master8dig, Covmaster, '//', clustered8dig, Covclustered) - if float(Covmaster/Covclustered) < 10: - out2.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n') - i +=1 - elif conspecific_names_dict[master[:10]] == conspecific_names_dict[clustered[:10]]: - out2.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n') - i +=1 - elif Covclustered >= 50: - out2.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n') - i +=1 - else: - j +=1 - out4 = open('/'.join(folder.split('/')[:-1]) + '/fastatoremoved.uc','a') - out3.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n') - print(clustline[clustered],'\t' , master ) - out4.write(clustline[clustered]+ '\t' + master + '\n') - out4.close() - - - print('there are ', str(i),' sequences kept and ',str(j),' sequences removed') - - out2.close() - out3.close() - - splittaxa(folder, listtaxa, minlen) - -def splittaxa(folder, listtaxa, minlen): - for taxa in listtaxa: - tax_sf_path = '/'.join(folder.split('/')[:-1]) + '/' + taxa + '/SizeFiltered/' - os.system('mv ' + tax_sf_path + taxa + '.' + str(minlen) + 'bp.fasta' + ' ' + tax_sf_path + taxa + '.' + str(minlen) + 'bp.preXPlate.fasta') - - with open(tax_sf_path + taxa + '.' + str(minlen) + 'bp.fasta','w') as o: - for kept in SeqIO.parse('/'.join(folder.split('/')[:-1]) + '/fastatokeep.fas','fasta'): - if taxa in kept.description: - o.write('>' + kept.description.replace(taxa + '_', '') + '\n' + str(kept.seq) + '\n') - - os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/fastatokeep.fas ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/') - os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/fastatoremoved.fas ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/') - os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/fastatoremoved.uc ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/') - os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/forclustering.fasta ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/') - -def main(): - - script, folder, minlen, conspecific_names = argv - merge_files(folder, minlen, conspecific_names) - -main() - - - - - - - - - - - - - - - -