mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-27 12:00:25 +08:00
Add files via upload
This commit is contained in:
parent
165a95fa9d
commit
7da3de77e4
@ -54,18 +54,20 @@ def aa_comp_lengths(args, gcodes):
|
|||||||
for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_AA/' + file, 'fasta'):
|
for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_AA/' + file, 'fasta'):
|
||||||
r2g_lengths.update({ rec.id : len(str(rec.seq)) * 3 })
|
r2g_lengths.update({ rec.id : len(str(rec.seq)) * 3 })
|
||||||
|
|
||||||
fymink = 0; garp = 0; other = 0; total = 0
|
fymink = 0; garp = 0; other = 0; total = 0; x = 0
|
||||||
for char in str(rec.seq):
|
for char in str(rec.seq):
|
||||||
if char in 'FYMINK':
|
if char in 'FYMINKfymink':
|
||||||
fymink += 1
|
fymink += 1
|
||||||
elif char in 'GARP':
|
elif char in 'GARPgarp':
|
||||||
garp += 1
|
garp += 1
|
||||||
|
elif char in 'Xx':
|
||||||
|
x += 1
|
||||||
else:
|
else:
|
||||||
other += 1
|
other += 1
|
||||||
|
|
||||||
total += 1
|
total += 1
|
||||||
|
|
||||||
aa_comp.update({ rec.id : { 'FYMINK' : fymink/total, 'GARP' : garp/total, 'Other' : other/total } })
|
aa_comp.update({ rec.id : { 'FYMINK' : fymink/total, 'GARP' : garp/total, 'Other' : other/total, 'X' : x} })
|
||||||
|
|
||||||
recid_by_contig_n.update({ rec.id.split('Contig_')[-1].split('_')[0] : rec.id })
|
recid_by_contig_n.update({ rec.id.split('Contig_')[-1].split('_')[0] : rec.id })
|
||||||
|
|
||||||
@ -91,16 +93,14 @@ def get_nuc_comp(args, gcodes):
|
|||||||
nuc_comp = { }
|
nuc_comp = { }
|
||||||
for file in tqdm([f for f in os.listdir(args.input + '/ReadyToGo/ReadyToGo_NTD')]):
|
for file in tqdm([f for f in os.listdir(args.input + '/ReadyToGo/ReadyToGo_NTD')]):
|
||||||
if file.endswith('.fasta') and file[:10] in gcodes:
|
if file.endswith('.fasta') and file[:10] in gcodes:
|
||||||
cub_out = CUB.CalcRefFasta(args.input + '/ReadyToGo/ReadyToGo_NTD/' + file, gcodes[file[:10]])[0]
|
cub_out = CUB.CalcRefFasta(args.input + '/ReadyToGo/ReadyToGo_NTD/' + file, gcodes[file[:10]].lower())[0]
|
||||||
for k in cub_out:
|
for k in cub_out:
|
||||||
nuc_comp.update({ k : cub_out[k] })
|
nuc_comp.update({ k : cub_out[k] })
|
||||||
|
|
||||||
return nuc_comp
|
return nuc_comp
|
||||||
|
|
||||||
|
|
||||||
def per_seq(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, transcript_id_corr):
|
def per_seq(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, transcript_id_corr, og_mean_lens):
|
||||||
|
|
||||||
og_mean_lens = hook_lens(args)
|
|
||||||
|
|
||||||
if not os.path.isdir(args.input + '/PerSequenceStatSummaries'):
|
if not os.path.isdir(args.input + '/PerSequenceStatSummaries'):
|
||||||
os.mkdir(args.input + '/PerSequenceStatSummaries')
|
os.mkdir(args.input + '/PerSequenceStatSummaries')
|
||||||
@ -109,7 +109,7 @@ def per_seq(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, transcript_id
|
|||||||
|
|
||||||
for taxon in taxa:
|
for taxon in taxa:
|
||||||
with open(args.input + '/PerSequenceStatSummaries/' + taxon + '.csv', 'w') as o:
|
with open(args.input + '/PerSequenceStatSummaries/' + taxon + '.csv', 'w') as o:
|
||||||
o.write('Sequence,Taxon,OG,Transcript,TranscriptLength,CDSLength,AvgLengthOGinHook,AmbiguousCodons,GC-Overall,GC1,GC2,GC3,GC3-Degen,ExpWrightENc,ObsWrightENc_6Fold,ObsWrightENc_No6Fold,ObsWeightedENc_6Fold,ObsWeightedENc_No6Fold,FYMINK,GARP,OtherAA\n')
|
o.write('Sequence,Taxon,OG,Transcript,TranscriptLength,CDSLength,AvgLengthOGinHook,AmbiguousCodons,GC-Overall,GC1,GC2,GC3,GC3-Degen,ExpWrightENc,ObsWrightENc_6Fold,ObsWrightENc_No6Fold,ObsWeightedENc_6Fold,ObsWeightedENc_No6Fold,FYMINK,GARP,OtherAA,N.Xs\n')
|
||||||
for rec in nuc_comp:
|
for rec in nuc_comp:
|
||||||
if rec[:10] == taxon:
|
if rec[:10] == taxon:
|
||||||
o.write(rec + ',' + rec[:10] + ',' + rec[-10:])
|
o.write(rec + ',' + rec[:10] + ',' + rec[-10:])
|
||||||
@ -126,15 +126,15 @@ def per_seq(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, transcript_id
|
|||||||
ENc = [str(v.expENc), str(v.obsENc_6F), str(v.obsENc_No6F), str(v.SunENc_6F),str(v.SunENc_No6F)]
|
ENc = [str(v.expENc), str(v.obsENc_6F), str(v.obsENc_No6F), str(v.SunENc_6F),str(v.SunENc_No6F)]
|
||||||
o.write(',' + ','.join([str(v.amb_cdn)] + gcs + ENc))
|
o.write(',' + ','.join([str(v.amb_cdn)] + gcs + ENc))
|
||||||
|
|
||||||
o.write(',' + str(aa_comp[rec]['FYMINK']) + ',' + str(aa_comp[rec]['GARP']) + ',' + str(aa_comp[rec]['Other']) + '\n')
|
o.write(',' + str(aa_comp[rec]['FYMINK']) + ',' + str(aa_comp[rec]['GARP']) + ',' + str(aa_comp[rec]['Other']) + ',' + str(aa_comp[rec]['X']) + '\n')
|
||||||
|
|
||||||
|
|
||||||
def per_tax(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, gcodes):
|
def per_tax(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, gcodes, og_mean_lens):
|
||||||
|
|
||||||
taxa = list(dict.fromkeys([seq[:10] for seq in nuc_comp]))
|
taxa = list(dict.fromkeys([seq[:10] for seq in nuc_comp]))
|
||||||
|
|
||||||
with open(args.input + '/PerTaxonSummary.csv', 'w') as o:
|
with open(args.input + '/PerTaxonSummary.csv', 'w') as o:
|
||||||
o.write('Taxon,TranscriptsInput,Median_GCTranscripts,IQR_GCTranscripts,Median_LenTranscripts,IRQ_LenTranscripts,SeqsR2G,OGsR2G,Median_GC3R2G,IQR_GC3R2G,Median_ENcR2G,IQR_ENcR2G,Median_LenR2G,IQR_LenR2G,GeneticCode\n')
|
o.write('Taxon,TranscriptsInput,Median_GCTranscripts,5-95Perc_GCTranscripts,Median_LenTranscripts,IQR_LenTranscripts,SeqsR2G,OGsR2G,Median_GC3R2G,5Perc_GC3R2G,95Perc_GC3R2G,5-95Perc_GC3R2G,Median_ENcR2G,IQR_ENcR2G,Median_LenR2G,IQR_LenR2G,Prop.G1.5_OGAvg,Prop.L0.5_OGAvg,GeneticCode\n')
|
||||||
|
|
||||||
for taxon in taxa:
|
for taxon in taxa:
|
||||||
try:
|
try:
|
||||||
@ -149,7 +149,7 @@ def per_tax(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, gcodes):
|
|||||||
|
|
||||||
transcript_gcs = sorted(transcript_gcs)
|
transcript_gcs = sorted(transcript_gcs)
|
||||||
o.write(',' + str(transcript_gcs[floor(len(transcripts)*0.5)]))
|
o.write(',' + str(transcript_gcs[floor(len(transcripts)*0.5)]))
|
||||||
o.write(',' + str(transcript_gcs[floor(len(transcripts)*0.75)] - transcript_gcs[floor(len(transcripts)*0.25)]))
|
o.write(',' + str(transcript_gcs[floor(len(transcripts)*0.95)] - transcript_gcs[floor(len(transcripts)*0.05)]))
|
||||||
|
|
||||||
transcript_lens = sorted([len(transcript) for transcript in transcripts])
|
transcript_lens = sorted([len(transcript) for transcript in transcripts])
|
||||||
o.write(',' + str(transcript_lens[floor(len(transcripts)*0.5)]))
|
o.write(',' + str(transcript_lens[floor(len(transcripts)*0.5)]))
|
||||||
@ -162,7 +162,9 @@ def per_tax(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, gcodes):
|
|||||||
|
|
||||||
r2g_gc3s = sorted([seq.gc4F for seq in r2g_ntds])
|
r2g_gc3s = sorted([seq.gc4F for seq in r2g_ntds])
|
||||||
o.write(',' + str(r2g_gc3s[floor(len(r2g_ntds)*0.5)]))
|
o.write(',' + str(r2g_gc3s[floor(len(r2g_ntds)*0.5)]))
|
||||||
o.write(',' + str(r2g_gc3s[floor(len(r2g_gc3s)*0.75)] - r2g_gc3s[floor(len(r2g_gc3s)*0.25)]))
|
o.write(',' + str(r2g_gc3s[floor(len(r2g_gc3s)*0.05)]))
|
||||||
|
o.write(',' + str(r2g_gc3s[floor(len(r2g_gc3s)*0.95)]))
|
||||||
|
o.write(',' + str(r2g_gc3s[floor(len(r2g_gc3s)*0.95)] - r2g_gc3s[floor(len(r2g_gc3s)*0.05)]))
|
||||||
|
|
||||||
r2g_encs = sorted([seq.obsENc_6F for seq in r2g_ntds])
|
r2g_encs = sorted([seq.obsENc_6F for seq in r2g_ntds])
|
||||||
o.write(',' + str(r2g_encs[floor(len(r2g_encs)*0.5)]))
|
o.write(',' + str(r2g_encs[floor(len(r2g_encs)*0.5)]))
|
||||||
@ -172,6 +174,11 @@ def per_tax(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, gcodes):
|
|||||||
o.write(',' + str(tax_r2g_lens[floor(len(tax_r2g_lens)*0.5)]))
|
o.write(',' + str(tax_r2g_lens[floor(len(tax_r2g_lens)*0.5)]))
|
||||||
o.write(',' + str(tax_r2g_lens[floor(len(tax_r2g_lens)*0.75)] - tax_r2g_lens[floor(len(tax_r2g_lens)*0.25)]))
|
o.write(',' + str(tax_r2g_lens[floor(len(tax_r2g_lens)*0.75)] - tax_r2g_lens[floor(len(tax_r2g_lens)*0.25)]))
|
||||||
|
|
||||||
|
prop_len_g = len([seq for seq in r2g_lengths if seq[:10] == taxon and r2g_lengths[seq] > 4.5 * og_mean_lens[seq[-10:]]])/len(tax_r2g_lens)
|
||||||
|
prop_len_l = len([seq for seq in r2g_lengths if seq[:10] == taxon and r2g_lengths[seq] < 1.5 * og_mean_lens[seq[-10:]]])/len(tax_r2g_lens)
|
||||||
|
|
||||||
|
o.write(',' + str(prop_len_g) + ',' + str(prop_len_l))
|
||||||
|
|
||||||
o.write(',' + gcodes[taxon] + '\n')
|
o.write(',' + gcodes[taxon] + '\n')
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
@ -243,9 +250,10 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
aa_comp, transcripts, r2g_lengths, transcript_id_corr = aa_comp_lengths(args, gcodes)
|
aa_comp, transcripts, r2g_lengths, transcript_id_corr = aa_comp_lengths(args, gcodes)
|
||||||
nuc_comp = get_nuc_comp(args, gcodes)
|
nuc_comp = get_nuc_comp(args, gcodes)
|
||||||
|
og_mean_lens = hook_lens(args)
|
||||||
|
|
||||||
per_tax(args, nuc_comp, aa_comp, transcripts, r2g_lengths, gcodes)
|
per_tax(args, nuc_comp, aa_comp, transcripts, r2g_lengths, gcodes, og_mean_lens)
|
||||||
per_seq(args, nuc_comp, aa_comp, transcripts, r2g_lengths, transcript_id_corr)
|
per_seq(args, nuc_comp, aa_comp, transcripts, r2g_lengths, transcript_id_corr, og_mean_lens)
|
||||||
|
|
||||||
if args.r2g_jf:
|
if args.r2g_jf:
|
||||||
r2g_jf(args, nuc_comp, gcodes)
|
r2g_jf(args, nuc_comp, gcodes)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user