58 lines
5.1 KiB
Bash
58 lines
5.1 KiB
Bash
###从NCBI下载基因组(DNA)/转录组(RNA)测序数据到获得预测基因的处理步骤(以Nyctotherus_ovalis物种为例,见/home/Data_01/yizhenzhen06/public_share/Saccharomyces_cerevisiae_DNA。如果没有注明仅基因组数据要做,其他步骤基因组和转录组步骤一样。直接写在本文件里的美丽表示可以直接终端运行指令,其他指令要提交pbs任务运行。
|
||
|
||
blastn -db /home/Data_01/share_databases/ncbi_genome_refseq_bac_old/ncbi_genome_refseq_bac_old \
|
||
-query /home/Data_01/yizhenzhen06/public_share/Saccharomyces_cerevisiae_DNA/2_assembly/spades/scaffolds.fasta \
|
||
-out 1_Sacc_cerevisiae_dec_bac.tab \
|
||
-outfmt "6 qseqid qlen sseqid slen pident qcovs qstart qend sstart send mismatch gapopen evalue bitscore" \
|
||
-evalue 1e-5 -max_target_seqs 1 -num_threads X
|
||
|
||
sed -i '1i\tquery ID\tquery sequence length\tsubject ID\tsubject qequence length\tpercent identities\tcoverage\tqstart\tqend\tsstart\tsend\tmismatch\tgapopen\tevalue\tbitscores ' 1_Sacc_cerevisiae_dec_bac.tab
|
||
#去细菌基因组污染,两条指令放一个脚本运行(注意检查数据库路径)
|
||
|
||
awk '{ if ($5 > 80 && $6 > 70) print $1 }' 1_Sacc_cerevisiae_dec_bac.tab > 1_Sacc_cerevisiae_dec_bac.id #提取第5列相似度大于80%且覆盖率大于70%的序列ID(这一步建议需要下载到本地手动检查输出的id条数是否和Sacc_cerevisiae_dec_bac.tab(手动excel筛选)文件的第5列相似度大于80%且覆盖率大于70%的序列条数一致)
|
||
seqkit grep -v -f Sacc_cerevisiae_dec_bac.id ../2_assembly/trinity_out_dir/Trinity.fasta > 1_Sacc_cerevisiae_dec_bac.fasta #从组装结果文件中删去细菌污染序列
|
||
|
||
####################################################################################################
|
||
|
||
blastn -db /home/Data_01/share_databases/human_genome_new/GCA_021234545.1_HCC1395BL_v1.0_genomic_db \
|
||
-query 1_Sacc_cerevisiae_dec_bac.fasta \
|
||
-out 2_Sacc_cerevisiae_dec_bac_hum.tab \
|
||
-outfmt "6 qseqid qlen sseqid slen pident qcovs qstart qend sstart send mismatch gapopen evalue bitscore" \
|
||
-evalue 1e-5 -max_target_seqs 1 -num_threads X
|
||
|
||
sed -i '1i\tquery ID\tquery sequence length\tsubject ID\tsubject qequence length\tpercent identities\tcoverage\tqstart\tqend\tsstart\tsend\tmismatch\tgapopen\tevalue\tbitscores ' 2_Sacc_cerevisiae_dec_bac_hum.tab
|
||
#去人类基因组污染,两条指令放一个脚本运行(注意检查数据库路径)
|
||
|
||
awk '{ if ($5 > 80 && $6 > 70) print $1 }' 2_Sacc_cerevisiae_dec_bac_hum.tab > 2_Sacc_cerevisiae_dec_bac_hum.id #提取第5列相似度大于80%且覆盖率大于70%的序列ID(这一步建议需要下载到本地手动检查输出的id条数是否和2_Sacc_cerevisiae_dec_bac_hum.tab(手动excel筛选)文件的第5列相似度大于80%且覆盖率大于70%的序列条数一致)
|
||
seqkit grep -v -f 2_Sacc_cerevisiae_dec_bac_hum.id 1_Sacc_cerevisiae_dec_bac.fasta > 2_Sacc_cerevisiae_dec_bac_hum.fasta #从组装结果文件中删去人类基因组污染序列
|
||
|
||
####################################################################################################
|
||
|
||
blastn -db /home/Data_01/share_databases/human_genome_new/GCF_000001405.39_GRCh38.p13_rna_db \
|
||
-query 2_Sacc_cerevisiae_dec_bac_hum.fasta \
|
||
-out 3_Sacc_cerevisiae_dec_bac_hum_geno_rna.tab \
|
||
-outfmt "6 qseqid qlen sseqid slen pident qcovs qstart qend sstart send mismatch gapopen evalue bitscore" \
|
||
-evalue 1e-5 -max_target_seqs 1 -num_threads X
|
||
|
||
sed -i '1i\tquery ID\tquery sequence length\tsubject ID\tsubject qequence length\tpercent identities\tcoverage\tqstart\tqend\tsstart\tsend\tmismatch\tgapopen\tevalue\tbitscores ' 3_Sacc_cerevisiae_dec_bac_hum_geno_rna.tab
|
||
#去人类转录组污染,两条指令放一个脚本运行(注意检查数据库路径)
|
||
|
||
awk '{ if ($5 > 80 && $6 > 70) print $1 }' 3_Sacc_cerevisiae_dec_bac_hum_geno_rna.tab > 3_Sacc_cerevisiae_dec_bac_hum_geno_rna.id #提取第5列相似度大于80%且覆盖率大于70%的序列ID(这一步需要下载到本地手动检查输出的id条数是否和Nyct_bac_human_genome_blastn.tab(excel筛选)文件的第5列相似度大于80%且覆盖率大于70%的序列条数一致)
|
||
seqkit grep -v -f 3_Sacc_cerevisiae_dec_bac_hum_geno_rna.id 2_Sacc_cerevisiae_dec_bac_hum.fasta > 3_Sacc_cerevisiae_dec_bac_hum_geno_rna.fasta #从组装结果文件中删去人类转录组污染序列
|
||
|
||
####################################################################################################
|
||
|
||
seqkit seq -n -i ../2_assembly/spades/scaffolds.fasta > 4_Sacc_cerevisiae_dec_all.txt
|
||
#提取序列ID,仅基因组数据要做
|
||
awk '{if($1~"_0.")print$1}' 4_Sacc_cerevisiae_dec_all.txt > 4_Sacc_cerevisiae_dec_all_cov1.id
|
||
#提取coverage小于1的序列ID,仅基因组数据要做
|
||
seqkit grep -v -f 4_Sacc_cerevisiae_dec_all_cov1.id 4_Sacc_cerevisiae_dec_all.fasta > 4_Sacc_cerevisiae_dec_all_cov1.fasta
|
||
#去除coverage小于1的序列,仅基因组数据要做
|
||
|
||
####################################################################################################
|
||
|
||
cd-hit-est \
|
||
-i 4_Sacc_cerevisiae_dec_all_cov1.fasta \
|
||
-o 5_Sacc_cerevisiae_dec_all_cov1_cdhit.fasta \
|
||
-c 0.9 -n 8 -d 0 -M 32000 -T 48
|
||
#去冗余(核酸序列) |