updating header in 7b_SummaryStats.py

This commit is contained in:
Auden Cote-L'Heureux 2025-01-19 11:04:02 -05:00 committed by GitHub
parent de566bf546
commit 3ac37547b7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -2,13 +2,13 @@
# Author: Auden Cote-L'Heureux
# This script produces both taxon- and sequence-level statistics to describe the ReadyToGo files
# output by PhyloToL Part 1, as well as some OG-level information from the Hook (OG reference)
# output by EukPhylo Part 1, as well as some OG-level information from the Hook (OG reference)
# database and the original input assembled transcripts. It relies on the utility script CUB.py
# to calculate composition statistics (GC content, Effective Number of Codons, etc.). Both sequence
# level and taxon-level stats are summarized in tab-separated outputs written to the Output folder.
# This script requires that the OG reference database is available as an amino acid fasta file
# in the Databases/db_OG folder with the same file name as the .dmnd file used in script 3. This script
# is intended to be run as part of the PhyloToL 6 Part 1 pipeline using the script wrapper.py.
# is intended to be run as part of the EukPhylo Part 1 pipeline using the script wrapper.py.
import os, sys
import argparse
@ -31,7 +31,7 @@ def get_args():
description = "Updated March 31th, 2023 by Auden Cote-L'Heureux"
)
parser.add_argument('-i', '--input', type = str, required = True, help = 'Input path to the "Output" folder produced by PhyloToL Part 1. This folder should contain both the "ReadyToGO" and "Intermediate" folders.')
parser.add_argument('-i', '--input', type = str, required = True, help = 'Input path to the "Output" folder produced by EukPhylo Part 1. This folder should contain both the "ReadyToGO" and "Intermediate" folders.')
parser.add_argument('-d', '--databases', type = str, default = '../Databases', help = 'Path to databases folder')
parser.add_argument('-r', '--r2g_jf', action = 'store_true', help = 'Create ReadyToGo files filtered to only include sequences between the 25th and 75th percentile of silent-site GC content. Please be aware that these are not necessarily the correct or non-contaminant sequences; examine the GC3xENc plots carefully before using these data.')