diff --git a/workflow/Snakefile b/workflow/Snakefile index 0e6ab69..4cb5a01 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -16,6 +16,8 @@ rule create_database: log_level=config['log_level'] conda: "envs/create_database.yml" log: "logs/create_database/create_database.log" + benchmark: + "benchmarks/create_database.benchmark.txt" shell: """ python workflow/scripts/create_database.py \ @@ -37,6 +39,8 @@ rule map_opentol: log_level=config['log_level'] conda: "envs/map_opentol.yml" log: "logs/map_opentol/map_opentol.log" + benchmark: + "benchmarks/map_opentol.benchmark.txt" shell: """ python workflow/scripts/map_opentol.py \ @@ -58,6 +62,8 @@ rule megatree_loader: output: 'results/databases/megatree_loader.ok' conda: "envs/megatree_loader.yml" log: "logs/megatree_loader/megatree_loader.log" + benchmark: + "benchmarks/megatree_loader.benchmark.txt" params: db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]), tempdb = temp("results/databases/opentree_nodes.db"), @@ -92,6 +98,8 @@ rule family_fasta: chunks=config["nfamilies"], conda: "envs/family_fasta.yml" log: "logs/family_fasta/family_fasta.log" + benchmark: + "benchmarks/family_fasta.benchmark.txt" shell: """ python workflow/scripts/family_fasta.py \ @@ -119,6 +127,8 @@ rule makeblastdb: blastdb=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}' conda: "envs/blast.yml" log: "logs/makeblastdb/makeblastdb.log" + benchmark: + "benchmarks/makeblastdb.benchmark.txt" shell: """ sh workflow/scripts/makeblastdb.sh \ @@ -139,6 +149,8 @@ rule get_outgroups: blastdb=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}' conda: "envs/blast.yml" log: "logs/get_outgroups/get_outgroups-{scatteritem}.log" + benchmark: + "benchmarks/get_outgroups/get_outgroups-{scatteritem}.benchmark.txt" shell: """ sh workflow/scripts/get_outgroups.sh {params.blastdb} {params.outgroups} {input.unaligned} {output} 2> {log} @@ -157,6 +169,8 @@ rule family_constraint: db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]) conda: "envs/family_constraint.yml" log: "logs/family_constraint/family_constraint-{scatteritem}.log" + benchmark: + "benchmarks/family_constraint/family_constraint-{scatteritem}.benchmark.txt" shell: """ python workflow/scripts/family_constraint.py \ @@ -180,6 +194,8 @@ rule msa_hmm: db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]) conda: "envs/msa_hmm.yml" log: "logs/msa_hmm/msa_hmm-{scatteritem}.log" + benchmark: + "benchmarks/msa_hmm/msa_hmm-{scatteritem}.benchmark.txt" shell: """ python workflow/scripts/msa_hmm.py \ @@ -209,6 +225,8 @@ rule prep_raxml: log_level=config['log_level'] conda: "envs/prep_raxml.yml" log: "logs/prep_raxml/prep_raxml-{scatteritem}.log" + benchmark: + "benchmarks/prep_raxml/prep_raxml-{scatteritem}.benchmark.txt" shell: """ python workflow/scripts/prep_raxml.py \ @@ -233,28 +251,34 @@ rule run_raxml: model = config['model'], num_outgroups= config['outgroups'] log: "logs/run_raxml/run_raxml-{scatteritem}.log" + benchmark: + "benchmarks/run_raxml/run_raxml-{scatteritem}.benchmark.txt" conda: "envs/raxml.yml" shell: """ - OG=$(grep '>' {input.alignment} | tail -{params.num_outgroups} | sed -e 's/>//' | tr '\n' ',') - if [ -s {input.tree} ]; then - set -e - raxml-ng \ - --redo \ - --msa {input.alignment} \ - --outgroup $OG \ - --model {params.model} \ - --tree-constraint {input.tree} \ - --search > {log} 2>&1 \ - || \ - raxml-ng \ - --redo \ - --msa {input.alignment} \ - --outgroup $OG \ - --model {params.model} \ - --search >> {log} 2>&1 + if [ -s {input.alignment} ]; then + OG=$(grep '>' {input.alignment} | tail -{params.num_outgroups} | sed -e 's/>//' | tr '\n' ',') + if [ -s {input.tree} ]; then + set -e + raxml-ng \ + --redo \ + --msa {input.alignment} \ + --outgroup $OG \ + --model {params.model} \ + --tree-constraint {input.tree} \ + --search1 > {log} 2>&1 \ + || \ + raxml-ng \ + --redo \ + --msa {input.alignment} \ + --outgroup $OG \ + --model {params.model} \ + --search1 >> {log} 2>&1 + else + raxml-ng --msa {input.alignment} --outgroup $OG --model {params.model} --search --redo > {log} 2>&1 + fi else - raxml-ng --msa {input.alignment} --outgroup $OG --model {params.model} --search --redo > {log} 2>&1 + touch {output.tree} fi """ @@ -272,16 +296,22 @@ rule reroot_raxml_output: log_level = config['log_level'], num_outgroups = config['outgroups'] log: "logs/reroot_raxml_output/reroot_raxml_output-{scatteritem}.log" + benchmark: + "benchmarks/reroot_raxml_output/reroot_raxml_output-{scatteritem}.benchmark.txt" conda: "envs/reroot_backbone.yml" shell: """ - OG=$(grep '>' {input.alignment} | tail -{params.num_outgroups} | sed -e 's/>//' | tr '\n' ',') - python workflow/scripts/reroot_backbone.py \ - -i {input.tree} \ - -c {input.constraint} \ - -o {output.outtree} \ - -g $OG \ - -v {params.log_level} 2> {log} + if [ -s {input.alignment} ]; then + OG=$(grep '>' {input.alignment} | tail -{params.num_outgroups} | sed -e 's/>//' | tr '\n' ',') + python workflow/scripts/reroot_backbone.py \ + -i {input.tree} \ + -c {input.constraint} \ + -o {output.outtree} \ + -g $OG \ + -v {params.log_level} 2> {log} + else + touch {output.outtree} + fi """ # In principle, exemplar choosing can be performed using one of three ways: @@ -299,15 +329,21 @@ rule choose_exemplars: log_level = config['log_level'], strategy = 'median' log: "logs/choose_exemplars/choose_exemplars-{scatteritem}.log" + benchmark: + "benchmarks/choose_exemplars/choose_exemplars-{scatteritem}.benchmark.txt" conda: "envs/choose_exemplars.yaml" shell: """ - python workflow/scripts/choose_exemplars.py \ - -v {params.log_level} \ - -t {input.tree} \ - -i {input.alignment} \ - -s {params.strategy} \ - -o {output} 2> {log} + if [ -s {input.alignment} ]; then + python workflow/scripts/choose_exemplars.py \ + -v {params.log_level} \ + -t {input.tree} \ + -i {input.alignment} \ + -s {params.strategy} \ + -o {output} 2> {log} + else + touch {output} + fi """ # When aligning the family-level subsets, different families may have different indel @@ -366,6 +402,8 @@ rule run_raxml_backbone: params: model = config['model'] log: "logs/run_raxml_backbone/run_raxml_backbone.log" + benchmark: + "benchmarks/run_raxml_backbone.benchmark.txt" conda: "envs/raxml.yml" shell: """ @@ -391,6 +429,8 @@ rule reroot_backbone: params: log_level = config['log_level'] log: "logs/reroot_backbone/reroot_backbone.log" + benchmark: + "benchmarks/reroot_backbone.benchmark.txt" conda: "envs/reroot_backbone.yml" shell: """ @@ -413,6 +453,8 @@ rule graft_clades: log_level = config['log_level'], nfamilies = config["nfamilies"] log: "logs/graft_clades/graft_clades.log" + benchmark: + "benchmarks/graft_clades.benchmark.txt" conda: "envs/graft_clades.yml" shell: """ diff --git a/workflow/scripts/backbone_constraint.py b/workflow/scripts/backbone_constraint.py index 807c925..31b3788 100644 --- a/workflow/scripts/backbone_constraint.py +++ b/workflow/scripts/backbone_constraint.py @@ -71,19 +71,19 @@ def process_exemplars(exemplar_files, conn): process_ids = [] for record in SeqIO.parse(exemplar_file, 'fasta'): process_ids.append(record.id) - - # Get an OTT ID for the process IDs in the list - dict_of_lists = get_ott_ids(process_ids, conn) - if dict_of_lists: - pids_for_ott.update(dict_of_lists) - else: - - # The process IDs have no family that occurs in the OpenTree. In two cases now this - # occurred because the focal exemplar file consisted entirely of extinct taxa - # (i.e. an entire extinct family, such as Megaladapidae). The least bad thing to do - # with these is to remove them from the backbone. - logger.warning(f'Input file with unconstrained sequences (maybe extinct?): {exemplar_file}') - extinct_pids.extend(process_ids) + if not process_ids == []: + # Get an OTT ID for the process IDs in the list + dict_of_lists = get_ott_ids(process_ids, conn) + if dict_of_lists: + pids_for_ott.update(dict_of_lists) + else: + + # The process IDs have no family that occurs in the OpenTree. In two cases now this + # occurred because the focal exemplar file consisted entirely of extinct taxa + # (i.e. an entire extinct family, such as Megaladapidae). The least bad thing to do + # with these is to remove them from the backbone. + logger.warning(f'Input file with unconstrained sequences (maybe extinct?): {exemplar_file}') + extinct_pids.extend(process_ids) return pids_for_ott, extinct_pids diff --git a/workflow/scripts/choose_exemplars.py b/workflow/scripts/choose_exemplars.py index d35e5f7..898239e 100644 --- a/workflow/scripts/choose_exemplars.py +++ b/workflow/scripts/choose_exemplars.py @@ -1,105 +1,106 @@ -import util -import argparse -import dendropy -import statistics - -from Bio import SeqIO - - -def pick_tips(tree, strategy): - """ - Picks two exemplar tips, either the two tallest, the two shallowest, or the two closest to the median - :param tree: Dendropy tree - :param strategy: How to pick exemplars - :return: - """ - logger.info(f'Going to pick exemplars from {tree} following {strategy} strategy') - - # List of exemplars to return - representatives = [] - - # Get root and its immediate children - root = tree.seed_node - children = root.child_nodes() - if len(children) == 2: - for child in children: - dists = [] - - # Calculate distances to root from all leaves - for tip in child.leaf_nodes(): - dist = tip.distance_from_root() - dists.append({'dist': dist, 'id': tip.taxon.label}) - - # Get shallowest tip - dists = sorted(dists, key=lambda x: x['dist']) - if str(strategy).capitalize().startswith('S'): - representatives.append(dists[0]['id']) - - # Get tallest tip - elif str(strategy).capitalize().startswith('T'): - representatives.append(dists[-1]['id']) - - # Get median tip - else: - median_dist = statistics.median([d['dist'] for d in dists]) - closest = min(dists, key=lambda x: abs(x['dist'] - median_dist)) - representatives.append(closest['id']) - - else: - logger.warning(f'Ingroup root in {tree} is not bifurcating, will approximate rooting') - return None - logger.debug(representatives) - return representatives - - -def write_sequences(inaln, outaln, subset): - """ - Appends the specified subset of sequences from the input alignment to the output - :param inaln: - :param outaln: - :param subset: - :return: - """ - logger.info(f'Going to write {len(subset)} sequences to {outaln}') - with open(outaln, 'a') as outfh: - for record in SeqIO.parse(inaln, "fasta"): - if record.id in subset: - outfh.write(f'>{record.id}\n') - outfh.write(f'{record.seq}\n') - - -if __name__ == "__main__": - - # Define command line arguments - parser = argparse.ArgumentParser(description='Required command line arguments.') - parser.add_argument('-t', '--tree', required=True, help='Input Newick tree') - parser.add_argument('-i', '--inaln', required=True, help='Input aligned FASTA file') - parser.add_argument('-o', '--outaln', required=True, help="Output FASTA alignment") - parser.add_argument('-s', '--strategy', required=True, help='Tip picking strategy: [t]all, [s]hallow, [m]edian') - parser.add_argument('-v', '--verbosity', required=True, help='Log level (e.g. DEBUG)') - args = parser.parse_args() - - # Configure logger - logger = util.get_formatted_logger('choose_exemplars', args.verbosity) - - # Read newick, get list of tips - tree = dendropy.Tree.get( - path=args.tree, - schema="newick", - rooting='default-rooted' - ) - tips = [] - for node in tree.preorder_node_iter(): - if node.is_leaf(): - tips.append(node.taxon.label) - - # Include the whole file if fewer than 3 ingroup tips - if len(tips) < 3: - logger.info(f'Infile {args.tree} has fewer than 3 tips, will include all') - write_sequences(args.inaln, args.outaln, tips) - - # Pick exemplars by strategy - else: - exemplars = pick_tips(tree, args.strategy) - write_sequences(args.inaln, args.outaln, exemplars) - +import util +import argparse +import dendropy +import statistics + +from Bio import SeqIO + + +def pick_tips(tree, strategy): + """ + Picks two exemplar tips, either the two tallest, the two shallowest, or the two closest to the median + :param tree: Dendropy tree + :param strategy: How to pick exemplars + :return: + """ + logger.info(f'Going to pick exemplars from {tree} following {strategy} strategy') + + # List of exemplars to return + representatives = [] + + # Get root and its immediate children + root = tree.seed_node + children = root.child_nodes() + if len(children) == 2: + for child in children: + dists = [] + + # Calculate distances to root from all leaves + for tip in child.leaf_nodes(): + dist = tip.distance_from_root() + dists.append({'dist': dist, 'id': tip.taxon.label}) + + # Get shallowest tip + dists = sorted(dists, key=lambda x: x['dist']) + if str(strategy).capitalize().startswith('S'): + representatives.append(dists[0]['id']) + + # Get tallest tip + elif str(strategy).capitalize().startswith('T'): + representatives.append(dists[-1]['id']) + + # Get median tip + else: + median_dist = statistics.median([d['dist'] for d in dists]) + closest = min(dists, key=lambda x: abs(x['dist'] - median_dist)) + representatives.append(closest['id']) + + else: + logger.warning(f'Ingroup root in {tree} is not bifurcating, will approximate rooting') + return None + logger.debug(representatives) + return representatives + + +def write_sequences(inaln, outaln, subset): + """ + Appends the specified subset of sequences from the input alignment to the output + :param inaln: + :param outaln: + :param subset: + :return: + """ + if subset == None: + subset = [] + logger.info(f'Going to write {len(subset)} sequences to {outaln}') + with open(outaln, 'a') as outfh: + for record in SeqIO.parse(inaln, "fasta"): + if record.id in subset: + outfh.write(f'>{record.id}\n') + outfh.write(f'{record.seq}\n') + + +if __name__ == "__main__": + + # Define command line arguments + parser = argparse.ArgumentParser(description='Required command line arguments.') + parser.add_argument('-t', '--tree', required=True, help='Input Newick tree') + parser.add_argument('-i', '--inaln', required=True, help='Input aligned FASTA file') + parser.add_argument('-o', '--outaln', required=True, help="Output FASTA alignment") + parser.add_argument('-s', '--strategy', required=True, help='Tip picking strategy: [t]all, [s]hallow, [m]edian') + parser.add_argument('-v', '--verbosity', required=True, help='Log level (e.g. DEBUG)') + args = parser.parse_args() + + # Configure logger + logger = util.get_formatted_logger('choose_exemplars', args.verbosity) + + # Read newick, get list of tips + tree = dendropy.Tree.get( + path=args.tree, + schema="newick", + rooting='default-rooted' + ) + tips = [] + for node in tree.preorder_node_iter(): + if node.is_leaf(): + tips.append(node.taxon.label) + + # Include the whole file if fewer than 3 ingroup tips + if len(tips) < 3: + logger.info(f'Infile {args.tree} has fewer than 3 tips, will include all') + write_sequences(args.inaln, args.outaln, tips) + + # Pick exemplars by strategy + else: + exemplars = pick_tips(tree, args.strategy) + write_sequences(args.inaln, args.outaln, exemplars) diff --git a/workflow/scripts/create_database.py b/workflow/scripts/create_database.py index 54501c0..28b4757 100644 --- a/workflow/scripts/create_database.py +++ b/workflow/scripts/create_database.py @@ -63,6 +63,11 @@ def create_barcode_table(tsv_file, table_name, add_keys): create_table_statement = f'CREATE TABLE IF NOT EXISTS {table_name} (' # Add each column header as a TEXT type column + column_headers[column_headers.index('COLLECTORS')] = 'COLLECTORS2' # double column names in the curated BOLD + column_headers[column_headers.index('COLLECTION_DATE')] = 'COLLECTION_DATE2' + column_headers[column_headers.index('COUNTRY')] = 'COUNTRY2' + column_headers[column_headers.index('SITE')] = 'SITE2' + column_headers[column_headers.index('COORD')] = 'COORD2' for i, header in enumerate(column_headers): if not add_keys and i == len(column_headers) - 1: create_table_statement += f'"{header}" TEXT);' diff --git a/workflow/scripts/graft_clades.py b/workflow/scripts/graft_clades.py index 5d45c71..e92b543 100644 --- a/workflow/scripts/graft_clades.py +++ b/workflow/scripts/graft_clades.py @@ -42,7 +42,7 @@ def read_tree(filename, rooting='default-rooted', schema='newick'): for line in file: clean_line = line.strip() extinct.append(clean_line) - + logger.info(f"extinct: {extinct}") # Read the backbone tree as a dendropy object, calculate distances to root, and get its leaves backbone = read_tree(args.tree) backbone.calc_node_root_distances() @@ -56,12 +56,14 @@ def read_tree(filename, rooting='default-rooted', schema='newick'): # Peprocess the focal family tree subfolder = f'{i}-of-{args.nfamilies}' subtree_file = os.path.join(base_folder, subfolder, 'aligned.fa.raxml.bestTree.rooted') - subtree = read_tree(subtree_file) + try: + subtree = read_tree(subtree_file) + except: + continue subtree.calc_node_root_distances() # Get the tip labels of the subtree subtree_leaf_labels = set([leaf.taxon.label for leaf in subtree.leaf_nodes()]) - # See if this needs to be skipped if subtree_leaf_labels.intersection(extinct): logger.warning(f'Skipping {subfolder} as it intersects with extinct exemplars') @@ -73,6 +75,9 @@ def read_tree(filename, rooting='default-rooted', schema='newick'): for label in subtree_leaf_labels: if label in backbone_leaf_labels: intersection.add(label) + if len(intersection) == 0: + logger.warning(f' all labels have been removed from the backbone') + continue # Find the mrca, calculate distance between exemplars logger.info(f'Intersection {intersection}') @@ -80,6 +85,7 @@ def read_tree(filename, rooting='default-rooted', schema='newick'): bbdist = 0 stdist = 0 for label in intersection: + logger.info(f"{backbone} + {label}") bbleaf = backbone.find_node_with_taxon_label(label) bbdist += (bbleaf.root_distance - mrca.root_distance) stleaf = subtree.find_node_with_taxon_label(label) @@ -99,4 +105,4 @@ def read_tree(filename, rooting='default-rooted', schema='newick'): for child in subtree.seed_node.child_nodes(): mrca.add_child(child) - backbone.write(path=args.out, schema="newick") + backbone.write(path=args.out, schema="newick") \ No newline at end of file diff --git a/workflow/scripts/msa_hmm.py b/workflow/scripts/msa_hmm.py index 658254e..5532270 100644 --- a/workflow/scripts/msa_hmm.py +++ b/workflow/scripts/msa_hmm.py @@ -70,7 +70,11 @@ def align_write(sequences, outfile, conn): # Align with hmmalign, capture and parse output as phylip run(['hmmalign', '--trim', '-o', f'{outfile}.tmp2', '--outformat', 'phylip', hmmfile, f'{outfile}.tmp1']) - aligned = read_alignment(f'{outfile}.tmp2', 'phylip') + try: + aligned = read_alignment(f'{outfile}.tmp2', 'phylip') + except ValueError: + logger.info("No sequences found.") + aligned = [] os.remove(f'{outfile}.tmp1') os.remove(f'{outfile}.tmp2') diff --git a/workflow/scripts/prep_raxml.py b/workflow/scripts/prep_raxml.py index 71ad216..aee8ac6 100644 --- a/workflow/scripts/prep_raxml.py +++ b/workflow/scripts/prep_raxml.py @@ -18,7 +18,11 @@ def make_constraint(intree, outtree, processmap): :return: """ logger.info(f"Going to create constraint tree from {intree} to {outtree}") - tree = read_newick(intree, 'newick') + try: + tree = read_newick(intree, 'newick') + except ValueError: + logger.info("No trees made for this family.") + return # Map opentol_id to process_id, possibly adding tips if there are multiple process_ids # for this opentol_id (which means there are multiple BINs in this species) @@ -107,7 +111,12 @@ def make_mapping(aln, conn): # Read input file, exit if it is empty logger.info(f'Going to read FASTA file {args.inaln}') infile = os.path.realpath(os.path.abspath(args.inaln)) - alignment = read_alignment(infile, 'fasta') + try: + alignment = read_alignment(infile, 'fasta') + except: + logger.info("No records in the family.") + alignment = [] + open(args.outtree, 'a') # Write Newick tree using process_id as labels, grafting subtended split BINS and outgroups logger.info(f'Going to expand OpenToL constraint to subtended process IDs')