Source code for gretel.cmd

import argparse
import numpy as np
import sys
import os

from . import gretel
from . import util

__version__ = "0.0.94"

[docs]def main(): """Gretel: A metagenomic haplotyper.""" parser = argparse.ArgumentParser(description="Gretel: A metagenomic haplotyper.") parser.add_argument("bam") parser.add_argument("vcf") parser.add_argument("contig") parser.add_argument("-s", "--start", type=int, default=1, help="1-indexed included start base position [default: 1]") parser.add_argument("-e", "--end", type=int, default=-1, help="1-indexed inlcuded end base position [default: reference length]") #parser.add_argument("-l", "--lorder", type=int, default=0, help="Order of markov chain to predict next nucleotide [default calculated from read data]") parser.add_argument("-p", "--paths", type=int, default=100, help="Maximum number of paths to generate [default:100]") parser.add_argument("--master", default=None, help="Master sequence (will be used to fill in homogeneous gaps in haplotypes, otherwise --gapchar)") #TODO Use something other than N? Should probably be a valid IUPAC parser.add_argument("--gapchar", default="N", help="Character to fill homogeneous gaps in haplotypes if no --master [default N]") parser.add_argument("--delchar", default="", help="Character to output in haplotype for deletion (eg. -) [default is blank]") parser.add_argument("--quiet", default=False, action='store_true', help="Don't output anything other than a single summary line.") #parser.add_argument("--sentinels", default=False, action='store_true', help="Add additional sentinels for read ends [default:False][EXPERIMENTAL]") parser.add_argument("-o", "--out", default=".", help="Output directory [default .]") parser.add_argument("-@", "--threads", type=int, default=1, help="Number of BAM iterators [default 1]") parser.add_argument("--debugreads", type=str, default="", help="A newline delimited list of read names to output debug data when parsing the BAM") parser.add_argument("--debugpos", type=str, default="", help="A newline delimited list of 1-indexed genomic positions to output debug data when parsing the BAM") parser.add_argument("--debughpos", type=str, default=",", help="A comma delimited list of 1-indexed SNP positions to output debug data when predicting haplotypes") parser.add_argument("--dumpmatrix", type=str, default=None, help="Location to dump the Hansel matrix to disk") parser.add_argument("--dumpsnps", type=str, default=None, help="Location to dump the SNP positions to disk") parser.add_argument("--pepper", action="store_true", help="enable a more permissive pileup by setting the pysam pileup stepper to 'all', instead of 'samtools'.\nNote that this will allow improper pairs.") parser.add_argument("--version", action="version", version="%(prog)s " + __version__) ARGS = parser.parse_args() debug_hpos = [] if ARGS.debughpos: for x in ARGS.debughpos.split(","): try: debug_hpos.append( int(x) ) except: pass if ARGS.end == -1: ARGS.end = util.get_ref_len_from_bam(ARGS.bam, ARGS.contig) sys.stderr.write("[NOTE] Setting end_pos to %d" % ARGS.end) debug_reads = set([]) if ARGS.debugreads: debug_fofn = open(ARGS.debugreads) for line in debug_fofn: debug_reads.add(line.strip()) debug_pos = set([]) if ARGS.debugpos: debug_fofn = open(ARGS.debugpos) for line in debug_fofn: debug_pos.add(int(line.strip())) VCF_h = util.process_vcf(ARGS.vcf, ARGS.contig, ARGS.start, ARGS.end) if ARGS.dumpsnps: snp_fh = open(ARGS.dumpsnps, 'w') for k in sorted(VCF_h["snp_fwd"].keys()): snp_fh.write("%d\t%d\t%d\n" % (VCF_h["snp_fwd"][k]+1, k, k-ARGS.start+1)) snp_fh.close() # Could we optimise for lower triangle by collapsing one of the dimensions # such that Z[m][n][i][j] == Z[m][n][i + ((j-1)*(j))/2] hansel = util.load_from_bam(ARGS.bam, ARGS.contig, ARGS.start, ARGS.end, VCF_h, n_threads=ARGS.threads, debug_reads=debug_reads, debug_pos=debug_pos, stepper="all" if ARGS.pepper else "samtools") original_hansel = hansel.copy() if ARGS.dumpmatrix: hansel.save_hansel_dump(ARGS.dumpmatrix) # Check if there is a gap in the matrix for i in range(0, VCF_h["N"]+1): marginal = hansel.get_counts_at(i) if i > 0: snp_rev = VCF_h["snp_rev"][i-1] else: snp_rev = 0 if marginal.get("total", 0) == 0: sys.stderr.write('''[FAIL] Unable to recover pairwise evidence concerning SNP #%d at position %d Gretel needs every SNP to appear on a read with at least one other SNP, at least once. There is no read in your data set that bridges SNP #%d with any of its neighbours. * If you are trying to run Gretel along an entire contig or genome, please note that this is not the recommended usage for Gretel, as it was intended to uncover the variation in a metahaplome: the set of haplotypes for a specific gene. See our pre-print https://doi.org/10.1101/223404 for more information Consider running a prediction tool such as `prokka` on your assembly or reference and using the CDS regions in the GFF for corresponding genes of interest to uncover haplotypes with Gretel instead. * If you are already doing this, consider calling for SNPs more aggressively. We use `snpper` (https://github.com/SamStudio8/gretel-test/blob/master/snpper.py) to determine any site in a BAM that has at least one read in disagreement with the reference as a SNP. Although this introduces noise from alignment and sequence error, Gretel is fairly robust. Importantly, this naive calling method will likely close gaps between SNPs and permit recovery. * Finally, consider that the gaps are indicative that your reads do not support one or more parts of your assembly or reference. You could try and find or construct a more suitable reference, or reduce the size of the recovery window. Sorry :(\n''' % (i, snp_rev, i)) sys.exit(1) PATHS = {} # Spew out exciting information about the SNPs if not ARGS.quiet: print ("i\tpos\tgap\tA\tC\tG\tT\tN\t-\t_\ttot") last_rev = 0 for i in range(0, VCF_h["N"]+1): marginal = hansel.get_counts_at(i) marginal = {str(x): marginal[x] for x in marginal} snp_rev = 0 if i > 0: snp_rev = VCF_h["snp_rev"][i-1] print ("%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d" % ( i, snp_rev, snp_rev - last_rev, marginal.get("A", 0), marginal.get("C", 0), marginal.get("G", 0), marginal.get("T", 0), marginal.get("N", 0), marginal.get("-", 0), marginal.get("_", 0), marginal.get("total", 0), )) last_rev = snp_rev # Make some genes SPINS = ARGS.paths ongoing_mag = 0 for i in range(0, SPINS): init_path, init_prob, init_min = gretel.generate_path(VCF_h["N"], hansel, original_hansel, debug_hpos=debug_hpos) if init_path == None: break current_path = init_path MIN_REMOVE = 0.01 # 1% if init_min < MIN_REMOVE: sys.stderr.write("[RWGT] Ratio %.10f too small, adjusting to %.3f\n" % (init_min, MIN_REMOVE)) init_min = MIN_REMOVE rw_magnitude = gretel.reweight_hansel_from_path(hansel, init_path, init_min) #TODO Horribly inefficient. current_path_str = "".join([str(x) for x in current_path]) if current_path_str not in PATHS: PATHS[current_path_str] = { "hp_current": [], "hp_original": [], "i": [], "i_0": i, "n": 0, "magnitude": 0, "hansel_path": current_path, } PATHS[current_path_str]["n"] += 1 PATHS[current_path_str]["i"].append(i) PATHS[current_path_str]["magnitude"] += rw_magnitude PATHS[current_path_str]["hp_current"].append(init_prob["hp_current"]) PATHS[current_path_str]["hp_original"].append(init_prob["hp_original"]) # Output FASTA dirn = ARGS.out + "/" fasta_out_fh = open(dirn+"out.fasta", "w") hfasta_out_fh = open(dirn+"snp.fasta", "w") if ARGS.master: master_fa = util.load_fasta(ARGS.master) master_seq = master_fa.fetch(master_fa.references[0]) else: master_seq = [' '] * ARGS.end for p in sorted(PATHS, key=lambda x: PATHS[x]["i_0"]): p = PATHS[p] path = p["hansel_path"] i = p["i_0"] seq = list(master_seq[:]) for j, mallele in enumerate(path[1:]): snp_pos_on_master = VCF_h["snp_rev"][j] try: if mallele == hansel.symbols_d["-"]: # It's a deletion, don't print a SNP seq[snp_pos_on_master-1] = ARGS.delchar else: seq[snp_pos_on_master-1] = mallele except IndexError: print (path, len(seq), snp_pos_on_master-1) sys.exit(1) # Coerce HanselSymbols to str to_write = "".join(str(x) for x in seq[ARGS.start-1 : ARGS.end]) if not ARGS.master: to_write = to_write.replace(' ', ARGS.gapchar) fasta_out_fh.write(">%d__%.2f\n" % (i, p["hp_current"][0])) #TODO hp_current or hp_original? fasta_out_fh.write("%s\n" % to_write) hfasta_out_fh.write(">%d__%.2f\n" % (i, p["hp_current"][0])) #TODO hp_current or hp_original? hfasta_out_fh.write("%s\n" % "".join([str(x) for x in path[1:]])) fasta_out_fh.close() hfasta_out_fh.close() #TODO datetime, n_obs, n_slices, avg_obs_len, L, n_paths, n_avg_loglik crumb_file = open(dirn+"gretel.crumbs", "w") crumb_file.write("# %d\t%d\t%d\t%.2f\n" % ( VCF_h["N"], hansel.n_crumbs, hansel.n_slices, hansel.L, )) for p in sorted(PATHS, key=lambda x: PATHS[x]["hp_current"][0], reverse=True): p = PATHS[p] crumb_file.write("%d\t%d\t%s\t%s\t%.2f\n" % ( p["i_0"], p["n"], ",".join(["%.2f" % x for x in p["hp_current"]]), ",".join(["%.2f" % x for x in p["hp_original"]]), p["magnitude"], ))