import argparse
import numpy as np
import sys
import os
from . import gretel
from . import util
__version__ = "0.0.94"
[docs]def main():
"""Gretel: A metagenomic haplotyper."""
parser = argparse.ArgumentParser(description="Gretel: A metagenomic haplotyper.")
parser.add_argument("bam")
parser.add_argument("vcf")
parser.add_argument("contig")
parser.add_argument("-s", "--start", type=int, default=1, help="1-indexed included start base position [default: 1]")
parser.add_argument("-e", "--end", type=int, default=-1, help="1-indexed inlcuded end base position [default: reference length]")
#parser.add_argument("-l", "--lorder", type=int, default=0, help="Order of markov chain to predict next nucleotide [default calculated from read data]")
parser.add_argument("-p", "--paths", type=int, default=100, help="Maximum number of paths to generate [default:100]")
parser.add_argument("--master", default=None, help="Master sequence (will be used to fill in homogeneous gaps in haplotypes, otherwise --gapchar)") #TODO Use something other than N? Should probably be a valid IUPAC
parser.add_argument("--gapchar", default="N", help="Character to fill homogeneous gaps in haplotypes if no --master [default N]")
parser.add_argument("--delchar", default="", help="Character to output in haplotype for deletion (eg. -) [default is blank]")
parser.add_argument("--quiet", default=False, action='store_true', help="Don't output anything other than a single summary line.")
#parser.add_argument("--sentinels", default=False, action='store_true', help="Add additional sentinels for read ends [default:False][EXPERIMENTAL]")
parser.add_argument("-o", "--out", default=".", help="Output directory [default .]")
parser.add_argument("-@", "--threads", type=int, default=1, help="Number of BAM iterators [default 1]")
parser.add_argument("--debugreads", type=str, default="", help="A newline delimited list of read names to output debug data when parsing the BAM")
parser.add_argument("--debugpos", type=str, default="", help="A newline delimited list of 1-indexed genomic positions to output debug data when parsing the BAM")
parser.add_argument("--debughpos", type=str, default=",", help="A comma delimited list of 1-indexed SNP positions to output debug data when predicting haplotypes")
parser.add_argument("--dumpmatrix", type=str, default=None, help="Location to dump the Hansel matrix to disk")
parser.add_argument("--dumpsnps", type=str, default=None, help="Location to dump the SNP positions to disk")
parser.add_argument("--pepper", action="store_true", help="enable a more permissive pileup by setting the pysam pileup stepper to 'all', instead of 'samtools'.\nNote that this will allow improper pairs.")
parser.add_argument("--version", action="version", version="%(prog)s " + __version__)
ARGS = parser.parse_args()
debug_hpos = []
if ARGS.debughpos:
for x in ARGS.debughpos.split(","):
try:
debug_hpos.append( int(x) )
except:
pass
if ARGS.end == -1:
ARGS.end = util.get_ref_len_from_bam(ARGS.bam, ARGS.contig)
sys.stderr.write("[NOTE] Setting end_pos to %d" % ARGS.end)
debug_reads = set([])
if ARGS.debugreads:
debug_fofn = open(ARGS.debugreads)
for line in debug_fofn:
debug_reads.add(line.strip())
debug_pos = set([])
if ARGS.debugpos:
debug_fofn = open(ARGS.debugpos)
for line in debug_fofn:
debug_pos.add(int(line.strip()))
VCF_h = util.process_vcf(ARGS.vcf, ARGS.contig, ARGS.start, ARGS.end)
if ARGS.dumpsnps:
snp_fh = open(ARGS.dumpsnps, 'w')
for k in sorted(VCF_h["snp_fwd"].keys()):
snp_fh.write("%d\t%d\t%d\n" % (VCF_h["snp_fwd"][k]+1, k, k-ARGS.start+1))
snp_fh.close()
# Could we optimise for lower triangle by collapsing one of the dimensions
# such that Z[m][n][i][j] == Z[m][n][i + ((j-1)*(j))/2]
hansel = util.load_from_bam(ARGS.bam, ARGS.contig, ARGS.start, ARGS.end, VCF_h, n_threads=ARGS.threads, debug_reads=debug_reads, debug_pos=debug_pos, stepper="all" if ARGS.pepper else "samtools")
original_hansel = hansel.copy()
if ARGS.dumpmatrix:
hansel.save_hansel_dump(ARGS.dumpmatrix)
# Check if there is a gap in the matrix
for i in range(0, VCF_h["N"]+1):
marginal = hansel.get_counts_at(i)
if i > 0:
snp_rev = VCF_h["snp_rev"][i-1]
else:
snp_rev = 0
if marginal.get("total", 0) == 0:
sys.stderr.write('''[FAIL] Unable to recover pairwise evidence concerning SNP #%d at position %d
Gretel needs every SNP to appear on a read with at least one other SNP, at least once.
There is no read in your data set that bridges SNP #%d with any of its neighbours.
* If you are trying to run Gretel along an entire contig or genome, please note that
this is not the recommended usage for Gretel, as it was intended to uncover the
variation in a metahaplome: the set of haplotypes for a specific gene.
See our pre-print https://doi.org/10.1101/223404 for more information
Consider running a prediction tool such as `prokka` on your assembly or reference
and using the CDS regions in the GFF for corresponding genes of interest to
uncover haplotypes with Gretel instead.
* If you are already doing this, consider calling for SNPs more aggressively.
We use `snpper` (https://github.com/SamStudio8/gretel-test/blob/master/snpper.py)
to determine any site in a BAM that has at least one read in disagreement with
the reference as a SNP. Although this introduces noise from alignment and sequence
error, Gretel is fairly robust. Importantly, this naive calling method will
likely close gaps between SNPs and permit recovery.
* Finally, consider that the gaps are indicative that your reads do not support
one or more parts of your assembly or reference. You could try and find or construct
a more suitable reference, or reduce the size of the recovery window.
Sorry :(\n''' % (i, snp_rev, i))
sys.exit(1)
PATHS = {}
# Spew out exciting information about the SNPs
if not ARGS.quiet:
print ("i\tpos\tgap\tA\tC\tG\tT\tN\t-\t_\ttot")
last_rev = 0
for i in range(0, VCF_h["N"]+1):
marginal = hansel.get_counts_at(i)
marginal = {str(x): marginal[x] for x in marginal}
snp_rev = 0
if i > 0:
snp_rev = VCF_h["snp_rev"][i-1]
print ("%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d" % (
i,
snp_rev,
snp_rev - last_rev,
marginal.get("A", 0),
marginal.get("C", 0),
marginal.get("G", 0),
marginal.get("T", 0),
marginal.get("N", 0),
marginal.get("-", 0),
marginal.get("_", 0),
marginal.get("total", 0),
))
last_rev = snp_rev
# Make some genes
SPINS = ARGS.paths
ongoing_mag = 0
for i in range(0, SPINS):
init_path, init_prob, init_min = gretel.generate_path(VCF_h["N"], hansel, original_hansel, debug_hpos=debug_hpos)
if init_path == None:
break
current_path = init_path
MIN_REMOVE = 0.01 # 1%
if init_min < MIN_REMOVE:
sys.stderr.write("[RWGT] Ratio %.10f too small, adjusting to %.3f\n" % (init_min, MIN_REMOVE))
init_min = MIN_REMOVE
rw_magnitude = gretel.reweight_hansel_from_path(hansel, init_path, init_min)
#TODO Horribly inefficient.
current_path_str = "".join([str(x) for x in current_path])
if current_path_str not in PATHS:
PATHS[current_path_str] = {
"hp_current": [],
"hp_original": [],
"i": [],
"i_0": i,
"n": 0,
"magnitude": 0,
"hansel_path": current_path,
}
PATHS[current_path_str]["n"] += 1
PATHS[current_path_str]["i"].append(i)
PATHS[current_path_str]["magnitude"] += rw_magnitude
PATHS[current_path_str]["hp_current"].append(init_prob["hp_current"])
PATHS[current_path_str]["hp_original"].append(init_prob["hp_original"])
# Output FASTA
dirn = ARGS.out + "/"
fasta_out_fh = open(dirn+"out.fasta", "w")
hfasta_out_fh = open(dirn+"snp.fasta", "w")
if ARGS.master:
master_fa = util.load_fasta(ARGS.master)
master_seq = master_fa.fetch(master_fa.references[0])
else:
master_seq = [' '] * ARGS.end
for p in sorted(PATHS, key=lambda x: PATHS[x]["i_0"]):
p = PATHS[p]
path = p["hansel_path"]
i = p["i_0"]
seq = list(master_seq[:])
for j, mallele in enumerate(path[1:]):
snp_pos_on_master = VCF_h["snp_rev"][j]
try:
if mallele == hansel.symbols_d["-"]:
# It's a deletion, don't print a SNP
seq[snp_pos_on_master-1] = ARGS.delchar
else:
seq[snp_pos_on_master-1] = mallele
except IndexError:
print (path, len(seq), snp_pos_on_master-1)
sys.exit(1)
# Coerce HanselSymbols to str
to_write = "".join(str(x) for x in seq[ARGS.start-1 : ARGS.end])
if not ARGS.master:
to_write = to_write.replace(' ', ARGS.gapchar)
fasta_out_fh.write(">%d__%.2f\n" % (i, p["hp_current"][0])) #TODO hp_current or hp_original?
fasta_out_fh.write("%s\n" % to_write)
hfasta_out_fh.write(">%d__%.2f\n" % (i, p["hp_current"][0])) #TODO hp_current or hp_original?
hfasta_out_fh.write("%s\n" % "".join([str(x) for x in path[1:]]))
fasta_out_fh.close()
hfasta_out_fh.close()
#TODO datetime, n_obs, n_slices, avg_obs_len, L, n_paths, n_avg_loglik
crumb_file = open(dirn+"gretel.crumbs", "w")
crumb_file.write("# %d\t%d\t%d\t%.2f\n" % (
VCF_h["N"],
hansel.n_crumbs,
hansel.n_slices,
hansel.L,
))
for p in sorted(PATHS, key=lambda x: PATHS[x]["hp_current"][0], reverse=True):
p = PATHS[p]
crumb_file.write("%d\t%d\t%s\t%s\t%.2f\n" % (
p["i_0"],
p["n"],
",".join(["%.2f" % x for x in p["hp_current"]]),
",".join(["%.2f" % x for x in p["hp_original"]]),
p["magnitude"],
))