Source code for gretel.cmd

import argparse
import numpy as np
import sys

import gretel
import util

[docs]def main(): """Gretel: A metagenomic haplotyper.""" parser = argparse.ArgumentParser(description="Gretel: A metagenomic haplotyper.") parser.add_argument("bam") parser.add_argument("vcf") parser.add_argument("contig") parser.add_argument("-s", "--start", type=int, default=1, help="1-indexed start base position [default: 1]") parser.add_argument("-e", "--end", type=int, default=-1, help="1-indexed end base position [default: contig end]") parser.add_argument("-l", "--lorder", type=int, default=0, help="Order of markov chain to predict next nucleotide [default:1]") parser.add_argument("-p", "--paths", type=int, default=100, help="Maximum number of paths to generate [default:100]") parser.add_argument("--master", default=None, help="Master sequence if available (required to generate out.fasta)") parser.add_argument("--quiet", default=False, action='store_true', help="Don't output anything other than a single summary line.") parser.add_argument("--sentinels", default=False, action='store_true', help="Add additional sentinels for read ends [default:False][EXPERIMENTAL]") parser.add_argument("-o", "--out", default=".", help="Output directory [default .]") parser.add_argument("-@", "--threads", type=int, default=1, help="Number of BAM iterators [default 1]") ARGS = parser.parse_args() VCF_h = gretel.process_vcf(ARGS.vcf, ARGS.contig, ARGS.start, ARGS.end) BAM_h = gretel.process_bam(VCF_h, ARGS.bam, ARGS.contig, ARGS.start, ARGS.end, ARGS.lorder, ARGS.sentinels, ARGS.threads) #print "[META] #CONTIG", ARGS.contig #print "[META] #SNPS", VCF_h["N"] #print "[META] #READS", BAM_h["N"] PATHS = [] PATH_PROBS = [] PATH_PROBS_UW = [] PATH_FALLS = [] # Spew out exciting information about the SNPs all_marginals = { "A": [], "C": [], "G": [], "T": [], "N": [], "-": [], "_": [], "total": [], } if not ARGS.quiet: print "i\tpos\tgap\tA\tC\tG\tT\tN\t-\t_\ttot" last_rev = 0 for i in range(0, VCF_h["N"]+1): marginal = BAM_h["read_support"].get_counts_at(i) snp_rev = 0 if i > 0: snp_rev = VCF_h["snp_rev"][i-1] print "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d" % ( i, snp_rev, snp_rev - last_rev, marginal.get("A", 0), marginal.get("C", 0), marginal.get("G", 0), marginal.get("T", 0), marginal.get("N", 0), marginal.get("-", 0), marginal.get("_", 0), marginal.get("total", 0), ) all_marginals["A"].append(marginal.get("A", 0)) all_marginals["C"].append(marginal.get("C", 0)) all_marginals["G"].append(marginal.get("G", 0)) all_marginals["T"].append(marginal.get("T", 0)) all_marginals["N"].append(marginal.get("N", 0)) all_marginals["-"].append(marginal.get("-", 0)) all_marginals["_"].append(marginal.get("_", 0)) all_marginals["total"].append( marginal.get("total", 0) ) last_rev = snp_rev # Make some genes SPINS = ARGS.paths ongoing_mag = 0 for i in range(0, SPINS): init_path, init_prob, init_min = gretel.generate_path(VCF_h["N"], BAM_h["read_support"], BAM_h["read_support_o"]) if init_path == None: break current_path = init_path MIN_REMOVE = 0.01 # 1% if init_min < MIN_REMOVE: sys.stderr.write("[RWGT] Ratio %.10f too small, adjusting to %.3f\n" % (init_min, MIN_REMOVE)) init_min = MIN_REMOVE rw_magnitude = gretel.reweight_hansel_from_path(BAM_h["read_support"], init_path, init_min) #TODO Horribly inefficient. if current_path in PATHS: continue else: ongoing_mag += rw_magnitude PATHS.append(current_path) PATH_PROBS.append(init_prob["weighted"]) PATH_PROBS_UW.append(init_prob["unweighted"]) PATH_FALLS.append(ongoing_mag) ongoing_mag = 0 ongoing_mag += rw_magnitude # Make some pretty pictures dirn = ARGS.out + "/" if ARGS.master: master_fa = util.load_fasta(ARGS.master) master_seq = master_fa.fetch(master_fa.references[0]) fasta_out_fh = open(dirn+"out.fasta", "w") for i, path in enumerate(PATHS): seq = list(master_seq[:]) for j, mallele in enumerate(path[1:]): snp_pos_on_master = VCF_h["snp_rev"][j] try: if mallele == "-": # It's a deletion, don't print a SNP seq[snp_pos_on_master-1] = "" else: seq[snp_pos_on_master-1] = mallele except IndexError: print path, len(seq), snp_pos_on_master-1 sys.exit(1) fasta_out_fh.write(">%d__%.2f\n" % (i, PATH_PROBS[i])) fasta_out_fh.write("%s\n" % "".join(seq[ARGS.start-1 : ARGS.end])) fasta_out_fh.close() else: fasta_out_fh = open(dirn+"out.fasta", "w") for i, path in enumerate(PATHS): fasta_out_fh.write(">%d__%.2f\n" % (i, PATH_PROBS[i])) fasta_out_fh.write("%s\n" % "".join("".join(path[1:]))) fasta_out_fh.close() #TODO datetime, n_obs, n_slices, avg_obs_len, L, n_paths, n_avg_loglik crumb_file = open(dirn+"gretel.crumbs", "w") crumb_file.write("# %d\t%d\t%d\t%.2f\t%d\t%.2f\t%.2f\t%.2f\n" % ( VCF_h["N"], BAM_h["read_support"].n_crumbs, BAM_h["read_support"].n_slices, BAM_h["meta"]["L"], BAM_h["read_support"].L, np.mean(PATH_PROBS), np.mean(PATH_PROBS_UW), np.mean(PATH_FALLS), )) for p in range(len(PATHS)): crumb_file.write("%d\t%.2f\t%.2f\t%.2f\n" % ( p, PATH_PROBS[p], PATH_PROBS_UW[p], PATH_FALLS[p] ))