diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/programgroups/LongReadProgramGroup.java b/src/main/java/org/broadinstitute/hellbender/cmdline/programgroups/LongReadProgramGroup.java
new file mode 100644
index 00000000000..88946feecfb
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/cmdline/programgroups/LongReadProgramGroup.java
@@ -0,0 +1,16 @@
+package org.broadinstitute.hellbender.cmdline.programgroups;
+
+import org.broadinstitute.barclay.argparser.CommandLineProgramGroup;
+import org.broadinstitute.hellbender.utils.help.HelpConstants;
+
+/**
+ * Tools that are designed for use with long reads.
+ * @author Jonn Smith
+ */
+public final class LongReadProgramGroup implements CommandLineProgramGroup {
+ @Override
+ public String getName() { return HelpConstants.DOC_CAT_LONGREAD; }
+
+ @Override
+ public String getDescription() { return HelpConstants.DOC_CAT_LONGREAD; }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/ComputeLongReadMetrics.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/ComputeLongReadMetrics.java
new file mode 100644
index 00000000000..3255ccdd66d
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/ComputeLongReadMetrics.java
@@ -0,0 +1,307 @@
+package org.broadinstitute.hellbender.tools.walkers.longreads;
+
+import com.google.common.base.Joiner;
+import htsjdk.samtools.*;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.BetaFeature;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.argparser.ExperimentalFeature;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.cmdline.programgroups.LongReadProgramGroup;
+import org.broadinstitute.hellbender.engine.FeatureContext;
+import org.broadinstitute.hellbender.engine.ReadWalker;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.PrintStream;
+import java.util.*;
+
+/**
+ * Compute simple metrics on long read data
+ *
Input
+ *
+ * - An unaligned or aligned BAM file
+ *
+ *
+ * Outputs
+ *
+ * - A table with metrics
+ *
+ *
+ * Usage Example
+ *
+ * gatk ComputeLongReadMetrics \
+ * -I my.bam \
+ * -O metrics
+ *
+ */
+@DocumentedFeature
+@ExperimentalFeature
+@CommandLineProgramProperties(
+ summary = "Compute simple metrics on long read data",
+ oneLineSummary = "Compute simple metrics on long read data",
+ programGroup = LongReadProgramGroup.class
+)
+public final class ComputeLongReadMetrics extends ReadWalker {
+ @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc="Write output to this file prefix")
+ public String OUTPUT;
+
+ @Argument(fullName = "roundLengthsToNearest", shortName="rn", doc="Round read lengths to nearest ")
+ public Integer ROUND_TO_NEAREST = 100;
+
+ private long num_reads = 0;
+ private long yield = 0;
+ private long yieldTimesPasses = 0;
+ private final Map prlHist = new TreeMap<>();
+ private final Map prlYieldHist = new TreeMap<>();
+ private final Map rlHist = new TreeMap<>();
+ private final Map rlYieldHist = new TreeMap<>();
+ private final Map zmwNumHist = new TreeMap<>();
+ private final Map npHist = new TreeMap<>();
+ private final Map rangeGapHist = new TreeMap<>();
+
+ private final List polymeraseReadLengths = new ArrayList<>();
+ private final List readLengths = new ArrayList<>();
+
+ private long lastZmw = -1;
+ private int prl = 0;
+ private int lastRangeStop = 0;
+
+ @Override
+ public void apply( GATKRead read, ReferenceContext referenceContext, FeatureContext featureContext ) {
+ SAMRecord sr = read.convertToSAMRecord(this.getHeaderForReads());
+
+ if (!sr.isSecondaryOrSupplementary() || sr.getReadUnmappedFlag()) {
+ int np = sr.hasAttribute("np") ? sr.getIntegerAttribute("np") : 1;
+ int readLength = sr.getReadLength();
+ long zmw = lastZmw + 1;
+
+ int range_start = 0;
+ int range_stop = 0;
+
+ if (sr.getReadName().contains("/")) {
+ String[] pieces = sr.getReadName().split("/");
+ if (pieces.length == 3) {
+ zmw = Long.parseLong(pieces[1]);
+
+ if (pieces[2].contains("_")) {
+ String[] range = pieces[2].split("_");
+ range_start = Integer.parseInt(range[0]);
+ range_stop = Integer.parseInt(range[1]);
+ }
+ }
+ }
+
+ num_reads++;
+ yield += readLength;
+ yieldTimesPasses += (long) readLength * np;
+
+ increment(rlHist, roundToNearest(readLength, ROUND_TO_NEAREST));
+ increment(zmwNumHist, zmw);
+ increment(npHist, np);
+
+ add(rlYieldHist, roundToNearest(readLength, ROUND_TO_NEAREST), (long) readLength);
+
+ if ( lastRangeStop != 0 && lastZmw == zmw) {
+ increment(rangeGapHist, range_start - lastRangeStop);
+ }
+
+ readLengths.add(readLength);
+
+ if ( lastZmw != -1 && lastZmw != zmw) {
+ increment(prlHist, roundToNearest(prl, ROUND_TO_NEAREST));
+ add(prlYieldHist, roundToNearest(prl, ROUND_TO_NEAREST), (long) prl);
+
+ polymeraseReadLengths.add(prl);
+
+ prl = 0;
+ }
+
+ prl = (zmw == lastZmw) ? prl + readLength : readLength;
+
+ lastZmw = zmw;
+ lastRangeStop = range_stop;
+ }
+ }
+
+ private int roundToNearest(int num, int nearest) {
+ return ((num + (nearest - 1)) / nearest) * nearest;
+ }
+
+ private void increment(Map h, Long k) {
+ if (!h.containsKey(k)) {
+ h.put(k, 1L);
+ } else {
+ h.put(k, h.get(k) + 1L);
+ }
+ }
+
+ private void increment(Map h, Integer k) {
+ if (!h.containsKey(k)) {
+ h.put(k, 1L);
+ } else {
+ h.put(k, h.get(k) + 1L);
+ }
+ }
+
+ private void add(Map h, Integer k, Long v) {
+ if (!h.containsKey(k)) {
+ h.put(k, v);
+ } else {
+ h.put(k, h.get(k) + v);
+ }
+ }
+
+ private int NX(List l, int X) {
+ double sum = 0.0;
+ for (int i : l) { sum += i; }
+
+ double h = (((double) X)/100.0)*sum;
+
+ double cs = 0.0;
+ int last = l.get(0);
+ for (int i : l) {
+ cs += i;
+ if (cs > h) {
+ break;
+ }
+
+ last = i;
+ }
+
+ return last;
+ }
+
+ @Override
+ public void closeTool() {
+ readLengths.sort(Collections.reverseOrder());
+ polymeraseReadLengths.sort(Collections.reverseOrder());
+
+ final PrintStream polymeraseReadLengthCountsOut;
+ final PrintStream polymeraseReadLengthHistOut;
+ final PrintStream polymeraseReadLengthYieldHistOut;
+ final PrintStream polymeraseReadLengthNxOut;
+
+ final PrintStream readLengthCountsOut;
+ final PrintStream readLengthHistOut;
+ final PrintStream readLengthYieldHistOut;
+ final PrintStream readLengthNxOut;
+
+ final PrintStream zmwHistOut;
+ final PrintStream npHistOut;
+ final PrintStream rangeGapHistOut;
+
+ // TODO: add data for number of passes vs. read length boxplot
+
+ try {
+ polymeraseReadLengthCountsOut = new PrintStream(new FileOutputStream(OUTPUT + ".prl_counts.txt"));
+ polymeraseReadLengthHistOut = new PrintStream(new FileOutputStream(OUTPUT + ".prl_hist.txt"));
+ polymeraseReadLengthYieldHistOut = new PrintStream(new FileOutputStream(OUTPUT + ".prl_yield_hist.txt"));
+ polymeraseReadLengthNxOut = new PrintStream(new FileOutputStream(OUTPUT + ".prl_nx.txt"));
+
+ readLengthCountsOut = new PrintStream(new FileOutputStream(OUTPUT + ".rl_counts.txt"));
+ readLengthHistOut = new PrintStream(new FileOutputStream(OUTPUT + ".rl_hist.txt"));
+ readLengthYieldHistOut = new PrintStream(new FileOutputStream(OUTPUT + ".rl_yield_hist.txt"));
+ readLengthNxOut = new PrintStream(new FileOutputStream(OUTPUT + ".rl_nx.txt"));
+
+ zmwHistOut = new PrintStream(new FileOutputStream(OUTPUT + ".zmw_hist.txt"));
+ npHistOut = new PrintStream(new FileOutputStream(OUTPUT + ".np_hist.txt"));
+ rangeGapHistOut = new PrintStream(new FileOutputStream(OUTPUT + ".range_gap_hist.txt"));
+ } catch (FileNotFoundException e) {
+ throw new GATKException("Couldn't open output file.");
+ }
+
+ final Map zmw_hist = new TreeMap<>();
+ for (long zmw : zmwNumHist.keySet()) {
+ increment(zmw_hist, zmwNumHist.get(zmw));
+ }
+
+ final Double polymerase_read_length_mean = polymeraseReadLengths.stream().mapToInt(Integer::intValue).average().getAsDouble();
+
+ polymeraseReadLengthCountsOut.println(Joiner.on("\t").join( "num_polymerase_reads", "polymerase_read_yield_bp", "polymerase_read_yield_bp_times_passes", "polymerase_read_min_length_bp", "polymerase_read_max_length_bp", "polymerase_read_mean_length_bp", "polymerase_read_sd_length_bp" ));
+ writeReadStatsToFile(polymeraseReadLengthCountsOut, polymerase_read_length_mean, polymeraseReadLengths);
+
+ polymeraseReadLengthHistOut.println("polymerase_read_length\tlength_bp");
+ for (int i : prlHist.keySet()) {
+ polymeraseReadLengthHistOut.println(i + "\t" + prlHist.get(i));
+ }
+
+ polymeraseReadLengthYieldHistOut.println("polymerase_read_length\tyield_bp");
+ for (int i : prlYieldHist.keySet()) {
+ polymeraseReadLengthYieldHistOut.println(i + "\t" + prlYieldHist.get(i));
+ }
+
+ polymeraseReadLengthNxOut.println("NX\tvalue");
+ readLengthNxOut.println("NX\tvalue");
+ for (int X = 1; X < 100; X++) {
+ polymeraseReadLengthNxOut.println(X + "\t" + NX(polymeraseReadLengths, X));
+ readLengthNxOut.println(X + "\t" + NX(readLengths, X));
+ }
+
+ final Double read_length_mean = readLengths.stream().mapToInt(Integer::intValue).average().getAsDouble();
+
+ readLengthCountsOut.println(Joiner.on("\t").join( "num_reads", "read_yield_bp", "read_yield_bp_times_passes", "read_min_length_bp", "read_max_length_bp", "read_mean_length_bp", "read_sd_length_bp" ));
+ writeReadStatsToFile(readLengthCountsOut, read_length_mean, readLengths);
+
+ readLengthHistOut.println("read_length_bp\tcount");
+ for (int i : rlHist.keySet()) {
+ readLengthHistOut.println(i + "\t" + rlHist.get(i));
+ }
+
+ readLengthYieldHistOut.println("read_length\tyield_bp");
+ for (int i : rlYieldHist.keySet()) {
+ readLengthYieldHistOut.println(i + "\t" + rlYieldHist.get(i));
+ }
+
+ zmwHistOut.println("zmw_count\tcount");
+ for (long i : zmw_hist.keySet()) {
+ zmwHistOut.println(i + "\t" + zmw_hist.get(i));
+ }
+
+ npHistOut.println("np\tcount");
+ for (int i : npHist.keySet()) {
+ npHistOut.println(i + "\t" + npHist.get(i));
+ }
+
+ rangeGapHistOut.println("range_gap\tcount");
+ for (int i : rangeGapHist.keySet()) {
+ rangeGapHistOut.println(i + "\t" + rangeGapHist.get(i));
+ }
+
+ polymeraseReadLengthCountsOut.close();
+ polymeraseReadLengthHistOut.close();
+ polymeraseReadLengthYieldHistOut.close();
+ polymeraseReadLengthNxOut.close();
+ readLengthCountsOut.close();
+ readLengthHistOut.close();
+ readLengthYieldHistOut.close();
+ readLengthNxOut.close();
+ zmwHistOut.close();
+ npHistOut.close();
+ rangeGapHistOut.close();
+ }
+
+ /**
+ * Write the read length stats to a file
+ * @param outStream The output stream to which to write.
+ * @param readLengthMean The mean read length.
+ * @param readLengthList The list of read lengths.
+ */
+ private void writeReadStatsToFile(final PrintStream outStream, final Double readLengthMean, final List readLengthList) {
+ outStream.println(Joiner.on("\t").join(
+ readLengthList.size(),
+ readLengthList.stream().mapToInt(Integer::intValue).sum(),
+ yieldTimesPasses,
+ readLengthList.stream().min(Integer::compareTo).get(),
+ readLengthList.stream().max(Integer::compareTo).get(),
+ readLengthMean,
+ Math.sqrt(readLengthList.stream()
+ .map((integer) -> Math.pow(integer - readLengthMean, 2))
+ .mapToDouble(Double::doubleValue).sum() / (readLengthList.size() - 1))
+ ));
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/CountIndels.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/CountIndels.java
new file mode 100644
index 00000000000..e02ee2a4f73
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/CountIndels.java
@@ -0,0 +1,282 @@
+package org.broadinstitute.hellbender.tools.walkers.longreads;
+
+import com.google.common.base.Joiner;
+
+import htsjdk.samtools.CigarElement;
+import htsjdk.samtools.CigarOperator;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.BetaFeature;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.argparser.ExperimentalFeature;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.cmdline.programgroups.LongReadProgramGroup;
+import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * Quickly count errors
+ *
+ * Input
+ *
+ * - A single BAM file
+ *
+ *
+ * Outputs
+ *
+ * - A collection of BAM files where care has been keep reads from the same ZMW in the same file
+ *
+ *
+ * Usage Example
+ * Quickly count errors
+ *
+ * gatk CountIndels \
+ * -I input.bam \
+ * -L region.bed \
+ * -O output.txt
+ *
+ */
+@DocumentedFeature
+@ExperimentalFeature
+@CommandLineProgramProperties(
+ summary = "Count indels",
+ oneLineSummary = "Count indels",
+ programGroup = LongReadProgramGroup.class
+)
+public final class CountIndels extends IntervalWalker {
+ @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+ shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
+ doc="Path to which per-read information should be written")
+ public String perReadPathName;
+
+ @Argument(fullName = "outputIntervalInfo",
+ shortName = "OI",
+ doc="Path to which per-interval information should be written")
+ public String perIntervalPathName;
+
+ private static class DataEntry {
+ private final String readName;
+ private final boolean isNegativeStrand;
+ private final CigarOperator type;
+ private final int refPos;
+ private final int alleleLength;
+ private final String allele;
+
+ public DataEntry(String readName, boolean isNegativeStrand, CigarOperator type, int refPos, int alleleLength, String allele) {
+ this.readName = readName;
+ this.isNegativeStrand = isNegativeStrand;
+ this.type = type;
+ this.refPos = refPos;
+ this.alleleLength = alleleLength;
+ this.allele = allele;
+ }
+
+ public String getReadName() { return readName; }
+ public boolean isNegativeStrand() { return isNegativeStrand; }
+ public CigarOperator getType() { return type; }
+ public int getRefPos() { return refPos; }
+ public int getAlleleLength() { return alleleLength; }
+ public String getAllele() { return allele; }
+
+ @Override
+ public String toString() {
+ return "DataEntry{" +
+ "readName='" + readName + '\'' +
+ ", isNegativeStrand=" + isNegativeStrand +
+ ", type=" + type +
+ ", refPos=" + refPos +
+ ", alleleLength=" + alleleLength +
+ ", allele='" + allele + '\'' +
+ '}';
+ }
+ }
+
+ private SimpleInterval curInterval = null;
+ private Set d = null;
+
+ BufferedWriter readStatsWriter, intervalStatsWriter;
+
+ @Override
+ public void onTraversalStart() {
+ try {
+ readStatsWriter = new BufferedWriter(new FileWriter(perReadPathName));
+ readStatsWriter.write(Joiner.on("\t").join(
+ "interval",
+ "readName",
+ "isNegativeStrand",
+ "lengthOnReference",
+ "numDeletedBases",
+ "numInsertedBases",
+ "delLengths",
+ "insLengths",
+ "insAlleles"
+ ) + "\n");
+
+ intervalStatsWriter = new BufferedWriter(new FileWriter(perIntervalPathName));
+ intervalStatsWriter.write(Joiner.on("\t").join(
+ "interval",
+ "numReads",
+ "lengthOnReference",
+ "allNumDeletedBasesFwd",
+ "allNumDeletedBasesRev",
+ "allNumInsertedBasesFwd",
+ "allNumInsertedBasesRev",
+ "allDelLengthsFwd",
+ "allDelLengthsRev",
+ "allInsAllelesFwd",
+ "allInsAllelesRev",
+ "allInsAllelesFwd",
+ "allInsAllelesRev"
+ ) + "\n");
+ } catch (IOException e) {
+ throw new GATKException(e.getMessage());
+ }
+ }
+
+ @Override
+ public void apply(SimpleInterval interval, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) {
+ if (curInterval == null || interval.getStart() != curInterval.getStart()) {
+ if (d != null) {
+ summarizeInterval(curInterval, d);
+ }
+
+ curInterval = interval;
+ d = new HashSet<>();
+
+ logger.info("{}", interval);
+ }
+
+ for (GATKRead read : readsContext) {
+ int curOffset = 1;
+ for (CigarElement ce : read.getCigar()) {
+ int opOffset = curOffset;
+ int refOffset = 0;
+ if (ce.getOperator().isIndel()) {
+ opOffset = curOffset - 1;
+ if (ce.getOperator().equals(CigarOperator.D)) {
+ refOffset = 1;
+ }
+ }
+ int refPos = read.convertToSAMRecord(getHeaderForReads()).getReferencePositionAtReadPosition(opOffset) + refOffset;
+
+ int alleleLength = ce.getLength();
+ String allele = ce.getOperator().equals(CigarOperator.D) ? "." : read.getBasesString().substring(curOffset - 1, curOffset - 1 + alleleLength);
+
+ if (ce.getOperator().isIndel() && alleleLength > 3 && interval.overlapsWithMargin(new SimpleInterval(interval.getContig(), refPos, refPos + alleleLength), 10)) {
+ d.add(new DataEntry(read.getName(), read.isReverseStrand(), ce.getOperator(), refPos, alleleLength, allele));
+ }
+
+ curOffset += ce.getOperator().consumesReadBases() ? ce.getLength() : 0;
+ }
+ }
+ }
+
+ private void summarizeInterval(SimpleInterval interval, Set d) {
+ Map readNames = new HashMap<>();
+ if ( !d.isEmpty() ) {
+ for (DataEntry de : d) {
+ readNames.put(de.getReadName(), de);
+ }
+
+ int allNumDeletedBasesFwd = 0;
+ int allNumDeletedBasesRev = 0;
+ int allNumInsertedBasesFwd = 0;
+ int allNumInsertedBasesRev = 0;
+ List allDelLengthsFwd = new ArrayList<>();
+ List allDelLengthsRev = new ArrayList<>();
+ List allInsLengthsFwd = new ArrayList<>();
+ List allInsLengthsRev = new ArrayList<>();
+ List allInsAllelesFwd = new ArrayList<>();
+ List allInsAllelesRev = new ArrayList<>();
+
+ for (String readName : readNames.keySet()) {
+ int numDeletedBases = 0;
+ int numInsertedBases = 0;
+ boolean isNegativeStrand = false;
+ List delLengths = new ArrayList<>();
+ List insLengths = new ArrayList<>();
+ List insAlleles = new ArrayList<>();
+
+ DataEntry de = readNames.get(readName);
+
+ numDeletedBases += de.getType().equals(CigarOperator.D) ? de.alleleLength : 0;
+ numInsertedBases += de.getType().equals(CigarOperator.I) ? de.alleleLength : 0;
+ isNegativeStrand = de.isNegativeStrand();
+
+ if (de.getType().equals(CigarOperator.D)) {
+ delLengths.add(de.getAlleleLength());
+ } else {
+ insLengths.add(de.getAlleleLength());
+ insAlleles.add(de.allele);
+ }
+
+ if (isNegativeStrand) {
+ allNumDeletedBasesRev += numDeletedBases;
+ allNumInsertedBasesRev += numInsertedBases;
+ allDelLengthsRev.addAll(delLengths);
+ allInsLengthsRev.addAll(insLengths);
+ allInsAllelesRev.addAll(insAlleles);
+ } else {
+ allNumDeletedBasesFwd += numDeletedBases;
+ allNumInsertedBasesFwd += numInsertedBases;
+ allDelLengthsFwd.addAll(delLengths);
+ allInsLengthsFwd.addAll(insLengths);
+ allInsAllelesFwd.addAll(insAlleles);
+ }
+
+ try {
+ readStatsWriter.write(Joiner.on("\t").join(
+ interval,
+ readName,
+ isNegativeStrand ? "-" : "+",
+ interval.getLengthOnReference(),
+ numDeletedBases,
+ numInsertedBases,
+ Joiner.on(",").join(delLengths),
+ Joiner.on(",").join(insLengths),
+ !insAlleles.isEmpty() ? Joiner.on(",").join(insAlleles) : "."
+ ) + "\n");
+ } catch (IOException e) {
+ throw new GATKException(e.getMessage());
+ }
+ }
+
+ try {
+ intervalStatsWriter.write(Joiner.on("\t").join(
+ interval,
+ readNames.size(),
+ interval.getLengthOnReference(),
+ allNumDeletedBasesFwd,
+ allNumDeletedBasesRev,
+ allNumInsertedBasesFwd,
+ allNumInsertedBasesRev,
+ Joiner.on(",").join(allDelLengthsFwd),
+ Joiner.on(",").join(allDelLengthsRev),
+ Joiner.on(",").join(allInsAllelesFwd),
+ Joiner.on(",").join(allInsAllelesRev),
+ !allInsAllelesFwd.isEmpty() ? Joiner.on(",").join(allInsAllelesFwd) : ".",
+ !allInsAllelesRev.isEmpty() ? Joiner.on(",").join(allInsAllelesRev) : "."
+ ) + "\n");
+ } catch (IOException e) {
+ throw new GATKException(e.getMessage());
+ }
+ }
+ }
+
+ @Override
+ public void closeTool() {
+ summarizeInterval(curInterval, d);
+
+ try {
+ readStatsWriter.close();
+ intervalStatsWriter.close();
+ } catch (IOException e) {
+ throw new GATKException(e.getMessage());
+ }
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/QuicklyCountErrors.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/QuicklyCountErrors.java
new file mode 100644
index 00000000000..f61f4545680
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/QuicklyCountErrors.java
@@ -0,0 +1,93 @@
+package org.broadinstitute.hellbender.tools.walkers.longreads;
+
+import htsjdk.samtools.CigarElement;
+import htsjdk.samtools.CigarOperator;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.BetaFeature;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.argparser.ExperimentalFeature;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.cmdline.programgroups.LongReadProgramGroup;
+import org.broadinstitute.hellbender.engine.FeatureContext;
+import org.broadinstitute.hellbender.engine.ReadWalker;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+
+import java.io.FileNotFoundException;
+import java.io.PrintStream;
+
+/**
+ * Quickly count errors
+ *
+ * Input
+ *
+ * - A single BAM file
+ *
+ *
+ * Outputs
+ *
+ * - A collection of BAM files where care has been keep reads from the same ZMW in the same file
+ *
+ *
+ * Usage Example
+ * Quickly count errors
+ *
+ * gatk QuicklyCountErrors \
+ * -I input.bam \
+ * -O output.txt
+ *
+ */
+@DocumentedFeature
+@ExperimentalFeature
+@CommandLineProgramProperties(
+ summary = "Quickly count errors",
+ oneLineSummary = "Quickly count errors",
+ programGroup = LongReadProgramGroup.class
+)
+public final class QuicklyCountErrors extends ReadWalker {
+ @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+ shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
+ doc="Path to which variants should be written")
+ public String outPathName;
+ //final Path outPath = IOUtils.getPath(outPathName);
+
+ private long numMismatches = 0;
+ private long numInsertions = 0;
+ private long numDeletions = 0;
+ private long numBases = 0;
+
+ @Override
+ public void apply( GATKRead read, ReferenceContext referenceContext, FeatureContext featureContext ) {
+ boolean xseen = false;
+ for (CigarElement ce : read.getCigarElements()) {
+ if (ce.getOperator().equals(CigarOperator.X)) {
+ xseen = true;
+ numMismatches++;
+ } else if (ce.getOperator().equals(CigarOperator.D)) {
+ numDeletions += ce.getLength();
+ } else if (ce.getOperator().equals(CigarOperator.I)) {
+ numInsertions += ce.getLength();
+ }
+ }
+
+ if (!xseen && read.hasAttribute("NM")) {
+ numMismatches += read.getAttributeAsInteger("NM");
+ }
+
+ numBases += read.getLength();
+ }
+
+ @Override
+ public void closeTool() {
+ try ( final PrintStream out = new PrintStream(outPathName) ) {
+ out.println(String.join(" ", "numMismatches", "numDeletions", "numInsertions", "numBases"));
+ out.println(String.join(" ", Long.valueOf(numMismatches).toString(), Long.valueOf(numDeletions).toString(), Long.valueOf(numInsertions).toString(), Long.valueOf(numBases).toString()));
+ } catch (final FileNotFoundException e) {
+ throw new UserException("Cannot open given outfile: " + outPathName, e);
+ }
+
+ logger.info("numMismatches: " + numMismatches + " numDeletions: " + numDeletions + " numInsertions: " + numInsertions + " numBases:" + numBases);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/QuicklyCountMismatches.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/QuicklyCountMismatches.java
new file mode 100644
index 00000000000..71d6f001838
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/QuicklyCountMismatches.java
@@ -0,0 +1,138 @@
+package org.broadinstitute.hellbender.tools.walkers.longreads;
+
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.BetaFeature;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.argparser.ExperimentalFeature;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.utils.pileup.PileupElement;
+import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
+import org.broadinstitute.hellbender.exceptions.UserException;
+
+import java.io.FileNotFoundException;
+import java.io.PrintStream;
+import java.util.*;
+
+/**
+ * Quickly count errors
+ *
+ * Input
+ *
+ * - A single BAM file
+ *
+ *
+ * Outputs
+ *
+ * - A collection of BAM files where care has been keep reads from the same ZMW in the same file
+ *
+ *
+ * Usage Example
+ * Quickly count errors
+ *
+ * gatk QuicklyCountMismatches \
+ * -I input.bam \
+ * -O output.txt
+ *
+ */
+@DocumentedFeature
+@ExperimentalFeature
+@CommandLineProgramProperties(
+ summary = "Quickly count errors",
+ oneLineSummary = "Quickly count errors",
+ programGroup = ReadDataManipulationProgramGroup.class
+)
+public final class QuicklyCountMismatches extends LocusWalker {
+ @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+ shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
+ doc="Path to which variants should be written")
+ public String outPathName;
+
+ @Argument(fullName = "insHist",
+ shortName = "insHist",
+ doc="Path to which variants should be written")
+ public String insPathName;
+
+ @Argument(fullName = "delHist",
+ shortName = "delHist",
+ doc="Path to which variants should be written")
+ public String delPathName;
+
+ private final Map numMismatchesMap = new TreeMap<>();
+ private final Map numInsertionsMap = new TreeMap<>();
+ private final Map numDeletionsMap = new TreeMap<>();
+ private final Map numBasesMap = new TreeMap<>();
+
+ private final Map insHist = new TreeMap<>();
+ private final Map delHist = new TreeMap<>();
+
+ @Override
+ public void apply(AlignmentContext alignmentContext, ReferenceContext referenceContext, FeatureContext featureContext) {
+ for (PileupElement pe : alignmentContext.getBasePileup()) {
+ int npr = (pe.getRead().hasAttribute("np")) ? pe.getRead().getAttributeAsInteger("np") : 0;
+
+ for (int np : Arrays.asList(-1, npr)) {
+ if (!numMismatchesMap.containsKey(np)) { numMismatchesMap.put(np, 0L); }
+ if (!numInsertionsMap.containsKey(np)) { numInsertionsMap.put(np, 0L); }
+ if (!numDeletionsMap.containsKey(np)) { numDeletionsMap.put(np, 0L); }
+ if (!numBasesMap.containsKey(np)) { numBasesMap.put(np, 0L); }
+
+ if (pe.isDeletion() && pe.atStartOfCurrentCigar()) {
+ int len = pe.getCurrentCigarElement().getLength();
+ if (!delHist.containsKey(len)) { delHist.put(len, 0); }
+ delHist.put(len, delHist.get(len) + 1);
+
+ if (len > 1) {
+ numDeletionsMap.put(np, numDeletionsMap.get(np) + 1L);
+ }
+
+ //logger.info("{} {} {} {} {}", pe.getRead(), pe.getCurrentCigarElement(), pe.getCurrentCigarOffset(), pe.getOffsetInCurrentCigar(), pe);
+ } else if (pe.isBeforeInsertion() && pe.getBasesOfImmediatelyFollowingInsertion() != null) {
+ int len = pe.getBasesOfImmediatelyFollowingInsertion().length();
+
+ if (!insHist.containsKey(len)) { insHist.put(len, 0); }
+ insHist.put(len, insHist.get(len) + 1);
+
+ if (len > 1) {
+ numInsertionsMap.put(np, numInsertionsMap.get(np) + 1L);
+ }
+ } else {
+ if (pe.getBase() != referenceContext.getBase()) {
+ numMismatchesMap.put(np, numMismatchesMap.get(np) + 1L);
+ }
+ }
+
+ numBasesMap.put(np, numBasesMap.get(np) + 1L);
+ }
+ }
+ }
+
+ @Override
+ public void closeTool() {
+ try (final PrintStream out = new PrintStream(outPathName)) {
+
+ for ( final int np : numMismatchesMap.keySet() ) {
+ out.println(String.join(" ", numMismatchesMap.get(np).toString(), numInsertionsMap.get(np).toString(), numDeletionsMap.get(np).toString(), numBasesMap.get(np).toString()));
+ }
+ } catch (final FileNotFoundException e) {
+ throw new UserException("Cannot open given outfile: " + outPathName, e);
+ }
+
+ try (PrintStream outIns = new PrintStream(insPathName)) {
+ for ( final int len : insHist.keySet() ) {
+ outIns.println(String.join(" ", Integer.valueOf(len).toString(), insHist.get(len).toString()));
+ }
+ } catch (final FileNotFoundException e) {
+ throw new UserException("Cannot open given outfile: " + insPathName, e);
+ }
+
+ try (final PrintStream outDel = new PrintStream(delPathName)) {
+ for ( final int len : delHist.keySet() ) {
+ outDel.println(String.join(" ", Integer.valueOf(len).toString(), delHist.get(len).toString()));
+ }
+ } catch (final FileNotFoundException e) {
+ throw new UserException("Cannot open given outfile: " + delPathName, e);
+ }
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/RecoverUncorrectedReads.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/RecoverUncorrectedReads.java
new file mode 100644
index 00000000000..fef857a9d4b
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/RecoverUncorrectedReads.java
@@ -0,0 +1,116 @@
+package org.broadinstitute.hellbender.tools.walkers.longreads;
+
+import htsjdk.samtools.*;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.BetaFeature;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.argparser.ExperimentalFeature;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.engine.FeatureContext;
+import org.broadinstitute.hellbender.engine.GATKPath;
+import org.broadinstitute.hellbender.engine.ReadWalker;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.utils.io.IOUtils;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;
+import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Recover uncorrected reads
+ *
+ * Input
+ *
+ * - The uncorrected BAM file
+ * - The CCS-corrected BAM file
+ *
+ *
+ * Outputs
+ *
+ * - A BAM file of remaining subreads that were not corrected by the CCS step
+ *
+ *
+ * Usage Example
+ * Quickly count errors
+ *
+ * gatk RecoverUncorrectedReads \
+ * -I input.bam \
+ * -C ccs.bam \
+ * -O output.bam
+ *
+ */
+@DocumentedFeature
+@ExperimentalFeature
+@CommandLineProgramProperties(
+ summary = "Recover uncorrected reads",
+ oneLineSummary = "Recover uncorrected reads",
+ programGroup = ReadDataManipulationProgramGroup.class
+)
+public final class RecoverUncorrectedReads extends ReadWalker {
+ @Argument(fullName = "corrected",
+ shortName = "C",
+ doc="CCS-corrected reads")
+ public String corrected;
+
+ @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+ shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
+ doc="Write output to this file")
+ public String output;
+ private SAMFileGATKReadWriter outputWriter;
+
+ private final Set correctedReadNames = new HashSet<>();
+ private int uncorrectedReads = 0;
+
+ @Override
+ public void onTraversalStart() {
+ final SamReaderFactory srf = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT);
+
+ try (final SamReader srs = srf.open(IOUtils.getPath(corrected))) {
+ for ( final SAMRecord sr : srs ) {
+ String[] name = sr.getReadName().split("/");
+ String readZmwName = name[ 0 ] + "/" + name[ 1 ];
+ correctedReadNames.add(readZmwName);
+ }
+
+ } catch (final IOException e) {
+ throw new UserException("Unable to open path: " + corrected, e);
+ }
+
+ outputWriter = createSAMWriter(new GATKPath(IOUtils.getPath(output).toUri().toString()), true);
+ }
+
+ @Override
+ public void apply( GATKRead read, ReferenceContext referenceContext, FeatureContext featureContext ) {
+ String[] name = read.getName().split("/");
+ if (name.length >= 2) {
+ String readZmwName = name[0] + "/" + name[1];
+ if (!correctedReadNames.contains(readZmwName)) {
+ outputWriter.addRead(read);
+ uncorrectedReads++;
+
+ logger.debug("uncorrected: {}", readZmwName);
+ } else {
+ logger.debug("corrected: {}", readZmwName);
+ }
+ } else {
+ outputWriter.addRead(read);
+ uncorrectedReads++;
+
+ logger.debug("uncorrected: {}", read.getName());
+ }
+ }
+
+ @Override
+ public void closeTool() {
+ logger.info("corrected: {} ; uncorrected: {}", correctedReadNames.size(), uncorrectedReads);
+
+ if ( outputWriter != null ) {
+ outputWriter.close();
+ }
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/RepairLongReadBam.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/RepairLongReadBam.java
new file mode 100644
index 00000000000..c7052e140a8
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/RepairLongReadBam.java
@@ -0,0 +1,131 @@
+package org.broadinstitute.hellbender.tools.walkers.longreads;
+
+import htsjdk.samtools.*;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.BetaFeature;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.argparser.ExperimentalFeature;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.utils.io.IOUtils;
+import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
+import org.broadinstitute.hellbender.engine.FeatureContext;
+import org.broadinstitute.hellbender.engine.ReadWalker;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+
+import java.io.File;
+import java.util.List;
+
+/**
+ * Restore annotations from unaligned BAM files that are discarded during conversion by samtools fastq
+ *
+ * Input
+ *
+ * - A unaligned BAM file
+ * - A aligned BAM file
+ *
+ *
+ * Outputs
+ *
+ * - An aligned BAM file with annotations restored
+ *
+ *
+ * Usage Example
+ * Restore annotations from unaligned BAM files that may be discarded during conversion by samtools fastq
+ *
+ * gatk RepairLongReadBam \
+ * -I unaligned.bam \
+ * -A aligned.bam \
+ * -O restored.bam
+ *
+ */
+@DocumentedFeature
+@ExperimentalFeature
+@CommandLineProgramProperties(
+ summary = "Restore annotations from unaligned BAM files that are discarded during conversion by samtools fastq",
+ oneLineSummary = "Restore annotations from unaligned BAM files that are discarded during conversion by samtools fastq",
+ programGroup = ReadDataManipulationProgramGroup.class
+)
+public final class RepairLongReadBam extends ReadWalker {
+ @Argument(fullName = "aligned", shortName = "A", doc="aligned reads")
+ public String aligned;
+
+ @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc="Write output to this file")
+ public String output;
+
+ @Argument(fullName = "sort", shortName = "s", doc="Sort output", optional = true)
+ public boolean sort = false;
+
+ private SAMFileWriter writer;
+ private SAMRecordIterator it;
+ private SAMRecord currentAlignedRead;
+
+ @Override
+ public void onTraversalStart() {
+ if ( readArguments.getReadPathSpecifiers().size() != 1 ) {
+ throw new UserException("Specify a single unaligned BAM file to -I and a single aligned BAM file to -A");
+ }
+
+ SamReaderFactory srf = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT);
+ SamReader srs = srf.open(IOUtils.getPath(aligned));
+
+ if (getHeaderForReads() != null && getHeaderForReads().getReadGroups() != null && getHeaderForReads().getReadGroups().size() != 1) {
+ throw new UserException("One (and only one) read group per aligned/unaligned BAM file required");
+ }
+
+ it = srs.iterator();
+ currentAlignedRead = it.hasNext() ? it.next() : null;
+
+ writer = createWriter(output, srs.getFileHeader(), srs.getFileHeader().getSequenceDictionary(), sort);
+ }
+
+ @Override
+ public void apply( GATKRead read, ReferenceContext referenceContext, FeatureContext featureContext ) {
+ SAMRecord unalignedRead = read.convertToSAMRecord(this.getHeaderForReads());
+
+ do {
+ if (currentAlignedRead != null) {
+ if (!unalignedRead.getReadName().equals(currentAlignedRead.getReadName())) {
+ throw new UserException(
+ "Expected aligned read to have name '" +
+ unalignedRead.getReadName() +
+ "', but saw '" +
+ currentAlignedRead.getReadName() +
+ "'. Are these unaligned and aligned versions of exactly the same data?"
+ );
+ } else {
+ List attrs = unalignedRead.getAttributes();
+ attrs.addAll(currentAlignedRead.getAttributes());
+
+ for (SAMRecord.SAMTagAndValue tv : attrs) {
+ currentAlignedRead.setAttribute(tv.tag, tv.value);
+ }
+
+ writer.addAlignment(currentAlignedRead);
+ }
+ }
+
+ currentAlignedRead = it.hasNext() ? it.next() : null;
+ } while (currentAlignedRead != null && unalignedRead.getReadName().equals(currentAlignedRead.getReadName()));
+ }
+
+ @Override
+ public void closeTool() {
+ if ( writer != null ) {
+ writer.close();
+ }
+ }
+
+ /**
+ * Creates SAMFileWriter
+ * @return A SAMFileWriter.
+ */
+ private SAMFileWriter createWriter(String out, SAMFileHeader sfh, SAMSequenceDictionary ssd, boolean sortOutput) {
+ sfh.setSortOrder(sortOutput ? SAMFileHeader.SortOrder.coordinate : SAMFileHeader.SortOrder.unsorted);
+ sfh.setSequenceDictionary(ssd);
+
+ return new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, !sortOutput, new File(out));
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/ReplaceQuals.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/ReplaceQuals.java
new file mode 100644
index 00000000000..67b27b2b212
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/ReplaceQuals.java
@@ -0,0 +1,72 @@
+package org.broadinstitute.hellbender.tools.walkers.longreads;
+
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.BetaFeature;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.argparser.ExperimentalFeature;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.utils.io.IOUtils;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;
+import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
+
+import java.util.Arrays;
+
+/**
+ * Quickly count errors
+ *
+ * Input
+ *
+ * - A single BAM file
+ *
+ *
+ * Outputs
+ *
+ * - A collection of BAM files where care has been keep reads from the same ZMW in the same file
+ *
+ *
+ * Usage Example
+ * Quickly count errors
+ *
+ * gatk QuicklyCountMismatches \
+ * -I input.bam \
+ * -O output.txt
+ *
+ */
+@DocumentedFeature
+@ExperimentalFeature
+@CommandLineProgramProperties(
+ summary = "Quickly replace read quals with fixed value (@).",
+ oneLineSummary = "Quickly replace read quals with fixed value (@).",
+ programGroup = ReadDataManipulationProgramGroup.class
+)
+public final class ReplaceQuals extends ReadWalker {
+ @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+ shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
+ doc="Write output to this file")
+ public String output;
+ private SAMFileGATKReadWriter outputWriter;
+
+ @Override
+ public void onTraversalStart() {
+ outputWriter = createSAMWriter(new GATKPath(IOUtils.getPath(output).toUri().toString()), true);
+ }
+
+ @Override
+ public void apply(GATKRead read, ReferenceContext referenceContext, FeatureContext featureContext ) {
+ byte[] quals = new byte[read.getBases().length];
+ Arrays.fill(quals, (byte) 40);
+ read.setBaseQualities(quals);
+
+ outputWriter.addRead(read);
+ }
+
+ @Override
+ public void closeTool() {
+ if ( outputWriter != null ) {
+ outputWriter.close();
+ }
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/ShardLongReads.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/ShardLongReads.java
new file mode 100644
index 00000000000..7dd31ef9ebd
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/longreads/ShardLongReads.java
@@ -0,0 +1,127 @@
+package org.broadinstitute.hellbender.tools.walkers.longreads;
+
+import htsjdk.samtools.util.IOUtil;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.BetaFeature;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.argparser.ExperimentalFeature;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.cmdline.programgroups.LongReadProgramGroup;
+import org.broadinstitute.hellbender.engine.GATKPath;
+import org.broadinstitute.hellbender.engine.FeatureContext;
+import org.broadinstitute.hellbender.engine.ReadWalker;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;
+
+import java.io.File;
+
+/**
+ * Shards long read BAMs, keeping reads from the same ZMW in the same file if ZMW information is present",
+ *
+ * Input
+ *
+ * - A single BAM file
+ *
+ *
+ * Outputs
+ *
+ * - A collection of BAM files where care has been keep reads from the same ZMW in the same file
+ *
+ *
+ * Usage Example
+ * Split reads in BAM file by sample name, read group and library name
+ *
+ * gatk ShardLongReads \
+ * -I input.bam \
+ * -O outputDirectory \
+ * --num-reads-per-split 10000
+ *
+ */
+@DocumentedFeature
+@ExperimentalFeature
+@CommandLineProgramProperties(
+ summary = "Shards long read BAMs and keep subreads from the same ZMW in the same file if ZMW information is present",
+ oneLineSummary = "Shards long read BAMs and keep subreads from the same ZMW in the same file if ZMW information is present",
+ programGroup = LongReadProgramGroup.class
+)
+public final class ShardLongReads extends ReadWalker {
+ public static final String READS_PER_SPLIT_LONG_NAME = "num-reads-per-split";
+ public static final String READS_PER_SPLIT_SHORT_NAME = "nr";
+
+ @Argument(
+ shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
+ fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+ doc = "The directory to output SAM/BAM/CRAM files."
+ )
+ public File OUTPUT_DIRECTORY = new File("");
+
+ @Argument(
+ fullName = READS_PER_SPLIT_LONG_NAME,
+ shortName = READS_PER_SPLIT_SHORT_NAME,
+ doc = "Approximate number of reads per split"
+ )
+ public int NUM_READS_PER_SPLIT = 10000;
+
+ private SAMFileGATKReadWriter writer;
+ private int shardIndex = 0;
+
+ private GATKRead lastRead = null;
+ private int numWrittenToShard = 0;
+
+ @Override
+ public void onTraversalStart() {
+ IOUtil.assertDirectoryIsWritable(OUTPUT_DIRECTORY);
+ if ( readArguments.getReadPathSpecifiers().size() != 1 ) {
+ throw new UserException("This tool only accepts a single SAM/BAM/CRAM as input");
+ }
+ writer = createWriter(shardIndex);
+ }
+
+ @Override
+ public void apply( GATKRead read, ReferenceContext referenceContext, FeatureContext featureContext ) {
+ if (numWrittenToShard < NUM_READS_PER_SPLIT) {
+ writer.addRead(read);
+ } else {
+ int lastZmw = lastRead.hasAttribute("zm") ? lastRead.getAttributeAsInteger("zm") : 0;
+ int thisZmw = read.hasAttribute("zm") ? read.getAttributeAsInteger("zm") : 1;
+
+ if (lastZmw != thisZmw) {
+ writer.close();
+
+ shardIndex++;
+ numWrittenToShard = 0;
+ writer = createWriter(shardIndex);
+ }
+
+ writer.addRead(read);
+ }
+
+ lastRead = read;
+ numWrittenToShard++;
+ }
+
+ @Override
+ public void closeTool() {
+ if ( writer != null ) {
+ writer.close();
+ }
+ }
+
+ /**
+ * Creates SAMFileWriter instances for the read shards
+ * @param shardIndex index of shard
+ * @return A SAMFileWriter.
+ */
+ private SAMFileGATKReadWriter createWriter(final int shardIndex) {
+ final String base = readArguments.getReadPathSpecifiers().get(0).getBaseName().get();
+ final String extension = readArguments.getReadPathSpecifiers().get(0).getExtension().get();
+ final String key = String.format("%06d", shardIndex);
+
+ final GATKPath outFile = new GATKPath(IOUtil.toPath(new File(OUTPUT_DIRECTORY, base + "." + key + "." + extension)).toUri().toString());
+
+ return createSAMWriter(outFile, true);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/help/HelpConstants.java b/src/main/java/org/broadinstitute/hellbender/utils/help/HelpConstants.java
index e4a82809eee..4ede9572a4c 100644
--- a/src/main/java/org/broadinstitute/hellbender/utils/help/HelpConstants.java
+++ b/src/main/java/org/broadinstitute/hellbender/utils/help/HelpConstants.java
@@ -43,6 +43,10 @@ public static String forumArticle(String article) {
public final static String DOC_CAT_CNV = "Copy Number Variant Discovery";
public final static String DOC_CAT_CNV_SUMMARY = "Tools that analyze read coverage to detect copy number variants.";
+ public final static String DOC_CAT_LONGREAD = "Long Read Tools";
+ public final static String DOC_CAT_LONGREAD_SUMMARY = "Tools that are designed for use with long reads.";
+
+
public final static String DOC_CAT_EXAMPLE = "Example Tools";
public final static String DOC_CAT_EXAMPLE_SUMMARY = "Example tools that show developers how to implement new tools";