use of org.broadinstitute.hellbender.engine.ReadsDataSource in project gatk by broadinstitute.
the class ReadsSparkSource method getHeader.
/**
* Loads the header using Hadoop-BAM.
* @param filePath path to the bam.
* @param referencePath Reference path or null if not available. Reference is required for CRAM files.
* @return the header for the bam.
*/
public SAMFileHeader getHeader(final String filePath, final String referencePath) {
// GCS case
if (BucketUtils.isCloudStorageUrl(filePath)) {
try (ReadsDataSource readsDataSource = new ReadsDataSource(IOUtils.getPath(filePath))) {
return readsDataSource.getHeader();
}
}
// local file or HDFs case
try {
Path path = new Path(filePath);
FileSystem fs = path.getFileSystem(ctx.hadoopConfiguration());
if (fs.isDirectory(path)) {
FileStatus[] bamFiles = fs.listStatus(path, new PathFilter() {
private static final long serialVersionUID = 1L;
@Override
public boolean accept(Path path) {
return path.getName().startsWith(HADOOP_PART_PREFIX);
}
});
if (bamFiles.length == 0) {
throw new UserException("No BAM files to load header from in: " + path);
}
// Hadoop-BAM writes the same header to each shard, so use the first one
path = bamFiles[0].getPath();
}
setHadoopBAMConfigurationProperties(filePath, referencePath);
return SAMHeaderReader.readSAMHeaderFrom(path, ctx.hadoopConfiguration());
} catch (IOException | IllegalArgumentException e) {
throw new UserException("Failed to read bam header from " + filePath + "\n Caused by:" + e.getMessage(), e);
}
}
use of org.broadinstitute.hellbender.engine.ReadsDataSource in project gatk by broadinstitute.
the class HaplotypeCallerIntegrationTest method testBamoutProducesReasonablySizedOutput.
@Test
public void testBamoutProducesReasonablySizedOutput() {
Utils.resetRandomGenerator();
// We will test that when running with -bamout over the testInterval, we produce
// a bam with a number of reads that is within 10% of what GATK3.5 produces with
// -bamout over the same interval. This is just to test that we produce a reasonably-sized
// bam for the region, not to validate the haplotypes, etc. We don't want
// this test to fail unless there is a likely problem with -bamout itself (eg., empty
// or truncated bam).
final String testInterval = "20:10000000-10010000";
final int gatk3BamoutNumReads = 5170;
final File vcfOutput = createTempFile("testBamoutProducesReasonablySizedOutput", ".vcf");
final File bamOutput = createTempFile("testBamoutProducesReasonablySizedOutput", ".bam");
final String[] args = { "-I", NA12878_20_21_WGS_bam, "-R", b37_reference_20_21, "-L", testInterval, "-O", vcfOutput.getAbsolutePath(), "-bamout", bamOutput.getAbsolutePath(), "-pairHMM", "AVX_LOGLESS_CACHING", "-stand_call_conf", "30.0" };
runCommandLine(args);
try (final ReadsDataSource bamOutReadsSource = new ReadsDataSource(bamOutput.toPath())) {
int actualBamoutNumReads = 0;
for (final GATKRead read : bamOutReadsSource) {
++actualBamoutNumReads;
}
final int readCountDifference = Math.abs(actualBamoutNumReads - gatk3BamoutNumReads);
Assert.assertTrue(((double) readCountDifference / gatk3BamoutNumReads) < 0.10, "-bamout produced a bam with over 10% fewer/more reads than expected");
}
}
use of org.broadinstitute.hellbender.engine.ReadsDataSource in project gatk by broadinstitute.
the class ReadCoordinateComparatorUnitTest method testComparatorOrderingMatchesHtsjdkFileOrdering.
/**
* Tests that the ordering produced by ReadCoordinateComparator matches coordinate file ordering
* as produced by htsjdk's {@link SAMRecordCoordinateComparator} for a representative selection of reads. Ignores
* differences in tie-breaking done for reads with the same position -- just asserts that the reads are
* coordinate-sorted according to htsjdk, including unmapped reads with and without an assigned position.
*/
@Test
public void testComparatorOrderingMatchesHtsjdkFileOrdering() throws IOException {
// This file has unmapped reads that are set to the position of their mates, as well as
// unmapped reads with no position -- the ordering check in the test below will fail if
// our ordering of these reads relative to the mapped reads is not consistent with the
// definition of coordinate sorting as defined in htsjdk.samtools.SAMRecordCoordinateComparator
final String inputBam = publicTestDir + "org/broadinstitute/hellbender/utils/read/comparator_test_with_unmapped.bam";
final List<GATKRead> reads = new ArrayList<>();
SAMFileHeader header = null;
try (final ReadsDataSource readsSource = new ReadsDataSource(IOUtils.getPath(inputBam))) {
header = readsSource.getHeader();
for (GATKRead read : readsSource) {
reads.add(read);
}
}
// Randomize ordering and then re-sort with the ReadCoordinateComparator
Collections.shuffle(reads);
Collections.sort(reads, new ReadCoordinateComparator(header));
// Check ordering using SAMRecordCoordinateComparator.fileOrderCompare() instead of the full comparator,
// to ignore meaningless differences in tie-breaking rules for reads with the same position.
final SAMRecordCoordinateComparator samComparator = new SAMRecordCoordinateComparator();
GATKRead previousRead = null;
for (final GATKRead currentRead : reads) {
if (previousRead != null) {
Assert.assertTrue(samComparator.fileOrderCompare(previousRead.convertToSAMRecord(header), currentRead.convertToSAMRecord(header)) <= 0, "Reads are out of order: " + previousRead + " and " + currentRead);
}
previousRead = currentRead;
}
}
use of org.broadinstitute.hellbender.engine.ReadsDataSource in project gatk by broadinstitute.
the class ConvertHeaderlessHadoopBamShardToBamIntegrationTest method testConvertHeaderlessHadoopBamShardToBam.
@Test
public void testConvertHeaderlessHadoopBamShardToBam() {
final File bamShard = new File(publicTestDir + "org/broadinstitute/hellbender/utils/spark/reads_data_source_test1.bam.headerless.part-r-00000");
final File output = createTempFile("testConvertHeaderlessHadoopBamShardToBam", ".bam");
final File headerSource = new File(publicTestDir + "org/broadinstitute/hellbender/engine/reads_data_source_test1.bam");
final int expectedReadCount = 11;
List<String> args = Arrays.asList("--bamShard", bamShard.getAbsolutePath(), "--bamWithHeader", headerSource.getAbsolutePath(), "-O", output.getAbsolutePath());
runCommandLine(args);
int actualCount = 0;
try (final ReadsDataSource readsSource = new ReadsDataSource(output.toPath())) {
for (final GATKRead read : readsSource) {
++actualCount;
}
}
Assert.assertEquals(actualCount, expectedReadCount, "Wrong number of reads in final BAM file");
}
use of org.broadinstitute.hellbender.engine.ReadsDataSource in project gatk by broadinstitute.
the class MarkDuplicatesSparkIntegrationTest method testMarkDuplicatesSparkIntegrationTestLocal.
@Test(groups = "spark", dataProvider = "md")
public void testMarkDuplicatesSparkIntegrationTestLocal(final File input, final long totalExpected, final long dupsExpected, Map<String, List<String>> metricsExpected) throws IOException {
ArgumentsBuilder args = new ArgumentsBuilder();
args.add("--" + StandardArgumentDefinitions.INPUT_LONG_NAME);
args.add(input.getPath());
args.add("--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME);
File outputFile = createTempFile("markdups", ".bam");
outputFile.delete();
args.add(outputFile.getAbsolutePath());
args.add("--METRICS_FILE");
File metricsFile = createTempFile("markdups_metrics", ".txt");
args.add(metricsFile.getAbsolutePath());
runCommandLine(args.getArgsArray());
Assert.assertTrue(outputFile.exists(), "Can't find expected MarkDuplicates output file at " + outputFile.getAbsolutePath());
int totalReads = 0;
int duplicateReads = 0;
try (final ReadsDataSource outputReads = new ReadsDataSource(outputFile.toPath())) {
for (GATKRead read : outputReads) {
++totalReads;
if (read.isDuplicate()) {
++duplicateReads;
}
}
}
Assert.assertEquals(totalReads, totalExpected, "Wrong number of reads in output BAM");
Assert.assertEquals(duplicateReads, dupsExpected, "Wrong number of duplicate reads in output BAM");
final MetricsFile<DuplicationMetrics, Comparable<?>> metricsOutput = new MetricsFile<>();
try {
metricsOutput.read(new FileReader(metricsFile));
} catch (final FileNotFoundException ex) {
System.err.println("Metrics file not found: " + ex);
}
final List<DuplicationMetrics> nonEmptyMetrics = metricsOutput.getMetrics().stream().filter(metric -> metric.UNPAIRED_READS_EXAMINED != 0L || metric.READ_PAIRS_EXAMINED != 0L || metric.UNMAPPED_READS != 0L || metric.UNPAIRED_READ_DUPLICATES != 0L || metric.READ_PAIR_DUPLICATES != 0L || metric.READ_PAIR_OPTICAL_DUPLICATES != 0L || (metric.PERCENT_DUPLICATION != null && metric.PERCENT_DUPLICATION != 0.0 && !Double.isNaN(metric.PERCENT_DUPLICATION)) || (metric.ESTIMATED_LIBRARY_SIZE != null && metric.ESTIMATED_LIBRARY_SIZE != 0L)).collect(Collectors.toList());
Assert.assertEquals(nonEmptyMetrics.size(), metricsExpected.size(), "Wrong number of metrics with non-zero fields.");
for (int i = 0; i < nonEmptyMetrics.size(); i++) {
final DuplicationMetrics observedMetrics = nonEmptyMetrics.get(i);
List<?> expectedList = metricsExpected.get(observedMetrics.LIBRARY);
Assert.assertNotNull(expectedList, "Unexpected library found: " + observedMetrics.LIBRARY);
Assert.assertEquals(observedMetrics.UNPAIRED_READS_EXAMINED, expectedList.get(0));
Assert.assertEquals(observedMetrics.READ_PAIRS_EXAMINED, expectedList.get(1));
Assert.assertEquals(observedMetrics.UNMAPPED_READS, expectedList.get(2));
Assert.assertEquals(observedMetrics.UNPAIRED_READ_DUPLICATES, expectedList.get(3));
Assert.assertEquals(observedMetrics.READ_PAIR_DUPLICATES, expectedList.get(4));
Assert.assertEquals(observedMetrics.READ_PAIR_OPTICAL_DUPLICATES, expectedList.get(5));
Assert.assertEquals(observedMetrics.PERCENT_DUPLICATION, expectedList.get(6));
//so we work around it by passing in an 'expected 0L' and only comparing to it if the actual value is non-null
if (observedMetrics.ESTIMATED_LIBRARY_SIZE != null && (Long) expectedList.get(7) != 0L) {
Assert.assertEquals(observedMetrics.ESTIMATED_LIBRARY_SIZE, expectedList.get(7));
}
}
}
Aggregations