Search in sources :

Example 6 with ReadsDataSource

use of org.broadinstitute.hellbender.engine.ReadsDataSource in project gatk by broadinstitute.

the class ReadsSparkSource method getHeader.

/**
     * Loads the header using Hadoop-BAM.
     * @param filePath path to the bam.
     * @param referencePath Reference path or null if not available. Reference is required for CRAM files.
     * @return the header for the bam.
     */
public SAMFileHeader getHeader(final String filePath, final String referencePath) {
    // GCS case
    if (BucketUtils.isCloudStorageUrl(filePath)) {
        try (ReadsDataSource readsDataSource = new ReadsDataSource(IOUtils.getPath(filePath))) {
            return readsDataSource.getHeader();
        }
    }
    // local file or HDFs case
    try {
        Path path = new Path(filePath);
        FileSystem fs = path.getFileSystem(ctx.hadoopConfiguration());
        if (fs.isDirectory(path)) {
            FileStatus[] bamFiles = fs.listStatus(path, new PathFilter() {

                private static final long serialVersionUID = 1L;

                @Override
                public boolean accept(Path path) {
                    return path.getName().startsWith(HADOOP_PART_PREFIX);
                }
            });
            if (bamFiles.length == 0) {
                throw new UserException("No BAM files to load header from in: " + path);
            }
            // Hadoop-BAM writes the same header to each shard, so use the first one
            path = bamFiles[0].getPath();
        }
        setHadoopBAMConfigurationProperties(filePath, referencePath);
        return SAMHeaderReader.readSAMHeaderFrom(path, ctx.hadoopConfiguration());
    } catch (IOException | IllegalArgumentException e) {
        throw new UserException("Failed to read bam header from " + filePath + "\n Caused by:" + e.getMessage(), e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) ReadsDataSource(org.broadinstitute.hellbender.engine.ReadsDataSource) UserException(org.broadinstitute.hellbender.exceptions.UserException) IOException(java.io.IOException)

Example 7 with ReadsDataSource

use of org.broadinstitute.hellbender.engine.ReadsDataSource in project gatk by broadinstitute.

the class HaplotypeCallerIntegrationTest method testBamoutProducesReasonablySizedOutput.

@Test
public void testBamoutProducesReasonablySizedOutput() {
    Utils.resetRandomGenerator();
    // We will test that when running with -bamout over the testInterval, we produce
    // a bam with a number of reads that is within 10% of what GATK3.5 produces with
    // -bamout over the same interval. This is just to test that we produce a reasonably-sized
    // bam for the region, not to validate the haplotypes, etc. We don't want
    // this test to fail unless there is a likely problem with -bamout itself (eg., empty
    // or truncated bam).
    final String testInterval = "20:10000000-10010000";
    final int gatk3BamoutNumReads = 5170;
    final File vcfOutput = createTempFile("testBamoutProducesReasonablySizedOutput", ".vcf");
    final File bamOutput = createTempFile("testBamoutProducesReasonablySizedOutput", ".bam");
    final String[] args = { "-I", NA12878_20_21_WGS_bam, "-R", b37_reference_20_21, "-L", testInterval, "-O", vcfOutput.getAbsolutePath(), "-bamout", bamOutput.getAbsolutePath(), "-pairHMM", "AVX_LOGLESS_CACHING", "-stand_call_conf", "30.0" };
    runCommandLine(args);
    try (final ReadsDataSource bamOutReadsSource = new ReadsDataSource(bamOutput.toPath())) {
        int actualBamoutNumReads = 0;
        for (final GATKRead read : bamOutReadsSource) {
            ++actualBamoutNumReads;
        }
        final int readCountDifference = Math.abs(actualBamoutNumReads - gatk3BamoutNumReads);
        Assert.assertTrue(((double) readCountDifference / gatk3BamoutNumReads) < 0.10, "-bamout produced a bam with over 10% fewer/more reads than expected");
    }
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReadsDataSource(org.broadinstitute.hellbender.engine.ReadsDataSource) File(java.io.File) Test(org.testng.annotations.Test) CommandLineProgramTest(org.broadinstitute.hellbender.CommandLineProgramTest)

Example 8 with ReadsDataSource

use of org.broadinstitute.hellbender.engine.ReadsDataSource in project gatk by broadinstitute.

the class ReadCoordinateComparatorUnitTest method testComparatorOrderingMatchesHtsjdkFileOrdering.

/**
     * Tests that the ordering produced by ReadCoordinateComparator matches coordinate file ordering
     * as produced by htsjdk's {@link SAMRecordCoordinateComparator} for a representative selection of reads. Ignores
     * differences in tie-breaking done for reads with the same position -- just asserts that the reads are
     * coordinate-sorted according to htsjdk, including unmapped reads with and without an assigned position.
     */
@Test
public void testComparatorOrderingMatchesHtsjdkFileOrdering() throws IOException {
    // This file has unmapped reads that are set to the position of their mates, as well as
    // unmapped reads with no position -- the ordering check in the test below will fail if
    // our ordering of these reads relative to the mapped reads is not consistent with the
    // definition of coordinate sorting as defined in htsjdk.samtools.SAMRecordCoordinateComparator
    final String inputBam = publicTestDir + "org/broadinstitute/hellbender/utils/read/comparator_test_with_unmapped.bam";
    final List<GATKRead> reads = new ArrayList<>();
    SAMFileHeader header = null;
    try (final ReadsDataSource readsSource = new ReadsDataSource(IOUtils.getPath(inputBam))) {
        header = readsSource.getHeader();
        for (GATKRead read : readsSource) {
            reads.add(read);
        }
    }
    // Randomize ordering and then re-sort with the ReadCoordinateComparator
    Collections.shuffle(reads);
    Collections.sort(reads, new ReadCoordinateComparator(header));
    // Check ordering using SAMRecordCoordinateComparator.fileOrderCompare() instead of the full comparator,
    // to ignore meaningless differences in tie-breaking rules for reads with the same position.
    final SAMRecordCoordinateComparator samComparator = new SAMRecordCoordinateComparator();
    GATKRead previousRead = null;
    for (final GATKRead currentRead : reads) {
        if (previousRead != null) {
            Assert.assertTrue(samComparator.fileOrderCompare(previousRead.convertToSAMRecord(header), currentRead.convertToSAMRecord(header)) <= 0, "Reads are out of order: " + previousRead + " and " + currentRead);
        }
        previousRead = currentRead;
    }
}
Also used : ArrayList(java.util.ArrayList) ReadsDataSource(org.broadinstitute.hellbender.engine.ReadsDataSource) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 9 with ReadsDataSource

use of org.broadinstitute.hellbender.engine.ReadsDataSource in project gatk by broadinstitute.

the class ConvertHeaderlessHadoopBamShardToBamIntegrationTest method testConvertHeaderlessHadoopBamShardToBam.

@Test
public void testConvertHeaderlessHadoopBamShardToBam() {
    final File bamShard = new File(publicTestDir + "org/broadinstitute/hellbender/utils/spark/reads_data_source_test1.bam.headerless.part-r-00000");
    final File output = createTempFile("testConvertHeaderlessHadoopBamShardToBam", ".bam");
    final File headerSource = new File(publicTestDir + "org/broadinstitute/hellbender/engine/reads_data_source_test1.bam");
    final int expectedReadCount = 11;
    List<String> args = Arrays.asList("--bamShard", bamShard.getAbsolutePath(), "--bamWithHeader", headerSource.getAbsolutePath(), "-O", output.getAbsolutePath());
    runCommandLine(args);
    int actualCount = 0;
    try (final ReadsDataSource readsSource = new ReadsDataSource(output.toPath())) {
        for (final GATKRead read : readsSource) {
            ++actualCount;
        }
    }
    Assert.assertEquals(actualCount, expectedReadCount, "Wrong number of reads in final BAM file");
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReadsDataSource(org.broadinstitute.hellbender.engine.ReadsDataSource) File(java.io.File) Test(org.testng.annotations.Test) CommandLineProgramTest(org.broadinstitute.hellbender.CommandLineProgramTest)

Example 10 with ReadsDataSource

use of org.broadinstitute.hellbender.engine.ReadsDataSource in project gatk by broadinstitute.

the class MarkDuplicatesSparkIntegrationTest method testMarkDuplicatesSparkIntegrationTestLocal.

@Test(groups = "spark", dataProvider = "md")
public void testMarkDuplicatesSparkIntegrationTestLocal(final File input, final long totalExpected, final long dupsExpected, Map<String, List<String>> metricsExpected) throws IOException {
    ArgumentsBuilder args = new ArgumentsBuilder();
    args.add("--" + StandardArgumentDefinitions.INPUT_LONG_NAME);
    args.add(input.getPath());
    args.add("--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME);
    File outputFile = createTempFile("markdups", ".bam");
    outputFile.delete();
    args.add(outputFile.getAbsolutePath());
    args.add("--METRICS_FILE");
    File metricsFile = createTempFile("markdups_metrics", ".txt");
    args.add(metricsFile.getAbsolutePath());
    runCommandLine(args.getArgsArray());
    Assert.assertTrue(outputFile.exists(), "Can't find expected MarkDuplicates output file at " + outputFile.getAbsolutePath());
    int totalReads = 0;
    int duplicateReads = 0;
    try (final ReadsDataSource outputReads = new ReadsDataSource(outputFile.toPath())) {
        for (GATKRead read : outputReads) {
            ++totalReads;
            if (read.isDuplicate()) {
                ++duplicateReads;
            }
        }
    }
    Assert.assertEquals(totalReads, totalExpected, "Wrong number of reads in output BAM");
    Assert.assertEquals(duplicateReads, dupsExpected, "Wrong number of duplicate reads in output BAM");
    final MetricsFile<DuplicationMetrics, Comparable<?>> metricsOutput = new MetricsFile<>();
    try {
        metricsOutput.read(new FileReader(metricsFile));
    } catch (final FileNotFoundException ex) {
        System.err.println("Metrics file not found: " + ex);
    }
    final List<DuplicationMetrics> nonEmptyMetrics = metricsOutput.getMetrics().stream().filter(metric -> metric.UNPAIRED_READS_EXAMINED != 0L || metric.READ_PAIRS_EXAMINED != 0L || metric.UNMAPPED_READS != 0L || metric.UNPAIRED_READ_DUPLICATES != 0L || metric.READ_PAIR_DUPLICATES != 0L || metric.READ_PAIR_OPTICAL_DUPLICATES != 0L || (metric.PERCENT_DUPLICATION != null && metric.PERCENT_DUPLICATION != 0.0 && !Double.isNaN(metric.PERCENT_DUPLICATION)) || (metric.ESTIMATED_LIBRARY_SIZE != null && metric.ESTIMATED_LIBRARY_SIZE != 0L)).collect(Collectors.toList());
    Assert.assertEquals(nonEmptyMetrics.size(), metricsExpected.size(), "Wrong number of metrics with non-zero fields.");
    for (int i = 0; i < nonEmptyMetrics.size(); i++) {
        final DuplicationMetrics observedMetrics = nonEmptyMetrics.get(i);
        List<?> expectedList = metricsExpected.get(observedMetrics.LIBRARY);
        Assert.assertNotNull(expectedList, "Unexpected library found: " + observedMetrics.LIBRARY);
        Assert.assertEquals(observedMetrics.UNPAIRED_READS_EXAMINED, expectedList.get(0));
        Assert.assertEquals(observedMetrics.READ_PAIRS_EXAMINED, expectedList.get(1));
        Assert.assertEquals(observedMetrics.UNMAPPED_READS, expectedList.get(2));
        Assert.assertEquals(observedMetrics.UNPAIRED_READ_DUPLICATES, expectedList.get(3));
        Assert.assertEquals(observedMetrics.READ_PAIR_DUPLICATES, expectedList.get(4));
        Assert.assertEquals(observedMetrics.READ_PAIR_OPTICAL_DUPLICATES, expectedList.get(5));
        Assert.assertEquals(observedMetrics.PERCENT_DUPLICATION, expectedList.get(6));
        //so we work around it by passing in an 'expected 0L' and only comparing to it if the actual value is non-null
        if (observedMetrics.ESTIMATED_LIBRARY_SIZE != null && (Long) expectedList.get(7) != 0L) {
            Assert.assertEquals(observedMetrics.ESTIMATED_LIBRARY_SIZE, expectedList.get(7));
        }
    }
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) MetricsFile(htsjdk.samtools.metrics.MetricsFile) ReadsDataSource(org.broadinstitute.hellbender.engine.ReadsDataSource) MarkDuplicatesSparkTester(org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesSparkTester) MarkDuplicatesIntegrationTest(org.broadinstitute.hellbender.tools.picard.sam.markduplicates.MarkDuplicatesIntegrationTest) CommandLineProgram(org.broadinstitute.hellbender.cmdline.CommandLineProgram) DuplicationMetrics(org.broadinstitute.hellbender.utils.read.markduplicates.DuplicationMetrics) DataProvider(org.testng.annotations.DataProvider) ImmutableMap(com.google.common.collect.ImmutableMap) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) ArgumentsBuilder(org.broadinstitute.hellbender.utils.test.ArgumentsBuilder) Test(org.testng.annotations.Test) IOException(java.io.IOException) MetricsFile(htsjdk.samtools.metrics.MetricsFile) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Collectors(java.util.stream.Collectors) File(java.io.File) FileNotFoundException(java.io.FileNotFoundException) MarkDuplicatesSpark(org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSpark) AbstractMarkDuplicatesTester(org.broadinstitute.hellbender.utils.test.testers.AbstractMarkDuplicatesTester) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Assert(org.testng.Assert) Map(java.util.Map) FileReader(java.io.FileReader) AbstractMarkDuplicatesCommandLineProgramTest(org.broadinstitute.hellbender.utils.test.testers.AbstractMarkDuplicatesCommandLineProgramTest) DuplicationMetrics(org.broadinstitute.hellbender.utils.read.markduplicates.DuplicationMetrics) FileNotFoundException(java.io.FileNotFoundException) ArgumentsBuilder(org.broadinstitute.hellbender.utils.test.ArgumentsBuilder) FileReader(java.io.FileReader) ReadsDataSource(org.broadinstitute.hellbender.engine.ReadsDataSource) MetricsFile(htsjdk.samtools.metrics.MetricsFile) File(java.io.File) MarkDuplicatesIntegrationTest(org.broadinstitute.hellbender.tools.picard.sam.markduplicates.MarkDuplicatesIntegrationTest) Test(org.testng.annotations.Test) AbstractMarkDuplicatesCommandLineProgramTest(org.broadinstitute.hellbender.utils.test.testers.AbstractMarkDuplicatesCommandLineProgramTest)

Aggregations

ReadsDataSource (org.broadinstitute.hellbender.engine.ReadsDataSource)10 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)8 File (java.io.File)7 Test (org.testng.annotations.Test)7 CommandLineProgramTest (org.broadinstitute.hellbender.CommandLineProgramTest)4 IOException (java.io.IOException)3 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)3 UserException (org.broadinstitute.hellbender.exceptions.UserException)2 ArgumentsBuilder (org.broadinstitute.hellbender.utils.test.ArgumentsBuilder)2 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 SAMFileHeader (htsjdk.samtools.SAMFileHeader)1 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)1 MetricsFile (htsjdk.samtools.metrics.MetricsFile)1 FileNotFoundException (java.io.FileNotFoundException)1 FileReader (java.io.FileReader)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Map (java.util.Map)1 Collectors (java.util.stream.Collectors)1