Search in sources :

Example 1 with GATKException

use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.

the class ParallelCopyGCSDirectoryIntoHDFSSpark method runTool.

@Override
protected void runTool(final JavaSparkContext ctx) {
    if (!BucketUtils.isCloudStorageUrl(inputGCSPath)) {
        throw new UserException("Input path " + inputGCSPath + " is not a GCS URI");
    }
    if (!BucketUtils.isHadoopUrl(outputHDFSDirectory)) {
        throw new UserException("Output directory " + outputHDFSDirectory + " is not an HDFS URI");
    }
    final String inputGCSPathFinal = inputGCSPath;
    final String outputDirectoryFinal = outputHDFSDirectory;
    org.apache.hadoop.fs.Path outputHdfsDirectoryPath = new org.apache.hadoop.fs.Path(outputHDFSDirectory);
    try (FileSystem fs = outputHdfsDirectoryPath.getFileSystem(new Configuration())) {
        if (fs.exists(outputHdfsDirectoryPath)) {
            throw new UserException("Specified output directory " + outputHdfsDirectoryPath + " already exists. Please specify a new directory name.");
        }
        fs.mkdirs(outputHdfsDirectoryPath);
        final long chunkSize = getChunkSize(fs);
        final List<Path> gcsNIOPaths = getGCSFilePathsToCopy(inputGCSPathFinal);
        List<Tuple2<String, Integer>> chunkList = setupChunks(chunkSize, gcsNIOPaths);
        if (chunkList.size() == 0) {
            logger.info("no files found to copy");
            return;
        }
        final JavaPairRDD<String, Integer> chunkRDD = ctx.parallelizePairs(chunkList, chunkList.size());
        final JavaPairRDD<String, Tuple2<Integer, String>> chunkMappingRDD = chunkRDD.mapToPair(p -> new Tuple2<>(p._1(), readChunkToHdfs(p._1(), chunkSize, p._2(), outputDirectoryFinal)));
        final Map<String, Iterable<Tuple2<Integer, String>>> chunksByFilePath = chunkMappingRDD.groupByKey().collectAsMap();
        concatenateChunks(outputDirectoryFinal, fs, gcsNIOPaths, chunksByFilePath);
    } catch (NoSuchFileException e) {
        throw new UserException("Could not locate input path " + e.getFile() + ". If you are trying to copy an entire directory, please include a trailing slash on your path.");
    } catch (IOException e) {
        throw new GATKException(e.getMessage(), e);
    }
}
Also used : Path(java.nio.file.Path) Configuration(org.apache.hadoop.conf.Configuration) NoSuchFileException(java.nio.file.NoSuchFileException) Tuple2(scala.Tuple2) FileSystem(org.apache.hadoop.fs.FileSystem) UserException(org.broadinstitute.hellbender.exceptions.UserException) GATKException(org.broadinstitute.hellbender.exceptions.GATKException)

Example 2 with GATKException

use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.

the class HDF5PCACoveragePoN method renderPoNTargets.

private static List<Target> renderPoNTargets(final String[][] values, final List<String> targetNamesToRender, final HDF5File reader) {
    if (values.length != targetNamesToRender.size()) {
        throw new GATKException(String.format("Wrong number of elements in the targets recovered " + "from file '%s': number of targets found in file (%d) != number of target names (%d)", reader.getFile(), values.length, targetNamesToRender.size()));
    }
    final int numTargetCols = (int) reader.readDouble(NUM_TARGET_COLUMNS_PATH);
    final List<Target> result = new ArrayList<>(values.length);
    for (int i = 0; i < values.length; i++) {
        if (values[i].length != numTargetCols) {
            throw new GATKException(String.format("Wrong number of column elements in the targets recovered " + "from file '%s': number of columns found in file (%d) != number of target columns (%d)", reader.getFile(), values[i].length, numTargetCols));
        }
        result.add(new Target(targetNamesToRender.get(i), new SimpleInterval(values[i][0], Integer.parseInt(values[i][1]), Integer.parseInt(values[i][2]))));
    }
    return result;
}
Also used : Target(org.broadinstitute.hellbender.tools.exome.Target) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) GATKException(org.broadinstitute.hellbender.exceptions.GATKException)

Example 3 with GATKException

use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.

the class PathSeqFilterSpark method runTool.

@Override
protected void runTool(final JavaSparkContext ctx) {
    final JavaRDD<GATKRead> reads = getReads();
    //Filter secondary/supplementary reads and reads that fail the vendor quality check
    final JavaRDD<GATKRead> primaryReads = reads.filter(read -> !(read.isSecondaryAlignment() || read.failsVendorQualityCheck() || read.isSupplementaryAlignment()));
    logger.info("Loaded " + reads.count() + " reads");
    //Mark and filter optical duplicates
    final OpticalDuplicateFinder finder = new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null);
    final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(primaryReads, getHeaderForReads(), MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES, finder, getRecommendedNumReducers());
    final JavaRDD<GATKRead> markedFilteredReads = markedReads.filter(new ReadFilterSparkifier(new MarkedOpticalDuplicateReadFilter()));
    logger.info("Reads remaining after de-duplication: " + markedFilteredReads.count());
    //Apply DUST masking
    final JavaRDD<GATKRead> readsDUSTMasked = markedFilteredReads.map(new ReadTransformerSparkifier(new DUSTReadTransformer(DUST_MASK, DUST_W, DUST_T)));
    //Apply base quality hard clipping
    final JavaRDD<GATKRead> readsClipped = readsDUSTMasked.map(new ReadTransformerSparkifier(new BaseQualityClipReadTransformer(READ_TRIM_THRESH)));
    //Filter reads with less than MIN_READ_LENGTH bases
    final JavaRDD<GATKRead> readsLengthFiltered = readsClipped.filter(new ReadFilterSparkifier(new ReadLengthReadFilter(MIN_READ_LENGTH, Integer.MAX_VALUE)));
    logger.info("Reads remaining after clipping: " + readsLengthFiltered.count());
    //Change low-quality bases to 'N'
    final JavaRDD<GATKRead> readsBQFiltered = readsLengthFiltered.map(new ReadTransformerSparkifier(new BaseQualityReadTransformer(QUAL_PHRED_THRESH)));
    //Filter reads with too many 'N's
    final JavaRDD<GATKRead> readsAmbigFiltered = readsBQFiltered.filter(new ReadFilterSparkifier(new AmbiguousBaseReadFilter(FRAC_N_THRESHOLD)));
    logger.info("Reads remaining after ambiguous base filtering: " + readsAmbigFiltered.count());
    //Load Kmer hopscotch set and filter reads containing > 0 matching kmers
    final JavaRDD<GATKRead> readsKmerFiltered = doKmerFiltering(ctx, readsAmbigFiltered);
    logger.info("Reads remaining after kmer filtering: " + readsKmerFiltered.count());
    //Filter unpaired reads
    final JavaRDD<GATKRead> readsFilteredPaired = retainPairs(readsKmerFiltered);
    logger.info("Reads remaining after unpaired filtering: " + readsFilteredPaired.count());
    //BWA filtering against user-specified host organism reference
    header = getHeaderForReads();
    final JavaRDD<GATKRead> readsAligned = doHostBWA(ctx, header, readsFilteredPaired);
    //Get unmapped reads (note these always come in pairs)
    //TODO: retain read pairs by alignment score instead of flags
    final JavaRDD<GATKRead> readsNonHost = readsAligned.filter(read -> read.isUnmapped() && read.mateIsUnmapped());
    logger.info("Reads remaining after BWA filtering: " + readsFilteredPaired.count());
    //TODO: repeat BWA with seed size 11
    //Write output
    header.setSortOrder(SAMFileHeader.SortOrder.queryname);
    try {
        ReadsSparkSink.writeReads(ctx, OUTPUT_PATH, null, readsNonHost, header, shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE);
    } catch (final IOException e) {
        throw new GATKException("Unable to write bam", e);
    }
}
Also used : BaseQualityReadTransformer(org.broadinstitute.hellbender.transformers.BaseQualityReadTransformer) OpticalDuplicateFinder(org.broadinstitute.hellbender.utils.read.markduplicates.OpticalDuplicateFinder) IOException(java.io.IOException) ReadTransformerSparkifier(org.broadinstitute.hellbender.tools.spark.utils.ReadTransformerSparkifier) ReadFilterSparkifier(org.broadinstitute.hellbender.tools.spark.utils.ReadFilterSparkifier) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) DUSTReadTransformer(org.broadinstitute.hellbender.transformers.DUSTReadTransformer) BaseQualityClipReadTransformer(org.broadinstitute.hellbender.transformers.BaseQualityClipReadTransformer)

Example 4 with GATKException

use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.

the class AlignAssembledContigsSpark method decodeStringAsSimpleInterval.

@VisibleForTesting
public static SimpleInterval decodeStringAsSimpleInterval(final String text) {
    try {
        if (!text.startsWith("CTG="))
            throw new IllegalArgumentException(text);
        int stop1 = text.indexOf("START=");
        final String ctg = text.substring(4, stop1);
        int stop2 = text.indexOf("END=", stop1);
        final int start = Integer.valueOf(text.substring(stop1 + 6, stop2));
        final int end = Integer.valueOf(text.substring(stop2 + 4, text.length()));
        return new SimpleInterval(ctg, start, end);
    } catch (final Exception ex) {
        throw new GATKException("unexpected format in supposedly formatted text for a SimpleInterval: " + text, ex);
    }
}
Also used : SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 5 with GATKException

use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.

the class BAQ method calcBAQFromHMM.

// we need to pad ref by at least the bandwidth / 2 on either side
@SuppressWarnings("fallthrough")
public BAQCalculationResult calcBAQFromHMM(GATKRead read, byte[] ref, int refOffset) {
    // todo -- need to handle the case where the cigar sum of lengths doesn't cover the whole read
    Pair<Integer, Integer> queryRange = calculateQueryRange(read);
    // read has Ns, or is completely clipped away
    if (queryRange == null)
        return null;
    int queryStart = queryRange.getLeft();
    int queryEnd = queryRange.getRight();
    BAQCalculationResult baqResult = calcBAQFromHMM(ref, read.getBases(), read.getBaseQualities(), queryStart, queryEnd);
    // cap quals
    int readI = 0, refI = 0;
    for (CigarElement elt : read.getCigarElements()) {
        int l = elt.getLength();
        switch(elt.getOperator()) {
            case // cannot handle these
            N:
                return null;
            case H:
            case // ignore pads and hard clips
            P:
                break;
            // move the reference too, in addition to I
            case S:
                refI += l;
            case I:
                // todo -- is it really the case that we want to treat I and S the same?
                for (int i = readI; i < readI + l; i++) baqResult.bq[i] = baqResult.rawQuals[i];
                readI += l;
                break;
            case D:
                refI += l;
                break;
            case M:
            case EQ:
            case //all three operators are equivalent here.
            X:
                for (int i = readI; i < readI + l; i++) {
                    int expectedPos = refI - refOffset + (i - readI);
                    baqResult.bq[i] = capBaseByBAQ(baqResult.rawQuals[i], baqResult.bq[i], baqResult.state[i], expectedPos);
                }
                readI += l;
                refI += l;
                break;
            default:
                throw new GATKException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getName());
        }
    }
    if (// odd cigar string
    readI != read.getLength())
        System.arraycopy(baqResult.rawQuals, 0, baqResult.bq, 0, baqResult.bq.length);
    return baqResult;
}
Also used : GATKException(org.broadinstitute.hellbender.exceptions.GATKException) CigarElement(htsjdk.samtools.CigarElement)

Aggregations

GATKException (org.broadinstitute.hellbender.exceptions.GATKException)96 IOException (java.io.IOException)19 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)13 CigarElement (htsjdk.samtools.CigarElement)12 ArrayList (java.util.ArrayList)10 UserException (org.broadinstitute.hellbender.exceptions.UserException)10 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)8 Cigar (htsjdk.samtools.Cigar)7 File (java.io.File)6 SAMFileHeader (htsjdk.samtools.SAMFileHeader)5 OutputStream (java.io.OutputStream)5 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)5 VisibleForTesting (com.google.common.annotations.VisibleForTesting)4 Utils (org.broadinstitute.hellbender.utils.Utils)4 Tuple2 (scala.Tuple2)4 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)3 BufferedOutputStream (java.io.BufferedOutputStream)3 InputStream (java.io.InputStream)3 BigInteger (java.math.BigInteger)3 java.util (java.util)3