use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class ParallelCopyGCSDirectoryIntoHDFSSpark method runTool.
@Override
protected void runTool(final JavaSparkContext ctx) {
if (!BucketUtils.isCloudStorageUrl(inputGCSPath)) {
throw new UserException("Input path " + inputGCSPath + " is not a GCS URI");
}
if (!BucketUtils.isHadoopUrl(outputHDFSDirectory)) {
throw new UserException("Output directory " + outputHDFSDirectory + " is not an HDFS URI");
}
final String inputGCSPathFinal = inputGCSPath;
final String outputDirectoryFinal = outputHDFSDirectory;
org.apache.hadoop.fs.Path outputHdfsDirectoryPath = new org.apache.hadoop.fs.Path(outputHDFSDirectory);
try (FileSystem fs = outputHdfsDirectoryPath.getFileSystem(new Configuration())) {
if (fs.exists(outputHdfsDirectoryPath)) {
throw new UserException("Specified output directory " + outputHdfsDirectoryPath + " already exists. Please specify a new directory name.");
}
fs.mkdirs(outputHdfsDirectoryPath);
final long chunkSize = getChunkSize(fs);
final List<Path> gcsNIOPaths = getGCSFilePathsToCopy(inputGCSPathFinal);
List<Tuple2<String, Integer>> chunkList = setupChunks(chunkSize, gcsNIOPaths);
if (chunkList.size() == 0) {
logger.info("no files found to copy");
return;
}
final JavaPairRDD<String, Integer> chunkRDD = ctx.parallelizePairs(chunkList, chunkList.size());
final JavaPairRDD<String, Tuple2<Integer, String>> chunkMappingRDD = chunkRDD.mapToPair(p -> new Tuple2<>(p._1(), readChunkToHdfs(p._1(), chunkSize, p._2(), outputDirectoryFinal)));
final Map<String, Iterable<Tuple2<Integer, String>>> chunksByFilePath = chunkMappingRDD.groupByKey().collectAsMap();
concatenateChunks(outputDirectoryFinal, fs, gcsNIOPaths, chunksByFilePath);
} catch (NoSuchFileException e) {
throw new UserException("Could not locate input path " + e.getFile() + ". If you are trying to copy an entire directory, please include a trailing slash on your path.");
} catch (IOException e) {
throw new GATKException(e.getMessage(), e);
}
}
use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class HDF5PCACoveragePoN method renderPoNTargets.
private static List<Target> renderPoNTargets(final String[][] values, final List<String> targetNamesToRender, final HDF5File reader) {
if (values.length != targetNamesToRender.size()) {
throw new GATKException(String.format("Wrong number of elements in the targets recovered " + "from file '%s': number of targets found in file (%d) != number of target names (%d)", reader.getFile(), values.length, targetNamesToRender.size()));
}
final int numTargetCols = (int) reader.readDouble(NUM_TARGET_COLUMNS_PATH);
final List<Target> result = new ArrayList<>(values.length);
for (int i = 0; i < values.length; i++) {
if (values[i].length != numTargetCols) {
throw new GATKException(String.format("Wrong number of column elements in the targets recovered " + "from file '%s': number of columns found in file (%d) != number of target columns (%d)", reader.getFile(), values[i].length, numTargetCols));
}
result.add(new Target(targetNamesToRender.get(i), new SimpleInterval(values[i][0], Integer.parseInt(values[i][1]), Integer.parseInt(values[i][2]))));
}
return result;
}
use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class PathSeqFilterSpark method runTool.
@Override
protected void runTool(final JavaSparkContext ctx) {
final JavaRDD<GATKRead> reads = getReads();
//Filter secondary/supplementary reads and reads that fail the vendor quality check
final JavaRDD<GATKRead> primaryReads = reads.filter(read -> !(read.isSecondaryAlignment() || read.failsVendorQualityCheck() || read.isSupplementaryAlignment()));
logger.info("Loaded " + reads.count() + " reads");
//Mark and filter optical duplicates
final OpticalDuplicateFinder finder = new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null);
final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(primaryReads, getHeaderForReads(), MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES, finder, getRecommendedNumReducers());
final JavaRDD<GATKRead> markedFilteredReads = markedReads.filter(new ReadFilterSparkifier(new MarkedOpticalDuplicateReadFilter()));
logger.info("Reads remaining after de-duplication: " + markedFilteredReads.count());
//Apply DUST masking
final JavaRDD<GATKRead> readsDUSTMasked = markedFilteredReads.map(new ReadTransformerSparkifier(new DUSTReadTransformer(DUST_MASK, DUST_W, DUST_T)));
//Apply base quality hard clipping
final JavaRDD<GATKRead> readsClipped = readsDUSTMasked.map(new ReadTransformerSparkifier(new BaseQualityClipReadTransformer(READ_TRIM_THRESH)));
//Filter reads with less than MIN_READ_LENGTH bases
final JavaRDD<GATKRead> readsLengthFiltered = readsClipped.filter(new ReadFilterSparkifier(new ReadLengthReadFilter(MIN_READ_LENGTH, Integer.MAX_VALUE)));
logger.info("Reads remaining after clipping: " + readsLengthFiltered.count());
//Change low-quality bases to 'N'
final JavaRDD<GATKRead> readsBQFiltered = readsLengthFiltered.map(new ReadTransformerSparkifier(new BaseQualityReadTransformer(QUAL_PHRED_THRESH)));
//Filter reads with too many 'N's
final JavaRDD<GATKRead> readsAmbigFiltered = readsBQFiltered.filter(new ReadFilterSparkifier(new AmbiguousBaseReadFilter(FRAC_N_THRESHOLD)));
logger.info("Reads remaining after ambiguous base filtering: " + readsAmbigFiltered.count());
//Load Kmer hopscotch set and filter reads containing > 0 matching kmers
final JavaRDD<GATKRead> readsKmerFiltered = doKmerFiltering(ctx, readsAmbigFiltered);
logger.info("Reads remaining after kmer filtering: " + readsKmerFiltered.count());
//Filter unpaired reads
final JavaRDD<GATKRead> readsFilteredPaired = retainPairs(readsKmerFiltered);
logger.info("Reads remaining after unpaired filtering: " + readsFilteredPaired.count());
//BWA filtering against user-specified host organism reference
header = getHeaderForReads();
final JavaRDD<GATKRead> readsAligned = doHostBWA(ctx, header, readsFilteredPaired);
//Get unmapped reads (note these always come in pairs)
//TODO: retain read pairs by alignment score instead of flags
final JavaRDD<GATKRead> readsNonHost = readsAligned.filter(read -> read.isUnmapped() && read.mateIsUnmapped());
logger.info("Reads remaining after BWA filtering: " + readsFilteredPaired.count());
//TODO: repeat BWA with seed size 11
//Write output
header.setSortOrder(SAMFileHeader.SortOrder.queryname);
try {
ReadsSparkSink.writeReads(ctx, OUTPUT_PATH, null, readsNonHost, header, shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE);
} catch (final IOException e) {
throw new GATKException("Unable to write bam", e);
}
}
use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class AlignAssembledContigsSpark method decodeStringAsSimpleInterval.
@VisibleForTesting
public static SimpleInterval decodeStringAsSimpleInterval(final String text) {
try {
if (!text.startsWith("CTG="))
throw new IllegalArgumentException(text);
int stop1 = text.indexOf("START=");
final String ctg = text.substring(4, stop1);
int stop2 = text.indexOf("END=", stop1);
final int start = Integer.valueOf(text.substring(stop1 + 6, stop2));
final int end = Integer.valueOf(text.substring(stop2 + 4, text.length()));
return new SimpleInterval(ctg, start, end);
} catch (final Exception ex) {
throw new GATKException("unexpected format in supposedly formatted text for a SimpleInterval: " + text, ex);
}
}
use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class BAQ method calcBAQFromHMM.
// we need to pad ref by at least the bandwidth / 2 on either side
@SuppressWarnings("fallthrough")
public BAQCalculationResult calcBAQFromHMM(GATKRead read, byte[] ref, int refOffset) {
// todo -- need to handle the case where the cigar sum of lengths doesn't cover the whole read
Pair<Integer, Integer> queryRange = calculateQueryRange(read);
// read has Ns, or is completely clipped away
if (queryRange == null)
return null;
int queryStart = queryRange.getLeft();
int queryEnd = queryRange.getRight();
BAQCalculationResult baqResult = calcBAQFromHMM(ref, read.getBases(), read.getBaseQualities(), queryStart, queryEnd);
// cap quals
int readI = 0, refI = 0;
for (CigarElement elt : read.getCigarElements()) {
int l = elt.getLength();
switch(elt.getOperator()) {
case // cannot handle these
N:
return null;
case H:
case // ignore pads and hard clips
P:
break;
// move the reference too, in addition to I
case S:
refI += l;
case I:
// todo -- is it really the case that we want to treat I and S the same?
for (int i = readI; i < readI + l; i++) baqResult.bq[i] = baqResult.rawQuals[i];
readI += l;
break;
case D:
refI += l;
break;
case M:
case EQ:
case //all three operators are equivalent here.
X:
for (int i = readI; i < readI + l; i++) {
int expectedPos = refI - refOffset + (i - readI);
baqResult.bq[i] = capBaseByBAQ(baqResult.rawQuals[i], baqResult.bq[i], baqResult.state[i], expectedPos);
}
readI += l;
refI += l;
break;
default:
throw new GATKException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getName());
}
}
if (// odd cigar string
readI != read.getLength())
System.arraycopy(baqResult.rawQuals, 0, baqResult.bq, 0, baqResult.bq.length);
return baqResult;
}
Aggregations