use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class PlotACNVResults method doWork.
@Override
protected Object doWork() {
checkRegularReadableUserFiles();
//get sample name from input files (consistency check is performed)
final String sampleName = getSampleName();
//load contig names and lengths from the sequence dictionary into a LinkedHashMap
final SAMSequenceDictionary sequenceDictionary = ReferenceUtils.loadFastaDictionary(sequenceDictionaryFile);
Utils.validateArg(sequenceDictionary.getSequences().stream().map(SAMSequenceRecord::getSequenceName).noneMatch(n -> n.contains(CONTIG_DELIMITER)), String.format("Contig names cannot contain \"%s\".", CONTIG_DELIMITER));
final Map<String, Integer> contigLengthMap = sequenceDictionary.getSequences().stream().filter(s -> s.getSequenceLength() >= minContigLength).collect(Collectors.toMap(SAMSequenceRecord::getSequenceName, SAMSequenceRecord::getSequenceLength, (c, l) -> {
throw new IllegalArgumentException(String.format("Duplicate contig in sequence dictionary: %s", c));
}, LinkedHashMap::new));
Utils.validateArg(contigLengthMap.size() > 0, "There must be at least one contig above the threshold length in the sequence dictionary.");
logger.info("Contigs above length threshold: " + contigLengthMap.toString());
//check that contigs in input files are present in sequence dictionary and that data points are valid given lengths
validateContigs(contigLengthMap);
//generate the plots
final List<String> contigNames = new ArrayList<>(contigLengthMap.keySet());
final List<Integer> contigLengths = new ArrayList<>(contigLengthMap.values());
writeSegmentedAlleleFractionPlot(sampleName, contigNames, contigLengths);
return "SUCCESS";
}
use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class PlotSegmentedCopyRatio method doWork.
@Override
protected Object doWork() {
checkRegularReadableUserFiles();
//get sample name from input files (consistency check is performed)
final String sampleName = getSampleName();
//load contig names and lengths from the sequence dictionary into a LinkedHashMap
final SAMSequenceDictionary sequenceDictionary = ReferenceUtils.loadFastaDictionary(sequenceDictionaryFile);
Utils.validateArg(sequenceDictionary.getSequences().stream().map(SAMSequenceRecord::getSequenceName).noneMatch(n -> n.contains(CONTIG_DELIMITER)), String.format("Contig names cannot contain \"%s\".", CONTIG_DELIMITER));
final Map<String, Integer> contigLengthMap = sequenceDictionary.getSequences().stream().filter(s -> s.getSequenceLength() >= minContigLength).collect(Collectors.toMap(SAMSequenceRecord::getSequenceName, SAMSequenceRecord::getSequenceLength, (c, l) -> {
throw new IllegalArgumentException(String.format("Duplicate contig in sequence dictionary: %s", c));
}, LinkedHashMap::new));
Utils.validateArg(contigLengthMap.size() > 0, "There must be at least one contig above the threshold length in the sequence dictionary.");
logger.info("Contigs above length threshold: " + contigLengthMap.toString());
//check that contigs in input files are present in sequence dictionary and that data points are valid given lengths
validateContigs(contigLengthMap);
//generate the plots
final List<String> contigNames = new ArrayList<>(contigLengthMap.keySet());
final List<Integer> contigLengths = new ArrayList<>(contigLengthMap.values());
writeSegmentedCopyRatioPlots(sampleName, contigNames, contigLengths);
return "SUCCESS";
}
use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class HaplotypeCallerSpark method writeVariants.
/**
* WriteVariants, this is currently going to be horribly slow and explosive on a full size file since it performs a collect.
*
* This will be replaced by a parallel writer similar to what's done with {@link org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSink}
*/
private void writeVariants(JavaRDD<VariantContext> variants) {
final List<VariantContext> collectedVariants = variants.collect();
final SAMSequenceDictionary referenceDictionary = getReferenceSequenceDictionary();
final List<VariantContext> sortedVariants = collectedVariants.stream().sorted((o1, o2) -> IntervalUtils.compareLocatables(o1, o2, referenceDictionary)).collect(Collectors.toList());
final HaplotypeCallerEngine hcEngine = new HaplotypeCallerEngine(hcArgs, getHeaderForReads(), new ReferenceMultiSourceAdapter(getReference(), getAuthHolder()));
try (final VariantContextWriter writer = hcEngine.makeVCFWriter(output, getBestAvailableSequenceDictionary())) {
hcEngine.writeHeader(writer, getHeaderForReads().getSequenceDictionary(), Collections.emptySet());
sortedVariants.forEach(writer::add);
}
}
use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class ReadWalkerSpark method getReads.
/**
* Loads reads and the corresponding reference and features into a {@link JavaRDD} for the intervals specified.
*
* If no intervals were specified, returns all the reads.
*
* @return all reads as a {@link JavaRDD}, bounded by intervals if specified.
*/
public JavaRDD<ReadWalkerContext> getReads(JavaSparkContext ctx) {
SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
List<SimpleInterval> intervals = hasIntervals() ? getIntervals() : IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
// use unpadded shards (padding is only needed for reference bases)
final List<ShardBoundary> intervalShards = intervals.stream().flatMap(interval -> Shard.divideIntervalIntoShards(interval, readShardSize, 0, sequenceDictionary).stream()).collect(Collectors.toList());
JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, getReads(), GATKRead.class, sequenceDictionary, intervalShards, readShardSize, shuffle);
Broadcast<ReferenceMultiSource> bReferenceSource = hasReference() ? ctx.broadcast(getReference()) : null;
Broadcast<FeatureManager> bFeatureManager = features == null ? null : ctx.broadcast(features);
return shardedReads.flatMap(getReadsFunction(bReferenceSource, bFeatureManager, sequenceDictionary, readShardPadding));
}
use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class MultiVariantDataSource method validateAllSequenceDictionaries.
/**
* GATKTool only validates individual feature dictionaries against the reference dictionary, so cross-validate
* all of the dictionaries against each other here by ensuring that each contig found in any dictionary has the
* same length (and md5, when a value is present for that contig in both dictionaries) in every other dictionary
* in which its present.
*/
private void validateAllSequenceDictionaries() {
final Map<String, FeatureDataSource<VariantContext>> contigMap = new HashMap<>();
featureDataSources.forEach(ds -> {
final SAMSequenceDictionary dictionary = ds.getSequenceDictionary();
if (dictionary == null) {
logger.warn("A sequence dictionary is required for each input when using multiple inputs, and one could" + " not be obtained for feature input: " + ds.getName() + ". The input may not exist or may not have a valid header");
} else {
dictionary.getSequences().forEach(sourceSequence -> {
final String sourceSequenceName = sourceSequence.getSequenceName();
final FeatureDataSource<VariantContext> previousDataSource = contigMap.getOrDefault(sourceSequenceName, null);
if (previousDataSource != null) {
final SAMSequenceDictionary previousDictionary = previousDataSource.getSequenceDictionary();
final SAMSequenceRecord previousSequence = previousDictionary.getSequence(sourceSequenceName);
validateSequenceDictionaryRecords(ds.getName(), dictionary, sourceSequence, previousDataSource.getName(), previousDictionary, previousSequence);
} else {
contigMap.put(sourceSequenceName, ds);
}
});
}
});
}
Aggregations