use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class HaplotypeCallerSpark method writeVariants.
/**
* WriteVariants, this is currently going to be horribly slow and explosive on a full size file since it performs a collect.
*
* This will be replaced by a parallel writer similar to what's done with {@link org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSink}
*/
private void writeVariants(JavaRDD<VariantContext> variants) {
final List<VariantContext> collectedVariants = variants.collect();
final SAMSequenceDictionary referenceDictionary = getReferenceSequenceDictionary();
final List<VariantContext> sortedVariants = collectedVariants.stream().sorted((o1, o2) -> IntervalUtils.compareLocatables(o1, o2, referenceDictionary)).collect(Collectors.toList());
final HaplotypeCallerEngine hcEngine = new HaplotypeCallerEngine(hcArgs, getHeaderForReads(), new ReferenceMultiSourceAdapter(getReference(), getAuthHolder()));
try (final VariantContextWriter writer = hcEngine.makeVCFWriter(output, getBestAvailableSequenceDictionary())) {
hcEngine.writeHeader(writer, getHeaderForReads().getSequenceDictionary(), Collections.emptySet());
sortedVariants.forEach(writer::add);
}
}
use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class ReadWalkerSpark method getReads.
/**
* Loads reads and the corresponding reference and features into a {@link JavaRDD} for the intervals specified.
*
* If no intervals were specified, returns all the reads.
*
* @return all reads as a {@link JavaRDD}, bounded by intervals if specified.
*/
public JavaRDD<ReadWalkerContext> getReads(JavaSparkContext ctx) {
SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
List<SimpleInterval> intervals = hasIntervals() ? getIntervals() : IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
// use unpadded shards (padding is only needed for reference bases)
final List<ShardBoundary> intervalShards = intervals.stream().flatMap(interval -> Shard.divideIntervalIntoShards(interval, readShardSize, 0, sequenceDictionary).stream()).collect(Collectors.toList());
JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, getReads(), GATKRead.class, sequenceDictionary, intervalShards, readShardSize, shuffle);
Broadcast<ReferenceMultiSource> bReferenceSource = hasReference() ? ctx.broadcast(getReference()) : null;
Broadcast<FeatureManager> bFeatureManager = features == null ? null : ctx.broadcast(features);
return shardedReads.flatMap(getReadsFunction(bReferenceSource, bFeatureManager, sequenceDictionary, readShardPadding));
}
use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class MultiVariantDataSource method validateAllSequenceDictionaries.
/**
* GATKTool only validates individual feature dictionaries against the reference dictionary, so cross-validate
* all of the dictionaries against each other here by ensuring that each contig found in any dictionary has the
* same length (and md5, when a value is present for that contig in both dictionaries) in every other dictionary
* in which its present.
*/
private void validateAllSequenceDictionaries() {
final Map<String, FeatureDataSource<VariantContext>> contigMap = new HashMap<>();
featureDataSources.forEach(ds -> {
final SAMSequenceDictionary dictionary = ds.getSequenceDictionary();
if (dictionary == null) {
logger.warn("A sequence dictionary is required for each input when using multiple inputs, and one could" + " not be obtained for feature input: " + ds.getName() + ". The input may not exist or may not have a valid header");
} else {
dictionary.getSequences().forEach(sourceSequence -> {
final String sourceSequenceName = sourceSequence.getSequenceName();
final FeatureDataSource<VariantContext> previousDataSource = contigMap.getOrDefault(sourceSequenceName, null);
if (previousDataSource != null) {
final SAMSequenceDictionary previousDictionary = previousDataSource.getSequenceDictionary();
final SAMSequenceRecord previousSequence = previousDictionary.getSequence(sourceSequenceName);
validateSequenceDictionaryRecords(ds.getName(), dictionary, sourceSequence, previousDataSource.getName(), previousDictionary, previousSequence);
} else {
contigMap.put(sourceSequenceName, ds);
}
});
}
});
}
use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class ReorderSam method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
IOUtil.assertFileIsWritable(OUTPUT);
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE);
SAMSequenceDictionary refDict = reference.getSequenceDictionary();
if (refDict == null) {
CloserUtil.close(in);
throw new UserException("No reference sequence dictionary found. Aborting. " + "You can create a sequence dictionary for the reference fasta using CreateSequenceDictionary.jar.");
}
printDictionary("SAM/BAM file", in.getFileHeader().getSequenceDictionary());
printDictionary("Reference", refDict);
Map<Integer, Integer> newOrder = buildSequenceDictionaryMap(refDict, in.getFileHeader().getSequenceDictionary());
// has to be after we create the newOrder
SAMFileHeader outHeader = ReadUtils.cloneSAMFileHeader(in.getFileHeader());
outHeader.setSequenceDictionary(refDict);
logger.info("Writing reads...");
if (in.hasIndex()) {
try (final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outHeader, true)) {
// write the reads in contig order
for (final SAMSequenceRecord contig : refDict.getSequences()) {
final SAMRecordIterator it = in.query(contig.getSequenceName(), 0, 0, false);
writeReads(out, it, newOrder, contig.getSequenceName());
}
// don't forget the unmapped reads
writeReads(out, in.queryUnmapped(), newOrder, "unmapped");
}
} else {
try (final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outHeader, false)) {
writeReads(out, in.iterator(), newOrder, "All reads");
}
}
// cleanup
CloserUtil.close(in);
return null;
}
use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class MakeSitesOnlyVcf method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
final VCFFileReader reader = new VCFFileReader(INPUT, false);
final VCFHeader inputVcfHeader = new VCFHeader(reader.getFileHeader().getMetaDataInInputOrder());
final SAMSequenceDictionary sequenceDictionary = inputVcfHeader.getSequenceDictionary();
if (CREATE_INDEX && sequenceDictionary == null) {
throw new UserException("A sequence dictionary must be available (either through the input file or by setting it explicitly) when creating indexed output.");
}
final ProgressLogger progress = new ProgressLogger(logger, 10000);
// Setup the site-only file writer
final VariantContextWriterBuilder builder = new VariantContextWriterBuilder().setOutputFile(OUTPUT).setReferenceDictionary(sequenceDictionary);
if (CREATE_INDEX)
builder.setOption(Options.INDEX_ON_THE_FLY);
else
builder.unsetOption(Options.INDEX_ON_THE_FLY);
try (final VariantContextWriter writer = builder.build()) {
final VCFHeader header = new VCFHeader(inputVcfHeader.getMetaDataInInputOrder(), SAMPLE);
writer.writeHeader(header);
// Go through the input, strip the records and write them to the output
final CloseableIterator<VariantContext> iterator = reader.iterator();
while (iterator.hasNext()) {
final VariantContext full = iterator.next();
final VariantContext site = subsetToSamplesWithOriginalAnnotations(full, SAMPLE);
writer.add(site);
progress.record(site.getContig(), site.getStart());
}
CloserUtil.close(iterator);
CloserUtil.close(reader);
}
return null;
}
Aggregations