use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class FindBadGenomicKmersSpark method runTool.
/** Get the list of high copy number kmers in the reference, and write them to a file. */
@Override
protected void runTool(final JavaSparkContext ctx) {
final SAMFileHeader hdr = getHeaderForReads();
SAMSequenceDictionary dict = null;
if (hdr != null)
dict = hdr.getSequenceDictionary();
final PipelineOptions options = getAuthenticatedGCSOptions();
final ReferenceMultiSource referenceMultiSource = getReference();
Collection<SVKmer> killList = findBadGenomicKmers(ctx, kSize, maxDUSTScore, referenceMultiSource, options, dict);
if (highCopyFastaFilename != null) {
killList = uniquify(killList, processFasta(kSize, maxDUSTScore, highCopyFastaFilename, options));
}
SVUtils.writeKmersFile(kSize, outputFile, killList);
}
use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class SplitIntervals method onTraversalStart.
@Override
public void onTraversalStart() {
ParamUtils.isPositive(scatterCount, "scatter count must be > 0.");
if (!outputDir.exists() && !outputDir.mkdir()) {
throw new RuntimeIOException("Unable to create directory: " + outputDir.getAbsolutePath());
}
// in general dictionary will be from the reference, but using -I reads.bam or -F variants.vcf
// to use the sequence dict from a bam or vcf is also supported
final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
final List<SimpleInterval> intervals = hasIntervals() ? intervalArgumentCollection.getIntervals(sequenceDictionary) : IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
final IntervalList intervalList = new IntervalList(sequenceDictionary);
intervals.stream().map(si -> new Interval(si.getContig(), si.getStart(), si.getEnd())).forEach(intervalList::add);
final IntervalListScatterer scatterer = new IntervalListScatterer(subdivisionMode);
final List<IntervalList> scattered = scatterer.scatter(intervalList, scatterCount, false);
final DecimalFormat formatter = new DecimalFormat("0000");
IntStream.range(0, scattered.size()).forEach(n -> scattered.get(n).write(new File(outputDir, formatter.format(n) + "-scattered.intervals")));
}
use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class SVUtils method getRefRDD.
/**
* Create an RDD from the reference sequences.
* The reference sequences are transformed into a single, large collection of byte arrays. The collection is then
* parallelized into an RDD.
* Each contig that exceeds a size given by REF_RECORD_LEN is broken into a series of REF_RECORD_LEN chunks with a
* K-1 base overlap between successive chunks. (I.e., for K=63, the last 62 bases in chunk n match the first 62
* bases in chunk n+1) so that we don't miss any kmers due to the chunking -- we can just kmerize each record
* independently.
*/
public static JavaRDD<byte[]> getRefRDD(final JavaSparkContext ctx, final int kSize, final ReferenceMultiSource ref, final PipelineOptions options, final SAMSequenceDictionary readsDict, final int ref_record_len, final int ref_records_per_partition) {
final SAMSequenceDictionary dict = ref.getReferenceSequenceDictionary(readsDict);
if (dict == null)
throw new GATKException("No reference dictionary available");
final int effectiveRecLen = ref_record_len - kSize + 1;
final List<byte[]> sequenceChunks = new ArrayList<>();
for (final SAMSequenceRecord rec : dict.getSequences()) {
final String seqName = rec.getSequenceName();
final int seqLen = rec.getSequenceLength();
final SimpleInterval interval = new SimpleInterval(seqName, 1, seqLen);
try {
final byte[] bases = ref.getReferenceBases(options, interval).getBases();
for (int start = 0; start < seqLen; start += effectiveRecLen) {
sequenceChunks.add(Arrays.copyOfRange(bases, start, Math.min(start + ref_record_len, seqLen)));
}
} catch (final IOException ioe) {
throw new GATKException("Can't get reference sequence bases for " + interval, ioe);
}
}
return ctx.parallelize(sequenceChunks, sequenceChunks.size() / ref_records_per_partition + 1);
}
use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class SVVCFWriter method writeVCF.
/**
* FASTA and Broadcast references are both required because 2bit Broadcast references currently order their
* sequence dictionaries in a scrambled order, see https://github.com/broadinstitute/gatk/issues/2037.
*/
public static void writeVCF(final PipelineOptions pipelineOptions, final String vcfFileName, final String fastaReference, final JavaRDD<VariantContext> variantContexts, final Logger logger) {
final SAMSequenceDictionary referenceSequenceDictionary = new ReferenceMultiSource(pipelineOptions, fastaReference, ReferenceWindowFunctions.IDENTITY_FUNCTION).getReferenceSequenceDictionary(null);
final List<VariantContext> sortedVariantsList = sortVariantsByCoordinate(variantContexts.collect(), referenceSequenceDictionary);
logNumOfVarByTypes(sortedVariantsList, logger);
writeVariants(pipelineOptions, vcfFileName, sortedVariantsList, referenceSequenceDictionary);
}
use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class GenomeLocParserUnitTest method testParseUnknownSequenceLength.
@Test
public void testParseUnknownSequenceLength() {
SAMSequenceDictionary seqDict = new SAMSequenceDictionary();
seqDict.addSequence(new SAMSequenceRecord("1", SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH));
Assert.assertEquals(seqDict.getSequence("1").getSequenceLength(), SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH);
GenomeLocParser myLocParser = new GenomeLocParser(seqDict);
GenomeLoc genomeLoc = myLocParser.parseGenomeLoc("1:1-99");
Assert.assertEquals(genomeLoc.getEnd(), 99);
}
Aggregations