use of htsjdk.samtools.SamReader in project gatk by broadinstitute.
the class CollectJumpingLibraryMetrics method doWork.
/**
* Calculates the detailed statistics about the jumping library and then generates the results.
*/
@Override
protected Object doWork() {
for (File f : INPUT) {
IOUtil.assertFileIsReadable(f);
}
IOUtil.assertFileIsWritable(OUTPUT);
Histogram<Integer> innieHistogram = new Histogram<>();
Histogram<Integer> outieHistogram = new Histogram<>();
int fragments = 0;
int innies = 0;
int outies = 0;
int innieDupes = 0;
int outieDupes = 0;
int crossChromPairs = 0;
int superSized = 0;
int tandemPairs = 0;
double chimeraSizeMinimum = Math.max(getOutieMode(), (double) CHIMERA_KB_MIN);
for (File f : INPUT) {
SamReader reader = SamReaderFactory.makeDefault().open(f);
if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException("SAM file must " + f.getName() + " must be sorted in coordinate order");
}
for (SAMRecord sam : reader) {
// We're getting all our info from the first of each pair.
if (!sam.getFirstOfPairFlag()) {
continue;
}
// Ignore unmapped read pairs
if (sam.getReadUnmappedFlag()) {
if (!sam.getMateUnmappedFlag()) {
fragments++;
continue;
}
// If both ends are unmapped and we've hit unaligned reads we're done
if (sam.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
break;
}
continue;
}
if (sam.getMateUnmappedFlag()) {
fragments++;
continue;
}
// Ignore low-quality reads. If we don't have the mate mapping quality, assume it's OK
if ((sam.getAttribute(SAMTag.MQ.name()) != null && sam.getIntegerAttribute(SAMTag.MQ.name()) < MINIMUM_MAPPING_QUALITY) || sam.getMappingQuality() < MINIMUM_MAPPING_QUALITY) {
continue;
}
final int absInsertSize = Math.abs(sam.getInferredInsertSize());
if (absInsertSize > chimeraSizeMinimum) {
superSized++;
} else if (sam.getMateNegativeStrandFlag() == sam.getReadNegativeStrandFlag()) {
tandemPairs++;
} else if (!sam.getMateReferenceIndex().equals(sam.getReferenceIndex())) {
crossChromPairs++;
} else {
final PairOrientation pairOrientation = SamPairUtil.getPairOrientation(sam);
if (pairOrientation == PairOrientation.RF) {
outieHistogram.increment(absInsertSize);
outies++;
if (sam.getDuplicateReadFlag()) {
outieDupes++;
}
} else if (pairOrientation == PairOrientation.FR) {
innieHistogram.increment(absInsertSize);
innies++;
if (sam.getDuplicateReadFlag()) {
innieDupes++;
}
} else {
throw new IllegalStateException("This should never happen");
}
}
}
CloserUtil.close(reader);
}
MetricsFile<JumpingLibraryMetrics, Integer> metricsFile = getMetricsFile();
JumpingLibraryMetrics metrics = new JumpingLibraryMetrics();
metrics.JUMP_PAIRS = outies;
metrics.JUMP_DUPLICATE_PAIRS = outieDupes;
metrics.JUMP_DUPLICATE_PCT = outies != 0 ? outieDupes / (double) outies : 0;
metrics.JUMP_LIBRARY_SIZE = (outies > 0 && outieDupes > 0) ? DuplicationMetrics.estimateLibrarySize(outies, outies - outieDupes) : 0;
outieHistogram.trimByTailLimit(TAIL_LIMIT);
metrics.JUMP_MEAN_INSERT_SIZE = outieHistogram.getMean();
metrics.JUMP_STDEV_INSERT_SIZE = outieHistogram.getStandardDeviation();
metrics.NONJUMP_PAIRS = innies;
metrics.NONJUMP_DUPLICATE_PAIRS = innieDupes;
metrics.NONJUMP_DUPLICATE_PCT = innies != 0 ? innieDupes / (double) innies : 0;
metrics.NONJUMP_LIBRARY_SIZE = (innies > 0 && innieDupes > 0) ? DuplicationMetrics.estimateLibrarySize(innies, innies - innieDupes) : 0;
innieHistogram.trimByTailLimit(TAIL_LIMIT);
metrics.NONJUMP_MEAN_INSERT_SIZE = innieHistogram.getMean();
metrics.NONJUMP_STDEV_INSERT_SIZE = innieHistogram.getStandardDeviation();
metrics.CHIMERIC_PAIRS = crossChromPairs + superSized + tandemPairs;
metrics.FRAGMENTS = fragments;
double totalPairs = outies + innies + metrics.CHIMERIC_PAIRS;
metrics.PCT_JUMPS = totalPairs != 0 ? outies / totalPairs : 0;
metrics.PCT_NONJUMPS = totalPairs != 0 ? innies / totalPairs : 0;
metrics.PCT_CHIMERAS = totalPairs != 0 ? metrics.CHIMERIC_PAIRS / totalPairs : 0;
metricsFile.addMetric(metrics);
metricsFile.write(OUTPUT);
return null;
}
use of htsjdk.samtools.SamReader in project gatk by broadinstitute.
the class CollectOxoGMetrics method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
if (INTERVALS != null)
IOUtil.assertFileIsReadable(INTERVALS);
IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
final SamReader in = SamReaderFactory.makeDefault().open(INPUT);
final Set<String> samples = new HashSet<>();
final Set<String> libraries = new HashSet<>();
for (final SAMReadGroupRecord rec : in.getFileHeader().getReadGroups()) {
samples.add(getOrElse(rec.getSample(), UNKNOWN_SAMPLE));
libraries.add(getOrElse(rec.getLibrary(), UNKNOWN_LIBRARY));
}
// Setup the calculators
final Set<String> contexts = CONTEXTS.isEmpty() ? makeContextStrings(CONTEXT_SIZE) : CONTEXTS;
final ListMap<String, Calculator> calculators = new ListMap<>();
for (final String context : contexts) {
for (final String library : libraries) {
calculators.add(context, new Calculator(library, context));
}
}
// Load up dbSNP if available
logger.info("Loading dbSNP File: " + DB_SNP);
final DbSnpBitSetUtil dbSnp;
if (DB_SNP != null)
dbSnp = new DbSnpBitSetUtil(DB_SNP, in.getFileHeader().getSequenceDictionary());
else
dbSnp = null;
// Make an iterator that will filter out funny looking things
final SamLocusIterator iterator;
if (INTERVALS != null) {
final IntervalList intervals = IntervalList.fromFile(INTERVALS);
iterator = new SamLocusIterator(in, intervals.uniqued(), false);
} else {
iterator = new SamLocusIterator(in);
}
iterator.setEmitUncoveredLoci(false);
iterator.setMappingQualityScoreCutoff(MINIMUM_MAPPING_QUALITY);
final List<SamRecordFilter> filters = new ArrayList<>();
filters.add(new NotPrimaryAlignmentFilter());
filters.add(new DuplicateReadFilter());
if (MINIMUM_INSERT_SIZE > 0 || MAXIMUM_INSERT_SIZE > 0) {
filters.add(new InsertSizeFilter(MINIMUM_INSERT_SIZE, MAXIMUM_INSERT_SIZE));
}
iterator.setSamFilters(filters);
logger.info("Starting iteration.");
long nextLogTime = 0;
int sites = 0;
for (final SamLocusIterator.LocusInfo info : iterator) {
// Skip dbSNP sites
final String chrom = info.getSequenceName();
final int pos = info.getPosition();
final int index = pos - 1;
if (dbSnp != null && dbSnp.isDbSnpSite(chrom, pos))
continue;
// Skip sites at the end of chromosomes
final byte[] bases = refWalker.get(info.getSequenceIndex()).getBases();
if (pos < 3 || pos > bases.length - 3)
continue;
// Skip non C-G bases
final byte base = StringUtil.toUpperCase(bases[index]);
if (base != 'C' && base != 'G')
continue;
// Get the context string
final String context;
{
final String tmp = StringUtil.bytesToString(bases, index - CONTEXT_SIZE, 1 + (2 * CONTEXT_SIZE)).toUpperCase();
if (base == 'C')
context = tmp;
else
/* if G */
context = SequenceUtil.reverseComplement(tmp);
}
final List<Calculator> calculatorsForContext = calculators.get(context);
// happens if we get ambiguous bases in the reference
if (calculatorsForContext == null)
continue;
for (final Calculator calc : calculatorsForContext) calc.accept(info, base);
// See if we need to stop
if (++sites % 100 == 0) {
final long now = System.currentTimeMillis();
if (now > nextLogTime) {
logger.info("Visited " + sites + " sites of interest. Last site: " + chrom + ":" + pos);
nextLogTime = now + 60000;
}
}
if (sites >= STOP_AFTER)
break;
}
final MetricsFile<CpcgMetrics, Integer> file = getMetricsFile();
for (final List<Calculator> calcs : calculators.values()) {
for (final Calculator calc : calcs) {
final CpcgMetrics m = calc.finish();
m.SAMPLE_ALIAS = StringUtil.join(",", new ArrayList<>(samples));
file.addMetric(m);
}
}
file.write(OUTPUT);
CloserUtil.close(in);
return null;
}
use of htsjdk.samtools.SamReader in project gatk by broadinstitute.
the class BuildBamIndex method doWork.
/**
* Main method for the program. Checks that all input files are present and
* readable and that the output file can be written to. Then iterates through
* all the records generating a BAM Index, then writes the bai file.
*/
@Override
protected Object doWork() {
try {
inputUrl = new URL(INPUT);
} catch (MalformedURLException e) {
inputFile = new File(INPUT);
}
// set default output file - input-file.bai
if (OUTPUT == null) {
final String baseFileName;
if (inputUrl != null) {
final String path = inputUrl.getPath();
final int lastSlash = path.lastIndexOf('/');
baseFileName = path.substring(lastSlash + 1, path.length());
} else {
baseFileName = inputFile.getAbsolutePath();
}
if (baseFileName.endsWith(BamFileIoUtils.BAM_FILE_EXTENSION)) {
final int index = baseFileName.lastIndexOf('.');
OUTPUT = new File(baseFileName.substring(0, index) + BAMIndex.BAMIndexSuffix);
} else {
OUTPUT = new File(baseFileName + BAMIndex.BAMIndexSuffix);
}
}
IOUtil.assertFileIsWritable(OUTPUT);
final SamReader bam;
if (inputUrl != null) {
// remote input
bam = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).disable(SamReaderFactory.Option.EAGERLY_DECODE).enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS).open(SamInputResource.of(inputUrl));
} else {
// input from a normal file
IOUtil.assertFileIsReadable(inputFile);
bam = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS).open(inputFile);
}
if (bam.type() != SamReader.Type.BAM_TYPE) {
throw new SAMException("Input file must be bam file, not sam file.");
}
if (!bam.getFileHeader().getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) {
throw new SAMException("Input bam file must be sorted by coordinates");
}
BAMIndexer.createIndex(bam, OUTPUT);
logger.info("Successfully wrote bam index file " + OUTPUT);
CloserUtil.close(bam);
return null;
}
use of htsjdk.samtools.SamReader in project gatk by broadinstitute.
the class CollectJumpingLibraryMetrics method getOutieMode.
/**
* Calculates the mode for outward-facing pairs, using the first SAMPLE_FOR_MODE
* outward-facing pairs found in INPUT
*/
private double getOutieMode() {
int samplePerFile = SAMPLE_FOR_MODE / INPUT.size();
Histogram<Integer> histo = new Histogram<>();
for (File f : INPUT) {
SamReader reader = SamReaderFactory.makeDefault().open(f);
int sampled = 0;
for (Iterator<SAMRecord> it = reader.iterator(); it.hasNext() && sampled < samplePerFile; ) {
SAMRecord sam = it.next();
if (!sam.getFirstOfPairFlag()) {
continue;
}
// If we get here we've hit the end of the aligned reads
if (sam.getReadUnmappedFlag() && sam.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
break;
} else if (sam.getReadUnmappedFlag() || sam.getMateUnmappedFlag()) {
continue;
} else {
if ((sam.getAttribute(SAMTag.MQ.name()) == null || sam.getIntegerAttribute(SAMTag.MQ.name()) >= MINIMUM_MAPPING_QUALITY) && sam.getMappingQuality() >= MINIMUM_MAPPING_QUALITY && sam.getMateNegativeStrandFlag() != sam.getReadNegativeStrandFlag() && sam.getMateReferenceIndex().equals(sam.getReferenceIndex())) {
if (SamPairUtil.getPairOrientation(sam) == PairOrientation.RF) {
histo.increment(Math.abs(sam.getInferredInsertSize()));
sampled++;
}
}
}
}
CloserUtil.close(reader);
}
return histo.size() > 0 ? histo.getMode() : 0;
}
use of htsjdk.samtools.SamReader in project gatk by broadinstitute.
the class GATKToolUnitTest method testBestSequenceDictionary_fromReads.
@Test
public void testBestSequenceDictionary_fromReads() throws Exception {
final GATKTool tool = new TestGATKToolWithReads();
final CommandLineParser clp = new CommandLineArgumentParser(tool);
final File bamFile = new File(publicTestDir + "org/broadinstitute/hellbender/engine/reads_data_source_test1.bam");
final String[] args = { "-I", bamFile.getCanonicalPath() };
clp.parseArguments(System.out, args);
tool.onStartup();
//read the dict back in and compare to bam dict
final SAMSequenceDictionary toolDict = tool.getBestAvailableSequenceDictionary();
try (final SamReader open = SamReaderFactory.makeDefault().open(bamFile)) {
final SAMSequenceDictionary bamDict = open.getFileHeader().getSequenceDictionary();
toolDict.assertSameDictionary(bamDict);
bamDict.assertSameDictionary(toolDict);
Assert.assertEquals(toolDict, bamDict);
}
}
Aggregations