use of htsjdk.samtools.SAMReadGroupRecord in project gatk by broadinstitute.
the class ReadGroupCovariateUnitTest method testSingleRecord.
@Test
public void testSingleRecord() {
final String id = "MY.ID";
final String expected = "SAMPLE.1";
final ReadGroupCovariate covariate = new ReadGroupCovariate(new RecalibrationArgumentCollection(), Arrays.asList(expected));
SAMReadGroupRecord rg = new SAMReadGroupRecord(id);
rg.setPlatformUnit(expected);
runTest(rg, expected, covariate);
}
use of htsjdk.samtools.SAMReadGroupRecord in project gatk-protected by broadinstitute.
the class ReferenceConfidenceModelUnitTest method setUp.
@BeforeClass
public void setUp() {
header = ArtificialReadUtils.createArtificialSamHeader(1, 1, 1000);
rg = new SAMReadGroupRecord(RGID);
rg.setSample(sample);
header.addReadGroup(rg);
parser = new GenomeLocParser(header.getSequenceDictionary());
}
use of htsjdk.samtools.SAMReadGroupRecord in project gatk by broadinstitute.
the class EstimateLibraryComplexity method doWork.
/**
* Method that does most of the work. Reads through the input BAM file and extracts the
* read sequences of each read pair and sorts them via a SortingCollection. Then traverses
* the sorted reads and looks at small groups at a time to find duplicates.
*/
@Override
protected Object doWork() {
for (final File f : INPUT) IOUtil.assertFileIsReadable(f);
logger.info("Will store " + MAX_RECORDS_IN_RAM + " read pairs in memory before sorting.");
final List<SAMReadGroupRecord> readGroups = new ArrayList<>();
final int recordsRead = 0;
final SortingCollection<PairedReadSequence> sorter = SortingCollection.newInstance(PairedReadSequence.class, new PairedReadCodec(), new PairedReadComparator(), MAX_RECORDS_IN_RAM, TMP_DIR);
// Loop through the input files and pick out the read sequences etc.
final ProgressLogger progress = new ProgressLogger(logger, (int) 1e6, "Read");
for (final File f : INPUT) {
final Map<String, PairedReadSequence> pendingByName = new HashMap<>();
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(f);
readGroups.addAll(in.getFileHeader().getReadGroups());
for (final SAMRecord rec : in) {
if (!rec.getReadPairedFlag())
continue;
if (!rec.getFirstOfPairFlag() && !rec.getSecondOfPairFlag()) {
continue;
}
PairedReadSequence prs = pendingByName.remove(rec.getReadName());
if (prs == null) {
// Make a new paired read object and add RG and physical location information to it
prs = new PairedReadSequence();
if (opticalDuplicateFinder.addLocationInformation(rec.getReadName(), prs)) {
final SAMReadGroupRecord rg = rec.getReadGroup();
if (rg != null)
prs.setReadGroup((short) readGroups.indexOf(rg));
}
pendingByName.put(rec.getReadName(), prs);
}
// Read passes quality check if both ends meet the mean quality criteria
final boolean passesQualityCheck = passesQualityCheck(rec.getReadBases(), rec.getBaseQualities(), MIN_IDENTICAL_BASES, MIN_MEAN_QUALITY);
prs.qualityOk = prs.qualityOk && passesQualityCheck;
// Get the bases and restore them to their original orientation if necessary
final byte[] bases = rec.getReadBases();
if (rec.getReadNegativeStrandFlag())
SequenceUtil.reverseComplement(bases);
if (rec.getFirstOfPairFlag()) {
prs.read1 = bases;
} else {
prs.read2 = bases;
}
if (prs.read1 != null && prs.read2 != null && prs.qualityOk) {
sorter.add(prs);
}
progress.record(rec);
}
CloserUtil.close(in);
}
logger.info("Finished reading - moving on to scanning for duplicates.");
// Now go through the sorted reads and attempt to find duplicates
try (final PeekableIterator<PairedReadSequence> iterator = new PeekableIterator<>(sorter.iterator())) {
final Map<String, Histogram<Integer>> duplicationHistosByLibrary = new HashMap<>();
final Map<String, Histogram<Integer>> opticalHistosByLibrary = new HashMap<>();
int groupsProcessed = 0;
long lastLogTime = System.currentTimeMillis();
final int meanGroupSize = Math.max(1, (recordsRead / 2) / (int) pow(4.0, (double) MIN_IDENTICAL_BASES * 2));
while (iterator.hasNext()) {
// Get the next group and split it apart by library
final List<PairedReadSequence> group = getNextGroup(iterator);
if (group.size() > meanGroupSize * MAX_GROUP_RATIO) {
final PairedReadSequence prs = group.get(0);
logger.warn("Omitting group with over " + MAX_GROUP_RATIO + " times the expected mean number of read pairs. " + "Mean=" + meanGroupSize + ", Actual=" + group.size() + ". Prefixes: " + StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES) + " / " + StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES));
} else {
final Map<String, List<PairedReadSequence>> sequencesByLibrary = splitByLibrary(group, readGroups);
// Now process the reads by library
for (final Map.Entry<String, List<PairedReadSequence>> entry : sequencesByLibrary.entrySet()) {
final String library = entry.getKey();
final List<PairedReadSequence> seqs = entry.getValue();
Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library);
Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library);
if (duplicationHisto == null) {
duplicationHisto = new Histogram<>("duplication_group_count", library);
opticalHisto = new Histogram<>("duplication_group_count", "optical_duplicates");
duplicationHistosByLibrary.put(library, duplicationHisto);
opticalHistosByLibrary.put(library, opticalHisto);
}
// Figure out if any reads within this group are duplicates of one another
for (int i = 0; i < seqs.size(); ++i) {
final PairedReadSequence lhs = seqs.get(i);
if (lhs == null)
continue;
final List<PairedReadSequence> dupes = new ArrayList<>();
for (int j = i + 1; j < seqs.size(); ++j) {
final PairedReadSequence rhs = seqs.get(j);
if (rhs == null)
continue;
if (matches(lhs, rhs, MAX_DIFF_RATE)) {
dupes.add(rhs);
seqs.set(j, null);
}
}
if (!dupes.isEmpty()) {
dupes.add(lhs);
final int duplicateCount = dupes.size();
duplicationHisto.increment(duplicateCount);
final boolean[] flags = opticalDuplicateFinder.findOpticalDuplicates(dupes);
for (final boolean b : flags) {
if (b)
opticalHisto.increment(duplicateCount);
}
} else {
duplicationHisto.increment(1);
}
}
}
++groupsProcessed;
if (lastLogTime < System.currentTimeMillis() - 60000) {
logger.info("Processed " + groupsProcessed + " groups.");
lastLogTime = System.currentTimeMillis();
}
}
}
sorter.cleanup();
final MetricsFile<DuplicationMetrics, Integer> file = getMetricsFile();
for (final String library : duplicationHistosByLibrary.keySet()) {
final Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library);
final Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library);
final DuplicationMetrics metrics = new DuplicationMetrics();
metrics.LIBRARY = library;
// Filter out any bins that have only a single entry in them and calcu
for (final Integer bin : duplicationHisto.keySet()) {
final double duplicateGroups = duplicationHisto.get(bin).getValue();
final double opticalDuplicates = opticalHisto.get(bin) == null ? 0 : opticalHisto.get(bin).getValue();
if (duplicateGroups > 1) {
metrics.READ_PAIRS_EXAMINED += (bin * duplicateGroups);
metrics.READ_PAIR_DUPLICATES += ((bin - 1) * duplicateGroups);
metrics.READ_PAIR_OPTICAL_DUPLICATES += opticalDuplicates;
}
}
metrics.calculateDerivedMetrics();
file.addMetric(metrics);
file.addHistogram(duplicationHisto);
}
file.write(OUTPUT);
}
return null;
}
use of htsjdk.samtools.SAMReadGroupRecord in project gatk by broadinstitute.
the class RevertSam method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
final boolean sanitizing = SANITIZE;
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
final SAMFileHeader inHeader = in.getFileHeader();
// If we are going to override SAMPLE_ALIAS or LIBRARY_NAME, make sure all the read
// groups have the same values.
final List<SAMReadGroupRecord> rgs = inHeader.getReadGroups();
if (SAMPLE_ALIAS != null || LIBRARY_NAME != null) {
boolean allSampleAliasesIdentical = true;
boolean allLibraryNamesIdentical = true;
for (int i = 1; i < rgs.size(); i++) {
if (!rgs.get(0).getSample().equals(rgs.get(i).getSample())) {
allSampleAliasesIdentical = false;
}
if (!rgs.get(0).getLibrary().equals(rgs.get(i).getLibrary())) {
allLibraryNamesIdentical = false;
}
}
if (SAMPLE_ALIAS != null && !allSampleAliasesIdentical) {
throw new UserException("Read groups have multiple values for sample. " + "A value for SAMPLE_ALIAS cannot be supplied.");
}
if (LIBRARY_NAME != null && !allLibraryNamesIdentical) {
throw new UserException("Read groups have multiple values for library name. " + "A value for library name cannot be supplied.");
}
}
////////////////////////////////////////////////////////////////////////////
// Build the output writer with an appropriate header based on the options
////////////////////////////////////////////////////////////////////////////
final boolean presorted = (inHeader.getSortOrder() == SORT_ORDER) || (SORT_ORDER == SAMFileHeader.SortOrder.queryname && SANITIZE);
final SAMFileHeader outHeader = new SAMFileHeader();
for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
if (SAMPLE_ALIAS != null) {
rg.setSample(SAMPLE_ALIAS);
}
if (LIBRARY_NAME != null) {
rg.setLibrary(LIBRARY_NAME);
}
outHeader.addReadGroup(rg);
}
outHeader.setSortOrder(SORT_ORDER);
if (!REMOVE_ALIGNMENT_INFORMATION) {
outHeader.setSequenceDictionary(inHeader.getSequenceDictionary());
outHeader.setProgramRecords(inHeader.getProgramRecords());
}
final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outHeader, presorted);
////////////////////////////////////////////////////////////////////////////
// Build a sorting collection to use if we are sanitizing
////////////////////////////////////////////////////////////////////////////
final SortingCollection<SAMRecord> sorter;
if (sanitizing) {
sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(outHeader), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM);
} else {
sorter = null;
}
final ProgressLogger progress = new ProgressLogger(logger, 1000000, "Reverted");
for (final SAMRecord rec : in) {
// Weed out non-primary and supplemental read as we don't want duplicates in the reverted file!
if (rec.isSecondaryOrSupplementary())
continue;
// Actually to the reverting of the remaining records
revertSamRecord(rec);
if (sanitizing)
sorter.add(rec);
else
out.addAlignment(rec);
progress.record(rec);
}
////////////////////////////////////////////////////////////////////////////
if (!sanitizing) {
out.close();
} else {
long total = 0, discarded = 0;
final PeekableIterator<SAMRecord> iterator = new PeekableIterator<>(sorter.iterator());
final Map<SAMReadGroupRecord, FastqQualityFormat> readGroupToFormat = new HashMap<>();
// Figure out the quality score encoding scheme for each read group.
for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
final SamRecordFilter filter = new SamRecordFilter() {
@Override
public boolean filterOut(final SAMRecord rec) {
return !rec.getReadGroup().getId().equals(rg.getId());
}
@Override
public boolean filterOut(final SAMRecord first, final SAMRecord second) {
throw new UnsupportedOperationException();
}
};
readGroupToFormat.put(rg, QualityEncodingDetector.detect(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, new FilteringSamIterator(reader.iterator(), filter), RESTORE_ORIGINAL_QUALITIES));
CloserUtil.close(reader);
}
for (final SAMReadGroupRecord r : readGroupToFormat.keySet()) {
logger.info("Detected quality format for " + r.getReadGroupId() + ": " + readGroupToFormat.get(r));
}
if (readGroupToFormat.values().contains(FastqQualityFormat.Solexa)) {
logger.error("No quality score encoding conversion implemented for " + FastqQualityFormat.Solexa);
return -1;
}
final ProgressLogger sanitizerProgress = new ProgressLogger(logger, 1000000, "Sanitized");
readNameLoop: while (iterator.hasNext()) {
final List<SAMRecord> recs = fetchByReadName(iterator);
total += recs.size();
// Check that all the reads have bases and qualities of the same length
for (final SAMRecord rec : recs) {
if (rec.getReadBases().length != rec.getBaseQualities().length) {
logger.debug("Discarding " + recs.size() + " reads with name " + rec.getReadName() + " for mismatching bases and quals length.");
discarded += recs.size();
continue readNameLoop;
}
}
// Check that if the first read is marked as unpaired that there is in fact only one read
if (!recs.get(0).getReadPairedFlag() && recs.size() > 1) {
logger.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because they claim to be unpaired.");
discarded += recs.size();
continue readNameLoop;
}
// Check that if we have paired reads there is exactly one first of pair and one second of pair
if (recs.get(0).getReadPairedFlag()) {
int firsts = 0, seconds = 0, unpaired = 0;
for (final SAMRecord rec : recs) {
if (!rec.getReadPairedFlag())
++unpaired;
if (rec.getFirstOfPairFlag())
++firsts;
if (rec.getSecondOfPairFlag())
++seconds;
}
if (unpaired > 0 || firsts != 1 || seconds != 1) {
logger.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because pairing information in corrupt.");
discarded += recs.size();
continue readNameLoop;
}
}
// If we've made it this far spit the records into the output!
for (final SAMRecord rec : recs) {
// The only valid quality score encoding scheme is standard; if it's not standard, change it.
final FastqQualityFormat recordFormat = readGroupToFormat.get(rec.getReadGroup());
if (!recordFormat.equals(FastqQualityFormat.Standard)) {
final byte[] quals = rec.getBaseQualities();
for (int i = 0; i < quals.length; i++) {
quals[i] -= SolexaQualityConverter.ILLUMINA_TO_PHRED_SUBTRAHEND;
}
rec.setBaseQualities(quals);
}
out.addAlignment(rec);
sanitizerProgress.record(rec);
}
}
out.close();
final double discardRate = discarded / (double) total;
final NumberFormat fmt = new DecimalFormat("0.000%");
logger.info("Discarded " + discarded + " out of " + total + " (" + fmt.format(discardRate) + ") reads in order to sanitize output.");
if (discarded / (double) total > MAX_DISCARD_FRACTION) {
throw new GATKException("Discarded " + fmt.format(discardRate) + " which is above MAX_DISCARD_FRACTION of " + fmt.format(MAX_DISCARD_FRACTION));
}
}
CloserUtil.close(in);
return null;
}
use of htsjdk.samtools.SAMReadGroupRecord in project gatk by broadinstitute.
the class ReferenceConfidenceModelUnitTest method setUp.
@BeforeClass
public void setUp() {
header = ArtificialReadUtils.createArtificialSamHeader(1, 1, 1000);
rg = new SAMReadGroupRecord(RGID);
rg.setSample(sample);
header.addReadGroup(rg);
parser = new GenomeLocParser(header.getSequenceDictionary());
}
Aggregations