use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk-protected by broadinstitute.
the class HDF5AllelicPoNUtils method read.
/**
* Reads an {@link AllelicPanelOfNormals} from an {@link HDF5File}.
*/
static AllelicPanelOfNormals read(final HDF5File file) {
if (file.isPresent(ALLELIC_PANEL_GROUP_NAME)) {
//read global hyperparameters
final double globalAlpha = file.readDouble(GLOBAL_ALPHA_PATH);
final double globalBeta = file.readDouble(GLOBAL_BETA_PATH);
final AllelicPanelOfNormals.HyperparameterValues globalHyperparameterValues = new AllelicPanelOfNormals.HyperparameterValues(globalAlpha, globalBeta);
//read SNPs
final List<SimpleInterval> snps = readSNPs(file);
//read local hyperparameters
final List<AllelicPanelOfNormals.HyperparameterValues> localHyperparameterValues = readLocalhyperparameterValues(file);
//check number of SNPs and local hyperparameters match
if (snps.size() != localHyperparameterValues.size()) {
throw new GATKException(String.format("Wrong number of elements in the SNPs and local hyperparameters recovered from file '%s': %d != %d", file.getFile(), snps.size(), localHyperparameterValues.size()));
}
//create site-to-hyperparameter map
final Map<SimpleInterval, AllelicPanelOfNormals.HyperparameterValues> siteToHyperparameterPairMap = new HashMap<>();
for (int i = 0; i < snps.size(); i++) {
final SimpleInterval site = snps.get(i);
final AllelicPanelOfNormals.HyperparameterValues hyperparameterValues = localHyperparameterValues.get(i);
if (siteToHyperparameterPairMap.containsKey(site)) {
throw new UserException.BadInput("Allelic panel of normals contains duplicate sites.");
} else {
siteToHyperparameterPairMap.put(site, hyperparameterValues);
}
}
return new AllelicPanelOfNormals(globalHyperparameterValues, siteToHyperparameterPairMap);
}
//if HDF5 file does not contain path for allelic PoN, return empty PoN
return AllelicPanelOfNormals.EMPTY_PON;
}
use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class IntervalArgumentCollection method parseIntervals.
private void parseIntervals(final GenomeLocParser genomeLocParser) {
// return if no interval arguments at all
if (!intervalsSpecified()) {
throw new GATKException("Cannot call parseIntervals() without specifying either intervals to include or exclude.");
}
GenomeLocSortedSet includeSortedSet;
if (getIntervalStrings().isEmpty()) {
// the -L argument isn't specified, which means that -XL was, since we checked intervalsSpecified()
// therefore we set the include set to be the entire reference territory
includeSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(genomeLocParser.getSequenceDictionary());
} else {
try {
includeSortedSet = IntervalUtils.loadIntervals(getIntervalStrings(), intervalSetRule, intervalMerging, intervalPadding, genomeLocParser);
} catch (UserException.EmptyIntersection e) {
throw new CommandLineException.BadArgumentValue("-L, --interval_set_rule", getIntervalStrings() + "," + intervalSetRule, "The specified intervals had an empty intersection");
}
}
final GenomeLocSortedSet excludeSortedSet = IntervalUtils.loadIntervals(excludeIntervalStrings, IntervalSetRule.UNION, intervalMerging, intervalExclusionPadding, genomeLocParser);
if (excludeSortedSet.contains(GenomeLoc.UNMAPPED)) {
throw new UserException("-XL unmapped is not currently supported");
}
GenomeLocSortedSet intervals;
// if no exclude arguments, can return the included set directly
if (excludeSortedSet.isEmpty()) {
intervals = includeSortedSet;
} else // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
{
intervals = includeSortedSet.subtractRegions(excludeSortedSet);
if (intervals.isEmpty()) {
throw new CommandLineException.BadArgumentValue("-L,-XL", getIntervalStrings().toString() + ", " + excludeIntervalStrings.toString(), "The intervals specified for exclusion with -XL removed all territory specified by -L.");
}
// logging messages only printed when exclude (-XL) arguments are given
final long toPruneSize = includeSortedSet.coveredSize();
final long toExcludeSize = excludeSortedSet.coveredSize();
final long intervalSize = intervals.coveredSize();
logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize));
logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)", toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
}
logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize()));
// Separate out requests for unmapped records from the rest of the intervals.
boolean traverseUnmapped = false;
if (intervals.contains(GenomeLoc.UNMAPPED)) {
traverseUnmapped = true;
intervals.remove(GenomeLoc.UNMAPPED);
}
traversalParameters = new TraversalParameters(IntervalUtils.convertGenomeLocsToSimpleIntervals(intervals.toList()), traverseUnmapped);
}
use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class AddContextDataToReadSparkOptimized method fillContext.
/**
* Given a shard that has reads and variants, query Google Genomics' Reference server and get reference info
* (including an extra margin on either side), and fill that and the correct variants into readContext.
*/
public static ContextShard fillContext(ReferenceMultiSource refSource, ContextShard shard) {
if (null == shard)
return null;
// use the function to make sure we get the exact correct amount of reference bases
int start = Integer.MAX_VALUE;
int end = Integer.MIN_VALUE;
SerializableFunction<GATKRead, SimpleInterval> referenceWindowFunction = refSource.getReferenceWindowFunction();
for (GATKRead r : shard.reads) {
SimpleInterval readRefs = referenceWindowFunction.apply(r);
start = Math.min(readRefs.getStart(), start);
end = Math.max(readRefs.getEnd(), end);
}
if (start == Integer.MAX_VALUE) {
// there are no reads in this shard, so we're going to remove it
return null;
}
SimpleInterval refInterval = new SimpleInterval(shard.interval.getContig(), start, end);
ReferenceBases refBases;
try {
refBases = refSource.getReferenceBases(null, refInterval);
} catch (IOException x) {
throw new GATKException("Unable to read the reference");
}
ArrayList<ReadContextData> readContext = new ArrayList<>();
for (GATKRead r : shard.reads) {
SimpleInterval readInterval = new SimpleInterval(r);
List<GATKVariant> variantsOverlappingThisRead = shard.variantsOverlapping(readInterval);
// we pass all the bases. That's better because this way it's just a shared
// pointer instead of being an array copy. Downstream processing is fine with having
// extra bases (it expects a few, actually).
readContext.add(new ReadContextData(refBases, variantsOverlappingThisRead));
}
return shard.withReadContext(readContext);
}
use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class RevertSam method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
final boolean sanitizing = SANITIZE;
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
final SAMFileHeader inHeader = in.getFileHeader();
// If we are going to override SAMPLE_ALIAS or LIBRARY_NAME, make sure all the read
// groups have the same values.
final List<SAMReadGroupRecord> rgs = inHeader.getReadGroups();
if (SAMPLE_ALIAS != null || LIBRARY_NAME != null) {
boolean allSampleAliasesIdentical = true;
boolean allLibraryNamesIdentical = true;
for (int i = 1; i < rgs.size(); i++) {
if (!rgs.get(0).getSample().equals(rgs.get(i).getSample())) {
allSampleAliasesIdentical = false;
}
if (!rgs.get(0).getLibrary().equals(rgs.get(i).getLibrary())) {
allLibraryNamesIdentical = false;
}
}
if (SAMPLE_ALIAS != null && !allSampleAliasesIdentical) {
throw new UserException("Read groups have multiple values for sample. " + "A value for SAMPLE_ALIAS cannot be supplied.");
}
if (LIBRARY_NAME != null && !allLibraryNamesIdentical) {
throw new UserException("Read groups have multiple values for library name. " + "A value for library name cannot be supplied.");
}
}
////////////////////////////////////////////////////////////////////////////
// Build the output writer with an appropriate header based on the options
////////////////////////////////////////////////////////////////////////////
final boolean presorted = (inHeader.getSortOrder() == SORT_ORDER) || (SORT_ORDER == SAMFileHeader.SortOrder.queryname && SANITIZE);
final SAMFileHeader outHeader = new SAMFileHeader();
for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
if (SAMPLE_ALIAS != null) {
rg.setSample(SAMPLE_ALIAS);
}
if (LIBRARY_NAME != null) {
rg.setLibrary(LIBRARY_NAME);
}
outHeader.addReadGroup(rg);
}
outHeader.setSortOrder(SORT_ORDER);
if (!REMOVE_ALIGNMENT_INFORMATION) {
outHeader.setSequenceDictionary(inHeader.getSequenceDictionary());
outHeader.setProgramRecords(inHeader.getProgramRecords());
}
final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outHeader, presorted);
////////////////////////////////////////////////////////////////////////////
// Build a sorting collection to use if we are sanitizing
////////////////////////////////////////////////////////////////////////////
final SortingCollection<SAMRecord> sorter;
if (sanitizing) {
sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(outHeader), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM);
} else {
sorter = null;
}
final ProgressLogger progress = new ProgressLogger(logger, 1000000, "Reverted");
for (final SAMRecord rec : in) {
// Weed out non-primary and supplemental read as we don't want duplicates in the reverted file!
if (rec.isSecondaryOrSupplementary())
continue;
// Actually to the reverting of the remaining records
revertSamRecord(rec);
if (sanitizing)
sorter.add(rec);
else
out.addAlignment(rec);
progress.record(rec);
}
////////////////////////////////////////////////////////////////////////////
if (!sanitizing) {
out.close();
} else {
long total = 0, discarded = 0;
final PeekableIterator<SAMRecord> iterator = new PeekableIterator<>(sorter.iterator());
final Map<SAMReadGroupRecord, FastqQualityFormat> readGroupToFormat = new HashMap<>();
// Figure out the quality score encoding scheme for each read group.
for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
final SamRecordFilter filter = new SamRecordFilter() {
@Override
public boolean filterOut(final SAMRecord rec) {
return !rec.getReadGroup().getId().equals(rg.getId());
}
@Override
public boolean filterOut(final SAMRecord first, final SAMRecord second) {
throw new UnsupportedOperationException();
}
};
readGroupToFormat.put(rg, QualityEncodingDetector.detect(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, new FilteringSamIterator(reader.iterator(), filter), RESTORE_ORIGINAL_QUALITIES));
CloserUtil.close(reader);
}
for (final SAMReadGroupRecord r : readGroupToFormat.keySet()) {
logger.info("Detected quality format for " + r.getReadGroupId() + ": " + readGroupToFormat.get(r));
}
if (readGroupToFormat.values().contains(FastqQualityFormat.Solexa)) {
logger.error("No quality score encoding conversion implemented for " + FastqQualityFormat.Solexa);
return -1;
}
final ProgressLogger sanitizerProgress = new ProgressLogger(logger, 1000000, "Sanitized");
readNameLoop: while (iterator.hasNext()) {
final List<SAMRecord> recs = fetchByReadName(iterator);
total += recs.size();
// Check that all the reads have bases and qualities of the same length
for (final SAMRecord rec : recs) {
if (rec.getReadBases().length != rec.getBaseQualities().length) {
logger.debug("Discarding " + recs.size() + " reads with name " + rec.getReadName() + " for mismatching bases and quals length.");
discarded += recs.size();
continue readNameLoop;
}
}
// Check that if the first read is marked as unpaired that there is in fact only one read
if (!recs.get(0).getReadPairedFlag() && recs.size() > 1) {
logger.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because they claim to be unpaired.");
discarded += recs.size();
continue readNameLoop;
}
// Check that if we have paired reads there is exactly one first of pair and one second of pair
if (recs.get(0).getReadPairedFlag()) {
int firsts = 0, seconds = 0, unpaired = 0;
for (final SAMRecord rec : recs) {
if (!rec.getReadPairedFlag())
++unpaired;
if (rec.getFirstOfPairFlag())
++firsts;
if (rec.getSecondOfPairFlag())
++seconds;
}
if (unpaired > 0 || firsts != 1 || seconds != 1) {
logger.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because pairing information in corrupt.");
discarded += recs.size();
continue readNameLoop;
}
}
// If we've made it this far spit the records into the output!
for (final SAMRecord rec : recs) {
// The only valid quality score encoding scheme is standard; if it's not standard, change it.
final FastqQualityFormat recordFormat = readGroupToFormat.get(rec.getReadGroup());
if (!recordFormat.equals(FastqQualityFormat.Standard)) {
final byte[] quals = rec.getBaseQualities();
for (int i = 0; i < quals.length; i++) {
quals[i] -= SolexaQualityConverter.ILLUMINA_TO_PHRED_SUBTRAHEND;
}
rec.setBaseQualities(quals);
}
out.addAlignment(rec);
sanitizerProgress.record(rec);
}
}
out.close();
final double discardRate = discarded / (double) total;
final NumberFormat fmt = new DecimalFormat("0.000%");
logger.info("Discarded " + discarded + " out of " + total + " (" + fmt.format(discardRate) + ") reads in order to sanitize output.");
if (discarded / (double) total > MAX_DISCARD_FRACTION) {
throw new GATKException("Discarded " + fmt.format(discardRate) + " which is above MAX_DISCARD_FRACTION of " + fmt.format(MAX_DISCARD_FRACTION));
}
}
CloserUtil.close(in);
return null;
}
use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class HDF5AllelicPoNUtils method read.
/**
* Reads an {@link AllelicPanelOfNormals} from an {@link HDF5File}.
*/
static AllelicPanelOfNormals read(final HDF5File file) {
if (file.isPresent(ALLELIC_PANEL_GROUP_NAME)) {
//read global hyperparameters
final double globalAlpha = file.readDouble(GLOBAL_ALPHA_PATH);
final double globalBeta = file.readDouble(GLOBAL_BETA_PATH);
final AllelicPanelOfNormals.HyperparameterValues globalHyperparameterValues = new AllelicPanelOfNormals.HyperparameterValues(globalAlpha, globalBeta);
//read SNPs
final List<SimpleInterval> snps = readSNPs(file);
//read local hyperparameters
final List<AllelicPanelOfNormals.HyperparameterValues> localHyperparameterValues = readLocalhyperparameterValues(file);
//check number of SNPs and local hyperparameters match
if (snps.size() != localHyperparameterValues.size()) {
throw new GATKException(String.format("Wrong number of elements in the SNPs and local hyperparameters recovered from file '%s': %d != %d", file.getFile(), snps.size(), localHyperparameterValues.size()));
}
//create site-to-hyperparameter map
final Map<SimpleInterval, AllelicPanelOfNormals.HyperparameterValues> siteToHyperparameterPairMap = new HashMap<>();
for (int i = 0; i < snps.size(); i++) {
final SimpleInterval site = snps.get(i);
final AllelicPanelOfNormals.HyperparameterValues hyperparameterValues = localHyperparameterValues.get(i);
if (siteToHyperparameterPairMap.containsKey(site)) {
throw new UserException.BadInput("Allelic panel of normals contains duplicate sites.");
} else {
siteToHyperparameterPairMap.put(site, hyperparameterValues);
}
}
return new AllelicPanelOfNormals(globalHyperparameterValues, siteToHyperparameterPairMap);
}
//if HDF5 file does not contain path for allelic PoN, return empty PoN
return AllelicPanelOfNormals.EMPTY_PON;
}
Aggregations