use of org.broadinstitute.hellbender.utils.SimpleInterval in project gatk by broadinstitute.
the class CombineReadCounts method readCountFileReader.
/**
* Creates a read-count file reader given the input files and the expected target collection.
* @param file the input file.
* @param targets the expected targets in the input file.
* @return never {@code null}.
*/
private TableReader<ReadCountRecord> readCountFileReader(final File file, final TargetCollection<Target> targets) {
try {
return new TableReader<ReadCountRecord>(file) {
private boolean hasName;
private boolean hasCoordinates;
private int countColumnCount;
private int[] countColumnIndexes;
@Override
public void processColumns(final TableColumnCollection columns) {
hasCoordinates = columns.containsAll(TargetTableColumn.CONTIG.toString(), TargetTableColumn.START.toString(), TargetTableColumn.END.toString());
hasName = columns.contains(TargetTableColumn.NAME.toString());
if (!hasCoordinates && !hasName) {
throw formatException("header contain neither coordinates nor target name columns");
}
final List<String> countColumnNames = readCountColumnNames(columns);
countColumnCount = countColumnNames.size();
countColumnIndexes = new int[countColumnCount];
for (int i = 0; i < countColumnCount; i++) {
countColumnIndexes[i] = columns.indexOf(countColumnNames.get(i));
}
}
@Override
protected ReadCountRecord createRecord(final DataLine dataLine) {
final double[] counts = new double[countColumnCount];
final Target target = createTarget(dataLine);
for (int i = 0; i < counts.length; i++) {
counts[i] = dataLine.getDouble(countColumnIndexes[i]);
}
return new ReadCountRecord(target, counts);
}
/**
* Extracts the target object out of a data input line.
* @param dataLine the input data line.
* @return never {@code null}.
*/
private Target createTarget(final DataLine dataLine) {
if (hasName) {
final String name = dataLine.get(TargetTableColumn.NAME);
final Target target = targets.target(name);
final SimpleInterval interval = createInterval(dataLine);
if (target == null) {
return new Target(name, createInterval(dataLine));
} else if (interval != null && !interval.equals(target.getInterval())) {
throw new UserException.BadInput(String.format("invalid target '%s' coordinates: expected %s but found %s", name, target.getInterval(), createInterval(dataLine)));
} else {
return target;
}
} else {
// hasCoordinates must be true.
final SimpleInterval interval = createInterval(dataLine);
final Optional<Target> target = targets.targets(interval).stream().findAny();
if (!target.isPresent() || !target.get().getInterval().equals(interval)) {
throw formatException("target not found with coordinates " + interval);
}
return target.get();
}
}
/**
* Extract the interval out of a data line.
* @param dataLine the input data line.
* @return {@code null} if the interval cannot be determined from the input file alone.
*/
private SimpleInterval createInterval(final DataLine dataLine) {
if (hasCoordinates) {
return new SimpleInterval(dataLine.get(TargetTableColumn.CONTIG), dataLine.getInt(TargetTableColumn.START), dataLine.getInt(TargetTableColumn.END));
} else {
return null;
}
}
};
} catch (final IOException ex) {
throw new UserException.CouldNotReadInputFile(file, ex);
}
}
use of org.broadinstitute.hellbender.utils.SimpleInterval in project gatk by broadinstitute.
the class CalculateTargetCoverage method writeOutputRows.
/**
* Writes the row in the main matrix output file for a target and, if requested,
* the corresponding row in the row summary output file.
*
* @param countBuffer the counts for the target.
* @param index the index of target within the target collection.
*/
private void writeOutputRows(final int[] countBuffer, final long[] columnTotals, final int index) {
final String countString = IntStream.range(0, countBuffer.length).mapToObj(i -> transform.apply(countBuffer[i], columnTotals[i])).collect(Collectors.joining(COLUMN_SEPARATOR));
final String targetInfoString = targetOutInfo.composeTargetOutInfoString(index, targetCollection);
outputWriter.println(String.join(COLUMN_SEPARATOR, targetInfoString, countString));
if (rowSummaryOutputWriter != null) {
final long sum = MathUtils.sum(countBuffer);
final SimpleInterval location = targetCollection.location(index);
final int targetSize = location.size();
rowSummaryOutputWriter.println(String.join(COLUMN_SEPARATOR, targetInfoString, Long.toString(sum), String.format(AVERAGE_DOUBLE_FORMAT, sum / ((float) countColumns.columnCount() * targetSize))));
}
}
use of org.broadinstitute.hellbender.utils.SimpleInterval in project gatk by broadinstitute.
the class CalculateTargetCoverage method apply.
@Override
public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) {
final SimpleInterval readLocation = referenceContext.getInterval();
final int columnIndex = countColumns.columnIndex(read);
if (columnIndex >= 0) {
// < 0 would means that the read is to be ignored.
targetCollection.indexRange(readLocation).forEach(i -> counts[columnIndex][i]++);
}
}
use of org.broadinstitute.hellbender.utils.SimpleInterval in project gatk by broadinstitute.
the class ReadCountsReader method extractRecordWithTargetNameAndIntervalExtractorsAndTargetCollection.
private ReadCountRecord extractRecordWithTargetNameAndIntervalExtractorsAndTargetCollection(final DataLine data, final Function<DataLine, SimpleInterval> intervalExtractor, final Function<DataLine, String> targetNameExtractor, final Function<DataLine, double[]> countExtractor) {
final String name = targetNameExtractor.apply(data);
final SimpleInterval interval = intervalExtractor.apply(data);
final Target target = createRecordTarget(targets, interval);
if (target == null) {
return ignoreMissingTargets ? null : new ReadCountRecord(new Target(name, interval), countExtractor.apply(data));
} else if (!target.getName().equals(name)) {
throw formatException(String.format("conflicting target resolution from the name (%s) and interval (%s) provided", name, interval));
} else {
return new ReadCountRecord(target, countExtractor.apply(data));
}
}
use of org.broadinstitute.hellbender.utils.SimpleInterval in project gatk by broadinstitute.
the class GappedAlignmentSplitter method split.
/**
* Splits a gapped alignment into multiple alignment regions, based on input sensitivity: i.e. if the gap(s) in the input alignment
* is equal to or larger than the size specified by {@code sensitivity}, two alignment regions will be generated.
*
* To fulfill this functionality, needs to accomplish three key tasks correctly:
* <ul>
* <li>generate appropriate cigar,</li>
* <li>infer reference coordinates,</li>
* <li>infer contig coordinates</li>
* </ul>
* As an example, an alignment with CIGAR "397S118M2D26M6I50M7I26M1I8M13D72M398S", when {@code sensitivity} is set to "1", should be split into 5 alignments
* <ul>
* <li>"397S118M594S"</li>
* <li>"515S26M568S"</li>
* <li>"547S50M512S"</li>
* <li>"631S8M470S"</li>
* <li>"639S72M398S"</li>
* </ul>
* On the other hand, an alignment with CIGAR "10M10D10M60I10M10I10M50D10M", when {@code sensitivity} is set to "50", should be split into 3 alignments
* <ul>
* <li>"10M10D10M100S"</li>
* <li>"80S10M10I10M10S"</li>
* <li>"110S10M"</li>
* </ul>
* And when an alignment has hard clipping adjacent to soft clippings, e.g. "1H2S3M5I10M20D6M7S8H", it should be split into alignments with CIGAR resembling the original CIGAR as much as possible:
* <ul>
* <li>"1H2S3M28S8H"</li>
* <li>"1H10S10M13S8H"</li>
* <li>"1H20S6M7S8H"</li>
* </ul>
*
* @return an iterable of size >= 1. if size==1, the returned iterable contains only the input (i.e. either no gap or hasn't reached sensitivity)
*/
@VisibleForTesting
public static Iterable<AlignedAssembly.AlignmentInterval> split(final AlignedAssembly.AlignmentInterval oneRegion, final int sensitivity, final int unclippedContigLen) {
final List<CigarElement> cigarElements = checkCigarAndConvertTerminalInsertionToSoftClip(oneRegion.cigarAlong5to3DirectionOfContig);
if (cigarElements.size() == 1)
return new ArrayList<>(Collections.singletonList(oneRegion));
// blunt guess
final List<AlignedAssembly.AlignmentInterval> result = new ArrayList<>(3);
final int originalMapQ = oneRegion.mapQual;
final List<CigarElement> cigarMemoryList = new ArrayList<>();
final int clippedNBasesFromStart = SVVariantDiscoveryUtils.getNumClippedBases(true, cigarElements);
final int hardClippingAtBeginning = cigarElements.get(0).getOperator() == CigarOperator.H ? cigarElements.get(0).getLength() : 0;
final int hardClippingAtEnd = (cigarElements.get(cigarElements.size() - 1).getOperator() == CigarOperator.H) ? cigarElements.get(cigarElements.size() - 1).getLength() : 0;
final CigarElement hardClippingAtBeginningMaybeNull = hardClippingAtBeginning == 0 ? null : new CigarElement(hardClippingAtBeginning, CigarOperator.H);
int contigIntervalStart = 1 + clippedNBasesFromStart;
// we are walking along the contig following the cigar, which indicates that we might be walking backwards on the reference if oneRegion.forwardStrand==false
int refBoundary1stInTheDirectionOfContig = oneRegion.forwardStrand ? oneRegion.referenceInterval.getStart() : oneRegion.referenceInterval.getEnd();
for (final CigarElement cigarElement : cigarElements) {
final CigarOperator op = cigarElement.getOperator();
final int operatorLen = cigarElement.getLength();
switch(op) {
case M:
case EQ:
case X:
case S:
case H:
cigarMemoryList.add(cigarElement);
break;
case I:
case D:
if (operatorLen < sensitivity) {
cigarMemoryList.add(cigarElement);
break;
}
// collapse cigar memory list into a single cigar for ref & contig interval computation
final Cigar memoryCigar = new Cigar(cigarMemoryList);
final int effectiveReadLen = memoryCigar.getReadLength() + SVVariantDiscoveryUtils.getTotalHardClipping(memoryCigar) - SVVariantDiscoveryUtils.getNumClippedBases(true, memoryCigar);
// task 1: infer reference interval taking into account of strand
final SimpleInterval referenceInterval;
if (oneRegion.forwardStrand) {
referenceInterval = new SimpleInterval(oneRegion.referenceInterval.getContig(), refBoundary1stInTheDirectionOfContig, refBoundary1stInTheDirectionOfContig + (memoryCigar.getReferenceLength() - 1));
} else {
referenceInterval = new SimpleInterval(oneRegion.referenceInterval.getContig(), // step backward
refBoundary1stInTheDirectionOfContig - (memoryCigar.getReferenceLength() - 1), refBoundary1stInTheDirectionOfContig);
}
// task 2: infer contig interval
final int contigIntervalEnd = contigIntervalStart + effectiveReadLen - 1;
// task 3: now add trailing cigar element and create the real cigar for the to-be-returned AR
cigarMemoryList.add(new CigarElement(unclippedContigLen - contigIntervalEnd - hardClippingAtEnd, CigarOperator.S));
if (hardClippingAtEnd != 0) {
// be faithful to hard clipping (as the accompanying bases have been hard-clipped away)
cigarMemoryList.add(new CigarElement(hardClippingAtEnd, CigarOperator.H));
}
final Cigar cigarForNewAlignmentInterval = new Cigar(cigarMemoryList);
final AlignedAssembly.AlignmentInterval split = new AlignedAssembly.AlignmentInterval(referenceInterval, contigIntervalStart, contigIntervalEnd, cigarForNewAlignmentInterval, oneRegion.forwardStrand, originalMapQ, SVConstants.DiscoveryStepConstants.ARTIFICIAL_MISMATCH);
result.add(split);
// update cigar memory
cigarMemoryList.clear();
if (hardClippingAtBeginningMaybeNull != null) {
// be faithful about hard clippings
cigarMemoryList.add(hardClippingAtBeginningMaybeNull);
}
cigarMemoryList.add(new CigarElement(contigIntervalEnd - hardClippingAtBeginning + (op.consumesReadBases() ? operatorLen : 0), CigarOperator.S));
// update pointers into reference and contig
final int refBoundaryAdvance = op.consumesReadBases() ? memoryCigar.getReferenceLength() : memoryCigar.getReferenceLength() + operatorLen;
refBoundary1stInTheDirectionOfContig += oneRegion.forwardStrand ? refBoundaryAdvance : -refBoundaryAdvance;
contigIntervalStart += op.consumesReadBases() ? effectiveReadLen + operatorLen : effectiveReadLen;
break;
default:
// TODO: 1/20/17 still not quite sure if this is quite right, it doesn't blow up on NA12878 WGS, but who knows what happens in the future
throw new GATKException("Alignment CIGAR contains an unexpected N or P element: " + oneRegion.toPackedString());
}
}
if (result.isEmpty()) {
return new ArrayList<>(Collections.singletonList(oneRegion));
}
final SimpleInterval lastReferenceInterval;
if (oneRegion.forwardStrand) {
lastReferenceInterval = new SimpleInterval(oneRegion.referenceInterval.getContig(), refBoundary1stInTheDirectionOfContig, oneRegion.referenceInterval.getEnd());
} else {
lastReferenceInterval = new SimpleInterval(oneRegion.referenceInterval.getContig(), oneRegion.referenceInterval.getStart(), refBoundary1stInTheDirectionOfContig);
}
final Cigar lastForwardStrandCigar = new Cigar(cigarMemoryList);
int clippedNBasesFromEnd = SVVariantDiscoveryUtils.getNumClippedBases(false, cigarElements);
result.add(new AlignedAssembly.AlignmentInterval(lastReferenceInterval, contigIntervalStart, unclippedContigLen - clippedNBasesFromEnd, lastForwardStrandCigar, oneRegion.forwardStrand, originalMapQ, SVConstants.DiscoveryStepConstants.ARTIFICIAL_MISMATCH));
return result;
}
Aggregations