use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class CoverageServer method printRaster.
/**
* print BAM for small interval, displaying reads
*/
private void printRaster(final BamInput bam, final SimpleInterval midRegion, final SimpleInterval region, final HttpServletRequest request, final HttpServletResponse response) throws IOException, ServletException {
final IntToDoubleFunction position2pixel = X -> ((X - region.getStart()) / (double) region.getLengthOnReference()) * (double) image_width;
final SamReaderFactory srf = SamReaderFactory.make().validationStringency(ValidationStringency.LENIENT).referenceSequence(this.faidxRef);
final Pileup<SAMRecord> pileup = new Pileup<>((L, R) -> position2pixel.applyAsDouble(L.getUnclippedEnd() + 1) + 1 < position2pixel.applyAsDouble(R.getUnclippedStart()));
try (SamReader sr = srf.open(bam.bamPath)) {
try (CloseableIterator<SAMRecord> iter = sr.query(region.getContig(), // extend to get clipR
Math.max(0, region.getStart() - this.small_region_size), region.getEnd() + this.small_region_size, false)) {
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
if (!acceptRead(rec))
continue;
if (rec.getUnclippedEnd() < region.getStart())
continue;
if (rec.getUnclippedStart() > region.getEnd())
continue;
final Cigar cigar = rec.getCigar();
if (cigar == null || cigar.isEmpty())
continue;
pileup.add(rec);
}
}
// end iterator
}
// end samreader
ReferenceSequence refInInterval = null;
try (ReferenceSequenceFile refseq = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidxRef)) {
final SAMSequenceRecord ssr = this.dictionary.getSequence(region.getContig());
if (region.getStart() <= ssr.getSequenceLength()) {
refInInterval = refseq.getSubsequenceAt(region.getContig(), region.getStart(), Math.min(region.getEnd(), ssr.getSequenceLength()));
}
}
final BufferedImage img = new BufferedImage(image_width, image_height, BufferedImage.TYPE_INT_RGB);
final Graphics2D g = img.createGraphics();
g.setRenderingHint(RenderingHints.KEY_RENDERING, RenderingHints.VALUE_RENDER_QUALITY);
g.setColor(new Color(240, 240, 240));
g.fillRect(0, 0, image_width + 1, image_height + 1);
final Stroke oldStroke = g.getStroke();
g.setStroke(new BasicStroke(0.5f));
g.setColor(Color.WHITE);
final double mid_start = position2pixel.applyAsDouble(midRegion.getStart());
final double mid_end = position2pixel.applyAsDouble(midRegion.getEnd() + 1);
g.fill(new Rectangle2D.Double(mid_start, 0, (mid_end - mid_start), this.image_height));
final int[] int_coverage = new int[region.getLengthOnReference()];
final int margin_top = 12;
final double featureHeight = Math.min(20, (this.image_height - margin_top) / Math.max(1.0, (double) pileup.getRowCount()));
double y = image_height - featureHeight;
for (final List<SAMRecord> row : pileup) {
final double h2 = Math.min(featureHeight * 0.9, featureHeight - 2);
for (final SAMRecord rec : row) {
final Cigar cigar = rec.getCigar();
if (cigar == null || cigar.isEmpty())
continue;
/* fill coverage array */
int ref1 = rec.getAlignmentStart();
for (CigarElement ce : cigar.getCigarElements()) {
if (ref1 > region.getEnd())
break;
final CigarOperator op = ce.getOperator();
if (op.consumesReferenceBases()) {
if (op.consumesReadBases()) {
for (int x = 0; x < ce.getLength(); ++x) {
int pos = ref1 + x;
if (pos < region.getStart())
continue;
if (pos > region.getEnd())
break;
int_coverage[pos - region.getStart()]++;
}
}
ref1 += ce.getLength();
}
}
/* draw rec itself */
final double midy = y + h2 / 2.0;
g.setColor(Color.DARK_GRAY);
g.draw(new Line2D.Double(position2pixel.applyAsDouble(rec.getUnclippedStart()), midy, position2pixel.applyAsDouble(rec.getUnclippedEnd()), midy));
ref1 = rec.getUnclippedStart();
final List<Double> insertions = new ArrayList<>();
for (final CigarElement ce : cigar.getCigarElements()) {
if (ref1 > region.getEnd())
break;
final CigarOperator op = ce.getOperator();
Shape shape = null;
Color fill = null;
Color stroke = Color.DARK_GRAY;
switch(op) {
case P:
break;
// through
case M:
// through
case X:
// through
case EQ:
// through
case S:
case H:
final double x1 = position2pixel.applyAsDouble(ref1);
shape = new Rectangle2D.Double(x1, y, position2pixel.applyAsDouble(ref1 + ce.getLength()) - x1, h2);
ref1 += ce.getLength();
switch(op) {
case H:
case S:
fill = Color.YELLOW;
break;
case X:
fill = Color.RED;
break;
case EQ:
case M:
fill = Color.LIGHT_GRAY;
break;
default:
break;
}
break;
// through
case N:
case D:
shape = null;
fill = null;
stroke = null;
ref1 += ce.getLength();
break;
case I:
shape = null;
fill = null;
stroke = null;
insertions.add(position2pixel.applyAsDouble(ref1));
break;
default:
throw new IllegalStateException("" + op);
}
if (ref1 < region.getStart())
continue;
if (shape != null) {
if (fill != null) {
g.setColor(fill);
g.fill(shape);
}
if (stroke != null && h2 > 4) {
g.setColor(stroke);
g.draw(shape);
}
}
}
/* draw mismatched bases */
if (refInInterval != null && rec.getReadBases() != null && rec.getReadBases() != SAMRecord.NULL_SEQUENCE) {
final byte[] bases = rec.getReadBases();
final IntFunction<Character> baseRead = IDX -> IDX < 0 || IDX >= bases.length || bases == SAMRecord.NULL_SEQUENCE ? 'N' : (char) Character.toUpperCase(bases[IDX]);
int read0 = 0;
ref1 = rec.getAlignmentStart();
for (CigarElement ce : cigar.getCigarElements()) {
if (ref1 > region.getEnd())
break;
final CigarOperator op = ce.getOperator();
switch(op) {
case P:
break;
case H:
break;
case D:
case N:
ref1 += ce.getLength();
break;
case S:
case I:
read0 += ce.getLength();
break;
case EQ:
case M:
case X:
{
for (int j = 0; j < ce.getLength(); j++) {
if (ref1 + j < region.getStart())
continue;
if (ref1 + j >= region.getStart() + refInInterval.length())
break;
final int ref_base_idx = ref1 - region.getStart() + j;
char ctgBase = (char) (ref_base_idx < 0 || ref_base_idx >= refInInterval.length() ? 'N' : Character.toUpperCase(refInInterval.getBases()[ref_base_idx]));
if (ctgBase == 'N')
continue;
char readBase = baseRead.apply(read0 + j);
if (readBase == 'N')
continue;
if (readBase == ctgBase)
continue;
g.setColor(Color.ORANGE);
final double x1 = position2pixel.applyAsDouble(ref1 + j);
final double x2 = position2pixel.applyAsDouble(ref1 + j + 1);
g.fill(new Rectangle2D.Double(x1, y, x2 - x1, h2));
}
read0 += ce.getLength();
ref1 += ce.getLength();
break;
}
default:
break;
}
}
}
for (double px : insertions) {
g.setColor(Color.RED);
g.draw(new Line2D.Double(px, y - 0.5, px, y + h2 + 0.5));
}
}
y -= featureHeight;
}
double max_cov = IntStream.of(int_coverage).max().orElse(0);
g.setColor(Color.DARK_GRAY);
g.drawString("Sample:" + bam.sample + " max-cov:" + (int) max_cov + " " + region.toNiceString(), 10, 10);
/* plot coverage */
final GeneralPath gp = new GeneralPath();
for (int i = 0; max_cov > 0 && i < int_coverage.length; ++i) {
final double x1 = position2pixel.applyAsDouble(region.getStart() + i);
final double x2 = position2pixel.applyAsDouble(region.getStart() + i + 1);
final double y1 = image_height - (int_coverage[i] / max_cov) * (image_height - margin_top);
if (i == 0)
gp.moveTo(x1, y1);
else
gp.lineTo(x1, y1);
gp.lineTo(x2, y1);
}
g.setStroke(new BasicStroke(0.5f, BasicStroke.CAP_BUTT, BasicStroke.JOIN_BEVEL, 0f, new float[] { 1f, 2f, 1f }, 0f));
g.setColor(Color.BLUE);
g.draw(gp);
g.setStroke(oldStroke);
writeGenes(g, region);
writeKnownCnv(g, region);
g.setColor(Color.PINK);
g.draw(new Line2D.Double(mid_start, 0, mid_start, image_height));
g.draw(new Line2D.Double(mid_end, 0, mid_end, image_height));
writeImage(img, bam, region, response);
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class SamFindClippedRegions method doWork.
@Override
public int doWork(final List<String> args) {
final int bad_mapq = 30;
try {
if (this.min_clip_depth > this.min_depth) {
LOG.error("this.min_clip_depth>this.min_depth");
return -1;
}
if (this.fraction < 0 || this.fraction > 1.0) {
LOG.error("bad ratio: " + fraction);
return -1;
}
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.referenceFai);
final List<Path> inputs = IOUtils.unrollPaths(args);
if (inputs.isEmpty()) {
LOG.error("input is missing");
return -1;
}
final IntervalTreeMap<Interval> intronMap = new IntervalTreeMap<>();
if (this.gtfPath != null) {
try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
gtfReader.getAllGenes().stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.hasIntron()).flatMap(T -> T.getIntrons().stream()).map(I -> new Interval(I.getContig(), I.getStart(), I.getEnd(), I.isNegativeStrand(), I.getTranscript().getId())).forEach(I -> intronMap.put(I, I));
}
}
final List<String> sample_list = new ArrayList<>();
final QueryInterval[] queryIntervals;
if (this.bedPath == null) {
queryIntervals = null;
} else {
try (BedLineReader br = new BedLineReader(this.bedPath).setValidationStringency(ValidationStringency.LENIENT).setContigNameConverter(ContigNameConverter.fromOneDictionary(dict))) {
queryIntervals = br.optimizeIntervals(dict);
}
}
SortingCollection<Base> sortingCollection = SortingCollection.newInstance(Base.class, new BaseCodec(), (A, B) -> A.compare1(B), writingSortingCollection.getMaxRecordsInRam(), writingSortingCollection.getTmpPath());
sortingCollection.setDestructiveIteration(true);
final Predicate<Base> acceptBase = B -> {
return B.clip() > 0;
};
final SamReaderFactory srf = super.createSamReaderFactory().referenceSequence(this.referenceFai);
for (final Path path : inputs) {
try (SamReader sr = srf.open(path)) {
final SAMFileHeader header = sr.getFileHeader();
SequenceUtil.assertSequenceDictionariesEqual(dict, SequenceDictionaryUtils.extractRequired(header));
final String sample_name = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse(IOUtils.getFilenameWithoutCommonSuffixes(path));
if (sample_list.contains(sample_name)) {
LOG.error("duplicate sample " + sample_name + " in " + path);
return -1;
}
final int sample_idx = sample_list.size();
sample_list.add(sample_name);
int prev_tid = -1;
final SortedMap<Integer, Base> pos2base = new TreeMap<>();
/* get base in pos2base, create it if needed */
final BiFunction<Integer, Integer, Base> baseAt = (TID, POS) -> {
Base b = pos2base.get(POS);
if (b == null) {
b = new Base(sample_idx, TID, POS);
pos2base.put(POS, b);
}
return b;
};
try (CloseableIterator<SAMRecord> iter = queryIntervals == null ? sr.iterator() : sr.query(queryIntervals, false)) {
for (; ; ) {
final SAMRecord rec = (iter.hasNext() ? iter.next() : null);
if (rec != null && !SAMRecordDefaultFilter.accept(rec, this.min_mapq))
continue;
if (rec == null || prev_tid != rec.getReferenceIndex().intValue()) {
for (final Integer pos : pos2base.keySet()) {
final Base b = pos2base.get(pos);
if (acceptBase.test(b))
sortingCollection.add(b);
}
if (rec == null)
break;
pos2base.clear();
prev_tid = rec.getReferenceIndex().intValue();
}
/* pop back previous bases */
for (Iterator<Integer> rpos = pos2base.keySet().iterator(); rpos.hasNext(); ) {
final Integer pos = rpos.next();
if (pos.intValue() + this.max_clip_length >= rec.getUnclippedStart())
break;
final Base b = pos2base.get(pos);
if (acceptBase.test(b))
sortingCollection.add(b);
rpos.remove();
}
final Cigar cigar = rec.getCigar();
int refPos = rec.getAlignmentStart();
for (final CigarElement ce : cigar.getCigarElements()) {
final CigarOperator op = ce.getOperator();
if (op.consumesReferenceBases()) {
if (op.consumesReadBases()) {
for (int x = 0; x < ce.getLength(); ++x) {
final Base gt = baseAt.apply(prev_tid, refPos + x);
gt.noClip++;
gt.noClip_sum_mapq += rec.getMappingQuality();
}
} else if (op.equals(CigarOperator.D) || op.equals(CigarOperator.N)) {
baseAt.apply(prev_tid, refPos).del++;
baseAt.apply(prev_tid, refPos + ce.getLength() - 1).del++;
}
refPos += ce.getLength();
}
}
CigarElement ce = cigar.getFirstCigarElement();
if (ce != null && ce.getOperator().isClipping() && ce.getLength() >= this.min_clip_operator_length) {
baseAt.apply(prev_tid, rec.getStart() - 1).leftClip++;
}
ce = cigar.getLastCigarElement();
if (ce != null && ce.getOperator().isClipping() && ce.getLength() >= this.min_clip_operator_length) {
baseAt.apply(prev_tid, rec.getEnd() + 1).rightClip++;
}
}
}
/* write last bases */
for (final Integer pos : pos2base.keySet()) {
final Base b = pos2base.get(pos);
if (acceptBase.test(b))
sortingCollection.add(b);
}
}
// end open reader
}
// end loop sam files
sortingCollection.doneAdding();
/* build VCF header */
final Allele reference_allele = Allele.create("N", true);
final Allele alt_allele = Allele.create("<CLIP>", false);
final Set<VCFHeaderLine> vcfHeaderLines = new HashSet<>();
VCFStandardHeaderLines.addStandardFormatLines(vcfHeaderLines, true, VCFConstants.GENOTYPE_KEY, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_ALLELE_DEPTHS);
VCFStandardHeaderLines.addStandardInfoLines(vcfHeaderLines, true, VCFConstants.DEPTH_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_FREQUENCY_KEY);
final VCFFormatHeaderLine leftClip = new VCFFormatHeaderLine("CL", 1, VCFHeaderLineType.Integer, "Left Clip");
vcfHeaderLines.add(leftClip);
final VCFFormatHeaderLine rightClip = new VCFFormatHeaderLine("RL", 1, VCFHeaderLineType.Integer, "Right Clip");
vcfHeaderLines.add(rightClip);
final VCFFormatHeaderLine totalCip = new VCFFormatHeaderLine("TL", 1, VCFHeaderLineType.Integer, "Total Clip");
vcfHeaderLines.add(totalCip);
final VCFFormatHeaderLine totalDel = new VCFFormatHeaderLine("DL", 1, VCFHeaderLineType.Integer, "Total Deletions");
vcfHeaderLines.add(totalDel);
final VCFFormatHeaderLine noClipMAPQ = new VCFFormatHeaderLine("MQ", 1, VCFHeaderLineType.Integer, "Average MAPQ for reads without clip at this position.");
vcfHeaderLines.add(noClipMAPQ);
final VCFInfoHeaderLine averageMAPQ = new VCFInfoHeaderLine("AVG_MAPQ", 1, VCFHeaderLineType.Integer, "Average MAPQ for called genotypes");
vcfHeaderLines.add(averageMAPQ);
final VCFInfoHeaderLine infoRetrogene = new VCFInfoHeaderLine("RETROGENE", 1, VCFHeaderLineType.String, "transcript name for Possible retrogene.");
vcfHeaderLines.add(infoRetrogene);
final VCFFilterHeaderLine filterRetrogene = new VCFFilterHeaderLine("POSSIBLE_RETROGENE", "Junction is a possible Retrogene.");
vcfHeaderLines.add(filterRetrogene);
final VCFFilterHeaderLine filterlowMapq = new VCFFilterHeaderLine("LOW_MAPQ", "Low average mapq (< " + bad_mapq + ")");
vcfHeaderLines.add(filterlowMapq);
final VCFHeader vcfHeader = new VCFHeader(vcfHeaderLines, sample_list);
vcfHeader.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, vcfHeader);
this.writingVcfConfig.dictionary(dict);
try (VariantContextWriter w = this.writingVcfConfig.open(this.outputFile)) {
w.writeHeader(vcfHeader);
try (CloseableIterator<Base> r1 = sortingCollection.iterator()) {
try (EqualRangeIterator<Base> r2 = new EqualRangeIterator<>(r1, (A, B) -> A.compare2(B))) {
while (r2.hasNext()) {
final List<Base> array = r2.next();
final Base first = array.get(0);
if (first.pos < 1)
continue;
// no clip
if (array.stream().mapToInt(G -> G.clip()).sum() == 0)
continue;
if (array.stream().allMatch(G -> G.clip() < min_clip_depth))
continue;
if (array.stream().allMatch(G -> G.dp() < min_depth))
continue;
if (array.stream().allMatch(G -> G.ratio() < fraction))
continue;
final VariantContextBuilder vcb = new VariantContextBuilder();
final String chrom = dict.getSequence(first.tid).getSequenceName();
vcb.chr(chrom);
vcb.start(first.pos);
vcb.stop(first.pos);
vcb.alleles(Arrays.asList(reference_allele, alt_allele));
vcb.attribute(VCFConstants.DEPTH_KEY, array.stream().mapToInt(G -> G.dp()).sum());
final List<Genotype> genotypes = new ArrayList<>(array.size());
int AC = 0;
int AN = 0;
int max_clip = 1;
double sum_mapq = 0.0;
int count_mapq = 0;
for (final Base gt : array) {
final GenotypeBuilder gb = new GenotypeBuilder(sample_list.get(gt.sample_idx));
if (gt.clip() == 0 && gt.noClip == 0) {
gb.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
} else if (gt.noClip == 0) {
gb.alleles(Arrays.asList(alt_allele, alt_allele));
AC += 2;
sum_mapq += gt.noClipMapq();
count_mapq++;
AN += 2;
} else if (gt.clip() == 0) {
gb.alleles(Arrays.asList(reference_allele, reference_allele));
AN += 2;
} else {
gb.alleles(Arrays.asList(reference_allele, alt_allele));
AC++;
sum_mapq += gt.noClipMapq();
count_mapq++;
AN += 2;
}
gb.DP(gt.dp());
gb.attribute(leftClip.getID(), gt.leftClip);
gb.attribute(rightClip.getID(), gt.rightClip);
gb.attribute(totalCip.getID(), gt.clip());
gb.attribute(totalDel.getID(), gt.del);
gb.attribute(noClipMAPQ.getID(), gt.noClipMapq());
gb.AD(new int[] { gt.noClip, gt.clip() });
genotypes.add(gb.make());
max_clip = Math.max(max_clip, gt.clip());
}
if (count_mapq > 0) {
final int avg_mapq = (int) (sum_mapq / count_mapq);
vcb.attribute(averageMAPQ.getID(), avg_mapq);
if (avg_mapq < bad_mapq)
vcb.filter(filterlowMapq.getID());
}
if (gtfPath != null) {
final Locatable bounds1 = new SimpleInterval(chrom, Math.max(1, first.pos - max_intron_distance), first.pos + max_intron_distance);
intronMap.getOverlapping(bounds1).stream().filter(I -> Math.abs(I.getStart() - first.pos) <= this.max_intron_distance || Math.abs(I.getEnd() - first.pos) <= this.max_intron_distance).map(I -> I.getName()).findFirst().ifPresent(transcript_id -> {
vcb.attribute(infoRetrogene.getID(), transcript_id);
vcb.filter(filterRetrogene.getID());
});
;
}
vcb.log10PError(max_clip / -10.0);
vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, AC);
vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, AN);
if (AN > 0)
vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, AC / (float) AN);
vcb.genotypes(genotypes);
w.add(vcb.make());
}
// end while r2
}
// end r2
}
// end r1
}
// end writer
sortingCollection.cleanup();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
}
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class ValidateCnv method doWork.
@Override
public int doWork(final List<String> args) {
if (this.extendFactor <= 0) {
LOG.error("bad extend factor " + this.extendFactor);
return -1;
}
if (this.treshold < 0 || this.treshold >= 0.25) {
LOG.error("Bad treshold 0 < " + this.treshold + " >=0.25 ");
return -1;
}
final Map<String, BamInfo> sample2bam = new HashMap<>();
VariantContextWriter out = null;
Iterator<? extends Locatable> iterIn = null;
try {
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.rererencePath);
final CRAMReferenceSource cramReferenceSource = new ReferenceSource(this.rererencePath);
final List<Path> bamPaths = IOUtils.unrollPaths(this.bamFiles);
final String input = oneFileOrNull(args);
if (input == null) {
iterIn = IntervalListProvider.empty().dictionary(dict).skipUnknownContigs().fromInputStream(stdin(), "bed").iterator();
} else {
final IntervalListProvider ilp = IntervalListProvider.from(input).setVariantPredicate(CTX -> {
if (CTX.isSNP())
return false;
final String svType = CTX.getAttributeAsString(VCFConstants.SVTYPE, "");
if (svType != null && (svType.equals("INV") || svType.equals("BND")))
return false;
return true;
}).dictionary(dict).skipUnknownContigs();
iterIn = ilp.stream().iterator();
}
/* register each bam */
for (final Path p2 : bamPaths) {
final BamInfo bi = new BamInfo(p2, cramReferenceSource);
if (sample2bam.containsKey(bi.sampleName)) {
LOG.error("sample " + bi.sampleName + " specified twice.");
bi.close();
return -1;
}
sample2bam.put(bi.sampleName, bi);
}
if (sample2bam.isEmpty()) {
LOG.error("no bam was defined");
return -1;
}
final Set<VCFHeaderLine> metadata = new HashSet<>();
final VCFInfoHeaderLine infoSVSamples = new VCFInfoHeaderLine("N_SAMPLES", 1, VCFHeaderLineType.Integer, "Number of Samples that could carry a SV");
metadata.add(infoSVSamples);
final VCFInfoHeaderLine infoSvLen = new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "SV length");
metadata.add(infoSvLen);
final BiFunction<String, String, VCFFormatHeaderLine> makeFmt = (TAG, DESC) -> new VCFFormatHeaderLine(TAG, 1, VCFHeaderLineType.Integer, DESC);
final VCFFormatHeaderLine formatCN = new VCFFormatHeaderLine("CN", 1, VCFHeaderLineType.Float, "normalized copy-number. Treshold was " + this.treshold);
metadata.add(formatCN);
final VCFFormatHeaderLine nReadsSupportingSv = makeFmt.apply("RSD", "number of split reads supporting SV.");
metadata.add(nReadsSupportingSv);
final VCFFilterHeaderLine filterAllDel = new VCFFilterHeaderLine("ALL_DEL", "number of samples greater than 1 and all are deletions");
metadata.add(filterAllDel);
final VCFFilterHeaderLine filterAllDup = new VCFFilterHeaderLine("ALL_DUP", "number of samples greater than 1 and all are duplication");
metadata.add(filterAllDup);
final VCFFilterHeaderLine filterNoSV = new VCFFilterHeaderLine("NO_SV", "There is no DUP or DEL in this variant");
metadata.add(filterNoSV);
final VCFFilterHeaderLine filterHomDel = new VCFFilterHeaderLine("HOM_DEL", "There is one Homozygous deletion.");
metadata.add(filterHomDel);
final VCFFilterHeaderLine filterHomDup = new VCFFilterHeaderLine("HOM_DUP", "There is one Homozygous duplication.");
metadata.add(filterHomDup);
VCFStandardHeaderLines.addStandardFormatLines(metadata, true, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_KEY, VCFConstants.GENOTYPE_FILTER_KEY, VCFConstants.GENOTYPE_QUALITY_KEY);
VCFStandardHeaderLines.addStandardInfoLines(metadata, true, VCFConstants.DEPTH_KEY, VCFConstants.END_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY, VCFConstants.ALLELE_NUMBER_KEY);
final VCFHeader header = new VCFHeader(metadata, sample2bam.keySet());
if (dict != null)
header.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, header);
final ProgressFactory.Watcher<VariantContext> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
out = this.writingVariantsDelegate.dictionary(dict).open(this.outputFile);
out.writeHeader(header);
final Allele DUP_ALLELE = Allele.create("<DUP>", false);
final Allele DEL_ALLELE = Allele.create("<DEL>", false);
final Allele REF_ALLELE = Allele.create("N", true);
while (iterIn.hasNext()) {
final Locatable ctx = iterIn.next();
if (ctx == null)
continue;
final SAMSequenceRecord ssr = dict.getSequence(ctx.getContig());
if (ssr == null || ctx.getStart() >= ssr.getSequenceLength())
continue;
final int svLen = ctx.getLengthOnReference();
if (svLen < this.min_abs_sv_size)
continue;
if (svLen > this.max_abs_sv_size)
continue;
int n_samples_with_cnv = 0;
final SimplePosition breakPointLeft = new SimplePosition(ctx.getContig(), ctx.getStart());
final SimplePosition breakPointRight = new SimplePosition(ctx.getContig(), ctx.getEnd());
final int extend = 1 + (int) (svLen * this.extendFactor);
final int leftPos = Math.max(1, breakPointLeft.getPosition() - extend);
final int array_mid_start = breakPointLeft.getPosition() - leftPos;
final int array_mid_end = breakPointRight.getPosition() - leftPos;
final int rightPos = Math.min(breakPointRight.getPosition() + extend, ssr.getSequenceLength());
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ctx.getContig());
vcb.start(ctx.getStart());
vcb.stop(ctx.getEnd());
vcb.attribute(VCFConstants.END_KEY, ctx.getEnd());
final Set<Allele> alleles = new HashSet<>();
alleles.add(REF_ALLELE);
int count_dup = 0;
int count_del = 0;
int an = 0;
final Counter<Allele> countAlleles = new Counter<>();
final List<Genotype> genotypes = new ArrayList<>(sample2bam.size());
Double badestGQ = null;
final double[] raw_coverage = new double[CoordMath.getLength(leftPos, rightPos)];
for (final String sampleName : sample2bam.keySet()) {
final BamInfo bi = sample2bam.get(sampleName);
Arrays.fill(raw_coverage, 0.0);
int n_reads_supporting_sv = 0;
try (CloseableIterator<SAMRecord> iter2 = bi.samReader.queryOverlapping(ctx.getContig(), leftPos, rightPos)) {
while (iter2.hasNext()) {
final SAMRecord rec = iter2.next();
if (!SAMRecordDefaultFilter.accept(rec, this.min_mapq))
continue;
final Cigar cigar = rec.getCigar();
if (cigar == null || cigar.isEmpty())
continue;
// any clip supporting deletion ?
boolean read_supports_cnv = false;
final int breakpoint_distance = 10;
// any clip on left ?
if (cigar.isLeftClipped() && rec.getUnclippedStart() < rec.getAlignmentStart() && new SimpleInterval(ctx.getContig(), rec.getUnclippedStart(), rec.getAlignmentStart()).withinDistanceOf(breakPointLeft, breakpoint_distance)) {
read_supports_cnv = true;
}
// any clip on right ?
if (!read_supports_cnv && cigar.isRightClipped() && rec.getAlignmentEnd() < rec.getUnclippedEnd() && new SimpleInterval(ctx.getContig(), rec.getAlignmentEnd(), rec.getUnclippedEnd()).withinDistanceOf(breakPointRight, breakpoint_distance)) {
read_supports_cnv = true;
}
if (read_supports_cnv) {
n_reads_supporting_sv++;
}
int ref = rec.getStart();
for (final CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
if (op.consumesReferenceBases()) {
if (op.consumesReadBases()) {
for (int x = 0; x < ce.getLength() && ref + x - leftPos < raw_coverage.length; ++x) {
final int p = ref + x - leftPos;
if (p < 0 || p >= raw_coverage.length)
continue;
raw_coverage[p]++;
}
}
ref += ce.getLength();
}
}
}
// end while iter record
}
// end try query for iterator
// test for great difference between DP left and DP right
final OptionalDouble medianDepthLeft = Percentile.median().evaluate(raw_coverage, 0, array_mid_start);
final OptionalDouble medianDepthRight = Percentile.median().evaluate(raw_coverage, array_mid_end, raw_coverage.length - array_mid_end);
// any is just too low
if (!medianDepthLeft.isPresent() || medianDepthLeft.getAsDouble() < this.min_depth || !medianDepthRight.isPresent() || medianDepthRight.getAsDouble() < this.min_depth) {
final Genotype gt2 = new GenotypeBuilder(sampleName, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).filter("LowDp").make();
genotypes.add(gt2);
continue;
}
final double difference_factor = 2.0;
// even if a value is divided , it remains greater than the other size
if (medianDepthLeft.getAsDouble() / difference_factor > medianDepthRight.getAsDouble() || medianDepthRight.getAsDouble() / difference_factor > medianDepthLeft.getAsDouble()) {
final Genotype gt2 = new GenotypeBuilder(sampleName, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).filter("DiffLR").make();
genotypes.add(gt2);
continue;
}
// run median to smooth spline
final double[] smoothed_cov = new RunMedian(RunMedian.getTurlachSize(raw_coverage.length)).apply(raw_coverage);
final double[] bounds_cov = IntStream.concat(IntStream.range(0, array_mid_start), IntStream.range(array_mid_end, smoothed_cov.length)).mapToDouble(IDX -> raw_coverage[IDX]).toArray();
final OptionalDouble optMedianBound = Percentile.median().evaluate(bounds_cov);
if (!optMedianBound.isPresent() || optMedianBound.getAsDouble() == 0) {
final Genotype gt2 = new GenotypeBuilder(sampleName, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).filter("MedZero").make();
genotypes.add(gt2);
continue;
}
final double medianBound = optMedianBound.getAsDouble();
// divide coverage per medianBound
final double[] normalized_mid_coverage = new double[array_mid_end - array_mid_start];
for (int i = 0; i < normalized_mid_coverage.length; ++i) {
normalized_mid_coverage[i] = smoothed_cov[array_mid_start + i] / medianBound;
}
final double normDepth = Percentile.median().evaluate(normalized_mid_coverage).getAsDouble();
final boolean is_sv;
final boolean is_hom_deletion = Math.abs(normDepth - 0.0) <= this.treshold;
final boolean is_het_deletion = Math.abs(normDepth - 0.5) <= this.treshold || (!is_hom_deletion && normDepth <= 0.5);
final boolean is_hom_dup = Math.abs(normDepth - 2.0) <= this.treshold || normDepth > 2.0;
final boolean is_het_dup = Math.abs(normDepth - 1.5) <= this.treshold || (!is_hom_dup && normDepth >= 1.5);
final boolean is_ref = Math.abs(normDepth - 1.0) <= this.treshold;
final double theoritical_depth;
final GenotypeBuilder gb;
if (is_ref) {
gb = new GenotypeBuilder(sampleName, Arrays.asList(REF_ALLELE, REF_ALLELE));
is_sv = false;
theoritical_depth = 1.0;
an += 2;
} else if (is_het_deletion) {
gb = new GenotypeBuilder(sampleName, Arrays.asList(REF_ALLELE, DEL_ALLELE));
alleles.add(DEL_ALLELE);
is_sv = true;
theoritical_depth = 0.5;
count_del++;
an += 2;
countAlleles.incr(DEL_ALLELE);
} else if (is_hom_deletion) {
gb = new GenotypeBuilder(sampleName, Arrays.asList(DEL_ALLELE, DEL_ALLELE));
alleles.add(DEL_ALLELE);
vcb.filter(filterHomDel.getID());
is_sv = true;
theoritical_depth = 0.0;
count_del++;
an += 2;
countAlleles.incr(DEL_ALLELE, 2);
} else if (is_het_dup) {
gb = new GenotypeBuilder(sampleName, Arrays.asList(REF_ALLELE, DUP_ALLELE));
alleles.add(DUP_ALLELE);
is_sv = true;
theoritical_depth = 1.5;
count_dup++;
an += 2;
countAlleles.incr(DUP_ALLELE);
} else if (is_hom_dup) {
gb = new GenotypeBuilder(sampleName, Arrays.asList(DUP_ALLELE, DUP_ALLELE));
alleles.add(DUP_ALLELE);
vcb.filter(filterHomDup.getID());
is_sv = true;
theoritical_depth = 2.0;
count_dup++;
an += 2;
countAlleles.incr(DUP_ALLELE, 2);
} else {
gb = new GenotypeBuilder(sampleName, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).filter("Ambigous");
is_sv = false;
theoritical_depth = 1.0;
}
if (is_sv) {
n_samples_with_cnv++;
}
double gq = Math.abs(theoritical_depth - normDepth);
gq = Math.min(0.5, gq);
gq = gq * gq;
gq = gq / 0.25;
gq = 99 * (1.0 - gq);
gb.GQ((int) gq);
if (badestGQ == null || badestGQ.compareTo(gq) > 0) {
badestGQ = gq;
}
gb.attribute(formatCN.getID(), normDepth);
gb.attribute(nReadsSupportingSv.getID(), n_reads_supporting_sv);
genotypes.add(gb.make());
}
vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, an);
final List<Allele> orderedAlleles = new ArrayList<>(alleles);
Collections.sort(orderedAlleles);
if (orderedAlleles.size() > 1) {
final List<Integer> acL = new ArrayList<>();
final List<Double> afL = new ArrayList<>();
for (int i = 1; i < orderedAlleles.size(); i++) {
final Allele a = orderedAlleles.get(i);
final int c = (int) countAlleles.count(a);
acL.add(c);
if (an > 0)
afL.add(c / (double) an);
}
vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, acL);
if (an > 0)
vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, afL);
}
// if(alleles.size()<=1) continue;
vcb.alleles(orderedAlleles);
vcb.noID();
vcb.genotypes(genotypes);
vcb.attribute(infoSVSamples.getID(), n_samples_with_cnv);
vcb.attribute(infoSvLen.getID(), svLen);
if (count_dup == sample2bam.size() && sample2bam.size() != 1) {
vcb.filter(filterAllDup.getID());
}
if (count_del == sample2bam.size() && sample2bam.size() != 1) {
vcb.filter(filterAllDel.getID());
}
if (n_samples_with_cnv == 0) {
vcb.filter(filterNoSV.getID());
}
if (badestGQ != null) {
vcb.log10PError(badestGQ / -10.0);
}
out.add(vcb.make());
}
progress.close();
out.close();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iterIn);
CloserUtil.close(out);
sample2bam.values().forEach(F -> CloserUtil.close(F));
}
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class DepthAnomaly method doWork.
@Override
public int doWork(final List<String> args) {
PrintWriter pw = null;
try {
final IndexCovUtils indexCovUtils = new IndexCovUtils(this.treshold);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.refPath);
final SamReaderFactory samReaderFactory = SamReaderFactory.makeDefault().referenceSequence(DepthAnomaly.this.refPath).validationStringency(ValidationStringency.LENIENT);
final List<Path> inputBams = IOUtils.unrollPaths(this.bamsPath);
if (inputBams.isEmpty()) {
LOG.error("input bam file missing.");
return -1;
}
Iterator<? extends Locatable> iter;
final String input = oneFileOrNull(args);
if (input == null) {
final BedLineCodec codec = new BedLineCodec();
final LineIterator liter = new LineIterator(stdin());
iter = new AbstractIterator<Locatable>() {
@Override
protected Locatable advance() {
while (liter.hasNext()) {
final String line = liter.next();
final BedLine bed = codec.decode(line);
if (bed == null) {
continue;
}
return bed;
}
liter.close();
return null;
}
};
} else {
iter = IntervalListProvider.from(input).dictionary(dict).stream().iterator();
}
pw = super.openPathOrStdoutAsPrintWriter(this.outputFile);
while (iter.hasNext()) {
final SimpleInterval locatable = new SimpleInterval(iter.next());
boolean found_anomaly_here = false;
if (this.min_anomaly_length * 3 >= locatable.getLengthOnReference()) {
LOG.warning("interval " + locatable.toNiceString() + " is too short. skipping.");
continue;
}
final int[] depth = new int[locatable.getLengthOnReference()];
final int[] copy = new int[depth.length];
for (final Path path : inputBams) {
try (SamReader sr = samReaderFactory.open(path)) {
final SAMFileHeader header = sr.getFileHeader();
final String sample = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtil.isBlank(S)).findFirst().orElse(IOUtils.getFilenameWithoutCommonSuffixes(path));
SequenceUtil.assertSequenceDictionariesEqual(dict, header.getSequenceDictionary());
Arrays.fill(depth, 0);
try (CloseableIterator<SAMRecord> siter = sr.queryOverlapping(locatable.getContig(), locatable.getStart(), locatable.getEnd())) {
while (siter.hasNext()) {
final SAMRecord rec = siter.next();
if (rec.getReadUnmappedFlag())
continue;
if (!SAMRecordDefaultFilter.accept(rec, this.min_mapq))
continue;
int ref = rec.getStart();
final Cigar cigar = rec.getCigar();
if (cigar == null)
continue;
for (CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
final int len = ce.getLength();
if (op.consumesReferenceBases()) {
if (op.consumesReadBases()) {
for (int i = 0; i < len; i++) {
final int pos = ref + i;
if (pos < locatable.getStart())
continue;
if (pos > locatable.getEnd())
break;
depth[pos - locatable.getStart()]++;
}
}
ref += len;
}
}
}
// loop cigar
}
// end samItere
System.arraycopy(depth, 0, copy, 0, depth.length);
final double median = median(copy);
final List<CovInterval> anomalies = new ArrayList<>();
// int minDepth = Arrays.stream(depth).min().orElse(0);
int x0 = 0;
while (x0 < depth.length && median > 0.0) {
final int xi = x0;
double total = 0;
double count = 0;
IndexCovUtils.SvType prevType = null;
while (x0 < depth.length) {
final IndexCovUtils.SvType type;
final int depthHere = depth[x0];
final double normDepth = depthHere / (median == 0 ? 1.0 : median);
if (depthHere > this.max_depth) {
type = null;
} else {
type = indexCovUtils.getType(normDepth);
}
x0++;
if (type == null || !type.isVariant())
break;
if (prevType != null && !type.equals(prevType))
break;
if (prevType == null)
prevType = type;
total += depthHere;
count++;
}
if (prevType != null && count >= this.min_anomaly_length) {
anomalies.add(new CovInterval(locatable.getContig(), locatable.getStart() + xi, locatable.getStart() + x0 - 1, prevType, Collections.singletonList(total / count)));
}
}
if (!anomalies.isEmpty() || force_screen) {
int i = 0;
while (i + 1 < anomalies.size() && this.merge_intervals) {
final CovInterval loc1 = anomalies.get(i);
final CovInterval loc2 = anomalies.get(i + 1);
if (loc1.svtype.equals(loc2.svtype) && loc1.withinDistanceOf(loc2, this.min_anomaly_length)) {
final List<Double> newdepths = new ArrayList<>(loc1.depths);
newdepths.addAll(loc2.depths);
anomalies.set(i, new CovInterval(loc1.getContig(), loc1.getStart(), loc2.getEnd(), loc1.svtype, newdepths));
anomalies.remove(i + 1);
} else {
i++;
}
}
if (!found_anomaly_here) {
pw.println(">>> " + locatable.toNiceString() + " length:" + StringUtils.niceInt(locatable.getLengthOnReference()));
found_anomaly_here = true;
}
if (screen_width > 0) {
pw.print("#");
pw.print(String.format("%-15s", sample));
pw.print("[");
for (i = 0; i < screen_width; i++) {
double t = 0;
double n = 0;
final int x1 = (int) (((i + 0) / (double) screen_width) * depth.length);
final int x2 = (int) (((i + 1) / (double) screen_width) * depth.length);
for (int x3 = x1; x3 <= x2 && x3 < depth.length; ++x3) {
t += depth[x1];
n++;
}
double normDepth = t /= n;
if (median > 0)
normDepth /= median;
// centered
normDepth /= 2.0;
final boolean is_anomaly = anomalies.stream().anyMatch(R -> CoordMath.overlaps(R.getStart(), R.getEnd(), locatable.getStart() + x1, locatable.getStart() + x2));
final AnsiUtils.AnsiColor color = is_anomaly ? AnsiColor.RED : null;
if (color != null)
pw.print(color.begin());
pw.print(AnsiUtils.getHistogram(normDepth));
if (color != null)
pw.print(color.end());
}
pw.print("]");
pw.println();
}
for (i = 0; i < anomalies.size(); i++) {
final CovInterval anomalie = anomalies.get(i);
pw.print(anomalie.getContig());
pw.print("\t");
pw.print(anomalie.getStart() - 1);
pw.print("\t");
pw.print(anomalie.getEnd());
pw.print("\t");
pw.print(anomalie.getLengthOnReference());
pw.print("\t");
pw.print(anomalie.svtype.name());
pw.print("\t");
pw.print(sample);
pw.print("\t");
pw.print(path);
pw.print("\t");
pw.print(i + 1);
pw.print("\t");
pw.print(anomalies.size());
pw.print("\t");
pw.print(locatable.toNiceString());
pw.print("\t");
pw.print((int) median);
pw.print("\t");
pw.print((int) anomalie.depths.stream().mapToDouble(X -> X.doubleValue()).average().orElse(0));
pw.print("\t");
pw.println();
}
}
}
}
if (found_anomaly_here) {
pw.println("<<< " + locatable.toNiceString() + " length:" + StringUtils.niceInt(locatable.getLengthOnReference()));
pw.println();
}
}
// end while iter
pw.flush();
pw.close();
pw = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(pw);
}
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class Transcript method getCdsInterval.
/**
*return genomic interval spanning between codon start and codon stop if they both exists
*/
public default Optional<Locatable> getCdsInterval() {
if (!hasCodonStartDefined() || !hasCodonStopDefined())
return Optional.empty();
int x1 = this.getEnd();
int x2 = this.getStart();
Locatable codon = getCodonStart().get();
x1 = Math.min(codon.getStart(), x1);
x2 = Math.max(codon.getEnd(), x2);
codon = getCodonStop().get();
x1 = Math.min(codon.getStart(), x1);
x2 = Math.max(codon.getEnd(), x2);
return Optional.of(new SimpleInterval(getContig(), x1, x2));
}
Aggregations