use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class SamFindClippedRegions method processInput.
@Override
protected int processInput(final SAMFileHeader header, final CloseableIterator<SAMRecord> iter) {
final Set<String> samples = header.getReadGroups().stream().map(this.partition).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toCollection(TreeSet::new));
if (samples.isEmpty()) {
LOG.error("no sample in read group was defined.");
return -1;
}
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final int bad_mapq = 30;
// SAMFileWriter w=null;
try {
final IntervalTreeMap<Interval> intronMap = new IntervalTreeMap<>();
if (this.gtfPath != null) {
try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
gtfReader.getAllGenes().stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.hasIntron()).flatMap(T -> T.getIntrons().stream()).map(I -> new Interval(I.getContig(), I.getStart(), I.getEnd(), I.isNegativeStrand(), I.getTranscript().getId())).forEach(I -> intronMap.put(I, I));
}
}
/* build VCF header */
final Allele reference_allele = Allele.create("N", true);
final Allele alt_allele = Allele.create("<CLIP>", false);
final Set<VCFHeaderLine> vcfHeaderLines = new HashSet<>();
VCFStandardHeaderLines.addStandardFormatLines(vcfHeaderLines, true, VCFConstants.GENOTYPE_KEY, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_ALLELE_DEPTHS);
VCFStandardHeaderLines.addStandardInfoLines(vcfHeaderLines, true, VCFConstants.DEPTH_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_FREQUENCY_KEY);
final VCFFormatHeaderLine leftClip = new VCFFormatHeaderLine("CL", 1, VCFHeaderLineType.Integer, "Left Clip");
vcfHeaderLines.add(leftClip);
final VCFFormatHeaderLine rightClip = new VCFFormatHeaderLine("RL", 1, VCFHeaderLineType.Integer, "Right Clip");
vcfHeaderLines.add(rightClip);
final VCFFormatHeaderLine totalCip = new VCFFormatHeaderLine("TL", 1, VCFHeaderLineType.Integer, "Total Clip");
vcfHeaderLines.add(totalCip);
final VCFFormatHeaderLine totalDel = new VCFFormatHeaderLine("DL", 1, VCFHeaderLineType.Integer, "Total Deletions");
vcfHeaderLines.add(totalDel);
final VCFFormatHeaderLine noClipMAPQ = new VCFFormatHeaderLine("MQ", 1, VCFHeaderLineType.Integer, "Average MAPQ for reads without clip at this position.");
vcfHeaderLines.add(noClipMAPQ);
final VCFInfoHeaderLine averageMAPQ = new VCFInfoHeaderLine("AVG_MAPQ", 1, VCFHeaderLineType.Integer, "Average MAPQ for called genotypes");
vcfHeaderLines.add(averageMAPQ);
final VCFInfoHeaderLine infoRetrogene = new VCFInfoHeaderLine("RETROGENE", 1, VCFHeaderLineType.String, "transcript name for Possible retrogene.");
vcfHeaderLines.add(infoRetrogene);
final VCFFilterHeaderLine filterRetrogene = new VCFFilterHeaderLine("POSSIBLE_RETROGENE", "Junction is a possible Retrogene.");
vcfHeaderLines.add(filterRetrogene);
final VCFFilterHeaderLine filterlowMapq = new VCFFilterHeaderLine("LOW_MAPQ", "Low average mapq (< " + bad_mapq + ")");
vcfHeaderLines.add(filterlowMapq);
final VCFHeader vcfHeader = new VCFHeader(vcfHeaderLines, samples);
vcfHeader.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, vcfHeader);
this.writingVcfConfig.dictionary(dict);
try (VariantContextWriter w = this.writingVcfConfig.open(this.outputFile)) {
w.writeHeader(vcfHeader);
@SuppressWarnings("resource") final VariantContextWriter finalVariantContextWriter = w;
/**
* dump a BASe into the VCF
*/
final BiConsumer<String, Base> baseConsumer = (CTG, B) -> {
if (B.pos < 1)
return;
// no clip
if (B.sample2gt.values().stream().mapToInt(G -> G.clip()).sum() == 0)
return;
if (B.sample2gt.values().stream().allMatch(G -> G.clip() < min_clip_depth))
return;
if (B.sample2gt.values().stream().allMatch(G -> G.dp() < min_depth))
return;
if (B.sample2gt.values().stream().allMatch(G -> G.ratio() < fraction))
return;
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(CTG);
vcb.start(B.pos);
vcb.stop(B.pos);
vcb.alleles(Arrays.asList(reference_allele, alt_allele));
vcb.attribute(VCFConstants.DEPTH_KEY, B.sample2gt.values().stream().mapToInt(G -> G.dp()).sum());
/* if gtf was specified, find intron which ends are near this pos */
if (gtfPath != null) {
final Locatable bounds1 = new SimpleInterval(CTG, Math.max(1, B.pos - max_intron_distance), B.pos + max_intron_distance);
intronMap.getOverlapping(bounds1).stream().filter(I -> Math.abs(I.getStart() - B.pos) <= this.max_intron_distance || Math.abs(I.getEnd() - B.pos) <= this.max_intron_distance).map(I -> I.getName()).findFirst().ifPresent(transcript_id -> {
vcb.attribute(infoRetrogene.getID(), transcript_id);
vcb.filter(filterRetrogene.getID());
});
;
}
final List<Genotype> genotypes = new ArrayList<>(B.sample2gt.size());
int AC = 0;
int AN = 0;
int max_clip = 1;
double sum_mapq = 0.0;
int count_mapq = 0;
for (final String sn : B.sample2gt.keySet()) {
final Gt gt = B.sample2gt.get(sn);
final GenotypeBuilder gb = new GenotypeBuilder(sn);
if (gt.clip() == 0 && gt.noClip == 0) {
gb.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
} else if (gt.noClip == 0) {
gb.alleles(Arrays.asList(alt_allele, alt_allele));
AC += 2;
sum_mapq += gt.noClipMapq();
count_mapq++;
AN += 2;
} else if (gt.clip() == 0) {
gb.alleles(Arrays.asList(reference_allele, reference_allele));
AN += 2;
} else {
gb.alleles(Arrays.asList(reference_allele, alt_allele));
AC++;
sum_mapq += gt.noClipMapq();
count_mapq++;
AN += 2;
}
gb.DP(gt.dp());
gb.attribute(leftClip.getID(), gt.leftClip);
gb.attribute(rightClip.getID(), gt.rightClip);
gb.attribute(totalCip.getID(), gt.clip());
gb.attribute(totalDel.getID(), gt.del);
gb.attribute(noClipMAPQ.getID(), gt.noClipMapq());
gb.AD(new int[] { gt.noClip, gt.clip() });
genotypes.add(gb.make());
max_clip = Math.max(max_clip, gt.clip());
}
if (count_mapq > 0) {
final int avg_mapq = (int) (sum_mapq / count_mapq);
vcb.attribute(averageMAPQ.getID(), avg_mapq);
if (avg_mapq < bad_mapq)
vcb.filter(filterlowMapq.getID());
}
vcb.log10PError(max_clip / -10.0);
vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, AC);
vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, AN);
if (AN > 0)
vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, AC / (float) AN);
vcb.genotypes(genotypes);
finalVariantContextWriter.add(vcb.make());
};
final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
String prevContig = null;
final SortedMap<Integer, Base> pos2base = new TreeMap<>();
/* get base in pos2base, create it if needed */
final Function<Integer, Base> baseAt = POS -> {
Base b = pos2base.get(POS);
if (b == null) {
b = new Base(POS, samples);
pos2base.put(POS, b);
}
return b;
};
for (; ; ) {
final SAMRecord rec = (iter.hasNext() ? progress.apply(iter.next()) : null);
if (rec != null && !SAMRecordDefaultFilter.accept(rec, this.min_mapq))
continue;
if (rec == null || !rec.getContig().equals(prevContig)) {
for (final Integer pos : pos2base.keySet()) {
baseConsumer.accept(prevContig, pos2base.get(pos));
}
if (rec == null)
break;
pos2base.clear();
prevContig = rec.getContig();
}
for (Iterator<Integer> rpos = pos2base.keySet().iterator(); rpos.hasNext(); ) {
final Integer pos = rpos.next();
if (pos.intValue() + this.max_clip_length >= rec.getUnclippedStart())
break;
baseConsumer.accept(prevContig, pos2base.get(pos));
rpos.remove();
}
final String rg = this.partition.getPartion(rec);
if (StringUtils.isBlank(rg))
continue;
for (final AlignmentBlock ab : rec.getAlignmentBlocks()) {
for (int n = 0; n < ab.getLength(); ++n) {
}
}
final Cigar cigar = rec.getCigar();
int refPos = rec.getAlignmentStart();
for (final CigarElement ce : cigar.getCigarElements()) {
final CigarOperator op = ce.getOperator();
if (op.consumesReferenceBases()) {
if (op.consumesReadBases()) {
for (int x = 0; x < ce.getLength(); ++x) {
final Gt gt = baseAt.apply(refPos + x).getGt(rg);
gt.noClip++;
gt.noClip_sum_mapq += rec.getMappingQuality();
}
} else if (op.equals(CigarOperator.D) || op.equals(CigarOperator.N)) {
baseAt.apply(refPos).getGt(rg).del++;
baseAt.apply(refPos + ce.getLength() - 1).getGt(rg).del++;
}
refPos += ce.getLength();
}
}
CigarElement ce = cigar.getFirstCigarElement();
if (ce != null && ce.getOperator().isClipping() && ce.getLength() >= this.min_clip_operator_length) {
baseAt.apply(rec.getStart() - 1).getGt(rg).leftClip++;
}
ce = cigar.getLastCigarElement();
if (ce != null && ce.getOperator().isClipping() && ce.getLength() >= this.min_clip_operator_length) {
baseAt.apply(rec.getEnd() + 1).getGt(rg).rightClip++;
}
}
}
// end of vcf writer
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
Aggregations