use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class BamToHaplotypes method processInput.
@Override
protected int processInput(final SAMFileHeader headerIn, final CloseableIterator<SAMRecord> iter0) {
SortingCollection<Haplotype> sorting = null;
try {
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(headerIn);
final String sample = headerIn.getReadGroups().stream().map(RG -> RG.getSample()).filter(R -> !StringUtils.isBlank(R)).findFirst().orElse("SAMPLE");
sorting = SortingCollection.newInstance(Haplotype.class, new HaplotypeCodec(), (A, B) -> A.compareTo(B), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
if (this.paired_mode) {
try (EqualIterator<SAMRecord> iter = new EqualIterator<>(iter0, (A, B) -> A.getReadName().compareTo(B.getReadName()))) {
while (iter.hasNext()) {
final LinkedList<SAMRecord> buffer = new LinkedList<>(iter.next());
SAMRecord R1 = null;
SAMRecord R2 = null;
while (!buffer.isEmpty()) {
final SAMRecord rec = buffer.pop();
if (rec.getReadUnmappedFlag() || rec.isSecondaryOrSupplementary()) {
continue;
} else if (!rec.getReadPairedFlag()) {
scanVariants(dict, Collections.singletonList(rec), sorting);
} else if (R1 == null && rec.getFirstOfPairFlag()) {
R1 = rec;
} else if (R2 == null && rec.getSecondOfPairFlag()) {
R2 = rec;
} else {
continue;
}
}
if (R1 != null && R2 != null) {
if (R1.contigsMatch(R2)) {
scanVariants(dict, Arrays.asList(R1, R2), sorting);
} else {
scanVariants(dict, Collections.singletonList(R1), sorting);
scanVariants(dict, Collections.singletonList(R2), sorting);
}
} else if (R1 != null && R2 == null) {
scanVariants(dict, Collections.singletonList(R1), sorting);
} else if (R2 != null && R1 == null) {
scanVariants(dict, Collections.singletonList(R2), sorting);
}
}
}
} else {
while (iter0.hasNext()) {
final SAMRecord rec = iter0.next();
if (rec.getReadUnmappedFlag()) {
continue;
}
scanVariants(dict, Collections.singletonList(rec), sorting);
}
}
sorting.doneAdding();
sorting.setDestructiveIteration(true);
try (CloseableIterator<Haplotype> iter = sorting.iterator()) {
PeekableIterator<Haplotype> peek = new PeekableIterator<Haplotype>(iter);
try (PrintWriter out = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
out.println("#CHROM\tSTART\tEND\tSAMPLE\tN-HAPLOTYPES\tN-VARIANTS\t(POS\\tALT)+");
while (peek.hasNext()) {
int n = 1;
final Haplotype hap = peek.next();
while (peek.hasNext()) {
final Haplotype hap2 = peek.peek();
if (!hap.equals(hap2))
break;
// consumme
peek.next();
n++;
}
out.print(dict.getSequence(hap.tid).getContig());
out.print("\t");
out.print(hap.getStart());
out.print("\t");
out.print(hap.getEnd());
out.print("\t");
out.print(sample);
out.print("\t");
out.print(n);
out.print("\t");
out.print(hap.changes.size());
for (Change c : hap.changes) {
out.print("\t");
out.print(c.pos1);
out.print("\t");
out.print((char) c.alt);
}
out.println();
}
out.flush();
}
peek.close();
}
sorting.cleanup();
return 0;
} catch (Throwable err) {
LOG.error(err);
return -1;
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class BamPhased01 method scanVariants.
private void scanVariants(final List<SAMRecord> buffer0, final SAMFileWriter sfw) {
if (buffer0.isEmpty())
return;
final List<SAMRecord> buffer = new ArrayList<>(buffer0);
final SAMReadGroupRecord rg0 = buffer.get(0).getReadGroup();
if (rg0 == null) {
for (SAMRecord rec : buffer) failingSAMRecord(rec, sfw);
return;
}
final String rgid0 = buffer.get(0).getReadGroup().getId();
for (int i = 1; !this.ignore_discordant_rg && i < buffer.size(); i++) {
if (buffer.get(i).getReadGroup() == null || !buffer.get(i).getReadGroup().getId().equals(rgid0)) {
/*
throw new SAMException("Two paired read without the same RG-ID:\n"+
buffer.get(0).getSAMString()+"\n"+
buffer.get(i).getSAMString()
);
*/
}
}
final List<PosToCheck> supporting = new ArrayList<>();
int i = 0;
while (i < buffer.size()) {
final SAMRecord rec = buffer.get(i);
final byte[] bases = rec.getReadBases();
if (bases == null || bases == SAMRecord.NULL_QUALS || bases.length == 0) {
failingSAMRecord(rec, sfw);
buffer.remove(i);
continue;
}
final String sn = rg0.getSample();
if (StringUtils.isBlank(sn) || !this.samplesInBam.contains(sn)) {
failingSAMRecord(rec, sfw);
buffer.remove(i);
continue;
}
final List<PosToCheck> candidates = new ArrayList<>();
try (CloseableIterator<VariantContext> iter = this.bufferedVCFReader.query(rec)) {
while (iter.hasNext()) {
final VariantContext ctx = iter.next();
final Genotype gt = ctx.getGenotype(sn);
if (gt.isHomRef() || gt.isNoCall())
continue;
final Set<Byte> alts = gt.getAlleles().stream().filter(A -> A.isCalled() && !A.isReference() && !A.isSymbolic() && A.length() == 1).filter(A -> AcidNucleics.isATGC(A)).map(A -> (byte) Character.toUpperCase(A.getDisplayString().charAt(0))).collect(Collectors.toSet());
if (alts.isEmpty())
continue;
final PosToCheck pos = new PosToCheck(ctx, alts);
candidates.add(pos);
}
}
if (candidates.isEmpty()) {
failingSAMRecord(rec, sfw);
buffer.remove(i);
continue;
}
final List<PosToCheck> supporting0 = new ArrayList<>();
for (AlignmentBlock ab : rec.getAlignmentBlocks()) {
final int readPos1 = ab.getReadStart();
final int refPos1 = ab.getReferenceStart();
for (int x = 0; x < ab.getLength(); ++x) {
for (PosToCheck pos : candidates) {
if (pos.getPosition() != refPos1 + x)
continue;
final byte readBase = bases[(readPos1 - 1) + x];
if (pos.alts.contains(readBase)) {
supporting0.add(pos);
break;
}
}
}
}
if (supporting0.isEmpty()) {
failingSAMRecord(rec, sfw);
buffer.remove(i);
continue;
}
supporting.addAll(supporting0);
i++;
}
if (supporting.size() < this.num_supporting_variants) {
for (SAMRecord rec : buffer) failingSAMRecord(rec, sfw);
return;
}
for (final SAMRecord rec : buffer) {
if (!StringUtils.isBlank(this.XTAG)) {
rec.setAttribute(this.XTAG, supporting.stream().map(S -> S.getContig() + "_" + S.getStart() + "_" + S.ref + "_" + S.alts.stream().map(B -> "" + (char) B.byteValue()).collect(Collectors.joining("_"))).collect(Collectors.joining(";")));
}
rec.setAttribute("PG", this.samProgramRecord.getId());
sfw.addAlignment(rec);
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class BamToMNV method doWork.
@Override
public int doWork(final List<String> args) {
try {
final List<Path> bams = IOUtils.unrollPaths(this.input_bams);
if (bams.isEmpty()) {
LOG.error("No bam was defined");
return -1;
}
final Pedigree pedigree;
if (this.pedigreePath != null) {
pedigree = new PedigreeParser().parse(this.pedigreePath);
pedigree.checkUniqIds();
} else {
pedigree = null;
}
try (VCFReader reader = VCFReaderFactory.makeDefault().open(oneAndOnlyOneFile(args), false)) {
final VCFHeader header = reader.getHeader();
final OrderChecker<VariantContext> order = new OrderChecker<>(header.getSequenceDictionary(), false);
try (CloseableIterator<VariantContext> r = reader.iterator()) {
this.all_variants.addAll(r.stream().filter(V -> V.isBiallelic() && V.isSNP()).map(V -> new VariantContextBuilder(V).noGenotypes().make()).map(order).collect(Collectors.toList()));
}
}
final List<Mnv> all_mnvs = new ArrayList<>();
for (int i = 0; i + 1 < this.all_variants.size(); i++) {
final VariantContext v1 = this.all_variants.get(i);
for (int j = i + 1; j < this.all_variants.size(); j++) {
final VariantContext v2 = this.all_variants.get(j);
if (!v1.withinDistanceOf(v2, min_distance_mnv))
break;
if (v1.getStart() == v2.getStart())
continue;
all_mnvs.add(new Mnv(i, j));
}
}
final Set<String> samples = new TreeSet<>();
final SamReaderFactory srf = super.createSamReaderFactory().referenceSequence(this.faidx);
for (final Path bam : bams) {
LOG.info(String.valueOf(bam));
try (SamReader sr = srf.open(bam)) {
final SAMFileHeader header = sr.getFileHeader();
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final String sample = header.getReadGroups().stream().map(R -> R.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse(IOUtils.getFilenameWithoutCommonSuffixes(bam));
if (samples.contains(sample)) {
LOG.error("duplicate sample " + sample);
return -1;
}
samples.add(sample);
if (pedigree != null && pedigree.getSampleById(sample) == null) {
LOG.warn("sample " + sample + " from " + bam + " is not in pedigree.");
}
if (all_mnvs.isEmpty())
continue;
final QueryInterval[] intervals = QueryInterval.optimizeIntervals(this.all_variants.stream().map(V -> new QueryInterval(dict.getSequenceIndex(V.getContig()), V.getStart(), V.getEnd())).toArray(X -> new QueryInterval[X]));
final List<SAMRecord> sam_reads = new ArrayList<>();
try (CloseableIterator<SAMRecord> iter = sr.query(intervals, false)) {
while (iter.hasNext()) {
final SAMRecord record = iter.next();
if (!SAMRecordDefaultFilter.accept(record, this.minmapq))
continue;
if (record.getReadBases() == SAMRecord.NULL_SEQUENCE)
continue;
sam_reads.add(record);
}
}
// sort on query name
Collections.sort(sam_reads, (A, B) -> A.getReadName().compareTo(B.getReadName()));
for (final Mnv mnv : all_mnvs) {
final Phase phase = mnv.getPhase(sam_reads);
if (phase.equals(Phase.ambigous))
continue;
mnv.sample2phase.put(sample, phase);
}
}
}
try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
pw.print("#CHROM1\tPOS1\tREF1\tALT1");
pw.print("\tCHROM2\tPOS2\tREF2\tALT2");
pw.print("\tdistance");
for (final String sn : samples) pw.print("\t" + sn);
if (pedigree != null) {
pw.print("\tcase-cis\tcase-trans\tcontrol-cis\tcontrol-trans\tfisher");
}
pw.println();
for (final Mnv mnv : all_mnvs) {
if (mnv.sample2phase.values().stream().allMatch(V -> V.equals(Phase.ambigous) || V.equals(Phase.ref)))
continue;
for (int side = 0; side < 2; ++side) {
final VariantContext ctx = mnv.get(side);
if (side > 0)
pw.print("\t");
pw.print(ctx.getContig());
pw.print("\t");
pw.print(ctx.getStart());
pw.print("\t");
pw.print(ctx.getReference().getDisplayString());
pw.print("\t");
pw.print(ctx.getAlleles().get(1).getDisplayString());
}
pw.print("\t");
pw.print(CoordMath.getLength(mnv.get(0).getStart(), mnv.get(1).getEnd()));
int case_cis = 0;
int case_trans = 0;
int ctrl_cis = 0;
int ctrl_trans = 0;
for (final String sn : samples) {
pw.print("\t");
final Phase phase = mnv.sample2phase.get(sn);
if (phase == null) {
pw.print(".");
continue;
}
pw.print(phase.name());
if (pedigree != null) {
final Sample sample = pedigree.getSampleById(sn);
if (sample == null) {
// nothing
} else if (sample.isAffected()) {
if (phase.equals(Phase.cis)) {
case_cis++;
} else if (phase.equals(Phase.trans)) {
case_trans++;
}
} else if (sample.isUnaffected()) {
if (phase.equals(Phase.cis)) {
ctrl_cis++;
} else if (phase.equals(Phase.trans)) {
ctrl_trans++;
}
}
}
}
if (pedigree != null) {
pw.print("\t");
pw.print(case_cis);
pw.print("\t");
pw.print(case_trans);
pw.print("\t");
pw.print(ctrl_cis);
pw.print("\t");
pw.print(ctrl_trans);
pw.print("\t");
final FisherExactTest fisher = FisherExactTest.compute(case_cis, case_trans, ctrl_cis, ctrl_trans);
pw.print(fisher.getAsDouble());
}
pw.println();
}
pw.flush();
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class SamFindClippedRegions method processInput.
@Override
protected int processInput(final SAMFileHeader header, final CloseableIterator<SAMRecord> iter) {
final Set<String> samples = header.getReadGroups().stream().map(this.partition).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toCollection(TreeSet::new));
if (samples.isEmpty()) {
LOG.error("no sample in read group was defined.");
return -1;
}
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final int bad_mapq = 30;
// SAMFileWriter w=null;
try {
final IntervalTreeMap<Interval> intronMap = new IntervalTreeMap<>();
if (this.gtfPath != null) {
try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
gtfReader.getAllGenes().stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.hasIntron()).flatMap(T -> T.getIntrons().stream()).map(I -> new Interval(I.getContig(), I.getStart(), I.getEnd(), I.isNegativeStrand(), I.getTranscript().getId())).forEach(I -> intronMap.put(I, I));
}
}
/* build VCF header */
final Allele reference_allele = Allele.create("N", true);
final Allele alt_allele = Allele.create("<CLIP>", false);
final Set<VCFHeaderLine> vcfHeaderLines = new HashSet<>();
VCFStandardHeaderLines.addStandardFormatLines(vcfHeaderLines, true, VCFConstants.GENOTYPE_KEY, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_ALLELE_DEPTHS);
VCFStandardHeaderLines.addStandardInfoLines(vcfHeaderLines, true, VCFConstants.DEPTH_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_FREQUENCY_KEY);
final VCFFormatHeaderLine leftClip = new VCFFormatHeaderLine("CL", 1, VCFHeaderLineType.Integer, "Left Clip");
vcfHeaderLines.add(leftClip);
final VCFFormatHeaderLine rightClip = new VCFFormatHeaderLine("RL", 1, VCFHeaderLineType.Integer, "Right Clip");
vcfHeaderLines.add(rightClip);
final VCFFormatHeaderLine totalCip = new VCFFormatHeaderLine("TL", 1, VCFHeaderLineType.Integer, "Total Clip");
vcfHeaderLines.add(totalCip);
final VCFFormatHeaderLine totalDel = new VCFFormatHeaderLine("DL", 1, VCFHeaderLineType.Integer, "Total Deletions");
vcfHeaderLines.add(totalDel);
final VCFFormatHeaderLine noClipMAPQ = new VCFFormatHeaderLine("MQ", 1, VCFHeaderLineType.Integer, "Average MAPQ for reads without clip at this position.");
vcfHeaderLines.add(noClipMAPQ);
final VCFInfoHeaderLine averageMAPQ = new VCFInfoHeaderLine("AVG_MAPQ", 1, VCFHeaderLineType.Integer, "Average MAPQ for called genotypes");
vcfHeaderLines.add(averageMAPQ);
final VCFInfoHeaderLine infoRetrogene = new VCFInfoHeaderLine("RETROGENE", 1, VCFHeaderLineType.String, "transcript name for Possible retrogene.");
vcfHeaderLines.add(infoRetrogene);
final VCFFilterHeaderLine filterRetrogene = new VCFFilterHeaderLine("POSSIBLE_RETROGENE", "Junction is a possible Retrogene.");
vcfHeaderLines.add(filterRetrogene);
final VCFFilterHeaderLine filterlowMapq = new VCFFilterHeaderLine("LOW_MAPQ", "Low average mapq (< " + bad_mapq + ")");
vcfHeaderLines.add(filterlowMapq);
final VCFHeader vcfHeader = new VCFHeader(vcfHeaderLines, samples);
vcfHeader.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, vcfHeader);
this.writingVcfConfig.dictionary(dict);
try (VariantContextWriter w = this.writingVcfConfig.open(this.outputFile)) {
w.writeHeader(vcfHeader);
@SuppressWarnings("resource") final VariantContextWriter finalVariantContextWriter = w;
/**
* dump a BASe into the VCF
*/
final BiConsumer<String, Base> baseConsumer = (CTG, B) -> {
if (B.pos < 1)
return;
// no clip
if (B.sample2gt.values().stream().mapToInt(G -> G.clip()).sum() == 0)
return;
if (B.sample2gt.values().stream().allMatch(G -> G.clip() < min_clip_depth))
return;
if (B.sample2gt.values().stream().allMatch(G -> G.dp() < min_depth))
return;
if (B.sample2gt.values().stream().allMatch(G -> G.ratio() < fraction))
return;
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(CTG);
vcb.start(B.pos);
vcb.stop(B.pos);
vcb.alleles(Arrays.asList(reference_allele, alt_allele));
vcb.attribute(VCFConstants.DEPTH_KEY, B.sample2gt.values().stream().mapToInt(G -> G.dp()).sum());
/* if gtf was specified, find intron which ends are near this pos */
if (gtfPath != null) {
final Locatable bounds1 = new SimpleInterval(CTG, Math.max(1, B.pos - max_intron_distance), B.pos + max_intron_distance);
intronMap.getOverlapping(bounds1).stream().filter(I -> Math.abs(I.getStart() - B.pos) <= this.max_intron_distance || Math.abs(I.getEnd() - B.pos) <= this.max_intron_distance).map(I -> I.getName()).findFirst().ifPresent(transcript_id -> {
vcb.attribute(infoRetrogene.getID(), transcript_id);
vcb.filter(filterRetrogene.getID());
});
;
}
final List<Genotype> genotypes = new ArrayList<>(B.sample2gt.size());
int AC = 0;
int AN = 0;
int max_clip = 1;
double sum_mapq = 0.0;
int count_mapq = 0;
for (final String sn : B.sample2gt.keySet()) {
final Gt gt = B.sample2gt.get(sn);
final GenotypeBuilder gb = new GenotypeBuilder(sn);
if (gt.clip() == 0 && gt.noClip == 0) {
gb.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
} else if (gt.noClip == 0) {
gb.alleles(Arrays.asList(alt_allele, alt_allele));
AC += 2;
sum_mapq += gt.noClipMapq();
count_mapq++;
AN += 2;
} else if (gt.clip() == 0) {
gb.alleles(Arrays.asList(reference_allele, reference_allele));
AN += 2;
} else {
gb.alleles(Arrays.asList(reference_allele, alt_allele));
AC++;
sum_mapq += gt.noClipMapq();
count_mapq++;
AN += 2;
}
gb.DP(gt.dp());
gb.attribute(leftClip.getID(), gt.leftClip);
gb.attribute(rightClip.getID(), gt.rightClip);
gb.attribute(totalCip.getID(), gt.clip());
gb.attribute(totalDel.getID(), gt.del);
gb.attribute(noClipMAPQ.getID(), gt.noClipMapq());
gb.AD(new int[] { gt.noClip, gt.clip() });
genotypes.add(gb.make());
max_clip = Math.max(max_clip, gt.clip());
}
if (count_mapq > 0) {
final int avg_mapq = (int) (sum_mapq / count_mapq);
vcb.attribute(averageMAPQ.getID(), avg_mapq);
if (avg_mapq < bad_mapq)
vcb.filter(filterlowMapq.getID());
}
vcb.log10PError(max_clip / -10.0);
vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, AC);
vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, AN);
if (AN > 0)
vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, AC / (float) AN);
vcb.genotypes(genotypes);
finalVariantContextWriter.add(vcb.make());
};
final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
String prevContig = null;
final SortedMap<Integer, Base> pos2base = new TreeMap<>();
/* get base in pos2base, create it if needed */
final Function<Integer, Base> baseAt = POS -> {
Base b = pos2base.get(POS);
if (b == null) {
b = new Base(POS, samples);
pos2base.put(POS, b);
}
return b;
};
for (; ; ) {
final SAMRecord rec = (iter.hasNext() ? progress.apply(iter.next()) : null);
if (rec != null && !SAMRecordDefaultFilter.accept(rec, this.min_mapq))
continue;
if (rec == null || !rec.getContig().equals(prevContig)) {
for (final Integer pos : pos2base.keySet()) {
baseConsumer.accept(prevContig, pos2base.get(pos));
}
if (rec == null)
break;
pos2base.clear();
prevContig = rec.getContig();
}
for (Iterator<Integer> rpos = pos2base.keySet().iterator(); rpos.hasNext(); ) {
final Integer pos = rpos.next();
if (pos.intValue() + this.max_clip_length >= rec.getUnclippedStart())
break;
baseConsumer.accept(prevContig, pos2base.get(pos));
rpos.remove();
}
final String rg = this.partition.getPartion(rec);
if (StringUtils.isBlank(rg))
continue;
for (final AlignmentBlock ab : rec.getAlignmentBlocks()) {
for (int n = 0; n < ab.getLength(); ++n) {
}
}
final Cigar cigar = rec.getCigar();
int refPos = rec.getAlignmentStart();
for (final CigarElement ce : cigar.getCigarElements()) {
final CigarOperator op = ce.getOperator();
if (op.consumesReferenceBases()) {
if (op.consumesReadBases()) {
for (int x = 0; x < ce.getLength(); ++x) {
final Gt gt = baseAt.apply(refPos + x).getGt(rg);
gt.noClip++;
gt.noClip_sum_mapq += rec.getMappingQuality();
}
} else if (op.equals(CigarOperator.D) || op.equals(CigarOperator.N)) {
baseAt.apply(refPos).getGt(rg).del++;
baseAt.apply(refPos + ce.getLength() - 1).getGt(rg).del++;
}
refPos += ce.getLength();
}
}
CigarElement ce = cigar.getFirstCigarElement();
if (ce != null && ce.getOperator().isClipping() && ce.getLength() >= this.min_clip_operator_length) {
baseAt.apply(rec.getStart() - 1).getGt(rg).leftClip++;
}
ce = cigar.getLastCigarElement();
if (ce != null && ce.getOperator().isClipping() && ce.getLength() >= this.min_clip_operator_length) {
baseAt.apply(rec.getEnd() + 1).getGt(rg).rightClip++;
}
}
}
// end of vcf writer
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class SamShortInvertion method doWork.
@Override
public int doWork(final List<String> args) {
if (this.max_size_inversion < 100) {
LOG.error("max size insersion must be >=100");
return -1;
}
final Map<SamReader, CloseableIterator<SAMRecord>> samReaders = new HashMap<>();
VariantContextWriter vcw = null;
final IntervalTreeMap<List<Arc>> database = new IntervalTreeMap<>();
try {
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.referenceFaidx);
final short SA_TAG = SAMTag.SA.getBinaryTag();
final QueryInterval[] queryIntervals = this.intervallistProvider == null ? null : this.intervallistProvider.dictionary(dict).optimizedQueryIntervals();
final AggregateFilter theFilter = new AggregateFilter(Arrays.asList(new MappingQualityFilter(this.mapq), new DuplicateReadFilter(), new SecondaryOrSupplementaryFilter(), new FailsVendorReadQualityFilter(), new SamRecordFilter() {
@Override
public boolean filterOut(SAMRecord first, SAMRecord second) {
return filterOut(first) || filterOut(second);
}
@Override
public boolean filterOut(final SAMRecord rec) {
if (rec.getReadUnmappedFlag())
return true;
if (rec.getAttribute(SA_TAG) == null)
return true;
final Cigar cigar = rec.getCigar();
if (cigar == null || cigar.isEmpty() || !cigar.isClipped())
return true;
return false;
}
}));
for (final Path samPath : IOUtils.unrollPaths(args)) {
final SamReader srf = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.LENIENT).referenceSequence(this.referenceFaidx).open(samPath);
final CloseableIterator<SAMRecord> iter;
if (queryIntervals != null) {
iter = srf.query(queryIntervals, false);
} else {
iter = srf.iterator();
}
final FilteringSamIterator sfi = new FilteringSamIterator(iter, theFilter);
samReaders.put(srf, sfi);
}
final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, samReaders.keySet().stream().map(SR -> SR.getFileHeader()).collect(Collectors.toList()), false);
final MergingSamRecordIterator iter = new MergingSamRecordIterator(headerMerger, samReaders, true);
final Set<String> samples = headerMerger.getHeaders().stream().flatMap(R -> R.getReadGroups().stream()).map(RG -> this.partition.apply(RG, null)).filter(S -> !StringUtil.isBlank(S)).collect(Collectors.toCollection(TreeSet::new));
if (samples.isEmpty()) {
iter.close();
LOG.error("No samples/bam defined");
return -1;
}
final ToIntBiFunction<Locatable, Locatable> distance = (A, B) -> {
if (CoordMath.overlaps(A.getStart(), A.getEnd(), B.getStart(), B.getEnd()))
return 0;
if (A.getEnd() < B.getStart()) {
return B.getStart() - A.getEnd();
} else {
return A.getStart() - B.getEnd();
}
};
final Set<VCFHeaderLine> meta = new HashSet<>();
VCFStandardHeaderLines.addStandardFormatLines(meta, true, VCFConstants.GENOTYPE_KEY, VCFConstants.DEPTH_KEY);
VCFStandardHeaderLines.addStandardInfoLines(meta, true, VCFConstants.DEPTH_KEY, VCFConstants.END_KEY);
meta.add(new VCFFormatHeaderLine("N5", 1, VCFHeaderLineType.Integer, "Number of validating clipped reads in 5'"));
meta.add(new VCFFormatHeaderLine("N3", 1, VCFHeaderLineType.Integer, "Number of validating clipped reads in 3'"));
meta.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "SV length"));
meta.add(new VCFInfoHeaderLine("NSAMPLES", 1, VCFHeaderLineType.Integer, "Number of sample having some split reads"));
meta.add(new VCFInfoHeaderLine("SAMPLES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples having some split reads"));
meta.add(new VCFInfoHeaderLine("DPMAX", 1, VCFHeaderLineType.Integer, "MAX DP among samples"));
meta.add(new VCFInfoHeaderLine("SVTYPE", 1, VCFHeaderLineType.String, "Structural variant type"));
final VCFHeader header = new VCFHeader(meta, samples);
JVarkitVersion.getInstance().addMetaData(this, header);
header.setSequenceDictionary(dict);
vcw = this.writingVariants.open(this.outputFile);
vcw.writeHeader(header);
final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
String prevContig = null;
while (iter.hasNext()) {
final SAMRecord rec = progress.apply(iter.next());
if (theFilter.filterOut(rec))
continue;
final String sample = this.partition.getPartion(rec, null);
if (StringUtil.isBlank(sample))
continue;
final List<SAMRecord> others = SAMUtils.getOtherCanonicalAlignments(rec).stream().filter(R -> rec.getContig().equals(R.getContig())).filter(R -> rec.getReadNegativeStrandFlag() != R.getReadNegativeStrandFlag()).filter(R -> distance.applyAsInt(rec, R) < this.max_size_inversion).collect(Collectors.toList());
if (others.isEmpty())
continue;
if (!rec.getContig().equals(prevContig)) {
dump(dict, database, vcw, samples, null);
database.clear();
prevContig = rec.getContig();
} else {
final int before = rec.getUnclippedStart() - this.max_size_inversion * 2;
dump(dict, database, vcw, samples, before);
database.entrySet().removeIf(entries -> entries.getKey().getEnd() < before);
}
final Consumer<Arc> registerArc = (A) -> {
if (A.chromEnd <= A.chromStart)
throw new IllegalArgumentException(A.toString());
final Interval rgn = new Interval(rec.getContig(), A.chromStart, A.chromEnd);
List<Arc> list = database.get(rgn);
if (list == null) {
list = new ArrayList<>();
database.put(rgn, list);
}
list.add(A);
};
final Cigar cigar = rec.getCigar();
if (cigar.isLeftClipped()) {
for (final SAMRecord rec2 : others) {
// NON if(rec.getEnd()>= rec2.getStart()) continue;
final Arc arc = new Arc();
arc.sample = sample;
arc.tid = rec.getReferenceIndex();
arc.chromStart = Math.min(rec.getStart(), rec2.getStart());
arc.chromEnd = Math.max(rec.getEnd(), rec2.getEnd());
if (arc.length() > this.max_size_inversion)
continue;
arc.type = SUPPORTING_LEFT;
registerArc.accept(arc);
}
}
if (cigar.isRightClipped()) {
for (final SAMRecord rec2 : others) {
final Arc arc = new Arc();
arc.sample = sample;
arc.tid = rec.getReferenceIndex();
arc.chromStart = Math.min(rec.getStart(), rec2.getStart());
arc.chromEnd = Math.max(rec.getEnd(), rec2.getEnd());
if (arc.length() > this.max_size_inversion)
continue;
arc.type = SUPPORTING_RIGHT;
registerArc.accept(arc);
}
}
}
dump(dict, database, vcw, samples, null);
iter.close();
progress.close();
vcw.close();
vcw = null;
for (final SamReader sr : samReaders.keySet()) {
samReaders.get(sr).close();
sr.close();
}
return 0;
} catch (final Throwable e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(vcw);
}
}
Aggregations