use of com.github.lindenb.jvarkit.util.bio.DistanceParser in project jvarkit by lindenb.
the class CnvSlidingWindow method doWork.
@Override
public int doWork(final List<String> args) {
final List<Sample> samples = new ArrayList<>();
try {
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.refPath);
final DistanceParser distParser = new DistanceParser();
final int[] windows_array = Arrays.stream(CharSplitter.SEMICOLON.split(windowDefs)).filter(S -> !StringUtils.isBlank(S)).mapToInt(N -> distParser.applyAsInt(N)).toArray();
if (windows_array.length == 0) {
LOG.error("No window defined.");
return -1;
}
if (windows_array.length % 2 != 0) {
LOG.error("odd number of windows ? " + this.windowDefs);
return -1;
}
final List<Path> inputBams = IOUtils.unrollPaths(args);
if (inputBams.isEmpty()) {
LOG.error("input bam file missing.");
return -1;
}
final Set<String> sampleNames = new TreeSet<>();
for (final Path samFile : inputBams) {
final Sample sample = new Sample(samFile);
if (sampleNames.contains(sample.name)) {
LOG.error("duplicate sample " + sample.name);
sample.close();
return -1;
}
sampleNames.add(sample.name);
samples.add(sample);
SequenceUtil.assertSequenceDictionariesEqual(dict, sample.dict);
}
final List<Locatable> contigs = dict.getSequences().stream().filter(SR -> !SR.getSequenceName().matches(this.excludeRegex)).collect(Collectors.toCollection(ArrayList::new));
if (this.excludeBedPath != null) {
final BedLineCodec bedLineCodec = new BedLineCodec();
final ContigNameConverter ctgConverter = ContigNameConverter.fromOneDictionary(dict);
try (BufferedReader br = IOUtils.openPathForBufferedReading(this.excludeBedPath)) {
final List<SimpleInterval> exclude = br.lines().filter(L -> !BedLine.isBedHeader(L)).map(L -> bedLineCodec.decode(L)).filter(B -> B != null && !StringUtils.isBlank(ctgConverter.apply(B.getContig()))).map(B -> new SimpleInterval(ctgConverter.apply(B.getContig()), B.getStart(), B.getEnd())).collect(Collectors.toList());
boolean done = false;
while (!done) {
done = true;
int i = 0;
while (i < contigs.size()) {
final Locatable contig = contigs.get(i);
final Locatable overlapper = exclude.stream().filter(EX -> EX.overlaps(contig)).findAny().orElse(null);
if (overlapper != null) {
contigs.remove(i);
contigs.addAll(split(contig, overlapper));
done = false;
} else {
i++;
}
}
}
}
}
contigs.sort(new ContigDictComparator(dict).createLocatableComparator());
final Allele ref_allele = Allele.create("N", true);
final Allele dup_allele = Allele.create("<DUP>", false);
final Allele del_allele = Allele.create("<DEL>", false);
final Function<Integer, List<Allele>> cnv2allele = CNV -> {
switch(CNV) {
case 0:
return Arrays.asList(ref_allele, ref_allele);
case 1:
return Arrays.asList(ref_allele, dup_allele);
case 2:
return Arrays.asList(dup_allele, dup_allele);
case -1:
return Arrays.asList(ref_allele, del_allele);
case -2:
return Arrays.asList(del_allele, del_allele);
default:
throw new IllegalArgumentException("cnv:" + CNV);
}
};
final Set<VCFHeaderLine> metaData = new HashSet<>();
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY);
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.END_KEY);
final VCFFormatHeaderLine fmtLeftCov = new VCFFormatHeaderLine("LC", 1, VCFHeaderLineType.Float, "Left median coverage.");
final VCFFormatHeaderLine fmtMidCov = new VCFFormatHeaderLine("MC", 1, VCFHeaderLineType.Float, "Middle median coverage.");
final VCFFormatHeaderLine fmtRightCov = new VCFFormatHeaderLine("RC", 1, VCFHeaderLineType.Float, "right median coverage.");
final VCFFormatHeaderLine fmtLeftMedian = new VCFFormatHeaderLine("LM", 1, VCFHeaderLineType.Float, "Left normalized median coverage.");
final VCFFormatHeaderLine fmtMidMedian = new VCFFormatHeaderLine("MM", 1, VCFHeaderLineType.Float, "Middle normalized median coverage.");
final VCFFormatHeaderLine fmtRightMedian = new VCFFormatHeaderLine("RM", 1, VCFHeaderLineType.Float, "right normalized median coverage.");
metaData.add(fmtLeftCov);
metaData.add(fmtMidCov);
metaData.add(fmtRightCov);
metaData.add(fmtLeftMedian);
metaData.add(fmtMidMedian);
metaData.add(fmtRightMedian);
final VCFHeader header = new VCFHeader(metaData, sampleNames);
header.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, header);
final VariantContextWriter vcw = this.writingVariantsDelegate.dictionary(dict).open(this.outputFile);
vcw.writeHeader(header);
for (final Locatable contig : contigs) {
System.gc();
final short[] array = new short[contig.getLengthOnReference()];
final SortingCollection<Gt> sorter = SortingCollection.newInstance(Gt.class, new GtCodec(), (A, B) -> A.compare1(B), this.sorting.getMaxRecordsInRam(), this.sorting.getTmpPaths());
for (int bam_index = 0; bam_index < samples.size(); bam_index++) {
final Sample sampleBam = samples.get(bam_index);
Arrays.fill(array, (short) 0);
try (SAMRecordIterator iter = sampleBam.samReader.queryOverlapping(contig.getContig(), contig.getStart(), contig.getEnd())) {
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
if (rec.getReadUnmappedFlag())
continue;
if (rec.getDuplicateReadFlag())
continue;
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getReadFailsVendorQualityCheckFlag())
continue;
final Cigar cigar = rec.getCigar();
if (cigar == null || cigar.isEmpty())
continue;
int refPos = rec.getStart();
for (final CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
if (op.consumesReferenceBases()) {
if (op.consumesReadBases()) {
for (int i = 0; i < ce.getLength(); i++) {
final int idx = refPos - contig.getStart() + i;
if (idx < 0)
continue;
if (idx >= array.length)
break;
if (array[idx] == Short.MAX_VALUE)
continue;
array[idx]++;
}
}
refPos += ce.getLength();
}
}
}
}
for (int widx = 0; widx < windows_array.length; widx += 2) {
final int window_size = windows_array[widx + 0];
final int extend = (int) Math.ceil(window_size * this.extend);
if (extend <= 0)
continue;
if (window_size > contig.getLengthOnReference())
continue;
final int window_shift = windows_array[widx + 1];
LOG.info(contig + " " + window_size + "+-" + extend + ";" + window_shift + " " + sampleBam.name);
final Coverage leftcov = new Coverage(extend);
final Coverage rightcov = new Coverage(extend);
final Coverage leftrightcov = new Coverage(extend + extend);
final Coverage midcov = new Coverage(window_size);
for (int pos1 = contig.getStart(); pos1 + window_size + extend + extend < contig.getEnd(); pos1 += window_shift) {
leftcov.reset();
rightcov.reset();
leftrightcov.reset();
midcov.reset();
for (int x = 0; x < extend; x++) {
final int idx = pos1 - contig.getStart() + x;
leftcov.add(array[idx]);
leftrightcov.add(array[idx]);
}
final double leftMedian = leftcov.median();
if (leftMedian < this.min_depth)
continue;
for (int x = 0; x < extend; x++) {
final int idx = pos1 - contig.getStart() + extend + window_size + x;
rightcov.add(array[idx]);
leftrightcov.add(array[idx]);
}
final double rightMedian = rightcov.median();
if (rightMedian < this.min_depth)
continue;
for (int x = 0; x < window_size; x++) {
final int idx = pos1 - contig.getStart() + extend + x;
midcov.add(array[idx]);
}
final double median = leftrightcov.median();
if (rightcov.median() < this.min_depth)
continue;
for (int x = 0; x < midcov.count; x++) {
midcov.array[x] /= median;
}
for (int x = 0; x < leftcov.count; x++) {
leftcov.array[x] /= median;
}
for (int x = 0; x < rightcov.count; x++) {
rightcov.array[x] /= median;
}
final double norm_depth = midcov.median();
final int cnv = getCNVIndex(norm_depth);
if (cnv != CNV_UNDEFINED && cnv != 1 && getCNVIndex(leftcov.median()) == 1 && getCNVIndex(rightcov.median()) == 1) {
final Gt gt = new Gt();
gt.start = pos1 + extend;
gt.end = gt.start + window_size;
gt.sample_idx = bam_index;
gt.cnv = cnv;
gt.leftMedian = (float) leftMedian;
gt.midMedian = (float) median;
gt.rightMedian = (float) rightMedian;
gt.leftCov = (float) leftcov.median();
gt.midCov = (float) norm_depth;
gt.rightCov = (float) rightcov.median();
sorter.add(gt);
}
}
}
}
sorter.setDestructiveIteration(true);
try (CloseableIterator<Gt> iter = sorter.iterator()) {
final EqualRangeIterator<Gt> eq = new EqualRangeIterator<>(iter, (A, B) -> A.compare0(B));
while (eq.hasNext()) {
final List<Gt> row = eq.next();
if (row.isEmpty())
continue;
final Gt first = row.get(0);
final Set<Allele> alleles = new HashSet<>();
row.stream().flatMap(GT -> cnv2allele.apply(GT.cnv).stream()).forEach(CNV -> alleles.add(CNV));
alleles.add(ref_allele);
if (alleles.size() < 2)
continue;
final VariantContextBuilder vcb = new VariantContextBuilder(null, contig.getContig(), first.start, first.end, alleles);
vcb.attribute(VCFConstants.END_KEY, first.end);
final List<Genotype> genotypes = new ArrayList<>(samples.size());
for (final Gt gt : row) {
final GenotypeBuilder gb = new GenotypeBuilder(samples.get(gt.sample_idx).name, cnv2allele.apply(gt.cnv));
gb.attribute(fmtLeftMedian.getID(), (double) gt.leftMedian);
gb.attribute(fmtMidMedian.getID(), (double) gt.midMedian);
gb.attribute(fmtRightMedian.getID(), (double) gt.rightMedian);
gb.attribute(fmtLeftCov.getID(), (double) gt.leftCov);
gb.attribute(fmtMidCov.getID(), (double) gt.midCov);
gb.attribute(fmtRightCov.getID(), (double) gt.rightCov);
genotypes.add(gb.make());
}
vcb.genotypes(genotypes);
vcw.add(vcb.make());
}
eq.close();
}
sorter.cleanup();
}
vcw.close();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
samples.forEach(S -> CloserUtil.close(S));
}
}
Aggregations