use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class VCFBed method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VCFIterator vcfin, final VariantContextWriter out) {
final VCFHeader header = vcfin.getHeader();
final VCFHeader h2 = new VCFHeader(header);
final VCFInfoHeaderLine infoHeader = new VCFInfoHeaderLine(this.infoName, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "metadata added from " + this.inputBedFile + " . Format was " + this.formatPattern.replaceAll("[\"\'\\\\]", " "));
final VCFInfoHeaderLine infoCountBasicOverlap = new VCFInfoHeaderLine(this.infoName + "_N", 1, VCFHeaderLineType.Integer, "Number of raw overlap within distance " + this.within_distance + "bp.");
final VCFInfoHeaderLine infoCountFinerOverlap = new VCFInfoHeaderLine(this.infoName + "_C", 1, VCFHeaderLineType.Integer, "Number of finer overlap within distance " + inputBedFile + " and overlap bed:" + this.min_overlap_bed_fraction + " and overlap vcf: " + this.min_overlap_vcf_fraction);
if (infoCountBasicOverlap != null)
h2.addMetaDataLine(infoCountBasicOverlap);
if (infoCountFinerOverlap != null)
h2.addMetaDataLine(infoCountFinerOverlap);
h2.addMetaDataLine(infoHeader);
JVarkitVersion.getInstance().addMetaData(this, h2);
out.writeHeader(h2);
while (vcfin.hasNext()) {
final VariantContext ctx = vcfin.next();
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.rmAttribute(infoCountBasicOverlap.getID());
vcb.rmAttribute(infoCountFinerOverlap.getID());
vcb.rmAttribute(infoHeader.getID());
if (this.ignoreFILTERed && ctx.isFiltered()) {
out.add(vcb.make());
continue;
}
final String normalizedContig = this.contigNameConverter.apply(ctx.getContig());
if (StringUtil.isBlank(normalizedContig)) {
out.add(ctx);
continue;
}
final Optional<? extends Locatable> bndOpt = this.disable_bnd_eval ? Optional.empty() : Breakend.parseInterval(ctx);
final SimpleInterval theInterval;
if (bndOpt.isPresent()) {
theInterval = new SimpleInterval(normalizedContig, bndOpt.get().getStart(), bndOpt.get().getEnd());
} else {
theInterval = new SimpleInterval(normalizedContig, ctx.getStart(), ctx.getEnd());
}
final SimpleInterval extendedInterval;
if (this.within_distance <= 0) {
extendedInterval = theInterval;
} else {
extendedInterval = new SimpleInterval(theInterval.getContig(), Math.max(1, theInterval.getStart() + this.within_distance), theInterval.getEnd() + this.within_distance);
}
int count_basic_overlap = 0;
int count_finer_overlap = 0;
final Set<String> annotations = new LinkedHashSet<>();
if (this.intervalTreeMap != null) {
for (final Set<BedLine> bedLines : this.intervalTreeMap.getOverlapping(extendedInterval)) {
for (final BedLine bedLine : bedLines) {
count_basic_overlap++;
if (!testFinerIntersection(theInterval, bedLine))
continue;
count_finer_overlap++;
final String newannot = this.bedJexlToString.apply(new BedJEXLContext(bedLine, ctx));
if (!StringUtil.isBlank(newannot)) {
annotations.add(VCFUtils.escapeInfoField(newannot));
}
}
}
} else {
try (CloseableIterator<BedLine> iter = this.bedReader.iterator(theInterval.getContig(), Math.max(0, theInterval.getStart() - 1), theInterval.getEnd() + 1)) {
while (iter.hasNext()) {
final BedLine bedLine = iter.next();
if (!theInterval.contigsMatch(bedLine))
continue;
if (!extendedInterval.withinDistanceOf(bedLine, this.within_distance))
continue;
count_basic_overlap++;
if (!testFinerIntersection(theInterval, bedLine))
continue;
count_finer_overlap++;
final String newannot = this.bedJexlToString.apply(new BedJEXLContext(bedLine, ctx));
if (!StringUtil.isBlank(newannot))
annotations.add(VCFUtils.escapeInfoField(newannot));
}
} catch (final IOException ioe) {
LOG.error(ioe);
throw new RuntimeIOException(ioe);
}
}
if (!annotations.isEmpty()) {
vcb.attribute(infoHeader.getID(), annotations.toArray());
}
vcb.attribute(infoCountBasicOverlap.getID(), count_basic_overlap);
vcb.attribute(infoCountFinerOverlap.getID(), count_finer_overlap);
out.add(vcb.make());
}
out.close();
return 0;
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class VCFMerge method doWork.
@Override
public int doWork(final List<String> args) {
final List<Path> userVcfFiles = new ArrayList<Path>();
final Set<String> genotypeSampleNames = new TreeSet<String>();
SAMSequenceDictionary dict = null;
VariantContextWriter w = null;
SortingCollection<VariantContext> array = null;
CloseableIterator<VariantContext> iter = null;
try {
userVcfFiles.addAll(IOUtils.unrollPaths(args));
final Set<String> fieldsSet = Arrays.stream(this.fieldsString.split("[,; ]")).collect(Collectors.toSet());
final boolean with_ac = fieldsSet.contains(VCFConstants.ALLELE_COUNT_KEY);
final boolean with_an = fieldsSet.contains(VCFConstants.ALLELE_NUMBER_KEY);
final boolean with_af = fieldsSet.contains(VCFConstants.ALLELE_FREQUENCY_KEY);
final boolean with_dp = fieldsSet.contains(VCFConstants.DEPTH_KEY);
final boolean with_gq = fieldsSet.contains(VCFConstants.GENOTYPE_QUALITY_KEY);
final boolean with_ad = fieldsSet.contains(VCFConstants.GENOTYPE_ALLELE_DEPTHS);
final boolean with_pl = fieldsSet.contains(VCFConstants.GENOTYPE_PL_KEY);
if (userVcfFiles.isEmpty()) {
LOG.error("No input");
return -1;
}
final boolean requireIndex = !StringUtils.isBlank(this.regionStr);
for (final Path vcfFile : userVcfFiles) {
try (VCFReader in = VCFReaderFactory.makeDefault().open(vcfFile, requireIndex)) {
final VCFHeader header = in.getHeader();
for (final String sn : header.getSampleNamesInOrder()) {
if (genotypeSampleNames.contains(sn)) {
LOG.error("duplicate sample name " + sn);
return -1;
}
genotypeSampleNames.add(sn);
}
final SAMSequenceDictionary dict1 = SequenceDictionaryUtils.extractRequired(header);
if (dict == null) {
dict = dict1;
} else {
SequenceUtil.assertSequenceDictionariesEqual(dict, dict1);
}
}
}
if (dict == null) {
LOG.error("No sequence dictionary defined");
return -1;
}
final SAMSequenceDictionary finalDict = dict;
final Function<String, Integer> contig2tid = C -> {
final int tid = finalDict.getSequenceIndex(C);
if (tid == -1)
throw new JvarkitException.ContigNotFoundInDictionary(C, finalDict);
return tid;
};
final Comparator<String> compareContigs = (C1, C2) -> {
if (C1.equals(C2))
return 0;
return contig2tid.apply(C1) - contig2tid.apply(C2);
};
final Comparator<VariantContext> compareChromPos = (V1, V2) -> {
int i = compareContigs.compare(V1.getContig(), V2.getContig());
if (i != 0)
return i;
return V1.getStart() - V2.getStart();
};
final Comparator<VariantContext> compareChromPosRef = (V1, V2) -> {
int i = compareChromPos.compare(V1, V2);
if (i != 0)
return i;
return V1.getReference().compareTo(V2.getReference());
};
final SimpleInterval rgn;
final Predicate<VariantContext> accept;
if (!StringUtil.isBlank(VCFMerge.this.regionStr)) {
rgn = IntervalParserFactory.newInstance().dictionary(dict).enableWholeContig().make().apply(VCFMerge.this.regionStr).orElseThrow(IntervalParserFactory.exception(VCFMerge.this.regionStr));
accept = (CTX) -> {
return rgn.overlaps(CTX);
};
} else {
accept = (VOL) -> true;
rgn = null;
}
final Set<VCFHeaderLine> metaData = new HashSet<>();
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY);
if (with_dp)
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.DEPTH_KEY);
if (with_gq)
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_QUALITY_KEY);
if (with_pl)
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_PL_KEY);
if (with_ad)
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_ALLELE_DEPTHS);
if (with_ac)
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.ALLELE_COUNT_KEY);
if (with_an)
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.ALLELE_NUMBER_KEY);
if (with_af)
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.ALLELE_FREQUENCY_KEY);
if (with_dp)
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.DEPTH_KEY);
final VCFHeader mergedHeader = new VCFHeader(metaData, genotypeSampleNames);
mergedHeader.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, mergedHeader);
array = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(mergedHeader), compareChromPosRef, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
array.setDestructiveIteration(true);
for (final Path vcfFile : userVcfFiles) {
try (VCFReader in = VCFReaderFactory.makeDefault().open(vcfFile, requireIndex)) {
try (CloseableIterator<VariantContext> lit = (in.isQueryable() && rgn != null ? in.query(rgn) : in.iterator())) {
while (lit.hasNext()) {
final VariantContext ctx = lit.next();
if (!accept.test(ctx))
continue;
array.add(new VariantContextBuilder(ctx).unfiltered().genotypes(ctx.getGenotypes().stream().filter(G -> G.isCalled()).map(G -> {
final GenotypeBuilder gb = new GenotypeBuilder(G);
gb.noAttributes();
return gb.make();
}).collect(Collectors.toList())).rmAttributes(new ArrayList<>(ctx.getAttributes().keySet())).make());
}
}
}
}
array.doneAdding();
LOG.info("merging..." + userVcfFiles.size() + " vcfs");
// create the context writer
w = this.writingVariantsDelegate.open(outputFile);
w.writeHeader(mergedHeader);
iter = array.iterator();
EqualRangeIterator<VariantContext> eqiter = new EqualRangeIterator<>(iter, compareChromPosRef);
while (eqiter.hasNext()) {
final List<VariantContext> row = eqiter.next();
final VariantContext first = row.get(0);
final List<Allele> alleles = new ArrayList<>();
alleles.add(first.getReference());
alleles.addAll(row.stream().flatMap(VC -> VC.getAlternateAlleles().stream()).filter(A -> !A.isNoCall()).collect(Collectors.toSet()));
final VariantContextBuilder vcb = new VariantContextBuilder(null, first.getContig(), first.getStart(), first.getEnd(), alleles);
final String id = row.stream().filter(V -> V.hasID()).map(ST -> ST.getID()).findFirst().orElse(null);
if (!StringUtils.isBlank(id)) {
vcb.id(id);
}
final Map<String, Genotype> sample2genotypes = new HashMap<>(genotypeSampleNames.size());
final Set<String> remainingSamples = new HashSet<String>(genotypeSampleNames);
int an = 0;
int dp = -1;
final Counter<Allele> ac = new Counter<>();
for (final VariantContext ctx : row) {
for (final Genotype gt : ctx.getGenotypes()) {
if (gt.isNoCall())
continue;
for (Allele a : gt.getAlleles()) {
ac.incr(a);
an++;
}
final GenotypeBuilder gb = new GenotypeBuilder(gt.getSampleName(), gt.getAlleles());
if (with_dp && gt.hasDP()) {
if (dp < 0)
dp = 0;
dp += gt.getDP();
gb.DP(gt.getDP());
}
if (with_gq && gt.hasGQ())
gb.GQ(gt.getGQ());
if (with_pl && gt.hasPL())
gb.PL(gt.getPL());
if (with_ad && gt.hasAD()) {
final int[] src_ad = gt.getAD();
final int[] dest_ad = new int[alleles.size()];
Arrays.fill(dest_ad, 0);
for (int i = 0; i < src_ad.length && i < ctx.getAlleles().size(); i++) {
final Allele a1 = ctx.getAlleles().get(i);
final int dest_idx = alleles.indexOf(a1);
if (dest_idx >= 0 && dest_idx < dest_ad.length) {
dest_ad[dest_idx] = src_ad[i];
}
gb.AD(dest_ad);
}
}
sample2genotypes.put(gt.getSampleName(), gb.make());
}
}
remainingSamples.removeAll(sample2genotypes.keySet());
for (String sampleName : remainingSamples) {
final Genotype gt;
if (this.useHomRefForUnknown) {
final List<Allele> list = new ArrayList<>(ploidy);
for (int i = 0; i < ploidy; i++) list.add(first.getReference());
an += ploidy;
gt = GenotypeBuilder.create(sampleName, list);
} else {
gt = GenotypeBuilder.createMissing(sampleName, this.ploidy);
}
sample2genotypes.put(sampleName, gt);
}
if (with_an)
vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, an);
if (with_ac)
vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, alleles.subList(1, alleles.size()).stream().mapToInt(A -> (int) ac.count(A)).toArray());
if (with_af && an > 0) {
final double finalAn = an;
vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, alleles.subList(1, alleles.size()).stream().mapToDouble(A -> ac.count(A) / finalAn).toArray());
}
if (with_dp && dp >= 0) {
vcb.attribute(VCFConstants.DEPTH_KEY, dp);
}
vcb.genotypes(sample2genotypes.values());
w.add(vcb.make());
}
eqiter.close();
CloserUtil.close(w);
w = null;
array.cleanup();
array = null;
CloserUtil.close(iter);
iter = null;
return 0;
} catch (Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(w);
CloserUtil.close(iter);
if (array != null)
array.cleanup();
}
}
Aggregations