use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.
the class VcfStrechToSvg method run.
private void run(final ArchiveFactory archive, final BedLine bed, final VCFHeader header, final VCFReader in, final PrintWriter manifest) {
LOG.info("processing " + bed);
final Set<String> limitSamples;
if (StringUtils.isBlank(this.sampleStr)) {
limitSamples = new TreeSet<>(header.getSampleNamesInOrder());
} else if (this.sampleStr.startsWith("^")) {
limitSamples = new TreeSet<>(header.getSampleNamesInOrder());
limitSamples.removeAll(Arrays.stream(this.sampleStr.substring(1).split("[, ]+")).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toSet()));
} else {
limitSamples = Arrays.stream(this.sampleStr.split("[, ]+")).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toCollection(TreeSet::new));
}
final SAMSequenceDictionary dict = header.getSequenceDictionary();
try (CloseableIterator<VariantContext> iter = in.query(bed)) {
final List<VariantSet> L = iter.stream().filter(V -> acceptVariant(V)).map(V -> new VariantSet(V)).collect(Collectors.toCollection(ArrayList::new));
if (L.isEmpty()) {
LOG.warn("No valid variant found for \"" + bed + "\"");
return;
}
int i = 0;
while (i + 1 < L.size()) {
if (L.get(i).withinDistanceOf(L.get(i + 1), this.withinDistance)) {
L.get(i).variants.addAll(L.get(i + 1).variants);
L.remove(i + 1);
} else {
i++;
}
}
final int margin_left = 50;
final int margin_right = 10;
final double drawingAreaWidth = image_width_pixel - (margin_left + margin_right);
final int intervalLength = L.stream().mapToInt(V -> V.getLengthOnReference()).sum();
double x = 0;
for (i = 0; i < L.size(); i++) {
L.get(i).x = x;
x += (L.get(i).getLengthOnReference() / (double) intervalLength) * drawingAreaWidth;
}
for (i = 0; i < L.size(); i++) {
L.get(i).width = (i + 1 < L.size() ? L.get(i + 1).x : drawingAreaWidth) - L.get(i).x;
}
try {
final DocumentBuilderFactory db = DocumentBuilderFactory.newInstance();
final DocumentBuilder dom = db.newDocumentBuilder();
this.document = dom.newDocument();
final Element svgRoot = element("svg");
this.document.appendChild(svgRoot);
final String mainTitleStr = SequenceDictionaryUtils.getBuildName(dict).orElse("") + " " + new SimpleInterval(bed).toNiceString() + " length:" + StringUtils.niceInt(bed.getLengthOnReference());
/* SVG title */
{
final Element title = element("title");
svgRoot.appendChild(title);
title.appendChild(text(mainTitleStr));
}
/* SVG style */
{
final String gtopacity = this.dynamicParams.getOrDefault("gt.opacity", "0.7");
final Element style = element("style");
svgRoot.appendChild(style);
style.appendChild(text(".maintitle {text-anchor:middle;fill:blue} " + ".vc {stroke-width:0.5px;} " + ".transcript {stroke:black;stroke-width:1px;opacity:1;}" + ".exon {stroke:black;stroke-width:0.5px;fill:blue;opacity:1;}" + ".sample {fill:blue;font-size:7px;} " + ".samplelabel {stroke:gray;stroke-width:0.5px;font-size:" + this.dynamicParams.getOrDefault("sample.fontsize", "7") + "px;}\n" + ".coverage { fill:gray; stroke:yellow;opacity:0.2;} " + ".frame { fill:none; stroke: darkgray;} " + ".area0 {fill:white;}\n" + ".area1 {fill:floralwhite;}\n" + "circle.HOM_REF {fill:green;opacity:" + gtopacity + ";stroke-width:0.5px;}\n" + "circle.HET {fill:blue;opacity:" + gtopacity + ";stroke-width:0.5px;}\n" + "circle.HOM_VAR {fill:red;opacity:" + gtopacity + ";stroke-width:0.5px;}\n" + "a {cursor: pointer;}\n"));
}
/* desc */
{
final Element descr = element("desc");
svgRoot.appendChild(descr);
descr.appendChild(text("Author: Pierre Lindenbaum\n" + JVarkitVersion.getInstance().getCompilationDate() + "\n" + JVarkitVersion.getInstance().getGitHash()));
}
// main title
{
final Element gtitle = element("text", mainTitleStr);
gtitle.setAttribute("class", "maintitle");
gtitle.setAttribute("x", format(this.image_width_pixel / 2.0));
gtitle.setAttribute("y", "15");
svgRoot.appendChild(wrapLoc(gtitle, bed));
}
int margin_top = 50;
double y = margin_top;
final double min_circle_radius = Double.parseDouble(this.dynamicParams.getOrDefault("gt.r1", "1"));
final double max_circle_radius = Double.parseDouble(this.dynamicParams.getOrDefault("gt.r2", "7"));
final Element main_g = element("g");
svgRoot.appendChild(main_g);
/**
* plot genes
*/
if (!this.all_genes.isEmpty()) {
final double transcript_height = 5;
final double exon_height = (transcript_height - 1);
final double save_y = y;
final Element g_genes = element("g");
g_genes.setAttribute("transform", "translate(" + margin_left + ",0)");
main_g.appendChild(g_genes);
/* loop over each vset */
for (i = 0; i < L.size(); i++) {
final VariantSet vset = L.get(i);
// find transcript in this vset
final List<Transcript> transcripts = this.all_genes.getOverlapping(vset).stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.overlaps(vset)).collect(Collectors.toList());
if (transcripts.isEmpty())
continue;
final Element g_vset = element("g");
g_vset.setAttribute("transform", "translate(" + format(vset.x) + ",0)");
g_genes.appendChild(g_vset);
// y in this vset
double y2 = save_y;
/* convert base to pixel */
final ToDoubleFunction<Integer> base2pixel = vset.createBaseToPixelFunction();
/* loop over transcripts */
for (final Transcript tr : transcripts) {
final Element g_tr = element("g");
g_tr.setAttribute("transform", "translate(0," + format(y2) + ")");
g_vset.appendChild(g_tr);
final Element line = element("line");
line.setAttribute("class", "transcript");
line.setAttribute("x1", format(Math.max(0, base2pixel.applyAsDouble(tr.getStart()))));
line.setAttribute("y1", format(transcript_height / 2.0));
line.setAttribute("x2", format(Math.min(vset.width, base2pixel.applyAsDouble(tr.getEnd()))));
line.setAttribute("y2", format(transcript_height / 2.0));
line.appendChild(element("title", tr.getId()));
g_tr.appendChild(wrapLoc(line, tr));
/* loop over exons */
for (final Exon exon : tr.getExons()) {
if (!exon.overlaps(vset))
continue;
final Element exRect = element("rect");
exRect.setAttribute("class", "exon");
final double x_start = Math.max(0, base2pixel.applyAsDouble(exon.getStart()));
exRect.setAttribute("x", format(x_start));
exRect.setAttribute("y", format(transcript_height / 2.0 - exon_height / 2.0));
final double x_end = Math.min(vset.width, base2pixel.applyAsDouble(exon.getEnd()));
exRect.setAttribute("width", format(x_end - x_start));
exRect.setAttribute("height", format(exon_height));
exRect.appendChild(element("title", exon.getName()));
g_tr.appendChild(wrapLoc(exRect, exon));
}
y2 += transcript_height + 0.5;
}
y = Math.max(y, y2);
}
y++;
}
final double sample_height = Double.parseDouble(this.dynamicParams.getOrDefault("sample.height", "25"));
final double sample_height2 = sample_height - (max_circle_radius * 2.0);
int space_between_samples = 2;
int got_n_samples = 0;
for (final String sn : header.getSampleNamesInOrder()) {
if (!limitSamples.contains(sn))
continue;
boolean got_this_sample = false;
final Element g_sample = element("g");
g_sample.setAttribute("transform", "translate(" + margin_left + "," + format(y) + ")");
/* get coverage */
final int maxCoverage;
if (this.sample2bam.containsKey(sn)) {
final CoverageFactory coverageFactory = new CoverageFactory();
try (SamReader sr = this.openSamReader(this.sample2bam.get(sn))) {
/* loop over each variant set */
for (final VariantSet vset : L) {
vset.coverage = coverageFactory.getSimpleCoverage(sr, vset, sn);
}
}
maxCoverage = L.stream().flatMapToInt(V -> V.coverage.stream()).max().orElse(0);
} else {
maxCoverage = 0;
for (final VariantSet vset : L) {
vset.coverage = null;
}
}
/* loop over each variant set */
for (i = 0; i < L.size(); i++) {
final VariantSet vset = L.get(i);
final Element g_vset = element("g");
g_vset.setAttribute("transform", "translate(" + format(vset.x) + ",0)");
g_sample.appendChild(g_vset);
/* convert base to pixel */
final ToDoubleFunction<Integer> base2pixel = vset.createBaseToPixelFunction();
// plot set length
final Element rect = element("rect");
rect.setAttribute("class", "area" + (i % 2));
rect.setAttribute("x", "0");
rect.setAttribute("y", "0");
rect.setAttribute("width", format(vset.width));
rect.setAttribute("height", format(sample_height));
if (!remove_tooltip)
rect.appendChild(element("title", vset.toString()));
g_vset.appendChild(rect);
// plot coverage
if (maxCoverage > 0 && this.sample2bam.containsKey(sn)) {
final double[] scaled = vset.coverage.scaleAverage((int) vset.width);
final StringBuilder sb = new StringBuilder();
sb.append("0," + sample_height);
for (int t = 0; t < scaled.length; t++) {
if (t > 1 && t + 1 < scaled.length && format(scaled[t - 1]).equals(format(scaled[t + 1])) && format(scaled[t - 1]).equals(format(scaled[t])))
continue;
sb.append(" ").append(t).append(",");
sb.append(format(sample_height * (1.0 - scaled[t] / maxCoverage)));
}
sb.append(" " + format(vset.width) + "," + sample_height);
final Element polyline = element("polyline");
polyline.setAttribute("class", "coverage");
polyline.setAttribute("points", sb.toString());
g_vset.appendChild(polyline);
vset.coverage = null;
}
// plot vertical line if colorTag defined
if (!StringUtils.isBlank(this.colorTag)) {
for (final VariantContext vc : vset.variants) {
if (!vc.hasAttribute(this.colorTag))
continue;
final String cssColor = vc.getAttributeAsString(this.colorTag, "");
if (StringUtils.isBlank(cssColor))
continue;
final double x0 = base2pixel.applyAsDouble(vc.getStart());
final Element line = element("line");
line.setAttribute("class", "vc");
line.setAttribute("style", "stroke:" + cssColor);
line.setAttribute("x1", format(x0));
line.setAttribute("y1", "0");
line.setAttribute("x2", format(x0));
line.setAttribute("y2", format(sample_height));
g_vset.appendChild(line);
}
}
// print all variants in this vcfset for this sample
for (final VariantContext vc : vset.variants) {
final Genotype gt = vc.getGenotype(sn);
if (gt.isNoCall())
continue;
if (hide_hom_ref && gt.isHomRef())
continue;
if (gt.hasGQ() && gt.getGQ() < this.minGQ)
continue;
final OptionalDouble alt_ratio = getAltRatio(gt);
if (!alt_ratio.isPresent())
continue;
final OptionalDouble af = getAF(vc);
final double circle_radius = min_circle_radius + (max_circle_radius - min_circle_radius) * (1.0 - af.orElse(1.0));
// HOMREF=0; HET =0.5; HOMVAR = 1;
final double gtx = base2pixel.applyAsDouble(vc.getStart());
final double gty = sample_height - (sample_height2 * alt_ratio.getAsDouble() + (sample_height - sample_height2) / 2.0);
final Element circle = element("circle");
circle.setAttribute("class", gt.getType().name());
circle.setAttribute("cx", format(gtx));
circle.setAttribute("cy", format(gty));
circle.setAttribute("r", format(circle_radius));
if (!remove_tooltip)
circle.appendChild(element("title", vc.getStart() + " " + (vc.hasID() ? vc.getID() : "") + " " + vc.getAlleles().stream().map(A -> A.getDisplayString()).collect(Collectors.joining("/")) + " " + gt.getType().name() + " AF=" + format(af.orElse(-1))));
g_vset.appendChild(wrapLoc(circle, vc));
got_this_sample = true;
}
}
final Element frame_sample = element("rect");
frame_sample.setAttribute("class", "frame");
frame_sample.setAttribute("x", "0");
frame_sample.setAttribute("y", "0");
frame_sample.setAttribute("width", format(drawingAreaWidth));
frame_sample.setAttribute("height", format(sample_height));
g_sample.appendChild(frame_sample);
final Element label = element("text", sn + (maxCoverage == 0 ? "" : " Max Cov. " + maxCoverage));
label.setAttribute("class", "samplelabel");
label.setAttribute("x", "0");
label.setAttribute("y", "0");
// label.setAttribute("transform", "translate("+format(-10)+","+0+") rotate(90) ");
label.setAttribute("transform", "translate(12,12)");
if (!remove_tooltip)
label.appendChild(element("title", sn));
g_sample.appendChild(label);
if (got_this_sample) {
got_n_samples++;
main_g.appendChild(g_sample);
y += sample_height + space_between_samples;
} else {
LOG.warn("no valid data for sample " + sn + " in " + bed);
}
}
// remove extra sample space
y -= space_between_samples;
svgRoot.setAttribute("width", format(this.image_width_pixel + 1));
svgRoot.setAttribute("height", format(y + 1));
if (got_n_samples == 0) {
LOG.info("no sample/genotype found for " + bed);
return;
}
// save
final Transformer tr = TransformerFactory.newInstance().newTransformer();
final String filename = bed.getContig() + "_" + bed.getStart() + "_" + bed.getEnd() + ".svg" + (this.compressed_svg ? ".gz" : "");
LOG.info("writing " + filename);
if (this.compressed_svg) {
try (final OutputStream pw = archive.openOuputStream(filename)) {
try (GZIPOutputStream gzout = new GZIPOutputStream(pw)) {
tr.transform(new DOMSource(this.document), new StreamResult(gzout));
gzout.finish();
gzout.flush();
}
pw.flush();
}
} else {
try (final PrintWriter pw = archive.openWriter(filename)) {
tr.transform(new DOMSource(this.document), new StreamResult(pw));
pw.flush();
}
}
manifest.print(bed.getContig());
manifest.print("\t");
manifest.print(bed.getStart() - 1);
manifest.print("\t");
manifest.print(bed.getEnd());
manifest.print("\t");
manifest.print(filename);
manifest.println();
} catch (final Throwable err) {
throw new RuntimeException(err);
} finally {
this.document = null;
}
}
}
use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.
the class ScanStructuralVariants method doWork.
@Override
public int doWork(final List<String> args) {
final List<VCFReader> casesFiles = new ArrayList<>();
if (this.svComparator.getBndDistance() < 0) {
LOG.error("bad max_distance :" + this.svComparator.getBndDistance());
return -1;
}
VariantContextWriter out = null;
try {
final List<Path> casesPaths = (IOUtils.unrollPaths(args));
if (casesPaths.isEmpty()) {
LOG.error("cases list is empty");
return -1;
}
if (!print_all_ctx && casesPaths.size() == 1) {
LOG.warning("One case: switching to --all");
print_all_ctx = true;
}
if (this.controlsPath.size() == 1 && this.controlsPath.get(0).toString().endsWith(".list")) {
this.controlsPath = Files.lines(this.controlsPath.get(0)).filter(L -> !(L.startsWith("#") || StringUtils.isBlank(L))).map(L -> Paths.get(L)).collect(Collectors.toList());
}
SAMSequenceDictionary dict = null;
final Set<VCFHeaderLine> metadata = new HashSet<>();
for (int side = 0; side < 2; side++) {
for (final Path input : (side == 0 ? casesPaths : this.controlsPath)) {
final VCFReader vcfInput = VCFReaderFactory.makeDefault().open(input);
final VCFHeader header = vcfInput.getHeader();
if (side == 0) {
casesFiles.add(vcfInput);
} else {
vcfInput.close();
}
final SAMSequenceDictionary dict2 = SequenceDictionaryUtils.extractRequired(header);
if (dict == null) {
dict = dict2;
} else if (!SequenceUtil.areSequenceDictionariesEqual(dict, dict2)) {
LOG.error(JvarkitException.DictionariesAreNotTheSame.getMessage(dict2, dict));
return -1;
}
}
}
final IntervalTreeMap<Boolean> intervalTreeMap;
if (intervalListProvider != null) {
intervalTreeMap = new IntervalTreeMap<>();
intervalListProvider.dictionary(dict).stream().forEach(R -> intervalTreeMap.put(new Interval(R), true));
} else {
intervalTreeMap = null;
}
casesFiles.stream().flatMap(F -> F.getHeader().getMetaDataInInputOrder().stream()).forEach(H -> metadata.add(H));
VCFStandardHeaderLines.addStandardFormatLines(metadata, true, VCFConstants.GENOTYPE_KEY);
VCFStandardHeaderLines.addStandardInfoLines(metadata, true, VCFConstants.END_KEY);
metadata.add(new VCFInfoHeaderLine("SAMPLES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples carrying the SV"));
metadata.add(new VCFInfoHeaderLine("NSAMPLES", 1, VCFHeaderLineType.Integer, "Number of Samples carrying the SV"));
metadata.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "SV length"));
metadata.add(new VCFInfoHeaderLine("CIPOS", 2, VCFHeaderLineType.Integer, "Confidence interval around POS for imprecise variants"));
metadata.add(new VCFInfoHeaderLine("CIEND", 2, VCFHeaderLineType.Integer, "Confidence interval around END for imprecise variants"));
metadata.add(new VCFInfoHeaderLine("IMPRECISE", 0, VCFHeaderLineType.Flag, "Imprecise structural variation"));
metadata.add(new VCFInfoHeaderLine(ATT_FILENAME, 1, VCFHeaderLineType.String, "Source of variant"));
metadata.add(new VCFInfoHeaderLine(ATT_CLUSTER, 1, VCFHeaderLineType.String, "Variant cluster"));
/*metadata.add(new VCFFormatHeaderLine(
"OV",1,
VCFHeaderLineType.Integer,
"Number calls (with different sample) overlapping this genotype"
));*/
metadata.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "SV type"));
metadata.add(new VCFFilterHeaderLine(ATT_CONTROL, "Variant is found in controls (max MAF=" + this.max_maf + ")"));
final VCFHeader header = new VCFHeader(metadata);
header.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, header);
final List<ShadowedVcfReader> controlShadowReaders = new ArrayList<>(this.controlsPath.size());
for (int i = 0; i < this.controlsPath.size(); i++) {
boolean large_flag = this.max_control_large_flag < 0 || i >= this.max_control_large_flag;
controlShadowReaders.add(new ShadowedVcfReader(this.controlsPath.get(i), large_flag));
}
out = super.openVariantContextWriter(this.outputFile);
out.writeHeader(header);
final CloseableIterator<VariantContext> iter = casesFiles.get(0).iterator();
final ProgressFactory.Watcher<VariantContext> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
final Decoy decoy = Decoy.getDefaultInstance();
VariantContext prevCtx = null;
while (iter.hasNext()) {
final VariantContext ctx = progress.apply(iter.next());
if (decoy.isDecoy(ctx.getContig()))
continue;
if (Breakend.parse(ctx).stream().anyMatch(B -> decoy.isDecoy(B.getContig())))
continue;
if (intervalTreeMap != null && !intervalTreeMap.containsOverlapping(ctx))
continue;
// in manta, I see the same variant multiple times in the same vcf
if (prevCtx != null && ctx.getContig().equals(prevCtx.getContig()) && ctx.getStart() == prevCtx.getStart() && ctx.getEnd() == prevCtx.getEnd())
continue;
prevCtx = ctx;
final List<VariantContext> candidate = new ArrayList<>(casesFiles.size());
candidate.add(ctx);
recursive(ctx, candidate, casesFiles, controlShadowReaders, out);
}
iter.close();
progress.close();
out.close();
out = null;
casesFiles.stream().forEach(F -> {
try {
F.close();
} catch (Exception err) {
}
});
controlShadowReaders.stream().forEach(F -> F.realClose());
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
}
}
use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.
the class TestNg01 method streamVcf.
static Stream<VariantContext> streamVcf(final File f) {
final VCFReader r = VCFReaderFactory.makeDefault().open(f, false);
final CloseableIterator<VariantContext> iter = r.iterator();
return StreamSupport.stream(new IterableAdapter<VariantContext>(iter).spliterator(), false).onClose(() -> {
iter.close();
CloserUtil.close(r);
});
}
use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.
the class VCFCompareGT method doWork.
@Override
public int doWork(final List<String> arguments) {
if (arguments.isEmpty()) {
LOG.error("VCFs missing.");
return -1;
}
final List<String> vcfLabelList;
if (!StringUtil.isBlank(this.vcfLabelsStr)) {
vcfLabelList = Arrays.stream(this.vcfLabelsStr.split("[,]")).filter(S -> !StringUtil.isBlank(S)).collect(Collectors.toList());
if (new HashSet<>(vcfLabelList).size() != arguments.size()) {
LOG.error("bad number of labels in : " + this.vcfLabelsStr);
return -1;
}
if (vcfLabelList.stream().anyMatch(S -> !S.matches("[0-9A-Za-z]+"))) {
LOG.error("bad label in : " + this.vcfLabelsStr);
return -1;
}
} else {
vcfLabelList = new ArrayList<>();
for (int i = 0; i < arguments.size(); ++i) {
vcfLabelList.add(String.valueOf(i + 1));
}
}
VariantComparator varcmp = new VariantComparator();
SortingCollection<Variant> variants = null;
final Set<String> sampleNames = new LinkedHashSet<>();
try {
variants = SortingCollection.newInstance(Variant.class, new VariantCodec(), varcmp, writingSortingCollection.getMaxRecordsInRam(), writingSortingCollection.getTmpPaths());
variants.setDestructiveIteration(true);
final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
metaData.add(new VCFHeaderLine(getClass().getSimpleName(), "version:" + getVersion() + " command:" + getProgramCommandLine()));
final BiFunction<String, Integer, String> createName = (SN, IDX) -> SN + "_" + vcfLabelList.get(IDX);
for (int i = 0; i < arguments.size(); ++i) {
final File vcfFile = new File(arguments.get(i));
LOG.info("Opening " + vcfFile);
final VCFReader vcfFileReader = VCFReaderFactory.makeDefault().open(vcfFile, false);
final CloseableIterator<VariantContext> iter = vcfFileReader.iterator();
final VCFHeader header = vcfFileReader.getHeader();
sampleNames.addAll(header.getSampleNamesInOrder());
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "_" + vcfLabelList.get(i), "File: " + vcfFile.getPath()));
long nLines = 0;
while (iter.hasNext()) {
final VariantContext var = iter.next();
if (nLines++ % 10000 == 0) {
LOG.info(vcfFile + " " + nLines);
}
if (!this.variantFilter.test(var))
continue;
if (!var.isVariant())
continue;
if (!var.hasGenotypes())
continue;
for (final Genotype genotype : var.getGenotypes()) {
final Variant rec = new Variant();
rec.file_index1 = i + 1;
rec.sampleName = genotype.getSampleName();
rec.chrom = var.getContig();
rec.start = var.getStart();
rec.end = var.getEnd();
rec.ref = var.getReference().getDisplayString().toUpperCase();
if (var.hasID()) {
rec.id = var.getID();
}
if (genotype.hasDP()) {
rec.dp = genotype.getDP();
}
if (genotype.hasGQ()) {
rec.gq = genotype.getGQ();
}
final List<Allele> alleles = genotype.getAlleles();
if (genotype.isNoCall() || !genotype.isAvailable() || alleles == null) {
if (!this.convertNoCallToHomRef)
continue;
rec.a1 = var.getReference().getDisplayString().toUpperCase();
rec.a2 = rec.a1;
} else if (alleles.size() == 1) {
rec.a1 = alleles.get(0).getDisplayString().toUpperCase();
rec.a2 = rec.a1;
} else if (alleles.size() == 2) {
rec.a1 = alleles.get(0).getDisplayString().toUpperCase();
rec.a2 = alleles.get(1).getDisplayString().toUpperCase();
if (rec.a1.compareTo(rec.a2) > 0) {
final String tmp = rec.a2;
rec.a2 = rec.a1;
rec.a1 = tmp;
}
} else {
continue;
}
variants.add(rec);
}
}
iter.close();
vcfFileReader.close();
}
variants.doneAdding();
LOG.info("Done Adding");
final String GenpotypeChangedKey = "GCH";
final String GenpotypeCreated = "GNW";
final String GenpotypeDiff = "GDF";
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_QUALITY_KEY));
metaData.add(new VCFFormatHeaderLine(GenpotypeChangedKey, 1, VCFHeaderLineType.Integer, "Changed Genotype"));
metaData.add(new VCFFormatHeaderLine(GenpotypeCreated, 1, VCFHeaderLineType.Integer, "Genotype Created/Deleted"));
metaData.add(new VCFInfoHeaderLine(GenpotypeDiff, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples with Genotype Difference"));
metaData.add(new VCFFilterHeaderLine("DISCORDANCE", "something has changed."));
final Set<String> newSampleNames = new TreeSet<>();
for (int i = 0; i < vcfLabelList.size(); ++i) {
for (final String sample : sampleNames) {
newSampleNames.add(createName.apply(sample, i));
}
}
final VCFHeader header = new VCFHeader(metaData, new ArrayList<>(newSampleNames));
final VariantContextWriter w = super.openVariantContextWriter(outputFile);
w.writeHeader(header);
final PosComparator posCompare = new PosComparator();
final EqualRangeIterator<Variant> iter = new EqualRangeIterator<>(variants.iterator(), posCompare);
while (iter.hasNext()) {
final List<Variant> row = iter.next();
/**
* this sample is not always the same
*/
final Set<String> samplesModified = new TreeSet<>();
/**
* the number of sample is different from vcflist.size()
*/
final Set<String> samplesCreates = new TreeSet<>();
final Map<String, List<Variant>> sample2variants = row.stream().collect(Collectors.groupingBy(T -> T.sampleName));
for (final String sn : sample2variants.keySet()) {
boolean all_hom_ref = true;
final List<Variant> sampleVariants = sample2variants.get(sn);
for (int x = 0; x < /*+1 non, besoin de tester hom_ref */
sampleVariants.size(); ++x) {
final Variant var1 = sampleVariants.get(x);
if (!var1.isHomRef())
all_hom_ref = false;
for (int y = x + 1; y < sampleVariants.size(); ++y) {
final Variant var2 = sampleVariants.get(y);
if (var1.a1.equals(var2.a1) && var1.a2.equals(var2.a2))
continue;
samplesModified.add(var1.sampleName);
}
}
if (sampleVariants.size() != arguments.size()) {
if (!convertNoCallToHomRef || (this.convertNoCallToHomRef && !all_hom_ref)) {
samplesCreates.add(sn);
}
}
}
final Variant first = row.get(0);
final Set<Allele> alleles = new HashSet<>();
alleles.add(Allele.create(first.ref, true));
for (final Variant var : row) {
alleles.add(Allele.create(var.a1, var.a1.equalsIgnoreCase(var.ref)));
alleles.add(Allele.create(var.a2, var.a2.equalsIgnoreCase(var.ref)));
}
final VariantContextBuilder b = new VariantContextBuilder(getClass().getName(), first.chrom, first.start, first.end, alleles);
// build genotypes
final List<Genotype> genotypes = new ArrayList<Genotype>();
for (final Variant var : row) {
// alleles for this genotype
final List<Allele> galleles = new ArrayList<Allele>();
galleles.add(Allele.create(var.a1, var.a1.equalsIgnoreCase(var.ref)));
galleles.add(Allele.create(var.a2, var.a2.equalsIgnoreCase(var.ref)));
final GenotypeBuilder gb = new GenotypeBuilder();
gb.DP(var.dp);
gb.alleles(galleles);
gb.name(createName.apply(var.sampleName, var.file_index1 - 1));
gb.GQ(var.gq);
gb.attribute(GenpotypeChangedKey, samplesModified.contains(var.sampleName) ? 1 : 0);
gb.attribute(GenpotypeCreated, samplesCreates.contains(var.sampleName) ? 1 : 0);
genotypes.add(gb.make());
}
b.genotypes(genotypes);
b.id(first.id);
if (!(samplesModified.isEmpty() && samplesCreates.isEmpty())) {
final Set<String> set2 = new TreeSet<String>(samplesModified);
set2.addAll(samplesCreates);
b.attribute(GenpotypeDiff, set2.toArray());
b.filter("DISCORDANCE");
} else {
b.passFilters();
}
if (!only_print_modified || !(samplesModified.isEmpty() && samplesCreates.isEmpty())) {
w.add(b.make());
}
}
iter.close();
w.close();
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
if (variants != null)
try {
variants.cleanup();
} catch (Exception err) {
}
}
return 0;
}
use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.
the class VCFMerge method doWork.
@Override
public int doWork(final List<String> args) {
final List<Path> userVcfFiles = new ArrayList<Path>();
final Set<String> genotypeSampleNames = new TreeSet<String>();
SAMSequenceDictionary dict = null;
VariantContextWriter w = null;
SortingCollection<VariantContext> array = null;
CloseableIterator<VariantContext> iter = null;
try {
userVcfFiles.addAll(IOUtils.unrollPaths(args));
final Set<String> fieldsSet = Arrays.stream(this.fieldsString.split("[,; ]")).collect(Collectors.toSet());
final boolean with_ac = fieldsSet.contains(VCFConstants.ALLELE_COUNT_KEY);
final boolean with_an = fieldsSet.contains(VCFConstants.ALLELE_NUMBER_KEY);
final boolean with_af = fieldsSet.contains(VCFConstants.ALLELE_FREQUENCY_KEY);
final boolean with_dp = fieldsSet.contains(VCFConstants.DEPTH_KEY);
final boolean with_gq = fieldsSet.contains(VCFConstants.GENOTYPE_QUALITY_KEY);
final boolean with_ad = fieldsSet.contains(VCFConstants.GENOTYPE_ALLELE_DEPTHS);
final boolean with_pl = fieldsSet.contains(VCFConstants.GENOTYPE_PL_KEY);
if (userVcfFiles.isEmpty()) {
LOG.error("No input");
return -1;
}
final boolean requireIndex = !StringUtils.isBlank(this.regionStr);
for (final Path vcfFile : userVcfFiles) {
try (VCFReader in = VCFReaderFactory.makeDefault().open(vcfFile, requireIndex)) {
final VCFHeader header = in.getHeader();
for (final String sn : header.getSampleNamesInOrder()) {
if (genotypeSampleNames.contains(sn)) {
LOG.error("duplicate sample name " + sn);
return -1;
}
genotypeSampleNames.add(sn);
}
final SAMSequenceDictionary dict1 = SequenceDictionaryUtils.extractRequired(header);
if (dict == null) {
dict = dict1;
} else {
SequenceUtil.assertSequenceDictionariesEqual(dict, dict1);
}
}
}
if (dict == null) {
LOG.error("No sequence dictionary defined");
return -1;
}
final SAMSequenceDictionary finalDict = dict;
final Function<String, Integer> contig2tid = C -> {
final int tid = finalDict.getSequenceIndex(C);
if (tid == -1)
throw new JvarkitException.ContigNotFoundInDictionary(C, finalDict);
return tid;
};
final Comparator<String> compareContigs = (C1, C2) -> {
if (C1.equals(C2))
return 0;
return contig2tid.apply(C1) - contig2tid.apply(C2);
};
final Comparator<VariantContext> compareChromPos = (V1, V2) -> {
int i = compareContigs.compare(V1.getContig(), V2.getContig());
if (i != 0)
return i;
return V1.getStart() - V2.getStart();
};
final Comparator<VariantContext> compareChromPosRef = (V1, V2) -> {
int i = compareChromPos.compare(V1, V2);
if (i != 0)
return i;
return V1.getReference().compareTo(V2.getReference());
};
final SimpleInterval rgn;
final Predicate<VariantContext> accept;
if (!StringUtil.isBlank(VCFMerge.this.regionStr)) {
rgn = IntervalParserFactory.newInstance().dictionary(dict).enableWholeContig().make().apply(VCFMerge.this.regionStr).orElseThrow(IntervalParserFactory.exception(VCFMerge.this.regionStr));
accept = (CTX) -> {
return rgn.overlaps(CTX);
};
} else {
accept = (VOL) -> true;
rgn = null;
}
final Set<VCFHeaderLine> metaData = new HashSet<>();
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY);
if (with_dp)
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.DEPTH_KEY);
if (with_gq)
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_QUALITY_KEY);
if (with_pl)
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_PL_KEY);
if (with_ad)
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_ALLELE_DEPTHS);
if (with_ac)
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.ALLELE_COUNT_KEY);
if (with_an)
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.ALLELE_NUMBER_KEY);
if (with_af)
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.ALLELE_FREQUENCY_KEY);
if (with_dp)
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.DEPTH_KEY);
final VCFHeader mergedHeader = new VCFHeader(metaData, genotypeSampleNames);
mergedHeader.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, mergedHeader);
array = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(mergedHeader), compareChromPosRef, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
array.setDestructiveIteration(true);
for (final Path vcfFile : userVcfFiles) {
try (VCFReader in = VCFReaderFactory.makeDefault().open(vcfFile, requireIndex)) {
try (CloseableIterator<VariantContext> lit = (in.isQueryable() && rgn != null ? in.query(rgn) : in.iterator())) {
while (lit.hasNext()) {
final VariantContext ctx = lit.next();
if (!accept.test(ctx))
continue;
array.add(new VariantContextBuilder(ctx).unfiltered().genotypes(ctx.getGenotypes().stream().filter(G -> G.isCalled()).map(G -> {
final GenotypeBuilder gb = new GenotypeBuilder(G);
gb.noAttributes();
return gb.make();
}).collect(Collectors.toList())).rmAttributes(new ArrayList<>(ctx.getAttributes().keySet())).make());
}
}
}
}
array.doneAdding();
LOG.info("merging..." + userVcfFiles.size() + " vcfs");
// create the context writer
w = this.writingVariantsDelegate.open(outputFile);
w.writeHeader(mergedHeader);
iter = array.iterator();
EqualRangeIterator<VariantContext> eqiter = new EqualRangeIterator<>(iter, compareChromPosRef);
while (eqiter.hasNext()) {
final List<VariantContext> row = eqiter.next();
final VariantContext first = row.get(0);
final List<Allele> alleles = new ArrayList<>();
alleles.add(first.getReference());
alleles.addAll(row.stream().flatMap(VC -> VC.getAlternateAlleles().stream()).filter(A -> !A.isNoCall()).collect(Collectors.toSet()));
final VariantContextBuilder vcb = new VariantContextBuilder(null, first.getContig(), first.getStart(), first.getEnd(), alleles);
final String id = row.stream().filter(V -> V.hasID()).map(ST -> ST.getID()).findFirst().orElse(null);
if (!StringUtils.isBlank(id)) {
vcb.id(id);
}
final Map<String, Genotype> sample2genotypes = new HashMap<>(genotypeSampleNames.size());
final Set<String> remainingSamples = new HashSet<String>(genotypeSampleNames);
int an = 0;
int dp = -1;
final Counter<Allele> ac = new Counter<>();
for (final VariantContext ctx : row) {
for (final Genotype gt : ctx.getGenotypes()) {
if (gt.isNoCall())
continue;
for (Allele a : gt.getAlleles()) {
ac.incr(a);
an++;
}
final GenotypeBuilder gb = new GenotypeBuilder(gt.getSampleName(), gt.getAlleles());
if (with_dp && gt.hasDP()) {
if (dp < 0)
dp = 0;
dp += gt.getDP();
gb.DP(gt.getDP());
}
if (with_gq && gt.hasGQ())
gb.GQ(gt.getGQ());
if (with_pl && gt.hasPL())
gb.PL(gt.getPL());
if (with_ad && gt.hasAD()) {
final int[] src_ad = gt.getAD();
final int[] dest_ad = new int[alleles.size()];
Arrays.fill(dest_ad, 0);
for (int i = 0; i < src_ad.length && i < ctx.getAlleles().size(); i++) {
final Allele a1 = ctx.getAlleles().get(i);
final int dest_idx = alleles.indexOf(a1);
if (dest_idx >= 0 && dest_idx < dest_ad.length) {
dest_ad[dest_idx] = src_ad[i];
}
gb.AD(dest_ad);
}
}
sample2genotypes.put(gt.getSampleName(), gb.make());
}
}
remainingSamples.removeAll(sample2genotypes.keySet());
for (String sampleName : remainingSamples) {
final Genotype gt;
if (this.useHomRefForUnknown) {
final List<Allele> list = new ArrayList<>(ploidy);
for (int i = 0; i < ploidy; i++) list.add(first.getReference());
an += ploidy;
gt = GenotypeBuilder.create(sampleName, list);
} else {
gt = GenotypeBuilder.createMissing(sampleName, this.ploidy);
}
sample2genotypes.put(sampleName, gt);
}
if (with_an)
vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, an);
if (with_ac)
vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, alleles.subList(1, alleles.size()).stream().mapToInt(A -> (int) ac.count(A)).toArray());
if (with_af && an > 0) {
final double finalAn = an;
vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, alleles.subList(1, alleles.size()).stream().mapToDouble(A -> ac.count(A) / finalAn).toArray());
}
if (with_dp && dp >= 0) {
vcb.attribute(VCFConstants.DEPTH_KEY, dp);
}
vcb.genotypes(sample2genotypes.values());
w.add(vcb.make());
}
eqiter.close();
CloserUtil.close(w);
w = null;
array.cleanup();
array = null;
CloserUtil.close(iter);
iter = null;
return 0;
} catch (Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(w);
CloserUtil.close(iter);
if (array != null)
array.cleanup();
}
}
Aggregations