use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.
the class MantaMerger method doWork.
@Override
public int doWork(final List<String> args) {
VariantContextWriter out = null;
try {
final Map<String, VcfInput> sample2inputs = new TreeMap<>();
SAMSequenceDictionary dict = null;
final List<String> lines;
if (args.size() == 1 && args.get(0).endsWith(".list")) {
lines = Files.lines(Paths.get(args.get(0))).filter(L -> !(StringUtils.isBlank(L) || L.startsWith("#"))).collect(Collectors.toList());
} else {
lines = args;
}
for (final String line : lines) {
final String[] tokens = CharSplitter.TAB.split(line);
final VcfInput vcfInput = new VcfInput();
vcfInput.vcfPath = Paths.get(tokens[0]);
IOUtil.assertFileIsReadable(vcfInput.vcfPath);
final SAMSequenceDictionary dict1 = SequenceDictionaryUtils.extractRequired(vcfInput.vcfPath);
if (dict == null) {
dict = dict1;
} else if (!SequenceUtil.areSequenceDictionariesEqual(dict, dict1)) {
throw new JvarkitException.DictionariesAreNotTheSame(dict1, dict);
}
if (tokens.length < 2 || StringUtils.isBlank(tokens[1])) {
try (VCFReader r = VCFReaderFactory.makeDefault().open(vcfInput.vcfPath, false)) {
List<String> snl = r.getHeader().getSampleNamesInOrder();
if (snl.size() == 1) {
vcfInput.sample = snl.get(0);
} else {
vcfInput.sample = vcfInput.vcfPath.toString();
}
}
} else {
vcfInput.sample = tokens[1];
}
if (sample2inputs.containsKey(vcfInput.sample)) {
LOG.error("duplicate sample " + vcfInput.sample);
return -1;
}
sample2inputs.put(vcfInput.sample, vcfInput);
}
if (sample2inputs.isEmpty()) {
LOG.error("no input found");
return -1;
}
if (!StringUtils.isBlank(this.limitContig) && dict.getSequence(this.limitContig) == null) {
LOG.error(JvarkitException.ContigNotFoundInDictionary.getMessage(this.limitContig, dict));
return -1;
}
final Predicate<VariantContext> bedPredicate;
if (this.excludeBedPath != null) {
final BedLineCodec codec = new BedLineCodec();
final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(dict);
final IntervalTreeMap<Boolean> map = new IntervalTreeMap<>();
try (BufferedReader br = IOUtils.openPathForBufferedReading(this.excludeBedPath)) {
br.lines().filter(L -> !BedLine.isBedHeader(L)).map(L -> codec.decode(L)).filter(L -> L != null).filter(B -> !StringUtils.isBlank(converter.apply(B.getContig()))).map(B -> new Interval(converter.apply(B.getContig()), B.getStart(), B.getEnd())).forEach(R -> map.put(R, true));
}
bedPredicate = V -> !map.containsOverlapping(V);
} else {
bedPredicate = V -> true;
}
final Set<VCFHeaderLine> metaData = new HashSet<>();
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY, true));
metaData.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Variation type"));
final VCFInfoHeaderLine infoSvLen = new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Variation length");
metaData.add(infoSvLen);
final VCFInfoHeaderLine infoNSamples = new VCFInfoHeaderLine("NSAMPLES", 1, VCFHeaderLineType.Integer, "Number of samples");
metaData.add(infoNSamples);
final VCFInfoHeaderLine infoSamples = new VCFInfoHeaderLine("SAMPLES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "amples");
metaData.add(infoSamples);
final VCFFilterHeaderLine filterSameNext = new VCFFilterHeaderLine("NEXT", "next variant in VCF is the same.");
metaData.add(filterSameNext);
final VCFFilterHeaderLine filterSamePrev = new VCFFilterHeaderLine("PREV", "next variant in VCF is the same.");
metaData.add(filterSamePrev);
final VCFHeader header = new VCFHeader(metaData, sample2inputs.keySet());
header.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, header);
out = VCFUtils.createVariantContextWriterToPath(this.outputFile);
out.writeHeader(header);
final Decoy decoy = Decoy.getDefaultInstance();
for (final SAMSequenceRecord ssr : dict.getSequences()) {
if (!StringUtils.isBlank(this.limitContig)) {
if (!ssr.getSequenceName().equals(this.limitContig))
continue;
}
LOG.info("contig " + ssr.getSequenceName());
if (decoy.isDecoy(ssr.getSequenceName()))
continue;
final Map<SVKey, Set<String>> variants2samples = new HashMap<>();
for (final VcfInput vcfinput : sample2inputs.values()) {
// reset count for this contig
vcfinput.contigCount = 0;
try (VCFReader vcfFileReader = VCFReaderFactory.makeDefault().open(vcfinput.vcfPath, true)) {
vcfFileReader.query(ssr.getSequenceName(), 1, ssr.getSequenceLength()).stream().filter(V -> discard_bnd == false || !V.getAttributeAsString(VCFConstants.SVTYPE, "").equals("BND")).filter(bedPredicate).map(V -> new VariantContextBuilder(V).unfiltered().noID().noGenotypes().rmAttribute("EVENT").rmAttribute("HOMSEQ").rmAttribute("HOMLEN").rmAttribute("SVINSSEQ").rmAttribute("SVINSLEN").rmAttribute("MATEID").rmAttribute("LEFT_SVINSSEQ").rmAttribute("RIGHT_SVINSSEQ").rmAttribute("BND_DEPTH").rmAttribute("MATE_BND_DEPTH").rmAttribute("JUNCTION_QUAL").rmAttribute("CIGAR").make()).forEach(V -> {
final SVKey key1 = new SVKey(V);
if (!svComparator.test(V, V))
throw new RuntimeException("compare to self failed ! " + V);
variants2samples.put(key1, new HashSet<>());
vcfinput.contigCount++;
});
}
}
if (variants2samples.isEmpty())
continue;
// build an interval tree for a faster access
final IntervalTree<SVKey> intervalTree = new IntervalTree<>();
for (final SVKey key : variants2samples.keySet()) {
final SimpleInterval r = new SimpleInterval(key.archetype).extend(this.svComparator.getBndDistance() + 1);
intervalTree.put(r.getStart(), r.getEnd(), key);
// paranoidcheck interval is ok to find current archetype
boolean found = false;
final Iterator<IntervalTree.Node<SVKey>> nodeIter = intervalTree.overlappers(r.getStart(), r.getEnd());
while (nodeIter.hasNext()) {
final SVKey key1 = nodeIter.next().getValue();
if (this.svComparator.test(key1.archetype, key.archetype)) {
found = true;
break;
}
}
if (!found) {
out.close();
throw new RuntimeException("cannot find self " + key.archetype + " in " + r);
}
}
for (final VcfInput vcfinput : sample2inputs.values()) {
try (VCFReader vcfFileReader = VCFReaderFactory.makeDefault().open(vcfinput.vcfPath, true)) {
final CloseableIterator<VariantContext> iter = vcfFileReader.query(ssr.getSequenceName(), 1, ssr.getSequenceLength());
while (iter.hasNext()) {
final VariantContext ctx = iter.next();
if (this.discard_bnd && ctx.getAttributeAsString(VCFConstants.SVTYPE, "").equals("BND"))
continue;
if (!bedPredicate.test(ctx))
continue;
final SimpleInterval r = new SimpleInterval(ctx).extend(this.svComparator.getBndDistance() + 1);
final Iterator<IntervalTree.Node<SVKey>> nodeIter = intervalTree.overlappers(r.getStart(), r.getEnd());
while (nodeIter.hasNext()) {
final SVKey key1 = nodeIter.next().getValue();
if (!this.svComparator.test(key1.archetype, ctx))
continue;
final Set<String> samples = variants2samples.get(key1);
samples.add(vcfinput.sample);
}
}
iter.close();
}
}
final Comparator<VariantContext> sorter = new ContigDictComparator(dict).createLocatableComparator();
final List<SVKey> orderedKeys = variants2samples.keySet().stream().filter(// no samples for this key ??!
K -> !variants2samples.get(K).isEmpty()).sorted((A, B) -> sorter.compare(A.archetype, B.archetype)).collect(Collectors.toCollection(ArrayList::new));
// remove duplicates
int i = 0;
while (i + 1 < orderedKeys.size()) {
final SVKey key1 = orderedKeys.get(i);
final SVKey key2 = orderedKeys.get(i + 1);
if (svComparator.test(key1.archetype, key2.archetype) && // same samples
variants2samples.get(key1).equals(variants2samples.get(key2))) {
orderedKeys.remove(i + 1);
} else {
i++;
}
}
for (int key_index = 0; key_index < orderedKeys.size(); key_index++) {
final SVKey key = orderedKeys.get(key_index);
final Set<String> samples = variants2samples.get(key);
final Allele refAllele = key.archetype.getReference();
final Allele altAllele = Allele.create("<SV>", false);
final Object svType = key.archetype.getAttribute(VCFConstants.SVTYPE, ".");
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(key.archetype.getContig());
vcb.start(key.archetype.getStart());
vcb.stop(key.archetype.getEnd());
vcb.log10PError(key.archetype.getLog10PError());
vcb.alleles(Arrays.asList(refAllele, altAllele));
vcb.attribute(VCFConstants.END_KEY, key.archetype.getEnd());
vcb.attribute(VCFConstants.SVTYPE, svType);
vcb.attribute(infoSvLen.getID(), (svType.equals("DEL") ? -1 : 1) * key.archetype.getLengthOnReference());
vcb.attribute(infoNSamples.getID(), samples.size());
vcb.attribute(infoSamples.getID(), samples.stream().sorted().collect(Collectors.toList()));
int ac = 0;
final List<Genotype> genotypes = new ArrayList<>(sample2inputs.size());
for (final String sn : sample2inputs.keySet()) {
List<Allele> gta;
if (samples.contains(sn)) {
ac++;
gta = Arrays.asList(refAllele, altAllele);
} else {
gta = Arrays.asList(refAllele, refAllele);
}
genotypes.add(new GenotypeBuilder(sn, gta).make());
}
vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, ac);
vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, sample2inputs.size() * 2);
if (ac > 0) {
vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, ac / (float) sample2inputs.size() * 2);
}
if (key_index > 0 && svComparator.test(key.archetype, orderedKeys.get(key_index - 1).archetype)) {
vcb.filter(filterSamePrev.getID());
}
if (key_index + 1 < orderedKeys.size() && svComparator.test(key.archetype, orderedKeys.get(key_index + 1).archetype)) {
System.err.println("SAME\n" + key.archetype + "\n" + orderedKeys.get(key_index + 1).archetype);
vcb.filter(filterSameNext.getID());
}
vcb.genotypes(genotypes);
out.add(vcb.make());
}
}
out.close();
out = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
}
}
use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.
the class VcfStrechToSvg method doWork.
@Override
public int doWork(final List<String> args) {
try {
this.afExtractor = new AFExtractorFactory().parseFieldExtractor(this.afExtractorFactoryStr);
final String input = super.oneAndOnlyOneFile(args);
if (!this.bamListPath.isEmpty()) {
LOG.info("reading bam list");
for (Path bamPath : IOUtils.unrollPaths(this.bamListPath)) {
try (SamReader sr = openSamReader(bamPath)) {
final SAMFileHeader hdr = sr.getFileHeader();
for (final String sn : hdr.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toSet())) {
if (this.sample2bam.containsKey(sn)) {
LOG.info("duplicate sample in bam " + bamPath + " and " + this.sample2bam.get(sn));
return -1;
}
this.sample2bam.put(sn, bamPath);
}
}
}
}
try (VCFReader r = VCFReaderFactory.makeDefault().open(input, true)) {
final VCFHeader header = r.getHeader();
this.afExtractor.validateHeader(header);
final String searchFormat;
switch(this.useFormat) {
case PL:
searchFormat = VCFConstants.GENOTYPE_PL_KEY;
break;
// through
case AD:
default:
searchFormat = VCFConstants.GENOTYPE_ALLELE_DEPTHS;
break;
}
if (header.getFormatHeaderLine(searchFormat) == null) {
LOG.error("FORMAT/" + searchFormat + " undefined in " + input);
return -1;
}
if (!header.hasGenotypingData()) {
LOG.error("No genotype in input");
return -1;
}
final SAMSequenceDictionary dict = header.getSequenceDictionary();
if (dict != null && this.refFaidx != null) {
SequenceUtil.assertSequenceDictionariesEqual(dict, SequenceDictionaryUtils.extractRequired(this.refFaidx));
}
if (this.gff3Path != null) {
LOG.info("reading gtf" + this.gff3Path);
try (GtfReader gtfReader = new GtfReader(this.gff3Path)) {
if (dict != null)
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
gtfReader.getAllGenes().forEach(G -> {
this.all_genes.put(new Interval(G), G);
});
}
}
try (BufferedReader br = IOUtils.openPathForBufferedReading(this.bedFile)) {
try (PrintWriter manifest = (this.manifestPath == null ? new PrintWriter(new NullOuputStream()) : IOUtils.openPathForPrintWriter(this.manifestPath))) {
try (ArchiveFactory out = ArchiveFactory.open(this.outputFile)) {
final BedLineCodec codec = new BedLineCodec();
br.lines().map(L -> codec.decode(L)).filter(B -> B != null).forEach(B -> {
run(out, B, header, r, manifest);
});
}
manifest.flush();
}
}
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.
the class TestNg01 method getVcfHeader.
static VCFHeader getVcfHeader(final File f) {
final VCFReader r = VCFReaderFactory.makeDefault().open(f, false);
final VCFHeader h = r.getHeader();
CloserUtil.close(r);
return h;
}
use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.
the class BamAlleleBalance method doWork.
@Override
public int doWork(List<String> args) {
try {
final List<Path> bamPaths = IOUtils.unrollPaths(args);
if (bamPaths.isEmpty()) {
LOG.error("Bam list is empty");
return -1;
}
final List<Snp> snps = new ArrayList<>(10_000);
try (VCFReader vr = VCFReaderFactory.makeDefault().open(this.variantsPath, false)) {
try (CloseableIterator<VariantContext> iter = vr.iterator()) {
while (iter.hasNext()) {
final VariantContext ctx = iter.next();
if (!ctx.isBiallelic() || ctx.isFiltered() || !ctx.isSNP())
continue;
snps.add(new Snp(ctx));
}
}
}
if (snps.isEmpty()) {
LOG.error("snp list is empty");
return -1;
}
LOG.info(String.valueOf(snps.size()) + " snp(s)");
final Map<String, Counter<RangeOfDoubles.Range>> sample2count = new HashMap<>(bamPaths.size());
final SamReaderFactory srf = super.createSamReaderFactory();
if (this.faidx != null)
srf.referenceSequence(this.faidx);
for (final Path bamPath : bamPaths) {
try (SamReader samReader = srf.open(bamPath)) {
final String defaultPartition = IOUtils.getFilenameWithoutCommonSuffixes(bamPath);
final SAMFileHeader header = samReader.getFileHeader();
final Set<String> samples = header.getReadGroups().stream().map(RG -> this.groupBy.apply(RG, defaultPartition)).collect(Collectors.toSet());
samples.stream().forEach(SN -> sample2count.putIfAbsent(SN, new Counter<>()));
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(dict);
for (final Snp snp : snps) {
final String bamContig = converter.apply(snp.contigSrc);
if (StringUtils.isBlank(bamContig))
continue;
final Map<String, SiteInfo> sample2siteinfo = new HashMap<>(samples.size());
for (final String sn : samples) {
sample2siteinfo.put(sn, new SiteInfo());
}
try (CloseableIterator<SAMRecord> iter = samReader.queryOverlapping(bamContig, snp.pos1, snp.pos1)) {
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
if (!SAMRecordDefaultFilter.accept(rec, this.mapq))
continue;
final SAMReadGroupRecord rg = rec.getReadGroup();
if (rg == null)
continue;
String sample = this.groupBy.apply(rg, defaultPartition);
if (StringUtils.isBlank(sample) || !sample2siteinfo.containsKey(sample))
continue;
final Cigar cigar = rec.getCigar();
if (cigar == null || cigar.isEmpty())
continue;
byte[] bases = rec.getReadBases();
if (bases == null || bases == SAMRecord.NULL_SEQUENCE)
continue;
final int readpos1 = rec.getReadPositionAtReferencePosition(snp.pos1);
if (readpos1 < 1)
continue;
final int readpos0 = readpos1 - 1;
if (readpos0 < 0 || readpos0 >= bases.length)
continue;
final char base = (char) Character.toUpperCase(bases[readpos0]);
final SiteInfo si = sample2siteinfo.get(sample);
if (si == null)
continue;
if (base == snp.ref) {
si.countRef++;
} else if (base == snp.alt) {
si.countAlt++;
}
}
for (final String sample : sample2siteinfo.keySet()) {
final SiteInfo si = sample2siteinfo.get(sample);
final int depth = si.countAlt + si.countRef;
if (depth < this.min_depth)
continue;
// must be het
if (si.countAlt == 0 || si.countRef == 0)
continue;
final double ratio = si.countAlt / (double) depth;
final RangeOfDoubles.Range range = this.ratios.getRange(ratio);
sample2count.get(sample).incr(range);
}
}
// end of iter
}
// end of loop over snps
}
}
// print report
try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
final List<String> sortedSamples = sample2count.keySet().stream().sorted((A, B) -> Long.compare(sample2count.get(A).getTotal(), sample2count.get(B).getTotal())).collect(Collectors.toList());
pw.print("RANGE");
for (final String sn : sortedSamples) {
pw.print("\t");
pw.print(sn);
}
pw.println();
for (final RangeOfDoubles.Range r : this.ratios.getRanges()) {
pw.print(r.toString());
for (final String sn : sortedSamples) {
pw.print("\t");
pw.print(sample2count.get(sn).count(r));
}
pw.println();
}
pw.flush();
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
}
}
use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.
the class AbstractBam2Raster method loadVCFs.
protected void loadVCFs() {
for (final String vcfFile : IOUtils.unrollFiles(variants)) {
try (final VCFReader vcfFileReader = VCFReaderFactory.makeDefault().open(Paths.get(vcfFile), true)) {
final VCFHeader header = vcfFileReader.getHeader();
final SAMSequenceDictionary dict = header.getSequenceDictionary();
final ContigNameConverter converter;
if (dict != null) {
converter = ContigNameConverter.fromOneDictionary(dict);
} else {
converter = ContigNameConverter.getIdentity();
}
final String newCtg = converter.apply(this.interval.getContig());
if (!StringUtil.isBlank(newCtg)) {
final CloseableIterator<VariantContext> r = vcfFileReader.query(this.interval.getContig(), this.interval.getStart(), this.interval.getEnd());
while (r.hasNext()) {
this.highlightPositions.add(r.next().getStart());
}
r.close();
}
} catch (final IOException err) {
throw new RuntimeIOException(err);
}
}
}
Aggregations