use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class LumpyMoreSamples method doWork.
@Override
public int doWork(final List<String> args) {
VcfIterator r = null;
VariantContextWriter vcw = null;
final Map<String, SamReader> sample2samreaders = new HashMap<>();
try {
r = super.openVcfIterator(oneFileOrNull(args));
final VCFHeader headerIn = r.getHeader();
final SAMSequenceDictionary dict = headerIn.getSequenceDictionary();
if (dict == null) {
LOG.error(JvarkitException.VcfDictionaryMissing.getMessage("input vcf"));
return -1;
}
final SamReaderFactory samReaderFactory = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.LENIENT);
IOUtil.slurpLines(this.bamFileList).stream().forEach(F -> {
if (F.trim().isEmpty())
return;
final SamReader sr = samReaderFactory.open(SamInputResource.of(F));
final SAMFileHeader samHeader = sr.getFileHeader();
final SAMSequenceDictionary dict2 = samHeader.getSequenceDictionary();
if (dict2 == null) {
throw new JvarkitException.BamDictionaryMissing(F);
}
if (!SequenceUtil.areSequenceDictionariesEqual(dict, dict2)) {
throw new JvarkitException.DictionariesAreNotTheSame(dict, dict2);
}
for (final SAMReadGroupRecord rg : samHeader.getReadGroups()) {
final String sample = rg.getSample();
if (StringUtil.isBlank(sample))
continue;
final SamReader reader = sample2samreaders.get(sample);
if (reader == null) {
sample2samreaders.put(sample, reader);
} else if (reader == sr) {
continue;
} else {
throw new JvarkitException.UserError("more than one sample per bam:" + F);
}
}
});
final Set<String> inVcfSampleNames = new HashSet<>(headerIn.getSampleNamesInOrder());
final Set<String> outVcfSampleNames = new HashSet<>(inVcfSampleNames);
outVcfSampleNames.addAll(sample2samreaders.keySet());
final VCFHeader headerOut = new VCFHeader(headerIn.getMetaDataInInputOrder(), outVcfSampleNames);
final VCFFormatHeaderLine SU2 = new VCFFormatHeaderLine("SU2", 1, VCFHeaderLineType.Integer, "Number of pieces of evidence supporting the variant");
final VCFFormatHeaderLine PE2 = new VCFFormatHeaderLine("PE2", 1, VCFHeaderLineType.Integer, "Number of split reads supporting the variant");
final VCFFormatHeaderLine SR2 = new VCFFormatHeaderLine("SR2", 1, VCFHeaderLineType.Integer, "Number of paired-end reads supporting the variant");
headerOut.addMetaDataLine(SU2);
headerOut.addMetaDataLine(PE2);
headerOut.addMetaDataLine(SR2);
vcw = super.openVariantContextWriter(this.outputFile);
vcw.writeHeader(headerOut);
while (r.hasNext()) {
final VariantContext ctx = r.next();
final StructuralVariantType sttype = ctx.getStructuralVariantType();
if (sttype == null)
continue;
final int tid = dict.getSequenceIndex(ctx.getContig());
final Map<String, Genotype> genotypeMap = new HashMap<>();
ctx.getGenotypes().stream().forEach(G -> genotypeMap.put(G.getSampleName(), G));
for (final String sample : sample2samreaders.keySet()) {
final SamReader samReader = sample2samreaders.get(sample);
final SupportingReads sr = new SupportingReads();
switch(sttype) {
case DEL:
{
int pos = ctx.getStart();
int[] ci = confidenceIntervalPos(ctx);
final QueryInterval left = new QueryInterval(tid, pos - ci[0], pos + ci[1]);
int end = ctx.getEnd();
ci = confidenceIntervalEnd(ctx);
final QueryInterval right = new QueryInterval(tid, end - ci[0], end + ci[1]);
for (final SAMRecord rec : extractSupportingReads(ctx, sample, samReader, new QueryInterval[] { left, right })) {
final Cigar cigar = rec.getCigar();
if (cigar.isLeftClipped()) {
final QueryInterval qi = new QueryInterval(tid, rec.getUnclippedStart(), rec.getStart() - 1);
if (qi.overlaps(left)) {
sr.splitReads++;
if (rec.getReadPairedFlag())
sr.pairedReads++;
}
}
if (cigar.isRightClipped()) {
final QueryInterval qi = new QueryInterval(tid, rec.getEnd() + 1, rec.getUnclippedEnd());
if (qi.overlaps(right)) {
sr.splitReads++;
if (rec.getReadPairedFlag())
sr.pairedReads++;
}
}
}
break;
}
default:
break;
}
final GenotypeBuilder gb;
if (genotypeMap.containsKey(sample)) {
gb = new GenotypeBuilder(genotypeMap.get(sample));
} else {
gb = new GenotypeBuilder(sample, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
}
gb.attribute(SR2.getID(), sr.splitReads);
gb.attribute(PE2.getID(), sr.pairedReads);
gb.attribute(SU2.getID(), 0);
genotypeMap.put(sample, gb.make());
}
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
// add missing samples.
for (final String sampleName : outVcfSampleNames) {
if (genotypeMap.containsKey(sampleName))
continue;
genotypeMap.put(sampleName, new GenotypeBuilder(sampleName, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).make());
}
vcb.genotypes(genotypeMap.values());
vcw.add(vcb.make());
}
r.close();
r = null;
sample2samreaders.values().stream().forEach(R -> CloserUtil.close(R));
LOG.info("done");
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
}
}
use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class AlleleFrequencyCalculator method doWork.
@Override
public int doWork(final List<String> args) {
PrintStream out = null;
VcfIterator in = null;
try {
in = super.openVcfIterator(oneAndOnlyOneFile(args));
out = openFileOrStdoutAsPrintStream(outputFile);
out.println("CHR\tPOS\tID\tREF\tALT\tTOTAL_CNT\tALT_CNT\tFRQ");
while (in.hasNext() && !out.checkError()) {
final VariantContext ctx = in.next();
final Allele ref = ctx.getReference();
if (ref == null)
continue;
if (ctx.getNSamples() == 0 || ctx.getAlternateAlleles().isEmpty())
continue;
final Allele alt = ctx.getAltAlleleWithHighestAlleleCount();
if (alt == null)
continue;
final GenotypesContext genotypes = ctx.getGenotypes();
if (genotypes == null)
continue;
int total_ctn = 0;
int alt_ctn = 0;
for (int i = 0; i < genotypes.size(); ++i) {
final Genotype g = genotypes.get(i);
for (final Allele allele : g.getAlleles()) {
if (allele.equals(ref)) {
total_ctn++;
} else if (allele.equals(alt)) {
total_ctn++;
alt_ctn++;
}
}
}
out.print(ctx.getContig());
out.print("\t");
out.print(ctx.getStart());
out.print("\t");
out.print(ctx.hasID() ? ctx.getID() : ".");
out.print("\t");
out.print(ref.getBaseString());
out.print("\t");
out.print(alt.getBaseString());
out.print("\t");
out.print(total_ctn);
out.print("\t");
out.print(alt_ctn);
out.print("\t");
out.print(alt_ctn / (float) total_ctn);
out.println();
}
out.flush();
out.close();
out = null;
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
CloserUtil.close(in);
}
}
use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class VcfCalledWithAnotherMethod method doVcfToVcf.
protected int doVcfToVcf(final String inputName, final VcfIterator in, final VariantContextWriter out) {
final List<ExternalVcf> externalVcfs = new ArrayList<>();
try {
final VCFHeader header = in.getHeader();
this.dictionary = header.getSequenceDictionary();
/**
* open primitive input
*/
if (this.dictionary == null) {
LOG.error("no dictionary in input");
return -1;
}
final Set<File> samtoolsFiles = new HashSet<>();
this.externalVcfStrs.stream().filter(S -> S.endsWith(".list")).map(S -> Paths.get(S)).forEach(P -> {
try {
samtoolsFiles.addAll(Files.readAllLines(P).stream().filter(L -> !(L.startsWith("#") || L.trim().isEmpty())).map(S -> new File(S)).collect(Collectors.toSet()));
} catch (final Exception err) {
throw new RuntimeIOException(err);
}
});
samtoolsFiles.addAll(this.externalVcfStrs.stream().filter(S -> !S.endsWith(".list")).map(S -> new File(S)).collect(Collectors.toSet()));
externalVcfs.addAll(samtoolsFiles.stream().map(F -> new ExternalVcf(F)).collect(Collectors.toList()));
/**
* check for uniq keys
*/
final Set<String> uniqKeys = new HashSet<>();
for (final ExternalVcf extvcf : externalVcfs) {
int n = 0;
for (; ; ) {
final String newkey = extvcf.key + (n == 0 ? "" : "_" + n);
if (!uniqKeys.contains(newkey)) {
extvcf.key = newkey;
uniqKeys.add(newkey);
break;
}
++n;
}
}
final VCFHeader h2 = new VCFHeader(header);
for (final ExternalVcf extvcf : externalVcfs) {
h2.addMetaDataLine(new VCFHeaderLine("otherVcf:" + extvcf.key, extvcf.vcfFile.getPath()));
}
final VCFFilterHeaderLine variantNotFoundElsewhereFILTER = (filterNameVariantNotFoundElseWhere.isEmpty() ? null : new VCFFilterHeaderLine(filterNameVariantNotFoundElseWhere, "Variant Was not found in other VCFs: " + externalVcfs.stream().map(S -> S.vcfFile.getPath()).collect(Collectors.joining(", "))));
if (variantNotFoundElsewhereFILTER != null) {
h2.addMetaDataLine(variantNotFoundElsewhereFILTER);
}
final VCFInfoHeaderLine variantFoundElseWhereKeys = new VCFInfoHeaderLine(this.infoFoundKey, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Variant was found in the VCFs designed by those keys");
h2.addMetaDataLine(variantFoundElseWhereKeys);
final VCFInfoHeaderLine variantFoundElseWhereCount = new VCFInfoHeaderLine(this.infoFoundKey, 1, VCFHeaderLineType.Integer, "Number of times the Variant was found in the VCFs");
h2.addMetaDataLine(variantFoundElseWhereCount);
final VCFFormatHeaderLine genotypeCountSame = new VCFFormatHeaderLine(this.formatCountSame, 1, VCFHeaderLineType.Integer, "Number of times the Genotype was found the same in other VCFs");
h2.addMetaDataLine(genotypeCountSame);
final VCFFormatHeaderLine genotypeCountDiscordant = new VCFFormatHeaderLine(this.formatCountDiscordant, 1, VCFHeaderLineType.Integer, "Number of times the Genotype was found dicordant in other VCFs");
h2.addMetaDataLine(genotypeCountDiscordant);
super.addMetaData(h2);
final VariantContextWriter w = super.openVariantContextWriter(outputFile);
w.writeHeader(h2);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(this.dictionary);
while (in.hasNext()) {
final VariantContext ctx = progress.watch(in.next());
final List<GenotypeCount> genotypeCounts = ctx.getGenotypes().stream().map(G -> new GenotypeCount(G)).collect(Collectors.toList());
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
final Set<String> variantFoundInOtherVcfs = new HashSet<>();
for (final ExternalVcf extvcf : externalVcfs) {
final List<VariantContext> otherVariants = extvcf.get(ctx);
if (otherVariants.stream().filter(CTX2 -> {
if (ctx.getAlternateAlleles().isEmpty())
return true;
for (final Allele a : ctx.getAlternateAlleles()) {
if (CTX2.hasAllele(a))
return true;
}
return false;
}).findAny().isPresent()) {
variantFoundInOtherVcfs.add(extvcf.key);
}
for (final GenotypeCount gt : genotypeCounts) {
for (final VariantContext otherVariant : otherVariants) {
final Genotype otherGt = otherVariant.getGenotype(gt.gt.getSampleName());
if (otherGt == null)
continue;
if (gt.gt.sameGenotype(otherGt) || (this.noCallSameAsHomRef && ((gt.gt.isNoCall() && otherGt.isHomRef()) || (gt.gt.isHomRef() && otherGt.isNoCall())))) {
gt.countSame++;
} else {
gt.countDiscordant++;
}
}
}
}
vcb.genotypes(genotypeCounts.stream().map(G -> {
final GenotypeBuilder gb = new GenotypeBuilder(G.gt);
gb.attribute(genotypeCountSame.getID(), G.countSame);
gb.attribute(genotypeCountDiscordant.getID(), G.countDiscordant);
return gb.make();
}).collect(Collectors.toList()));
vcb.attribute(variantFoundElseWhereCount.getID(), variantFoundInOtherVcfs.size());
if (variantFoundInOtherVcfs.isEmpty()) {
if (variantNotFoundElsewhereFILTER != null) {
vcb.filter(variantNotFoundElsewhereFILTER.getID());
}
} else {
if (variantNotFoundElsewhereFILTER != null && !ctx.isFiltered()) {
vcb.passFilters();
}
vcb.attribute(variantFoundElseWhereKeys.getID(), new ArrayList<>(variantFoundInOtherVcfs));
}
w.add(vcb.make());
}
progress.finish();
while (!externalVcfs.isEmpty()) externalVcfs.remove(0).close();
return RETURN_OK;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
while (!externalVcfs.isEmpty()) externalVcfs.remove(0).close();
}
}
use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class VcfCreateDictionary method doWork.
@Override
public int doWork(final List<String> args) {
VcfIterator in = null;
Writer out = null;
if (this.outputFile != null && !outputFile.getName().endsWith(".dict")) {
LOG.error("output file should end with .dict :" + this.outputFile);
return -1;
}
try {
final LinkedHashMap<String, Integer> buildNewDictionary = new LinkedHashMap<String, Integer>();
int optind = 0;
do {
in = super.openVcfIterator(args.isEmpty() ? null : args.get(optind));
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(in.getHeader()).logger(LOG);
while (in.hasNext()) {
final VariantContext ctx = progress.watch(in.next());
Integer length = buildNewDictionary.get(ctx.getContig());
if (length == null)
length = 0;
if (ctx.getEnd() > length) {
length = ctx.getEnd();
buildNewDictionary.put(ctx.getContig(), length);
}
final Object END = ctx.getAttribute("END");
if (END != null) {
try {
final int end = Integer.parseInt(END.toString());
if (end > length) {
length = end;
buildNewDictionary.put(ctx.getContig(), length);
}
} catch (Throwable err) {
// not an integer
}
}
}
progress.finish();
in.close();
in = null;
++optind;
} while (optind < args.size());
final List<SAMSequenceRecord> list = new ArrayList<SAMSequenceRecord>(buildNewDictionary.size());
for (final String k : buildNewDictionary.keySet()) {
list.add(new SAMSequenceRecord(k, buildNewDictionary.get(k)));
}
final SAMFileHeader sfh = new SAMFileHeader();
sfh.setSequenceDictionary(new SAMSequenceDictionary(list));
final SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
codec.setValidationStringency(htsjdk.samtools.ValidationStringency.SILENT);
out = super.openFileOrStdoutAsPrintWriter(this.outputFile);
codec.encode(out, sfh);
out.flush();
out.close();
out = null;
return 0;
} catch (final Exception err2) {
LOG.error(err2);
return -1;
} finally {
CloserUtil.close(out);
CloserUtil.close(in);
}
}
use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class NgsFilesScanner method readVCF.
@Override
protected void readVCF(File f) {
if (!f.canRead())
return;
LOG.debug("readVCF " + f);
VcfIterator r = null;
InputStream in = null;
try {
r = VCFUtils.createVcfIteratorFromFile(f);
VCFHeader header = r.getHeader();
StringWriter sw = new StringWriter();
XMLOutputFactory xof = XMLOutputFactory.newFactory();
XMLStreamWriter out = xof.createXMLStreamWriter(new StreamResult(sw));
out.writeStartElement("vcf");
writeFile(out, f);
out.writeStartElement("samples");
for (String sample : header.getSampleNamesInOrder()) {
out.writeStartElement("sample");
out.writeCharacters(sample);
out.writeEndElement();
}
out.writeEndElement();
out.writeEndElement();
out.flush();
out.close();
sw.flush();
put(f, sw.toString());
} catch (Exception err) {
LOG.error(err);
} finally {
CloserUtil.close(r);
CloserUtil.close(in);
}
}
Aggregations