use of htsjdk.variant.vcf.VCFHeader in project jvarkit by lindenb.
the class VcfGeneEpistasis method doWork.
@Override
public int doWork(final List<String> args) {
if (this.geneBed == null) {
LOG.error("gene file bed undefined");
return -1;
}
if (this.outputFile == null) {
LOG.error("output file undefined");
return -1;
}
CloseableIterator<VariantContext> iter = null;
try {
final File vcfFile = new File(oneAndOnlyOneFile(args));
this.vcfFileReader = new VCFFileReader(vcfFile, true);
final VCFHeader header = this.vcfFileReader.getFileHeader();
final Pedigree pedigree;
if (this.pedigreeFile != null) {
pedigree = new Pedigree.Parser().parse(this.pedigreeFile);
} else {
pedigree = new Pedigree.Parser().parse(header);
}
if (pedigree == null || pedigree.isEmpty() || !pedigree.hasAffected() || !pedigree.hasUnaffected()) {
LOG.error("empty ped or no case/ctrl");
return -1;
}
pedigree.verifyPersonsHaveUniqueNames();
for (final Pedigree.Person p : pedigree.getPersons().stream().filter(P -> P.isAffected() || P.isUnaffected()).filter(P -> header.getSampleNamesInOrder().contains(P.getId())).collect(Collectors.toSet())) {
this.id2samples.put(p.getId(), p);
}
this.vcfTools = new VcfTools(header);
List<Interval> geneList;
if (!this.geneBed.exists()) {
final Map<String, Interval> gene2interval = new HashMap<>(50000);
LOG.info("building gene file" + this.geneBed);
iter = this.vcfFileReader.iterator();
// iter = this.vcfFileReader.query("chr3",1,300_000_000);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
while (iter.hasNext()) {
final VariantContext ctx = progress.watch(iter.next());
if (!accept(ctx))
continue;
for (final String geneName : getGenes(ctx)) {
final Interval old = gene2interval.get(geneName);
if (old == null) {
gene2interval.put(geneName, new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, geneName));
LOG.info("adding " + geneName + ". number of genes: " + gene2interval.size());
} else if (!old.getContig().equals(ctx.getContig())) {
LOG.error("boum :" + geneName + ": on chrom " + ctx.getContig() + " vs " + old);
return -1;
} else {
gene2interval.put(geneName, new Interval(ctx.getContig(), Math.min(ctx.getStart(), old.getStart()), Math.max(ctx.getEnd(), old.getEnd()), false, geneName));
}
}
}
iter.close();
iter = null;
progress.finish();
geneList = new ArrayList<>(gene2interval.values());
PrintWriter pw = new PrintWriter(this.geneBed);
for (final Interval g : geneList) {
pw.println(g.getContig() + "\t" + (g.getStart() - 1) + "\t" + (g.getEnd()) + "\t" + g.getName());
}
pw.flush();
pw.close();
pw = null;
} else {
BedLineCodec codec = new BedLineCodec();
BufferedReader r = IOUtil.openFileForBufferedReading(geneBed);
geneList = r.lines().map(L -> codec.decode(L)).filter(B -> B != null).map(B -> new Interval(B.getContig(), B.getStart(), B.getEnd(), true, B.get(3))).collect(Collectors.toList());
r.close();
}
if (geneList.isEmpty()) {
LOG.error("gene List is empty");
return -1;
}
final Comparator<VariantContext> ctxSorter = VCFUtils.createTidPosRefComparator(header.getSequenceDictionary());
final Function<Interval, List<VariantContext>> loadVariants = (R) -> {
List<VariantContext> L = new ArrayList<>();
CloseableIterator<VariantContext> r = this.vcfFileReader.query(R.getContig(), R.getStart(), R.getEnd());
while (r.hasNext()) {
final VariantContext ctx = r.next();
if (!accept(ctx))
continue;
if (!getGenes(ctx).contains(R.getName()))
continue;
L.add(ctx);
}
r.close();
return L;
};
final SkatExecutor executor = this.skatFactory.build();
Double bestSkat = null;
LOG.info("number of genes : " + geneList.size());
final int list_end_index = (this.user_end_index < 0 ? geneList.size() : Math.min(geneList.size(), this.user_end_index));
for (int x = this.user_begin_index; x < list_end_index; ++x) {
final Interval intervalx = geneList.get(x);
final List<VariantContext> variantsx = loadVariants.apply(intervalx);
if (variantsx.isEmpty())
continue;
for (int y = x; y < geneList.size(); /* pas list_end_index */
++y) {
final Interval intervaly;
final List<VariantContext> merge;
if (y == x) {
// we-re testing gene 1 only
intervaly = intervalx;
merge = variantsx;
} else {
intervaly = geneList.get(y);
if (intervaly.intersects(intervalx))
continue;
final List<VariantContext> variantsy = loadVariants.apply(intervaly);
if (variantsy.isEmpty())
continue;
merge = new MergedList<>(variantsx, variantsy);
}
LOG.info("testing : [" + x + "]" + intervalx + " [" + y + "]" + intervaly + " N:" + geneList.size() + " best: " + bestSkat);
final Double skat = eval(executor, merge);
if (skat == null)
continue;
if (bestSkat == null || skat.compareTo(bestSkat) < 0) {
bestSkat = skat;
LOG.info("best " + bestSkat + " " + intervalx + " " + intervaly);
if (this.outputFile.getName().endsWith(".vcf") || this.outputFile.getName().endsWith(".vcf.gz")) {
final VCFHeader header2 = new VCFHeader(header);
header2.addMetaDataLine(new VCFHeaderLine(VcfGeneEpistasis.class.getName(), intervalx.getName() + " " + intervaly.getName() + " " + bestSkat));
final VariantContextWriter w = VCFUtils.createVariantContextWriter(outputFile);
w.writeHeader(header2);
merge.stream().sorted(ctxSorter).forEach(V -> w.add(V));
w.close();
} else {
final PrintWriter w = super.openFileOrStdoutAsPrintWriter(outputFile);
w.println(String.valueOf(bestSkat) + "\t" + intervalx.getName() + "\t" + intervaly.getName());
w.flush();
w.close();
}
}
}
}
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(this.vcfFileReader);
}
}
use of htsjdk.variant.vcf.VCFHeader in project jvarkit by lindenb.
the class HaloplexParasite method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
SamReader samReader = null;
final List<Mutation> mutations = new ArrayList<>();
try {
final Set<File> bamFiles = Files.lines(this.bamList.toPath()).filter(T -> !(T.isEmpty() || T.startsWith("#"))).map(T -> new File(T)).collect(Collectors.toSet());
final VCFHeader header = new VCFHeader();
header.setSequenceDictionary(in.getHeader().getSequenceDictionary());
final VCFFilterHeaderLine filter = new VCFFilterHeaderLine("HALOPLEXPARASITE", "fails Parasite Haloplex Sequence");
final VCFInfoHeaderLine infoWord = new VCFInfoHeaderLine(filter.getID(), 1, VCFHeaderLineType.String, "Parasite Haloplex Sequence (Word|Count|Fraction)");
super.addMetaData(header);
out.writeHeader(header);
header.addMetaDataLine(filter);
header.addMetaDataLine(infoWord);
while (in.hasNext()) {
final VariantContext ctx = in.next();
final VariantContextBuilder vcb = new VariantContextBuilder(inputName, ctx.getContig(), ctx.getStart(), ctx.getEnd(), ctx.getAlleles());
if (!(ctx.isIndel() || ctx.isMixed())) {
out.add(vcb.make());
continue;
}
if (!vcb.getAlleles().stream().filter(A -> !(A.isSymbolic() || A.isNoCall() || A.length() < this.minClipSize)).findAny().isPresent()) {
out.add(vcb.make());
continue;
}
final Mutation mut = new Mutation(ctx);
mutations.add(mut);
}
final Counter<String> words = new Counter<>();
for (final File bamFile : bamFiles) {
LOG.info("Opening " + bamFile);
samReader = createSamReaderFactory().open(bamFile);
for (final Mutation mut : mutations) {
// words seen in that mutation : don't use a Counter
final Set<String> mutWords = new HashSet<>();
/* loop over reads overlapping that mutation */
final SAMRecordIterator sri = samReader.queryOverlapping(mut.contig, mut.start, mut.end);
while (sri.hasNext()) {
final SAMRecord rec = sri.next();
if (rec.getReadUnmappedFlag())
continue;
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getDuplicateReadFlag())
continue;
if (rec.getReadFailsVendorQualityCheckFlag())
continue;
final Cigar cigar = rec.getCigar();
if (cigar.numCigarElements() == 1)
continue;
final byte[] bases = rec.getReadBases();
int refpos = rec.getUnclippedStart();
int readpos = 0;
/* scan cigar overlapping that mutation */
for (final CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
final int ref_end = refpos + (op.consumesReferenceBases() || op.isClipping() ? ce.getLength() : 0);
final int read_end = readpos + (op.consumesReadBases() ? ce.getLength() : 0);
/* check clip is large enough */
if (op.equals(CigarOperator.S) && ce.getLength() >= this.minClipSize) {
/* check clip overlap mutation */
if (!(ref_end < mut.start || refpos > mut.end)) {
/* break read of soft clip into words */
for (int x = 0; x + this.minClipSize <= ce.getLength(); ++x) {
final String substr = new String(bases, readpos + x, this.minClipSize);
if (!substr.contains("N")) {
final String revcomp = AcidNucleics.reverseComplement(substr);
mutWords.add(substr);
if (!revcomp.equals(substr))
mutWords.add(revcomp);
}
}
}
}
refpos = ref_end;
readpos = read_end;
}
}
sri.close();
for (final String w : mutWords) {
words.incr(w);
}
}
// end of for(mutations)
samReader.close();
samReader = null;
}
LOG.info("mutations:" + mutations.size());
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
for (final Mutation mut : mutations) {
progress.watch(mut.contig, mut.start);
final VariantContextBuilder vcb = new VariantContextBuilder(inputName, mut.contig, mut.start, mut.end, mut.alleles);
String worstString = null;
Double worstFraction = null;
final double totalWords = words.getTotal();
for (final Allele a : mut.alleles) {
if (a.isSymbolic() || a.isNoCall() || a.length() < this.minClipSize)
continue;
for (int x = 0; x + this.minClipSize <= a.length(); ++x) {
final String substr = new String(a.getBases(), x, this.minClipSize);
final long count = words.count(substr);
final double fraction = count / totalWords;
if (worstFraction == null || worstFraction < fraction) {
worstString = substr + "|" + count + "|" + fraction;
worstFraction = fraction;
}
}
}
if (worstString != null) {
vcb.attribute(infoWord.getID(), worstString);
}
if (worstFraction != null && worstFraction.doubleValue() >= this.tresholdFraction) {
vcb.filter(filter.getID());
}
out.add(vcb.make());
}
progress.finish();
return RETURN_OK;
} catch (Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(samReader);
}
}
use of htsjdk.variant.vcf.VCFHeader in project jvarkit by lindenb.
the class IgvReview method saveVariants.
private File saveVariants(Window owner, final File f) {
if (f == null)
return null;
VariantContextWriterBuilder vcb = new VariantContextWriterBuilder();
VariantContextWriter w = null;
try {
SAMSequenceDictionary dict = this.vcfHeader.getSequenceDictionary();
if (dict != null) {
vcb.setReferenceDictionary(dict);
}
vcb.setOutputFile(f);
final VCFHeader header2 = new VCFHeader(this.vcfHeader);
if (header2.getFormatHeaderLine(this.reviewFormat.getID()) == null) {
header2.addMetaDataLine(this.reviewFormat);
}
w = vcb.build();
w.writeHeader(header2);
int x = 0;
while (x < this.genotypeTable.getItems().size()) {
VariantContext v1 = this.genotypeTable.getItems().get(x).ctx;
List<Genotype> genotypes = new ArrayList<>();
genotypes.add(this.genotypeTable.getItems().get(x).makeGenotype());
int y = x + 1;
while (y < this.genotypeTable.getItems().size()) {
VariantContext v2 = this.genotypeTable.getItems().get(y).ctx;
// yes '!=' is enough
if (v2 != v1)
break;
genotypes.add(this.genotypeTable.getItems().get(y).makeGenotype());
y++;
}
VariantContextBuilder vb = new VariantContextBuilder(v1);
vb.genotypes(genotypes);
w.add(vb.make());
x = y;
}
w.close();
return f;
} catch (final Exception err) {
JfxUtils.dialog().cause(err).show(owner);
return null;
} finally {
CloserUtil.close(w);
}
}
use of htsjdk.variant.vcf.VCFHeader in project jvarkit by lindenb.
the class Biostar251649 method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter w) {
try {
final VCFHeader header = new VCFHeader(in.getHeader());
VCFInfoHeaderLine info5 = new VCFInfoHeaderLine(leftTag + extend, 1, VCFHeaderLineType.String, "Sequence on the 5' of mutation");
VCFInfoHeaderLine info3 = new VCFInfoHeaderLine(rightTag + extend, 1, VCFHeaderLineType.String, "Sequence on the 3' of mutation");
if (header.getInfoHeaderLine(info5.getID()) != null) {
LOG.error("tag " + info5.getID() + " already present in VCF header");
return -1;
}
if (header.getInfoHeaderLine(info3.getID()) != null) {
LOG.error("tag " + info3.getID() + " already present in VCF header");
return -1;
}
header.addMetaDataLine(info5);
header.addMetaDataLine(info3);
GenomicSequence chrom = null;
w.writeHeader(header);
while (in.hasNext()) {
final VariantContext ctx = in.next();
if (chrom == null || !chrom.getChrom().equals(ctx.getContig())) {
chrom = new GenomicSequence(this.faidx, ctx.getContig());
}
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
if (ctx.getStart() > 0) {
final StringBuilder sb = new StringBuilder(this.extend);
for (int i = 0; i < this.extend; ++i) {
final int pos0 = (ctx.getStart() - 2) - i;
if (pos0 <= 0)
continue;
sb.insert(0, chrom.charAt(pos0));
}
if (sb.length() > 0)
vcb.attribute(info5.getID(), sb.toString());
}
{
final StringBuilder sb = new StringBuilder(this.extend);
for (int i = 0; i < this.extend; ++i) {
int pos0 = ctx.getEnd() + i;
if (pos0 >= chrom.length())
break;
sb.append(chrom.charAt(pos0));
}
if (sb.length() > 0)
vcb.attribute(info3.getID(), sb.toString());
}
w.add(vcb.make());
}
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(faidx);
}
}
use of htsjdk.variant.vcf.VCFHeader in project jvarkit by lindenb.
the class VcfBurdenSplitter method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, File outorNull) {
SortingCollection<KeyAndLine> sortingcollection = null;
BufferedReader in = null;
CloseableIterator<KeyAndLine> iter = null;
PrintStream pw = null;
PrintWriter allDiscardedLog = null;
try {
in = inputName == null ? IOUtils.openStreamForBufferedReader(stdin()) : IOUtils.openURIForBufferedReading(inputName);
if (this.allFilteredFileOut != null) {
allDiscardedLog = IOUtils.openFileForPrintWriter(this.allFilteredFileOut);
}
final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(in);
/**
* find splitter by name
*/
Splitter splitter = null;
for (final Splitter s : this.splitters) {
if (this.splitterName.equals(s.getName())) {
splitter = s;
break;
}
}
if (splitter == null) {
return wrapException("Cannot find a splitter named " + this.splitterName);
}
splitter.initialize(cah.header);
LOG.info("splitter is " + splitter);
pw = super.openFileOrStdoutAsPrintStream(outorNull);
// read variants
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(cah.header);
String prev_contig = null;
for (; ; ) {
final String line = in.readLine();
final VariantContext variant = (line == null ? null : progess.watch(cah.codec.decode(line)));
if (variant == null || !variant.getContig().equals(prev_contig)) {
if (sortingcollection != null) {
sortingcollection.doneAdding();
iter = sortingcollection.iterator();
LOG.info("dumping data for CONTIG: \"" + prev_contig + "\"");
final EqualRangeIterator<KeyAndLine> eqiter = new EqualRangeIterator<>(iter, new Comparator<KeyAndLine>() {
@Override
public int compare(final KeyAndLine o1, final KeyAndLine o2) {
return o1.key.compareTo(o2.key);
}
});
while (eqiter.hasNext()) {
final List<KeyAndLine> buffer = eqiter.next();
final KeyAndLine first = buffer.get(0);
LOG.info(first.key);
final List<VariantContext> variants = new ArrayList<>(buffer.size());
boolean has_only_filtered = true;
for (final KeyAndLine kal : buffer) {
final VariantContext ctx = cah.codec.decode(kal.ctx);
variants.add(ctx);
if (isDebuggingVariant(ctx)) {
LOG.info("Adding variant to list for key " + kal.key + " " + shortName(ctx));
}
if (!ctx.getContig().equals(prev_contig)) {
eqiter.close();
return wrapException("illegal state");
}
if (!ctx.isFiltered() || this.acceptFiltered) {
has_only_filtered = false;
// break; NOOOONNN !!!
}
}
// all ctx are filtered
if (has_only_filtered) {
LOG.warn("ALL IS FILTERED in " + first.key);
if (allDiscardedLog != null) {
for (final VariantContext ctx : variants) {
if (isDebuggingVariant(ctx)) {
LOG.info("Variant " + shortName(ctx) + " is part of never filtered for " + first.key);
}
allDiscardedLog.println(String.join("\t", first.key, ctx.getContig(), String.valueOf(ctx.getStart()), ctx.getReference().getDisplayString(), ctx.getAlternateAllele(0).getDisplayString(), String.valueOf(ctx.getFilters())));
}
}
continue;
}
// save vcf file
final VariantContextWriter out = VCFUtils.createVariantContextWriterToOutputStream(IOUtils.uncloseableOutputStream(pw));
final VCFHeader header2 = addMetaData(new VCFHeader(cah.header));
header2.addMetaDataLine(new VCFHeaderLine(VCF_HEADER_SPLITKEY, first.key));
out.writeHeader(header2);
for (final VariantContext ctx : variants) {
if (isDebuggingVariant(ctx)) {
LOG.info("saving variant " + shortName(ctx) + " to final output with key=" + first.key);
}
out.add(ctx);
}
// yes because wrapped into IOUtils.encloseableOutputSream
out.close();
pw.flush();
}
eqiter.close();
iter.close();
iter = null;
// dispose sorting collection
sortingcollection.cleanup();
sortingcollection = null;
}
// EOF met
if (variant == null)
break;
prev_contig = variant.getContig();
}
if (sortingcollection == null) {
/* create sorting collection for new contig */
sortingcollection = SortingCollection.newInstance(KeyAndLine.class, new KeyAndLineCodec(), new KeyAndLineComparator(), this.writingSortingCollection.maxRecordsInRam, this.writingSortingCollection.getTmpPaths());
sortingcollection.setDestructiveIteration(true);
}
if (variant.getAlternateAlleles().size() != 1) {
return wrapException("Expected only one allele per variant. Please use VcfMultiToOneAllele https://github.com/lindenb/jvarkit/wiki/VcfMultiToOneAllele.");
}
// no check for ctx.ifFiltered here, we do this later.
for (final String key : splitter.keys(variant)) {
if (isDebuggingVariant(variant)) {
LOG.info("Adding variant with key " + key + " " + shortName(variant));
}
sortingcollection.add(new KeyAndLine(key, line));
}
}
progess.finish();
pw.flush();
pw.close();
pw = null;
if (allDiscardedLog != null) {
allDiscardedLog.flush();
allDiscardedLog.close();
allDiscardedLog = null;
}
return RETURN_OK;
} catch (final Exception err) {
return wrapException(err);
} finally {
CloserUtil.close(iter);
if (sortingcollection != null)
sortingcollection.cleanup();
CloserUtil.close(in);
CloserUtil.close(pw);
CloserUtil.close(allDiscardedLog);
}
}
Aggregations