use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class VcfSkatSlidingWindow method doWork.
@Override
public int doWork(final List<String> args) {
if (this.nJobs < 1) {
this.nJobs = Math.max(1, Runtime.getRuntime().availableProcessors());
LOG.info("setting njobs to " + this.nJobs);
}
VcfIterator r = null;
try {
final VCFHeader header;
final SAMSequenceDictionary dict;
final File vcfFile = new File(oneAndOnlyOneFile(args));
try (final VCFFileReader vr = new VCFFileReader(vcfFile, true)) {
header = vr.getFileHeader();
dict = header.getSequenceDictionary();
}
if (dict == null || dict.isEmpty()) {
throw new JvarkitException.VcfDictionaryMissing(vcfFile);
}
if (!this.limit_contigs.isEmpty()) {
if (this.limit_contigs.stream().anyMatch(C -> dict.getSequence(C) == null)) {
LOG.error("user contig missing in vcf dictionary.");
return -1;
}
}
final Pedigree pedigree;
if (this.pedigreeFile != null) {
pedigree = new Pedigree.Parser().parse(this.pedigreeFile);
} else {
pedigree = new Pedigree.Parser().parse(header);
}
final Set<Pedigree.Person> samples = new HashSet<>(pedigree.getPersons());
samples.removeIf(I -> !(I.isAffected() || I.isUnaffected()) || !header.getSampleNamesInOrder().contains(I.getId()));
this.writer = super.openFileOrStdoutAsPrintWriter(this.outputFile);
final Consumer<SkatCallerResult> writeResult = (R) -> {
synchronized (this.writer) {
this.writer.println(R.toString());
}
};
for (final SAMSequenceRecord ssr : dict.getSequences()) {
if (!this.limit_contigs.isEmpty() && !this.limit_contigs.contains(ssr.getSequenceName())) {
LOG.warning("skipping contig " + ssr.getSequenceName());
continue;
}
LOG.info("contig " + ssr.getSequenceName());
final ExecutorService executorService = new ThreadPoolExecutor(this.nJobs, this.nJobs, 0L, TimeUnit.MILLISECONDS, new LinkedBlockingDeque<>(this.nJobs));
final List<Future<Integer>> results = new ArrayList<>(this.nJobs);
for (int i = 0; i < this.nJobs; i++) {
final int windowLen = Math.max(1, ssr.getSequenceLength() / this.nJobs);
final SkatWorker caller = new SkatWorker(vcfFile, new Interval(ssr.getSequenceName(), i * windowLen, Math.min(ssr.getSequenceLength(), (i + 1) * windowLen)), samples, this.skat.build(), writeResult);
results.add(executorService.submit(caller));
}
executorService.shutdown();
executorService.awaitTermination(365, TimeUnit.DAYS);
for (final Future<Integer> fl : results) {
try {
if (fl.get() != 0) {
LOG.error("An error occured");
return -1;
}
} catch (final Exception err) {
LOG.error(err);
return -1;
}
}
}
this.writer.flush();
this.writer.close();
this.writer = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
CloserUtil.close(this.writer);
}
}
use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class VcfToHilbert method doWork.
@Override
public int doWork(final List<String> args) {
if (this.imgOut == null) {
LOG.error("output image file not defined");
return -1;
}
if (this.imageWidth < 1) {
LOG.error("Bad image size:" + this.imageWidth);
return -1;
}
VcfIterator iter = null;
try {
iter = this.openVcfIterator(oneFileOrNull(args));
final VCFHeader header = iter.getHeader();
this.dict = header.getSequenceDictionary();
if (this.dict == null) {
throw new JvarkitException.FastaDictionaryMissing("no dict in input");
}
final List<String> samples = header.getSampleNamesInOrder();
if (samples.isEmpty()) {
throw new JvarkitException.SampleMissing("no.sample.in.vcf");
}
LOG.info("N-Samples:" + samples.size());
double marginWidth = (this.imageWidth - 2) * 0.05;
this.sampleWidth = ((this.imageWidth - 2) - marginWidth) / samples.size();
LOG.info("sample Width:" + sampleWidth);
BufferedImage img = new BufferedImage(this.imageWidth, this.imageWidth, BufferedImage.TYPE_INT_RGB);
this.g = (Graphics2D) img.getGraphics();
this.g.setColor(Color.WHITE);
this.g.fillRect(0, 0, imageWidth, imageWidth);
g.setColor(Color.BLACK);
final Hershey hershey = new Hershey();
EvalCurve evalCurve = new EvalCurve();
evalCurve.run();
this.genomicSizePerCurveUnit = ((double) dict.getReferenceLength() / (double) (evalCurve.count));
if (this.genomicSizePerCurveUnit < 1)
this.genomicSizePerCurveUnit = 1;
LOG.info("genomicSizePerCurveUnit:" + genomicSizePerCurveUnit);
for (int x = 0; x < samples.size(); ++x) {
String samplex = samples.get(x);
double labelHeight = marginWidth;
if (labelHeight > 50)
labelHeight = 50;
g.setColor(Color.BLACK);
hershey.paint(g, samplex, marginWidth + x * sampleWidth, marginWidth - labelHeight, sampleWidth * 0.9, labelHeight * 0.9);
AffineTransform old = g.getTransform();
AffineTransform tr = AffineTransform.getTranslateInstance(marginWidth, marginWidth + x * sampleWidth);
tr.rotate(Math.PI / 2);
g.setTransform(tr);
hershey.paint(g, samplex, 0.0, 0.0, sampleWidth * 0.9, labelHeight * 0.9);
// g.drawString(this.tabixFile.getFile().getName(),0,0);
g.setTransform(old);
double tx = marginWidth + x * sampleWidth;
for (int y = 0; y < samples.size(); ++y) {
double ty = marginWidth + y * sampleWidth;
g.translate(tx, ty);
g.setColor(Color.BLUE);
g.draw(new Rectangle2D.Double(0, 0, sampleWidth, sampleWidth));
// paint each chromosome
paintReference();
g.translate(-tx, -ty);
}
}
LOG.info("genomicSizePerCurveUnit:" + (long) genomicSizePerCurveUnit * evalCurve.count + " " + dict.getReferenceLength() + " count=" + evalCurve.count);
LOG.info("Scanning variants");
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(this.dict);
while (iter.hasNext()) {
final VariantContext var = progress.watch(iter.next());
for (int x = 0; x < samples.size(); ++x) {
final String samplex = samples.get(x);
final Genotype gx = var.getGenotype(samplex);
if (!gx.isCalled())
continue;
double tx = marginWidth + x * sampleWidth;
for (int y = 0; y < samples.size(); ++y) {
final String sampley = samples.get(y);
final Genotype gy = var.getGenotype(sampley);
if (!gy.isCalled())
continue;
if (gx.isHomRef() && gy.isHomRef())
continue;
double ty = marginWidth + y * sampleWidth;
g.translate(tx, ty);
final PaintVariant paint = new PaintVariant(var, x, y);
paint.run();
g.translate(-tx, -ty);
}
}
}
progress.finish();
this.g.dispose();
// save file
LOG.info("saving " + imgOut);
if (imgOut != null) {
ImageIO.write(img, imgOut.getName().toLowerCase().endsWith(".png") ? "PNG" : "JPG", imgOut);
} else {
ImageIO.write(img, "PNG", stdout());
}
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
}
}
use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class VcfMultiToOne method doWork.
@Override
public int doWork(final List<String> arguments) {
VariantContextWriter out = null;
Set<String> args = IOUtils.unrollFiles(arguments);
List<VcfIterator> inputs = new ArrayList<>(args.size() + 1);
List<String> inputFiles = new ArrayList<>(args.size() + 1);
try {
if (args.isEmpty() && arguments.isEmpty()) {
inputs.add(VCFUtils.createVcfIteratorStdin());
inputFiles.add("stdin");
} else if (args.isEmpty()) {
LOG.error("No vcf provided");
return -1;
} else {
for (final String vcfFile : args) {
inputs.add(VCFUtils.createVcfIterator(vcfFile));
inputFiles.add(VCFUtils.escapeInfoField(vcfFile));
}
}
SAMSequenceDictionary dict = null;
final Set<String> sampleNames = new HashSet<String>();
final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
for (final VcfIterator in : inputs) {
final VCFHeader header = in.getHeader();
if (dict == null) {
dict = header.getSequenceDictionary();
} else if (header.getSequenceDictionary() == null) {
LOG.error("No Dictionary in vcf");
return -1;
} else if (!SequenceUtil.areSequenceDictionariesEqual(dict, header.getSequenceDictionary())) {
LOG.error("Not the same dictionary between vcfs");
return -1;
}
metaData.addAll(in.getHeader().getMetaDataInInputOrder());
sampleNames.addAll(in.getHeader().getSampleNamesInOrder());
}
final Comparator<VariantContext> comparator = (dict == null ? VCFUtils.createChromPosRefComparator() : VCFUtils.createTidPosRefComparator(dict));
// addMetaData(metaData);
metaData.add(new VCFInfoHeaderLine(DEFAULT_SAMPLE_TAGID, 1, VCFHeaderLineType.String, "Sample Name from multi-sample vcf"));
metaData.add(new VCFInfoHeaderLine(DEFAULT_SAMPLE_FILETAGID, 1, VCFHeaderLineType.String, "Origin of sample"));
for (final String sample : sampleNames) {
metaData.add(new VCFHeaderLine(SAMPLE_HEADER_DECLARATION, sample));
}
final VCFHeader h2 = new VCFHeader(metaData, Collections.singleton(DEFAULT_VCF_SAMPLE_NAME));
recalculator.setHeader(h2);
out = super.openVariantContextWriter(this.outputFile);
out.writeHeader(h2);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
for (; ; ) {
if (out.checkError())
break;
/* get 'smallest' variant */
VariantContext smallest = null;
int idx = 0;
int best_idx = -1;
while (idx < inputs.size()) {
final VcfIterator in = inputs.get(idx);
if (!in.hasNext()) {
CloserUtil.close(in);
inputs.remove(idx);
inputFiles.remove(idx);
} else {
final VariantContext ctx = in.peek();
if (smallest == null || comparator.compare(smallest, ctx) > 0) {
smallest = ctx;
best_idx = idx;
}
++idx;
}
}
if (smallest == null)
break;
final VariantContext ctx = progress.watch(inputs.get(best_idx).next());
if (ctx.getNSamples() == 0) {
if (!this.discard_no_call) {
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.attribute(DEFAULT_SAMPLE_FILETAGID, inputFiles.get(best_idx));
vcb.genotypes(GenotypeBuilder.createMissing(DEFAULT_VCF_SAMPLE_NAME, 2));
out.add(this.recalculator.apply(vcb.make()));
}
continue;
}
for (int i = 0; i < ctx.getNSamples(); ++i) {
final Genotype g = ctx.getGenotype(i);
final String sample = g.getSampleName();
if (!g.isCalled() && this.discard_no_call)
continue;
if (!g.isAvailable() && this.discard_non_available)
continue;
if (g.isHomRef() && this.discard_hom_ref)
continue;
final GenotypeBuilder gb = new GenotypeBuilder(g);
gb.name(DEFAULT_VCF_SAMPLE_NAME);
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.attribute(DEFAULT_SAMPLE_TAGID, sample);
vcb.attribute(DEFAULT_SAMPLE_FILETAGID, inputFiles.get(best_idx));
vcb.genotypes(gb.make());
out.add(this.recalculator.apply(vcb.make()));
}
}
progress.finish();
LOG.debug("done");
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(inputs);
CloserUtil.close(out);
}
}
use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class VcfBurden method doWork2.
private int doWork2(List<String> args) {
ZipOutputStream zout = null;
FileOutputStream fout = null;
VcfIterator in = null;
try {
if (args.isEmpty()) {
LOG.info("reading from stdin.");
in = VCFUtils.createVcfIteratorStdin();
} else if (args.size() == 1) {
String filename = args.get(0);
LOG.info("reading from " + filename);
in = VCFUtils.createVcfIterator(filename);
} else {
LOG.error("Illegal number of arguments.");
return -1;
}
if (outputFile == null) {
LOG.error("undefined output");
return -1;
} else if (!outputFile.getName().endsWith(".zip")) {
LOG.error("output " + outputFile + " should end with .zip");
return -1;
} else {
fout = new FileOutputStream(outputFile);
zout = new ZipOutputStream(fout);
}
final List<String> samples = in.getHeader().getSampleNamesInOrder();
final VCFHeader header = in.getHeader();
String prev_chrom = null;
final VepPredictionParser vepPredParser = new VepPredictionParserFactory().header(header).get();
final Map<GeneTranscript, List<VariantAndCsq>> gene2variants = new HashMap<>();
final SequenceOntologyTree soTree = SequenceOntologyTree.getInstance();
final Set<SequenceOntologyTree.Term> acn = new HashSet<>();
/* mail solena *SO en remplacement des SO actuels (VEP HIGH + MODERATE) - pas la peine de faire retourner les analyses mais servira pour les futures analyses burden */
String[] acn_list = new String[] { "SO:0001893", "SO:0001574", "SO:0001575", "SO:0001587", "SO:0001589", "SO:0001578", "SO:0002012", "SO:0001889", "SO:0001821", "SO:0001822", "SO:0001583", "SO:0001818" /*
"SO:0001589", "SO:0001587", "SO:0001582", "SO:0001583",
"SO:0001575", "SO:0001578", "SO:0001574", "SO:0001889",
"SO:0001821", "SO:0001822", "SO:0001893"*/
};
if (this.highdamage) {
acn_list = new String[] { "SO:0001893", "SO:0001574", "SO:0001575", "SO:0001587", "SO:0001589", "SO:0001578", "SO:0002012", "SO:0001889" };
}
for (final String acns : acn_list) {
final SequenceOntologyTree.Term tacn = soTree.getTermByAcn(acns);
if (tacn == null) {
in.close();
throw new NullPointerException("tacn == null pour " + acns);
}
acn.addAll(tacn.getAllDescendants());
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(in.getHeader());
for (; ; ) {
VariantContext ctx1 = null;
if (in.hasNext()) {
ctx1 = progress.watch(in.next());
if (ctx1.getAlternateAlleles().size() != 1) {
// info("count(ALT)!=1 in "+ctx1.getChr()+":"+ctx1.getStart());
continue;
}
if (ctx1.getAlternateAlleles().get(0).equals(Allele.SPAN_DEL)) {
continue;
}
}
if (ctx1 == null || !ctx1.getContig().equals(prev_chrom)) {
LOG.info("DUMP to zip n=" + gene2variants.size());
final Set<String> geneNames = new HashSet<>();
for (GeneTranscript gene_transcript : gene2variants.keySet()) {
geneNames.add(gene_transcript.geneName);
final Set<VariantAndCsq> uniq = new TreeSet<>(this.variantAndCsqComparator);
uniq.addAll(gene2variants.get(gene_transcript));
dumpVariants(zout, prev_chrom, gene_transcript.geneName + "_" + gene_transcript.transcriptName, samples, new ArrayList<VariantAndCsq>(uniq));
LOG.info("dumped" + gene_transcript.geneName);
}
LOG.info("loop over geneName");
for (final String geneName : geneNames) {
final SortedSet<VariantAndCsq> lis_nm = new TreeSet<>(this.variantAndCsqComparator);
final SortedSet<VariantAndCsq> lis_all = new TreeSet<>(this.variantAndCsqComparator);
final SortedSet<VariantAndCsq> lis_refseq = new TreeSet<>(this.variantAndCsqComparator);
final SortedSet<VariantAndCsq> lis_enst = new TreeSet<>(this.variantAndCsqComparator);
LOG.info("loop over gene2variants");
for (final GeneTranscript gene_transcript : gene2variants.keySet()) {
if (!geneName.equals(gene_transcript.geneName))
continue;
lis_all.addAll(gene2variants.get(gene_transcript));
if (gene_transcript.transcriptName.startsWith("NM_")) {
lis_nm.addAll(gene2variants.get(gene_transcript));
}
if (!gene_transcript.transcriptName.startsWith("ENST")) {
lis_refseq.addAll(gene2variants.get(gene_transcript));
}
if (gene_transcript.transcriptName.startsWith("ENST")) {
lis_enst.addAll(gene2variants.get(gene_transcript));
}
}
LOG.info("dump_ALL_TRANSCRIPTS");
dumpVariants(zout, prev_chrom, geneName + "_ALL_TRANSCRIPTS", samples, new ArrayList<VariantAndCsq>(lis_all));
LOG.info("dump_ALL_NM");
dumpVariants(zout, prev_chrom, geneName + "_ALL_NM", samples, new ArrayList<VariantAndCsq>(lis_nm));
LOG.info("dump_ALL_REFSEQ");
dumpVariants(zout, prev_chrom, geneName + "_ALL_REFSEQ", samples, new ArrayList<VariantAndCsq>(lis_refseq));
LOG.info("dump_ALL_ENST");
dumpVariants(zout, prev_chrom, geneName + "_ALL_ENST", samples, new ArrayList<VariantAndCsq>(lis_enst));
}
if (ctx1 == null)
break;
LOG.info("gene2variants");
gene2variants.clear();
LOG.info("System.gc();");
System.gc();
prev_chrom = ctx1.getContig();
LOG.info("prev_chrom=" + prev_chrom);
}
final Set<GeneTranscript> seen_names = new HashSet<>();
for (final VepPredictionParser.VepPrediction pred : vepPredParser.getPredictions(ctx1)) {
String geneName = pred.getSymbol();
if (geneName == null || geneName.trim().isEmpty())
continue;
if (this._gene2seen != null) {
if (!this._gene2seen.containsKey(geneName))
continue;
}
final String transcriptName = pred.getFeature();
if (transcriptName == null || transcriptName.trim().isEmpty()) {
LOG.info("No transcript in " + ctx1);
continue;
}
final GeneTranscript geneTranscript = new GeneTranscript(geneName, transcriptName);
if (seen_names.contains(geneTranscript))
continue;
boolean ok = false;
for (SequenceOntologyTree.Term so : pred.getSOTerms()) {
if (acn.contains(so)) {
ok = true;
}
}
if (!printSOTerms && !ok)
continue;
List<VariantAndCsq> L = gene2variants.get(geneTranscript);
if (L == null) {
L = new ArrayList<>();
gene2variants.put(geneTranscript, L);
}
Float vqslod = null;
if (this.printVQSLOD && ctx1.hasAttribute("VQSLOD")) {
vqslod = (float) ctx1.getAttributeAsDouble("VQSLOD", -9999999.0);
}
L.add(new VariantAndCsq(ctx1, pred.getSOTerms(), pred.getPositionInCDS(), vqslod));
seen_names.add(geneTranscript);
if (this._gene2seen != null) {
this._gene2seen.put(geneTranscript.geneName, Boolean.TRUE);
}
}
}
if (this._gene2seen != null) {
final List<VariantAndCsq> emptylist = Collections.emptyList();
for (final String gene : this._gene2seen.keySet()) {
if (this._gene2seen.get(gene).equals(Boolean.TRUE))
continue;
LOG.warning("Gene not found : " + gene);
dumpVariants(zout, "UNDEFINED", gene + "_000000000000000.txt", samples, emptylist);
}
}
progress.finish();
zout.finish();
fout.flush();
zout.flush();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(in);
CloserUtil.close(zout);
CloserUtil.close(fout);
}
}
use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class FindAVariation method scan.
private void scan(final BufferedReader in) throws IOException {
String line;
while ((line = in.readLine()) != null) {
if (line.isEmpty() || line.startsWith("#"))
continue;
final File f = new File(line);
if (!f.isFile())
continue;
if (!f.canRead())
continue;
if (!VCFUtils.isVcfFile(f))
continue;
VcfIterator iter = null;
if (VCFUtils.isTribbleVcfFile(f)) {
VCFFileReader r = null;
try {
r = new VCFFileReader(f, true);
final VCFHeader header = r.getFileHeader();
for (final Mutation m : convertFromVcfHeader(f, header)) {
final CloseableIterator<VariantContext> iter2 = r.query(m.chrom, m.pos, m.pos);
while (iter2.hasNext()) {
final VariantContext ctx = iter2.next();
if (this.onlySnp) {
if (ctx.getStart() != m.pos || ctx.getEnd() != m.pos)
continue;
}
report(f, header, ctx, m);
}
CloserUtil.close(iter2);
}
} catch (final htsjdk.tribble.TribbleException.InvalidHeader err) {
LOG.warn(f + "\t" + err.getMessage());
} catch (final Exception err) {
LOG.severe("cannot read " + f, err);
} finally {
CloserUtil.close(r);
}
} else if (VCFUtils.isTabixVcfFile(f)) {
TabixVcfFileReader r = null;
try {
r = new TabixVcfFileReader(f.getPath());
final VCFHeader header = r.getHeader();
for (final Mutation m : convertFromVcfHeader(f, header)) {
final Iterator<VariantContext> iter2 = r.iterator(m.chrom, m.pos, m.pos);
while (iter2.hasNext()) {
final VariantContext ctx = iter2.next();
if (this.onlySnp) {
if (ctx.getStart() != m.pos || ctx.getEnd() != m.pos)
continue;
}
report(f, header, ctx, m);
}
CloserUtil.close(iter2);
}
} catch (final htsjdk.tribble.TribbleException.InvalidHeader err) {
LOG.warn(f + "\t" + err.getMessage());
} catch (final Exception err) {
LOG.severe("cannot read " + f, err);
} finally {
CloserUtil.close(r);
}
} else if (!this.indexedOnly) {
try {
iter = VCFUtils.createVcfIteratorFromFile(f);
final VCFHeader header = iter.getHeader();
final Set<Mutation> mutlist = convertFromVcfHeader(f, iter.getHeader());
while (iter.hasNext()) {
final VariantContext ctx = iter.next();
final Mutation m = new Mutation(ctx.getContig(), ctx.getStart());
for (final Mutation m2 : mutlist) {
if (m.equals(m2)) {
if (this.onlySnp) {
if (ctx.getStart() != m2.pos || ctx.getEnd() != m2.pos)
continue;
}
report(f, header, ctx, m2);
break;
}
}
}
} catch (final htsjdk.tribble.TribbleException.InvalidHeader err) {
LOG.warn(f + "\t" + err.getMessage());
} catch (final Exception err) {
LOG.severe("Error in " + f, err);
} finally {
CloserUtil.close(iter);
}
}
}
}
Aggregations