use of com.github.lindenb.jvarkit.util.vcf.AFExtractorFactory in project jvarkit by lindenb.
the class VcfStrechToSvg method doWork.
@Override
public int doWork(final List<String> args) {
try {
this.afExtractor = new AFExtractorFactory().parseFieldExtractor(this.afExtractorFactoryStr);
final String input = super.oneAndOnlyOneFile(args);
if (!this.bamListPath.isEmpty()) {
LOG.info("reading bam list");
for (Path bamPath : IOUtils.unrollPaths(this.bamListPath)) {
try (SamReader sr = openSamReader(bamPath)) {
final SAMFileHeader hdr = sr.getFileHeader();
for (final String sn : hdr.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toSet())) {
if (this.sample2bam.containsKey(sn)) {
LOG.info("duplicate sample in bam " + bamPath + " and " + this.sample2bam.get(sn));
return -1;
}
this.sample2bam.put(sn, bamPath);
}
}
}
}
try (VCFReader r = VCFReaderFactory.makeDefault().open(input, true)) {
final VCFHeader header = r.getHeader();
this.afExtractor.validateHeader(header);
final String searchFormat;
switch(this.useFormat) {
case PL:
searchFormat = VCFConstants.GENOTYPE_PL_KEY;
break;
// through
case AD:
default:
searchFormat = VCFConstants.GENOTYPE_ALLELE_DEPTHS;
break;
}
if (header.getFormatHeaderLine(searchFormat) == null) {
LOG.error("FORMAT/" + searchFormat + " undefined in " + input);
return -1;
}
if (!header.hasGenotypingData()) {
LOG.error("No genotype in input");
return -1;
}
final SAMSequenceDictionary dict = header.getSequenceDictionary();
if (dict != null && this.refFaidx != null) {
SequenceUtil.assertSequenceDictionariesEqual(dict, SequenceDictionaryUtils.extractRequired(this.refFaidx));
}
if (this.gff3Path != null) {
LOG.info("reading gtf" + this.gff3Path);
try (GtfReader gtfReader = new GtfReader(this.gff3Path)) {
if (dict != null)
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
gtfReader.getAllGenes().forEach(G -> {
this.all_genes.put(new Interval(G), G);
});
}
}
try (BufferedReader br = IOUtils.openPathForBufferedReading(this.bedFile)) {
try (PrintWriter manifest = (this.manifestPath == null ? new PrintWriter(new NullOuputStream()) : IOUtils.openPathForPrintWriter(this.manifestPath))) {
try (ArchiveFactory out = ArchiveFactory.open(this.outputFile)) {
final BedLineCodec codec = new BedLineCodec();
br.lines().map(L -> codec.decode(L)).filter(B -> B != null).forEach(B -> {
run(out, B, header, r, manifest);
});
}
manifest.flush();
}
}
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of com.github.lindenb.jvarkit.util.vcf.AFExtractorFactory in project jvarkit by lindenb.
the class CaseControlJfx method doWork.
@Override
public int doWork(final List<String> args) {
final VariantPartition partition;
Pedigree pedigree = null;
VCFIterator in = null;
try {
switch(this.partitionType) {
case variantType:
partition = new VariantTypePartition();
break;
case chromosome:
partition = new ChromosomePartition();
break;
case autosomes:
partition = new SexualContigPartition();
break;
case qual:
partition = new QualPartition();
break;
case vqslod:
partition = new VQSLODPartition();
break;
case typeFilter:
partition = new TypeAndFilterPartiton();
break;
case distance:
partition = new DisanceToDiagonalPartiton();
break;
case n_alts:
partition = new NAltsPartition();
break;
default:
throw new IllegalStateException(this.partitionType.name());
}
in = openVCFIterator(oneFileOrNull(args));
if (this.pedigreeFile != null) {
pedigree = Pedigree.newParser().parse(this.pedigreeFile);
} else {
pedigree = Pedigree.newParser().parse(in.getHeader());
}
final AFExtractor controlAFExtractor;
if (!StringUtil.isBlank(this.controlFields)) {
final List<AFExtractor> extractors = new AFExtractorFactory().parseFieldExtractors(this.controlFields);
if (extractors.size() != 1) {
LOG.error("extractor list should have size==1 . got " + extractors);
return -1;
}
controlAFExtractor = extractors.get(0);
if (!controlAFExtractor.validateHeader(in.getHeader())) {
LOG.error("Invalid : " + controlAFExtractor);
return -1;
}
} else {
controlAFExtractor = null;
}
int count = 0;
final ProgressFactory.Watcher<VariantContext> progress = ProgressFactory.newInstance().dictionary(in.getHeader()).logger(LOG).build();
while (in.hasNext() && (this.limit_to_N_variants < 0 || count < this.limit_to_N_variants)) {
final VariantContext ctx = progress.apply(in.next());
if (this.ignore_ctx_filtered && ctx.isFiltered())
continue;
++count;
final List<Allele> alternates = ctx.getAlternateAlleles();
for (int alt_idx = 0; alt_idx < alternates.size(); ++alt_idx) {
final Allele alt = alternates.get(alt_idx);
final Double[] mafs = { null, null };
for (int i = 0; i < 2; ++i) {
if (i == 1 && controlAFExtractor != null) {
final List<Double> dvals = controlAFExtractor.parse(ctx);
if (alt_idx < dvals.size() && dvals.get(alt_idx) != null) {
final double d = dvals.get(alt_idx);
if (!Double.isNaN(d) && d >= 0 && d <= 1.0)
mafs[1] = d;
}
} else {
final MafCalculator mafCalculator = new MafCalculator(alt, ctx.getContig());
mafCalculator.setNoCallIsHomRef(no_call_is_homref);
for (Pedigree.Person person : (i == 0 ? pedigree.getAffected() : pedigree.getUnaffected())) {
if (selectSamples.equals(SelectSamples.males) && !person.isMale())
continue;
if (selectSamples.equals(SelectSamples.females) && !person.isFemale())
continue;
final Genotype genotype = ctx.getGenotype(person.getId());
if (genotype == null)
continue;
if (ignore_gt_filtered && genotype.isFiltered())
continue;
mafCalculator.add(genotype, person.isMale());
}
if (!mafCalculator.isEmpty()) {
mafs[i] = mafCalculator.getMaf();
}
}
}
if (mafs[0] == null || mafs[1] == null)
continue;
final XYChart.Data<Number, Number> data = new XYChart.Data<Number, Number>(mafs[0], mafs[1]);
partition.add(ctx, pedigree, data);
}
}
progress.close();
in.close();
in = null;
final NumberAxis xAxis = new NumberAxis(0.0, 1.0, 0.1);
xAxis.setLabel("Cases");
final NumberAxis yAxis = new NumberAxis(0.0, 1.0, 0.1);
yAxis.setLabel("Controls" + (StringUtil.isBlank(this.controlFields) ? "" : "[" + this.controlFields + "]"));
final ScatterChart<Number, Number> chart = new ScatterChart<>(xAxis, yAxis);
for (final XYChart.Series<Number, Number> series : partition.getSeries()) {
chart.getData().add(series);
}
String title = "Case/Control";
if (!args.isEmpty()) {
title = args.get(0);
int slash = title.lastIndexOf("/");
if (slash != -1)
title = title.substring(slash + 1);
if (title.endsWith(".vcf.gz"))
title = title.substring(0, title.length() - 7);
if (title.endsWith(".vcf"))
title = title.substring(0, title.length() - 4);
}
if (userTitle != null)
title = userTitle;
chart.setTitle(title);
chart.setAnimated(false);
// chart.setLegendSide(this.legendSide);
final RExporter rExporter = new RExporter();
final PrintWriter pw = super.openFileOrStdoutAsPrintWriter(this.outputFile);
rExporter.exportToR(pw, chart);
pw.flush();
pw.close();
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(in);
}
return 0;
}
use of com.github.lindenb.jvarkit.util.vcf.AFExtractorFactory in project jvarkit by lindenb.
the class CaseControlPlot method parseConfigFile.
private List<CaseControlExtractor> parseConfigFile(final VCFHeader header) throws Exception {
Document dom = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(this.xmlFile);
Element root = dom.getDocumentElement();
if (root == null || !root.getNodeName().equals("config"))
throw new JvarkitException.XmlDomError(root, "Root is note <config>");
final Map<String, JSPredigate> id2variantFilter = new HashMap<>();
// first pass, collect filters
for (Node n1 = root.getFirstChild(); n1 != null; n1 = n1.getNextSibling()) {
if (n1.getNodeType() != Node.ELEMENT_NODE)
continue;
Element e1 = Element.class.cast(n1);
if (!e1.getNodeName().equals("filter"))
continue;
final String filterid = e1.getAttribute("id");
if (StringUtil.isBlank(filterid))
throw new JvarkitException.XmlDomError(e1, "@id missing");
if (id2variantFilter.containsKey(filterid))
throw new JvarkitException.XmlDomError(e1, "duplicate filter id : " + filterid);
final String expression = e1.getTextContent();
if (StringUtil.isBlank(expression))
throw new JvarkitException.XmlDomError(e1, "expression missing");
id2variantFilter.put(filterid, new JSPredigate(header, expression));
}
// second pass collect maf extractors
final Map<String, MafExtractor> id2mafExtractor = new HashMap<>();
for (Node n1 = root.getFirstChild(); n1 != null; n1 = n1.getNextSibling()) {
if (n1.getNodeType() != Node.ELEMENT_NODE)
continue;
Element e1 = Element.class.cast(n1);
if (!e1.getNodeName().equals("maf"))
continue;
final String mafid = e1.getAttribute("id");
if (StringUtil.isBlank(mafid))
throw new JvarkitException.XmlDomError(e1, "@id missing");
if (id2mafExtractor.containsKey(mafid))
throw new JvarkitException.XmlDomError(e1, "duplicate maf id : " + mafid);
final MafExtractor mafExtractor;
if (e1.hasAttribute("attribute")) {
final String tag = e1.getAttribute("attribute");
if (StringUtil.isBlank(tag))
throw new JvarkitException.XmlDomError(e1, "@attribute is empty");
final List<AFExtractor> extractors = new AFExtractorFactory().parseFieldExtractors(tag);
if (extractors.size() != 1) {
throw new JvarkitException.XmlDomError(e1, "expected one AF extractor but got " + extractors);
}
final DefaultMafExtractor at = new DefaultMafExtractor(extractors.get(0));
mafExtractor = at;
} else {
final GenotypeMafExtractor at = new GenotypeMafExtractor();
mafExtractor = at;
}
id2mafExtractor.put(mafid, mafExtractor);
}
final List<CaseControlExtractor> excractors = new ArrayList<>();
// parse handlers
for (Node n1 = root.getFirstChild(); n1 != null; n1 = n1.getNextSibling()) {
if (n1.getNodeType() != Node.ELEMENT_NODE)
continue;
Element e1 = Element.class.cast(n1);
if (!e1.getNodeName().equals("handler"))
continue;
final CaseControlExtractor extractor = new CaseControlExtractor();
extractor.name = e1.getAttribute("name");
if (StringUtil.isBlank(extractor.name))
throw new JvarkitException.XmlDomError(e1, "@name missing");
extractor.name = this.prefix + extractor.name;
if (excractors.stream().filter(F -> F.getName().equals(extractor.name)).findAny().isPresent()) {
throw new JvarkitException.XmlDomError(e1, "duplicate name" + extractor.name);
}
for (Node n2 = n1.getFirstChild(); n2 != null; n2 = n2.getNextSibling()) {
if (n2.getNodeType() != Node.ELEMENT_NODE)
continue;
Element e2 = Element.class.cast(n2);
if (e2.getNodeName().equals("filter")) {
final Predicate<VariantContext> expr;
if (e2.hasAttribute("ref")) {
final String filterid = e2.getAttribute("ref");
JSPredigate predicate = id2variantFilter.get(filterid);
if (predicate == null) {
throw new JvarkitException.XmlDomError(e2, "no such filter:" + filterid);
}
expr = predicate;
} else {
final String expressionstr = e2.getTextContent();
if (StringUtil.isBlank(expressionstr))
throw new JvarkitException.XmlDomError(e2, "expression missing");
expr = new JSPredigate(header, expressionstr);
}
extractor.variantPredicate = extractor.variantPredicate.and(expr);
} else if (e2.getNodeName().equals("case") || e2.getNodeName().equals("ctrl")) {
final MafExtractor mafextractor;
if (e2.hasAttribute("ref")) {
final String mafid = e2.getAttribute("ref");
if (!id2mafExtractor.containsKey(e2.getAttribute(mafid))) {
throw new JvarkitException.XmlDomError(e2, "no such mafextractor:" + mafid);
}
mafextractor = id2mafExtractor.get(mafid);
} else {
final GenotypeMafExtractor genotypeMafExtractor = new GenotypeMafExtractor();
mafextractor = genotypeMafExtractor;
}
if (e2.getNodeName().equals("case")) {
extractor.caseExtractor = mafextractor;
} else {
extractor.ctrlExtractor = mafextractor;
}
} else {
LOG.error("unknown XML element " + e2.getNodeName());
}
}
excractors.add(extractor);
}
return excractors;
}
use of com.github.lindenb.jvarkit.util.vcf.AFExtractorFactory in project jvarkit by lindenb.
the class VcfAfInfoFilter method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VCFIterator in, final VariantContextWriter out) {
try {
if (this.vcf_gnomad_nfe) {
this.user_fields_str += ",gnomad_exome_AF_NFE,gnomad_genome_AF_NFE";
this.user_fields_str += ",gnomad_exome_AC_NFE/gnomad_exome_AN_NFE," + "gnomad_genome_AC_NFE/gnomad_genome_AN_NFE";
}
if (!StringUtil.isBlank(this.deprecated_user_af_fields)) {
LOG.warn("-af use is deprecated.");
this.user_fields_str += "," + String.join(" , ", this.deprecated_user_af_fields.split("[,; \t\n]+"));
}
if (!StringUtil.isBlank(this.deprecated_user_af_fields)) {
LOG.warn("-af use is deprecated.");
this.user_fields_str += "," + String.join(" , ", this.deprecated_user_af_fields.split("[,; \t\n]+"));
}
if (!StringUtil.isBlank(this.deprecated_user_ac_an_fields)) {
LOG.warn("--acn use is deprecated.");
final String[] array = this.deprecated_user_ac_an_fields.split("[,; \t\n]+");
int i = 0;
while (i < array.length) {
String s = array[i];
if (StringUtil.isBlank(s)) {
i++;
continue;
}
if (s.contains("*")) {
this.user_fields_str += ";" + s;
} else {
final String acf = s;
i++;
while (i < array.length) {
s = array[i];
if (StringUtil.isBlank(s)) {
i++;
continue;
}
break;
}
if (i == array.length) {
LOG.error("missing AN for " + acf + " in " + this.deprecated_user_ac_an_fields);
return -1;
}
final String anf = s;
this.user_fields_str += ";" + acf + "/" + anf;
}
}
}
final AFExtractorFactory afExtractorFactory = new AFExtractorFactory();
final VCFHeader header = in.getHeader();
final List<AFExtractor> afExtractors = new ArrayList<>(afExtractorFactory.parseFieldExtractors(this.user_fields_str));
afExtractors.removeIf(T -> {
if (!T.validateHeader(header) && !this.ignore_INFO_field_validation) {
LOG.warn("Ignoring " + T + " because it's not valid.");
return true;
}
return false;
});
if (afExtractors.isEmpty()) {
LOG.warn("No extractor was defined !");
}
final Set<VCFHeaderLine> headerLines = new HashSet<>();
final VCFFilterHeaderLine noAltVariantFilter;
if (!StringUtil.isBlank(this.filterAllAltInGnomad)) {
noAltVariantFilter = new VCFFilterHeaderLine(this.filterAllAltInGnomad, "All ALT alleles don't pass the " + this.user_af_minimum + " < gnomad treshold AF < " + this.user_af_maximum);
headerLines.add(noAltVariantFilter);
} else {
noAltVariantFilter = null;
}
VCFStandardHeaderLines.addStandardFormatLines(headerLines, true, VCFConstants.GENOTYPE_FILTER_KEY);
JVarkitVersion.getInstance().addMetaData(getClass().getSimpleName(), header);
this.recalculator.setHeader(header);
headerLines.stream().forEach(H -> header.addMetaDataLine(H));
final ProgressFactory.Watcher<VariantContext> progress = ProgressFactory.newInstance().dictionary(header).logger(LOG).build();
out.writeHeader(header);
while (in.hasNext()) {
final VariantContext ctx = progress.apply(in.next());
if (!ctx.isVariant()) {
out.add(ctx);
continue;
}
final List<Allele> alt_alleles = ctx.getAlternateAlleles();
final Set<Allele> ok_alleles = new HashSet<>(alt_alleles);
ok_alleles.remove(Allele.SPAN_DEL);
for (final AFExtractor afExtractor : afExtractors) {
if (ok_alleles.isEmpty())
break;
final List<Double> afo_list = afExtractor.parse(ctx);
if (afo_list == null)
continue;
if (afo_list.size() != alt_alleles.size()) {
LOG.warn("in " + ctx.getContig() + ":" + ctx.getStart() + ":" + ctx.getReference() + " illegal number of AF values " + afExtractor);
}
for (int x = 0; x < afo_list.size() && x < alt_alleles.size(); ++x) {
final Double af = afo_list.get(x);
if (af == null)
continue;
if (af.doubleValue() < this.user_af_minimum || af.doubleValue() > this.user_af_maximum) {
if (this.filter_for_any_allele) {
ok_alleles.clear();
} else {
ok_alleles.remove(alt_alleles.get(x));
}
}
}
}
if (ok_alleles.isEmpty()) {
if (noAltVariantFilter != null) {
out.add(new VariantContextBuilder(ctx).filter(noAltVariantFilter.getID()).make());
}
continue;
}
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
final List<Genotype> genotypes = new ArrayList<>(ctx.getNSamples());
for (final Genotype gt : ctx.getGenotypes()) {
if (gt.isNoCall() || gt.isHomRef()) {
genotypes.add(gt);
continue;
}
boolean got_good_allele = false;
boolean got_bad_alt = false;
for (final Allele gta : gt.getAlleles()) {
if (!gta.isCalled() || gta.equals(Allele.SPAN_DEL)) {
continue;
} else if (gta.isReference() || ok_alleles.contains(gta)) {
got_good_allele = true;
} else if (alt_alleles.contains(gta)) {
got_bad_alt = true;
}
}
if (got_good_allele || !got_bad_alt) {
genotypes.add(gt);
continue;
}
if (StringUtil.isBlank(this.genotypeFilter)) {
genotypes.add(GenotypeBuilder.createMissing(gt.getSampleName(), gt.getPloidy()));
} else {
genotypes.add(new GenotypeBuilder(gt).filter(this.genotypeFilter).make());
}
}
vcb.genotypes(genotypes);
out.add(this.recalculator.apply(vcb.make()));
}
progress.close();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
}
}
Aggregations