use of com.github.lindenb.jvarkit.util.vcf.predictions.GeneExtractorFactory in project jvarkit by lindenb.
the class VcfGeneSplitter method run.
private int run(final List<String> args) {
SortingCollection<KeyAndLine> sortingcollection = null;
BufferedReader in = null;
FileOutputStream fos = null;
CloseableIterator<KeyAndLine> iter = null;
ArchiveFactory archiveFactory = null;
PrintWriter manifest = null;
try {
final Path tmpVcf = Files.createTempFile("tmp.", ".vcf.gz");
archiveFactory = ArchiveFactory.open(this.outputFile);
manifest = new PrintWriter(this.manifestFile == null ? new NullOuputStream() : IOUtils.openPathForWriting(manifestFile));
manifest.println("#chrom\tstart\tend\tsplitter\tgene\tkey\tpath\tCount_Variants");
in = super.openBufferedReader(oneFileOrNull(args));
final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(in);
final GeneExtractorFactory geneExtractorFactory = new GeneExtractorFactory(cah.header);
final List<GeneExtractorFactory.GeneExtractor> extractors = geneExtractorFactory.parse(this.extractorsNames);
if (extractors.isEmpty()) {
LOG.warn("No extractor defined!");
return -1;
}
final VCFEncoder vcfEncoder = new VCFEncoder(cah.header, false, false);
// read variants
final ProgressFactory.Watcher<VariantContext> progess = ProgressFactory.newInstance().dictionary(cah.header).logger(LOG).build();
String prevCtg = null;
for (; ; ) {
final String line = in.readLine();
final VariantContext ctx = (line == null ? null : cah.codec.decode(line));
if (ctx != null)
progess.apply(ctx);
if (this.ignoreFiltered && ctx.isFiltered())
continue;
if (ctx == null || !ctx.getContig().equals(prevCtg)) {
if (sortingcollection != null) {
sortingcollection.doneAdding();
iter = sortingcollection.iterator();
final EqualRangeIterator<KeyAndLine> eqiter = new EqualRangeIterator<>(iter, (o1, o2) -> o1.keyAndGene.compareTo(o2.keyAndGene));
while (eqiter.hasNext()) {
final List<KeyAndLine> buffer = eqiter.next();
if (buffer.size() < this.min_number_of_ctx)
continue;
if (this.max_number_of_ctx > 0 && buffer.size() > this.max_number_of_ctx)
continue;
final KeyAndLine first = buffer.get(0);
final VariantContextWriter out = VCFUtils.createVariantContextWriterToPath(tmpVcf);
final VCFHeader header2 = addMetaData(new VCFHeader(cah.header));
header2.addMetaDataLine(new VCFHeaderLine("GtfFileSplitter.Name", String.valueOf(first.keyAndGene.key)));
header2.addMetaDataLine(new VCFHeaderLine("GtfFileSplitter.Gene", String.valueOf(first.keyAndGene.gene)));
out.writeHeader(header2);
int minPos = Integer.MAX_VALUE;
int maxPos = 0;
for (final KeyAndLine kl : buffer) {
final VariantContext ctx2 = cah.codec.decode(kl.ctx);
minPos = Math.min(ctx2.getStart(), minPos);
maxPos = Math.max(ctx2.getEnd(), maxPos);
out.add(ctx2);
}
out.close();
final String md5 = StringUtils.md5(prevCtg + ":" + first.splitter + ":" + first.keyAndGene.key);
final String filename = md5.substring(0, 2) + File.separatorChar + md5.substring(2) + File.separator + buffer.get(0).keyAndGene.key.replaceAll("[/\\:]", "_") + ".vcf.gz";
final OutputStream os = archiveFactory.openOuputStream(filename);
IOUtils.copyTo(tmpVcf, os);
os.flush();
os.close();
manifest.print(prevCtg);
manifest.print('\t');
manifest.print(minPos - 1);
manifest.print('\t');
manifest.print(maxPos);
manifest.print('\t');
manifest.print(first.splitter);
manifest.print('\t');
manifest.print(first.keyAndGene.gene);
manifest.print('\t');
manifest.print(first.keyAndGene.key);
manifest.print('\t');
manifest.print((archiveFactory.isTarOrZipArchive() ? "" : this.outputFile.toString() + File.separator) + filename);
manifest.print('\t');
manifest.println(buffer.size());
}
eqiter.close();
iter.close();
}
sortingcollection = null;
if (ctx == null)
break;
prevCtg = ctx.getContig();
}
for (final GeneExtractorFactory.GeneExtractor ex : extractors) {
final Map<GeneExtractorFactory.KeyAndGene, Set<String>> gene2values = ex.apply(ctx);
if (gene2values.isEmpty())
continue;
for (final GeneExtractorFactory.KeyAndGene keyAndGene : gene2values.keySet()) {
final Set<String> values = gene2values.get(keyAndGene);
if (values.isEmpty())
continue;
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.rmAttribute(VepPredictionParser.getDefaultTag());
vcb.rmAttribute(AnnPredictionParser.getDefaultTag());
vcb.attribute(ex.getInfoTag(), new ArrayList<>(values));
if (sortingcollection == null) {
sortingcollection = SortingCollection.newInstance(KeyAndLine.class, new KeyAndLineCodec(), new KeyAndLineComparator2(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sortingcollection.setDestructiveIteration(true);
}
sortingcollection.add(new KeyAndLine(new KeyGene(keyAndGene.getKey(), keyAndGene.getGene()), ex.getName(), vcfEncoder.encode(vcb.make())));
}
}
}
progess.close();
manifest.flush();
manifest.close();
archiveFactory.close();
Files.deleteIfExists(tmpVcf);
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
if (sortingcollection != null)
sortingcollection.cleanup();
CloserUtil.close(in);
CloserUtil.close(fos);
CloserUtil.close(manifest);
}
}
use of com.github.lindenb.jvarkit.util.vcf.predictions.GeneExtractorFactory in project jvarkit by lindenb.
the class VCFComposite method doVcfToVcf.
protected int doVcfToVcf(final String inputName, final VCFIterator iterin, final VariantContextWriter out) {
final VCFHeader header = iterin.getHeader();
if (!header.hasGenotypingData()) {
LOG.error("No genotypes in " + inputName);
return -1;
}
final GeneExtractorFactory geneExtractorFactory = new GeneExtractorFactory(header);
final List<GeneExtractorFactory.GeneExtractor> extractors = geneExtractorFactory.parse(this.extractorsNames);
if (extractors.isEmpty()) {
LOG.error("no gene extractor found/defined.");
return -1;
}
final Pedigree pedigree;
try {
final Set<String> sampleNames = new HashSet<>(header.getSampleNamesInOrder());
final PedigreeParser pedParser = new PedigreeParser();
pedigree = pedParser.parse(this.pedigreeFile);
if (pedigree == null || pedigree.isEmpty()) {
LOG.error("pedigree missing/empty");
return -1;
}
this.affectedSamples.addAll(pedigree.getAffectedSamples());
this.affectedSamples.removeIf(S -> !sampleNames.contains(S.getId()));
if (this.affectedSamples.isEmpty()) {
LOG.error("No Affected sample in pedigree. " + this.pedigreeFile + "/" + inputName);
return -1;
}
this.unaffectedSamples.addAll(pedigree.getUnaffectedSamples());
this.unaffectedSamples.removeIf(S -> !sampleNames.contains(S.getId()));
if (pedigree.getUnaffectedSamples().isEmpty()) {
LOG.error("No Unaffected sample in " + this.pedigreeFile + "/" + inputName);
return -1;
}
} catch (final IOException err) {
throw new RuntimeIOException(err);
}
header.addMetaDataLine(new VCFInfoHeaderLine(INFO_TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Variant of VCFComposite"));
if (!StringUtils.isBlank(this.filterNonCompositeTag)) {
header.addMetaDataLine(new VCFFilterHeaderLine(this.filterNonCompositeTag, "Not a Variant fir VCFComposite"));
}
final SAMSequenceDictionary dict = header.getSequenceDictionary();
final Comparator<String> contigCmp;
if (dict == null || dict.isEmpty()) {
contigCmp = (A, B) -> A.compareTo(B);
} else {
contigCmp = new ContigDictComparator(dict);
}
final Comparator<VariantContext> ctxComparator = (V1, V2) -> {
int i = contigCmp.compare(V1.getContig(), V2.getContig());
if (i != 0)
return i;
i = Integer.compare(V1.getStart(), V2.getStart());
if (i != 0)
return i;
return V1.getReference().compareTo(V2.getReference());
};
final Comparator<VariantLine> variantLineComparator = (V1, V2) -> {
final int i = ctxComparator.compare(V1.ctx, V2.ctx);
if (i != 0)
return i;
return Long.compare(V1.id, V2.id);
};
long ID_GENERATOR = 0L;
this.vcfDecoder = VCFUtils.createDefaultVCFCodec();
this.vcfDecoder.setVCFHeader(header, VCFHeaderVersion.VCF4_2);
this.vcfEncoder = new VCFEncoder(header, false, true);
SortingCollection<GeneAndVariant> sorting = null;
SortingCollection<VariantLine> outputSorter = null;
try {
LOG.info("reading variants and genes");
/* Gene and variant sorter */
sorting = SortingCollection.newInstance(GeneAndVariant.class, new GeneAndVariantCodec(), GeneAndVariant::compareGeneThenIndex, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sorting.setDestructiveIteration(true);
/* Variant sorter */
outputSorter = SortingCollection.newInstance(VariantLine.class, new VariantLineCodec(), variantLineComparator, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
outputSorter.setDestructiveIteration(true);
/* read input */
while (iterin.hasNext()) {
final VariantContext ctx = iterin.next();
final VariantLine variantLine = new VariantLine(++ID_GENERATOR, ctx);
if (!this.variantJexl.test(ctx)) {
outputSorter.add(variantLine);
continue;
}
if (!acceptVariant(ctx)) {
outputSorter.add(variantLine);
continue;
}
final Set<GeneIdentifier> geneKeys = new HashSet<>();
extractors.stream().map(EX -> EX.apply(ctx)).flatMap(H -> H.keySet().stream()).forEach(KG -> {
geneKeys.add(new GeneIdentifier(KG.getKey(), KG.getGene(), KG.getMethod().replace('/', '_')));
});
if (geneKeys.isEmpty()) {
outputSorter.add(variantLine);
continue;
}
for (final GeneIdentifier gk : geneKeys) {
final GeneAndVariant gav = new GeneAndVariant(gk, variantLine);
gav.gene.contig = ctx.getContig();
sorting.add(gav);
}
}
sorting.doneAdding();
LOG.info("compile per gene");
this.reportGeneWriter = (this.geneReportPath == null ? new PrintWriter(new NullOuputStream()) : IOUtils.openPathForPrintWriter(this.geneReportPath));
this.reportGeneWriter.print("#CHROM");
this.reportGeneWriter.print('\t');
this.reportGeneWriter.print("bed.start");
this.reportGeneWriter.print('\t');
this.reportGeneWriter.print("bed.end");
this.reportGeneWriter.print('\t');
this.reportGeneWriter.print("gene.key");
this.reportGeneWriter.print('\t');
this.reportGeneWriter.print("gene.label");
this.reportGeneWriter.print('\t');
this.reportGeneWriter.print("gene.source");
this.reportGeneWriter.print('\t');
this.reportGeneWriter.print("count.variants");
this.reportGeneWriter.print('\t');
this.reportGeneWriter.print("affected.counts");
this.reportGeneWriter.print('\t');
this.reportGeneWriter.print("affected.total");
this.reportGeneWriter.print('\t');
this.reportGeneWriter.print("affected.samples");
this.reportGeneWriter.println();
this.reportWriter = (this.reportPath == null ? new PrintWriter(new NullOuputStream()) : IOUtils.openPathForPrintWriter(this.reportPath));
this.reportWriter.print("#CHROM");
this.reportWriter.print('\t');
this.reportWriter.print("bed.start");
this.reportWriter.print('\t');
this.reportWriter.print("bed.end");
this.reportWriter.print('\t');
this.reportWriter.print("gene.index");
this.reportWriter.print('\t');
this.reportWriter.print("gene.key");
this.reportWriter.print('\t');
this.reportWriter.print("gene.label");
this.reportWriter.print('\t');
this.reportWriter.print("gene.source");
for (int side = 0; side < 2; ++side) {
this.reportWriter.print('\t');
final String prefix = "variant" + (side + 1) + ".";
this.reportWriter.print(prefix + "start");
this.reportWriter.print('\t');
this.reportWriter.print(prefix + "end");
this.reportWriter.print('\t');
this.reportWriter.print(prefix + "ref");
this.reportWriter.print('\t');
this.reportWriter.print(prefix + "alt");
this.reportWriter.print('\t');
this.reportWriter.print(prefix + "info");
for (final Sample sn : this.affectedSamples) {
this.reportWriter.print('\t');
this.reportWriter.print(prefix + "gt[" + sn.getId() + "].affected");
}
for (final Sample sn : this.unaffectedSamples) {
this.reportWriter.print('\t');
this.reportWriter.print(prefix + "gt[" + sn.getId() + "].unaffected");
}
}
this.reportWriter.println();
// compile data
CloseableIterator<GeneAndVariant> iter2 = sorting.iterator();
EqualRangeIterator<GeneAndVariant> eqiter = new EqualRangeIterator<>(iter2, (A, B) -> A.gene.compareTo(B.gene));
while (eqiter.hasNext()) {
final List<GeneAndVariant> variants = eqiter.next();
scan(variants.get(0).gene, variants.stream().map(L -> L.variant).collect(Collectors.toList()));
for (final GeneAndVariant ga : variants) outputSorter.add(ga.variant);
}
eqiter.close();
iter2.close();
sorting.cleanup();
//
this.reportWriter.flush();
this.reportWriter.close();
this.reportGeneWriter.flush();
this.reportGeneWriter.close();
LOG.info("write variants");
CloseableIterator<VariantLine> iter1 = outputSorter.iterator();
EqualRangeIterator<VariantLine> eqiter1 = new EqualRangeIterator<>(iter1, variantLineComparator);
out.writeHeader(header);
while (eqiter1.hasNext()) {
final List<VariantLine> array = eqiter1.next();
final VariantContext firstCtx = array.get(0).ctx;
final Set<String> set = getAnnotationsForVariant(firstCtx);
final VariantContext outCtx;
final VariantContextBuilder vcb = new VariantContextBuilder(firstCtx);
for (int y = 1; y < array.size(); ++y) {
set.addAll(getAnnotationsForVariant(array.get(y).ctx));
}
if (set.isEmpty()) {
if (StringUtils.isBlank(this.filterNonCompositeTag)) {
// ignore
continue;
} else {
vcb.filter(this.filterNonCompositeTag);
}
} else {
if (!firstCtx.isFiltered()) {
vcb.passFilters();
}
vcb.attribute(INFO_TAG, new ArrayList<>(set));
}
outCtx = vcb.make();
out.add(outCtx);
}
outputSorter.cleanup();
eqiter1.close();
iter1.close();
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
}
}
use of com.github.lindenb.jvarkit.util.vcf.predictions.GeneExtractorFactory in project jvarkit by lindenb.
the class GroupByGene method read.
private void read(final String input) throws IOException {
SortingCollection<Call> sortingCollection = null;
VCFIterator vcfIterator = null;
try {
final BcfIteratorBuilder iterbuilder = new BcfIteratorBuilder();
vcfIterator = (input == null ? iterbuilder.open(stdin()) : iterbuilder.open(input));
final VCFHeader header = vcfIterator.getHeader();
this.contigDictComparator = new ContigDictComparator(SequenceDictionaryUtils.extractRequired(header));
sortingCollection = SortingCollection.newInstance(Call.class, new CallCodec(header), (C1, C2) -> C1.compare2(C2), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sortingCollection.setDestructiveIteration(true);
final GeneExtractorFactory geneExtractorFactory = new GeneExtractorFactory(header);
final List<GeneExtractorFactory.GeneExtractor> geneExtractors = geneExtractorFactory.parse(this.extractorsNames);
final List<String> sampleNames;
if (header.getSampleNamesInOrder() != null) {
sampleNames = header.getSampleNamesInOrder();
} else {
sampleNames = Collections.emptyList();
}
final Pedigree pedigree;
if (this.pedigreePath != null) {
final PedigreeParser pedParser = new PedigreeParser();
pedigree = pedParser.parse(this.pedigreePath);
} else {
pedigree = PedigreeParser.empty();
}
final ProgressFactory.Watcher<VariantContext> progress = ProgressFactory.newInstance().dictionary(header).logger(LOG).build();
while (vcfIterator.hasNext()) {
final VariantContext ctx = progress.apply(vcfIterator.next());
if (!ctx.isVariant())
continue;
if (ignore_filtered && ctx.isFiltered())
continue;
// simplify line
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.noID();
vcb.log10PError(VariantContext.NO_LOG10_PERROR);
vcb.unfiltered();
vcb.attributes(Collections.emptyMap());
final VariantContext ctx2 = vcb.make();
final SortingCollection<Call> finalSorter = sortingCollection;
geneExtractors.stream().flatMap(EX -> EX.apply(ctx).keySet().stream()).forEach(KG -> {
final Call c = new Call();
c.ctx = ctx2;
c.gene = new GeneName(KG.getKey(), KG.getGene(), KG.getMethod());
finalSorter.add(c);
});
}
CloserUtil.close(vcfIterator);
vcfIterator = null;
sortingCollection.doneAdding();
progress.close();
/**
* dump
*/
final Set<String> casesSamples = pedigree.getAffectedSamples().stream().map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> controlsSamples = pedigree.getUnaffectedSamples().stream().map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> maleSamples = pedigree.getSamples().stream().filter(P -> P.isMale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> femaleSamples = pedigree.getSamples().stream().filter(P -> P.isFemale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Predicate<Genotype> genotypeFilter = genotype -> {
if (!genotype.isAvailable())
return false;
if (!genotype.isCalled())
return false;
if (genotype.isNoCall())
return false;
if (genotype.isHomRef())
return false;
if (this.ignore_filtered_genotype && genotype.isFiltered())
return false;
return true;
};
PrintStream pw = openPathOrStdoutAsPrintStream(this.outFile);
pw.print("#chrom");
pw.print('\t');
pw.print("min.POS");
pw.print('\t');
pw.print("max.POS");
pw.print('\t');
pw.print("gene.name");
pw.print('\t');
pw.print("gene.label");
pw.print('\t');
pw.print("gene.type");
pw.print('\t');
pw.print("samples.affected");
pw.print('\t');
pw.print("count.variations");
if (this.print_positions) {
pw.print('\t');
pw.print("positions");
}
if (!casesSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.cases");
}
if (!controlsSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.controls");
}
if (!maleSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.males");
}
if (!femaleSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.females");
}
if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
pw.print('\t');
pw.print("fisher");
}
for (final String sample : sampleNames) {
pw.print('\t');
pw.print(sample);
}
pw.println();
final CloseableIterator<Call> iter = sortingCollection.iterator();
final EqualRangeIterator<Call> eqiter = new EqualRangeIterator<>(iter, (C1, C2) -> C1.compareTo(C2));
while (eqiter.hasNext()) {
final List<Call> row = eqiter.next();
final Call first = row.get(0);
final List<VariantContext> variantList = row.stream().map(R -> R.ctx).collect(Collectors.toList());
final int minPos = variantList.stream().mapToInt(R -> R.getStart()).min().getAsInt();
final int maxPos = variantList.stream().mapToInt(R -> R.getEnd()).max().getAsInt();
final Set<String> sampleCarryingMut = new HashSet<String>();
final Counter<String> pedCasesCarryingMut = new Counter<String>();
final Counter<String> pedCtrlsCarryingMut = new Counter<String>();
final Counter<String> malesCarryingMut = new Counter<String>();
final Counter<String> femalesCarryingMut = new Counter<String>();
final Counter<String> sample2count = new Counter<String>();
for (final VariantContext ctx : variantList) {
for (final Genotype genotype : ctx.getGenotypes()) {
if (!genotypeFilter.test(genotype))
continue;
final String sampleName = genotype.getSampleName();
sample2count.incr(sampleName);
sampleCarryingMut.add(sampleName);
if (casesSamples.contains(sampleName)) {
pedCasesCarryingMut.incr(sampleName);
}
if (controlsSamples.contains(sampleName)) {
pedCtrlsCarryingMut.incr(sampleName);
}
if (maleSamples.contains(sampleName)) {
malesCarryingMut.incr(sampleName);
}
if (femaleSamples.contains(sampleName)) {
femalesCarryingMut.incr(sampleName);
}
}
}
pw.print(first.getContig());
pw.print('\t');
// convert to bed
pw.print(minPos - 1);
pw.print('\t');
pw.print(maxPos);
pw.print('\t');
pw.print(first.gene.name);
pw.print('\t');
pw.print(first.gene.label);
pw.print('\t');
pw.print(first.gene.type);
pw.print('\t');
pw.print(sampleCarryingMut.size());
pw.print('\t');
pw.print(variantList.size());
if (this.print_positions) {
pw.print('\t');
pw.print(variantList.stream().map(CTX -> String.valueOf(CTX.getStart()) + ":" + CTX.getAlleles().stream().map(A -> A.getDisplayString()).collect(Collectors.joining("/"))).collect(Collectors.joining(";")));
}
if (!casesSamples.isEmpty()) {
pw.print('\t');
pw.print(pedCasesCarryingMut.getCountCategories());
}
if (!controlsSamples.isEmpty()) {
pw.print('\t');
pw.print(pedCtrlsCarryingMut.getCountCategories());
}
if (!maleSamples.isEmpty()) {
pw.print('\t');
pw.print(malesCarryingMut.getCountCategories());
}
if (!femaleSamples.isEmpty()) {
pw.print('\t');
pw.print(femalesCarryingMut.getCountCategories());
}
if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
int count_case_mut = 0;
int count_ctrl_mut = 0;
int count_case_wild = 0;
int count_ctrl_wild = 0;
for (final String sampleName : header.getSampleNamesInOrder()) {
final boolean has_mutation = variantList.stream().map(V -> V.getGenotype(sampleName)).anyMatch(G -> G != null && genotypeFilter.test(G));
if (controlsSamples.contains(sampleName)) {
if (has_mutation) {
count_ctrl_mut++;
} else {
count_ctrl_wild++;
}
} else if (casesSamples.contains(sampleName)) {
if (has_mutation) {
count_case_mut++;
} else {
count_case_wild++;
}
}
}
final FisherExactTest fisher = FisherExactTest.compute(count_case_mut, count_case_wild, count_ctrl_mut, count_ctrl_wild);
pw.print('\t');
pw.print(fisher.getAsDouble());
}
for (final String sample : sampleNames) {
pw.print('\t');
pw.print(sample2count.count(sample));
}
pw.println();
if (pw.checkError())
break;
}
eqiter.close();
iter.close();
pw.flush();
if (this.outFile != null)
pw.close();
} finally {
CloserUtil.close(vcfIterator);
if (sortingCollection != null)
sortingCollection.cleanup();
}
}
Aggregations