use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class ForkVcf method doWork.
@Override
public int doWork(List<String> args) {
if (this.outputFile == null || !this.outputFile.getName().contains(REPLACE_GROUPID)) {
LOG.error("Output file pattern undefined or doesn't contain " + REPLACE_GROUPID + " : " + this.outputFile);
return -1;
}
if (!(this.outputFile.getName().endsWith(".vcf") || this.outputFile.getName().endsWith(".vcf.gz"))) {
LOG.error("output file must end with '.vcf' or '.vcf.gz'");
return -1;
}
if (this.number_of_files <= 0) {
LOG.error("Bad value for number of files:" + this.number_of_files);
return -1;
}
BufferedReader r = null;
VcfIterator in = null;
PrintWriter manifestWriter = null;
final List<SplitGroup> groups = new ArrayList<>();
VCFBuffer vcfBuffer = null;
try {
in = openVcfIterator(oneFileOrNull(args));
manifestWriter = (this.manifestFile == null ? new PrintWriter(new NullOuputStream()) : IOUtils.openFileForPrintWriter(this.manifestFile));
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(in.getHeader());
if (!this.split_by_chunk) {
while (groups.size() < this.number_of_files) {
final SplitGroup sg = new SplitGroup(groups.size() + 1);
sg.open(in.getHeader());
manifestWriter.println(sg.getFile().getPath());
groups.add(sg);
}
int idx = 0;
while (in.hasNext()) {
final VariantContext ctx = progress.watch(in.next());
groups.get(idx % this.number_of_files)._writer.add(ctx);
++idx;
}
in.close();
} else {
long count_variants = 0;
vcfBuffer = new VCFBuffer(this.maxRecordsInRam, this.tmpDir);
vcfBuffer.writeHeader(in.getHeader());
while (in.hasNext()) {
final VariantContext ctx = progress.watch(in.next());
vcfBuffer.add(ctx);
++count_variants;
}
in.close();
final long variant_per_file = Math.max(1L, (long) Math.ceil(count_variants / (double) this.number_of_files));
LOG.info("done buffering. n=" + count_variants + " now forking " + variant_per_file + " variants for " + this.number_of_files + " files.");
VcfIterator iter2 = vcfBuffer.iterator();
long count_ctx = 0L;
while (iter2.hasNext()) {
if (groups.isEmpty() || count_ctx >= variant_per_file) {
if (!groups.isEmpty())
groups.get(groups.size() - 1).close();
final SplitGroup last = new SplitGroup(groups.size() + 1);
last.open(in.getHeader());
manifestWriter.println(last.getFile().getPath());
groups.add(last);
count_ctx = 0;
}
final VariantContext ctx = iter2.next();
groups.get(groups.size() - 1)._writer.add(ctx);
count_ctx++;
}
iter2.close();
vcfBuffer.close();
vcfBuffer.dispose();
vcfBuffer = null;
// save remaining empty VCFs
while (groups.size() < this.number_of_files) {
LOG.info("creating empty vcf");
final SplitGroup sg = new SplitGroup(groups.size() + 1);
sg.open(in.getHeader());
manifestWriter.println(sg.getFile().getPath());
sg.close();
groups.add(sg);
}
}
progress.finish();
for (final SplitGroup g : groups) {
g.close();
}
manifestWriter.flush();
manifestWriter.close();
manifestWriter = null;
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
for (final SplitGroup g : groups) {
CloserUtil.close(g);
if (in != null)
g.getFile().delete();
}
return -1;
} finally {
if (vcfBuffer != null)
vcfBuffer.dispose();
CloserUtil.close(r);
CloserUtil.close(in);
IOUtils.flush(manifestWriter);
CloserUtil.close(manifestWriter);
}
}
use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class VcfTreePack method doWork.
@Override
public int doWork(List<String> args) {
setDimension(this.dimensionStr);
VcfIterator in = null;
try {
parseConfigFile();
if (super.nodeFactoryChain.next == null) {
LOG.error("no path defined");
return -1;
}
if (args.isEmpty()) {
LOG.info("Reading stdin");
in = VCFUtils.createVcfIteratorFromStream(stdin());
scan(in);
CloserUtil.close(in);
} else {
for (final String f : args) {
in = VCFUtils.createVcfIterator(f);
scan(in);
CloserUtil.close(in);
}
}
this.layout();
this.svg(this.outputFile);
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(in);
}
}
use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class VcfToRdf method scanVCF.
private void scanVCF(final File filein) throws IOException {
VcfIterator in = null;
URI source = null;
try {
if (filein != null)
source = filein.toURI();
in = (filein == null ? VCFUtils.createVcfIteratorStdin() : VCFUtils.createVcfIteratorFromFile(filein));
final VCFHeader header = in.getHeader();
final VepPredictionParser vepPredictionParser = new VepPredictionParserFactory(header).get();
writeHeader(header, source);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
while (in.hasNext()) {
if (this.w.checkError()) {
LOG.warn("I/O interruption");
break;
}
final VariantContext ctx = progress.watch(in.next());
/* Variant */
final URI variant = URI.create("urn:variant/" + ctx.getContig() + ":" + ctx.getStart() + ":" + ctx.getReference().getBaseString());
emit(variant, "rdf:type", "vcf:Variant", "vcf:chrom", URI.create("urn:chrom/" + ctx.getContig()), "vcf:position", ctx.getStart(), "vcf:ref", ctx.getReference().getBaseString(), "vcf:id", (ctx.hasID() ? ctx.getID() : null), "vcf:qual", (ctx.hasLog10PError() ? ctx.getPhredScaledQual() : null));
if (this.printAlleles) {
for (final Allele alt : ctx.getAlternateAlleles()) {
emit(variant, "vcf:alt", alt.getBaseString());
}
}
if (this.printFilters) {
for (final String f : ctx.getFilters()) {
emit(variant, "vcf:filter", URI.create("urn:filter/" + f));
}
}
if (this.printVep) {
for (final VepPrediction prediction : vepPredictionParser.getPredictions(ctx)) {
/*
final List<Object> L=new ArrayList<>();
L.add("rdf:type");L.add("vep:Prediction");
L.add("vcf:variant"); L.add(variant);
L.add("vcf:allele");L.add(prediction.getAllele().getBaseString());
for(final SequenceOntologyTree.Term term:prediction.getSOTerms())
{
L.add("vcf:so");
L.add(URI.create(term.getUri()));
}
if(prediction.getEnsemblTranscript()!=null)
{
final URI transcriptid=URI.create("http://www.ensembl.org/id/"+prediction.getEnsemblTranscript());
L.add("vep:transcript");
L.add(transcriptid);
if(prediction.getEnsemblGene()!=null)
{
emit(transcriptid,
"uniprot:transcribedFrom",//used in uniprot dump
URI.create("http://www.ensembl.org/id/"+prediction.getEnsemblGene())
);
}
if(prediction.getEnsemblProtein()!=null)
{
emit(
transcriptid,
"uniprot:translatedTo",//used in uniprot dump
URI.create("http://www.ensembl.org/id/"+prediction.getEnsemblProtein())
);
}
}
emit(
URI.create("urn:vep/"+(++id_generator)),
L.toArray()
);
*/
}
}
if (this.printGenotypes) {
for (final String sample : ctx.getSampleNames()) {
final Genotype g = ctx.getGenotype(sample);
final List<Object> L = new ArrayList<>();
L.add("vcf:sample");
L.add(URI.create("urn:sample/" + sample));
L.add("vcf:variant");
L.add(variant);
L.add("rdf:type");
L.add("vcf:Genotype");
if (g.hasDP()) {
L.add("vcf:dp");
L.add(g.getDP());
}
if (g.hasGQ()) {
L.add("vcf:gq");
L.add(g.getGQ());
}
if (g.isCalled()) {
if (g.isHet()) {
if (g.isHetNonRef()) {
L.add("rdf:type");
L.add("vcf:HetNonRefGenotype");
} else {
L.add("rdf:type");
L.add("vcf:HetGenotype");
}
} else if (g.isHom()) {
if (g.isHomRef()) {
L.add("rdf:type");
L.add("vcf:HomRefGenotype");
} else {
L.add("rdf:type");
L.add("vcf:HomVarGenotype");
}
}
for (final Allele a : g.getAlleles()) {
L.add("vcf:allele");
L.add(a.getBaseString());
}
}
emit(URI.create("urn:gt/" + ctx.getContig() + ":" + ctx.getStart() + ":" + ctx.getReference().getBaseString() + ":" + sample), L.toArray());
}
}
}
in.close();
in = null;
progress.finish();
} catch (final Exception e) {
throw new IOException(e);
} finally {
CloserUtil.close(in);
}
}
use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class VcfOptimizePedForSkat method doWork.
@Override
public int doWork(final List<String> args) {
VcfIterator r = null;
try {
this.random = new Random(this.seed == -1 ? System.currentTimeMillis() : seed);
if (this.nSamplesRemove < 1) {
LOG.error("bad number of samples to remove.");
return -1;
}
final SkatFactory.SkatExecutor executor = this.skatInstance.build();
r = super.openVcfIterator(oneFileOrNull(args));
final List<VariantContext> variants = new ArrayList<>();
while (r.hasNext()) {
final VariantContext ctx = r.next();
if (!executor.getUpstreamVariantFilter().test(ctx))
continue;
variants.add(ctx);
}
LOG.info("number of variants : " + variants.size());
if (variants.stream().map(V -> V.getContig()).collect(Collectors.toSet()).size() != 1) {
LOG.error("multiple contig/chromosome in input.");
return -1;
}
final VCFHeader header = r.getHeader();
final Set<String> sampleNames = new HashSet<>(header.getSampleNamesInOrder());
if (this.pedigreeFile != null) {
this.pedigree = new Pedigree.Parser().parse(this.pedigreeFile);
} else {
this.pedigree = new Pedigree.Parser().parse(header);
}
r.close();
r = null;
final List<Pedigree.Person> samples = this.pedigree.getPersons().stream().filter(P -> (P.isAffected() || P.isUnaffected())).filter(S -> sampleNames.contains(S.getId())).collect(Collectors.toList());
if (samples.isEmpty()) {
LOG.error("Not enough Samples in pedigree/vcf");
return -1;
}
if (!samples.stream().anyMatch(P -> P.isAffected())) {
LOG.error("No affected Samples in pedigree/vcf");
return -1;
}
if (!samples.stream().anyMatch(P -> P.isUnaffected())) {
LOG.error("No unaffected Samples in pedigree/vcf");
return -1;
}
long nIter = 0L;
while (max_iterations == -1L || nIter < max_iterations) {
exec(nIter, header, variants, samples, executor);
++nIter;
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
}
}
use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class VCFFilterJS method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VcfIterator r, final VariantContextWriter w) {
try {
final VCFHeader header = r.getHeader();
final VcfTools vcfTools = new VcfTools(header);
final VCFHeader h2 = new VCFHeader(header);
addMetaData(h2);
final Pedigree pedigree;
if (pedigreeFile != null) {
pedigree = Pedigree.newParser().parse(this.pedigreeFile);
} else // try to read from VCF header
{
Pedigree p = null;
try {
p = Pedigree.newParser().parse(header);
} catch (final Exception err) {
LOG.warn("cannot decode pedigree from vcf header");
p = Pedigree.createEmptyPedigree();
}
pedigree = p;
}
final VCFFilterHeaderLine filterHeaderLine = (filteredTag.trim().isEmpty() ? null : new VCFFilterHeaderLine(this.filteredTag.trim(), "Filtered with " + getProgramName()));
if (filterHeaderLine != null)
h2.addMetaDataLine(filterHeaderLine);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
final Bindings bindings = this.compiledScript.getEngine().createBindings();
bindings.put("header", header);
bindings.put("tools", vcfTools);
bindings.put("pedigree", pedigree);
bindings.put("individuals", Collections.unmodifiableList(pedigree.getPersons().stream().filter(P -> (P.isAffected() || P.isUnaffected())).filter(P -> P.hasUniqId()).filter(P -> header.getSampleNamesInOrder().contains(P.getId())).collect(Collectors.toList())));
for (final String jsonkv : this.jsonFiles) {
int eq = jsonkv.indexOf("=");
if (eq <= 0)
throw new JvarkitException.UserError("Bad format for json . expected key=/path/to/file.json but got '" + jsonkv + "'");
final String key = jsonkv.substring(0, eq);
final FileReader jsonFile = new FileReader(jsonkv.substring(eq + 1));
JsonParser jsonParser = new JsonParser();
final JsonElement root = jsonParser.parse(jsonFile);
jsonFile.close();
bindings.put(key, root);
}
w.writeHeader(h2);
while (r.hasNext() && !w.checkError()) {
final VariantContext variation = progress.watch(r.next());
bindings.put("variant", variation);
final Object result = compiledScript.eval(bindings);
// result is an array of a collection of variants
if (result != null && (result.getClass().isArray() || (result instanceof Collection))) {
final Collection<?> col;
if (result.getClass().isArray()) {
final Object[] array = (Object[]) result;
col = Arrays.asList(array);
} else {
col = (Collection<?>) result;
}
// write all of variants
for (final Object item : col) {
if (item == null)
throw new JvarkitException.UserError("item in array is null");
if (!(item instanceof VariantContext))
throw new JvarkitException.UserError("item in array is not a VariantContext " + item.getClass());
w.add(VariantContext.class.cast(item));
}
} else // result is a VariantContext
if (result != null && (result instanceof VariantContext)) {
w.add(VariantContext.class.cast(result));
} else {
boolean accept = true;
if (result == null) {
accept = false;
} else if (result instanceof Boolean) {
if (Boolean.FALSE.equals(result))
accept = false;
} else if (result instanceof Number) {
if (((Number) result).intValue() != 1)
accept = false;
} else {
LOG.warn("Script returned something that is not a boolean or a number:" + result.getClass());
accept = false;
}
if (!accept) {
if (filterHeaderLine != null) {
final VariantContextBuilder vcb = new VariantContextBuilder(variation);
vcb.filter(filterHeaderLine.getID());
w.add(vcb.make());
}
continue;
}
// set PASS filter if needed
if (filterHeaderLine != null && !variation.isFiltered()) {
w.add(new VariantContextBuilder(variation).passFilters().make());
continue;
}
w.add(variation);
}
}
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
}
}
Aggregations