use of com.github.lindenb.jvarkit.util.vcf.VCFBuffer in project jvarkit by lindenb.
the class ForkVcf method doWork.
@Override
public int doWork(List<String> args) {
if (this.outputFile == null || !this.outputFile.getName().contains(REPLACE_GROUPID)) {
LOG.error("Output file pattern undefined or doesn't contain " + REPLACE_GROUPID + " : " + this.outputFile);
return -1;
}
if (!(this.outputFile.getName().endsWith(".vcf") || this.outputFile.getName().endsWith(".vcf.gz"))) {
LOG.error("output file must end with '.vcf' or '.vcf.gz'");
return -1;
}
if (this.number_of_files <= 0) {
LOG.error("Bad value for number of files:" + this.number_of_files);
return -1;
}
BufferedReader r = null;
VcfIterator in = null;
PrintWriter manifestWriter = null;
final List<SplitGroup> groups = new ArrayList<>();
VCFBuffer vcfBuffer = null;
try {
in = openVcfIterator(oneFileOrNull(args));
manifestWriter = (this.manifestFile == null ? new PrintWriter(new NullOuputStream()) : IOUtils.openFileForPrintWriter(this.manifestFile));
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(in.getHeader());
if (!this.split_by_chunk) {
while (groups.size() < this.number_of_files) {
final SplitGroup sg = new SplitGroup(groups.size() + 1);
sg.open(in.getHeader());
manifestWriter.println(sg.getFile().getPath());
groups.add(sg);
}
int idx = 0;
while (in.hasNext()) {
final VariantContext ctx = progress.watch(in.next());
groups.get(idx % this.number_of_files)._writer.add(ctx);
++idx;
}
in.close();
} else {
long count_variants = 0;
vcfBuffer = new VCFBuffer(this.maxRecordsInRam, this.tmpDir);
vcfBuffer.writeHeader(in.getHeader());
while (in.hasNext()) {
final VariantContext ctx = progress.watch(in.next());
vcfBuffer.add(ctx);
++count_variants;
}
in.close();
final long variant_per_file = Math.max(1L, (long) Math.ceil(count_variants / (double) this.number_of_files));
LOG.info("done buffering. n=" + count_variants + " now forking " + variant_per_file + " variants for " + this.number_of_files + " files.");
VcfIterator iter2 = vcfBuffer.iterator();
long count_ctx = 0L;
while (iter2.hasNext()) {
if (groups.isEmpty() || count_ctx >= variant_per_file) {
if (!groups.isEmpty())
groups.get(groups.size() - 1).close();
final SplitGroup last = new SplitGroup(groups.size() + 1);
last.open(in.getHeader());
manifestWriter.println(last.getFile().getPath());
groups.add(last);
count_ctx = 0;
}
final VariantContext ctx = iter2.next();
groups.get(groups.size() - 1)._writer.add(ctx);
count_ctx++;
}
iter2.close();
vcfBuffer.close();
vcfBuffer.dispose();
vcfBuffer = null;
// save remaining empty VCFs
while (groups.size() < this.number_of_files) {
LOG.info("creating empty vcf");
final SplitGroup sg = new SplitGroup(groups.size() + 1);
sg.open(in.getHeader());
manifestWriter.println(sg.getFile().getPath());
sg.close();
groups.add(sg);
}
}
progress.finish();
for (final SplitGroup g : groups) {
g.close();
}
manifestWriter.flush();
manifestWriter.close();
manifestWriter = null;
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
for (final SplitGroup g : groups) {
CloserUtil.close(g);
if (in != null)
g.getFile().delete();
}
return -1;
} finally {
if (vcfBuffer != null)
vcfBuffer.dispose();
CloserUtil.close(r);
CloserUtil.close(in);
IOUtils.flush(manifestWriter);
CloserUtil.close(manifestWriter);
}
}
Aggregations