use of com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator in project jvarkit by lindenb.
the class VcfGeneSplitter method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, File outputFile) {
SortingCollection<KeyAndLine> sortingcollection = null;
BufferedReader in = null;
FileOutputStream fos = null;
ZipOutputStream zout = null;
CloseableIterator<KeyAndLine> iter = null;
PrintWriter pw = null;
try {
in = inputName == null ? IOUtils.openStreamForBufferedReader(stdin()) : IOUtils.openURIForBufferedReading(inputName);
final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(in);
/**
* find splitter by name
*/
final VepPredictionParser vepPredictionParser = new VepPredictionParserFactory().header(cah.header).get();
sortingcollection = SortingCollection.newInstance(KeyAndLine.class, new KeyAndLineCodec(), new KeyAndLineComparator(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sortingcollection.setDestructiveIteration(true);
// read variants
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(cah.header);
String line;
while ((line = in.readLine()) != null) {
final VariantContext ctx = progess.watch(cah.codec.decode(line));
// no check for ctx.ifFiltered here, we do this later.
for (final String key : this.getVariantKeys(vepPredictionParser, ctx)) {
sortingcollection.add(new KeyAndLine(key, line));
}
}
progess.finish();
sortingcollection.doneAdding();
LOG.info("creating zip " + outputFile);
fos = new FileOutputStream(outputFile);
zout = new ZipOutputStream(fos);
final File tmpReportFile = File.createTempFile("_tmp.", ".txt", writingSortingCollection.getTmpDirectories().get(0));
tmpReportFile.deleteOnExit();
pw = IOUtils.openFileForPrintWriter(tmpReportFile);
pw.println("#chrom\tstart\tend\tkey\tCount_Variants");
iter = sortingcollection.iterator();
final EqualRangeIterator<KeyAndLine> eqiter = new EqualRangeIterator<>(iter, new Comparator<KeyAndLine>() {
@Override
public int compare(final KeyAndLine o1, final KeyAndLine o2) {
return o1.key.compareTo(o2.key);
}
});
while (eqiter.hasNext()) {
final List<KeyAndLine> buffer = eqiter.next();
final KeyAndLine first = buffer.get(0);
LOG.info(first.key);
final List<VariantContext> variants = new ArrayList<>(buffer.size());
String contig = null;
int chromStart = Integer.MAX_VALUE;
int chromEnd = 0;
for (final KeyAndLine kal : buffer) {
final VariantContext ctx = cah.codec.decode(kal.ctx);
variants.add(ctx);
contig = ctx.getContig();
chromStart = Math.min(chromStart, ctx.getStart());
chromEnd = Math.max(chromEnd, ctx.getEnd());
}
pw.println(contig + "\t" + (chromStart - 1) + // -1 for bed compatibility
"\t" + chromEnd + "\t" + first.key + "\t" + variants.size());
// save vcf file
final ZipEntry ze = new ZipEntry(this.baseZipDir + "/" + first.key + ".vcf");
zout.putNextEntry(ze);
final VariantContextWriter out = VCFUtils.createVariantContextWriterToOutputStream(IOUtils.uncloseableOutputStream(zout));
final VCFHeader header2 = addMetaData(new VCFHeader(cah.header));
header2.addMetaDataLine(new VCFHeaderLine("VcfGeneSplitter.Name", String.valueOf(first.key)));
out.writeHeader(header2);
for (final VariantContext ctx : variants) {
out.add(ctx);
}
// yes because wrapped into IOUtils.encloseableOutputSream
out.close();
zout.closeEntry();
}
eqiter.close();
iter.close();
iter = null;
progess.finish();
LOG.info("saving report");
pw.flush();
pw.close();
final ZipEntry entry = new ZipEntry(this.baseZipDir + "/manifest.bed");
zout.putNextEntry(entry);
IOUtils.copyTo(tmpReportFile, zout);
zout.closeEntry();
zout.finish();
zout.close();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
if (sortingcollection != null)
sortingcollection.cleanup();
CloserUtil.close(in);
CloserUtil.close(fos);
CloserUtil.close(pw);
}
}
use of com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator in project jvarkit by lindenb.
the class VCFCompareGT method doWork.
@Override
public int doWork(final List<String> arguments) {
final List<File> inputVcfFiles = new ArrayList<>(IOUtil.unrollFiles(arguments.stream().map(F -> new File(F)).collect(Collectors.toCollection(HashSet::new)), ".vcf", "vcf.gz"));
if (inputVcfFiles.isEmpty()) {
LOG.error("VCF missing.");
return -1;
}
VariantComparator varcmp = new VariantComparator();
SortingCollection<Variant> variants = null;
final Set<String> sampleNames = new LinkedHashSet<>();
try {
variants = SortingCollection.newInstance(Variant.class, new VariantCodec(), varcmp, writingSortingCollection.getMaxRecordsInRam(), writingSortingCollection.getTmpPaths());
variants.setDestructiveIteration(true);
final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
metaData.add(new VCFHeaderLine(getClass().getSimpleName(), "version:" + getVersion() + " command:" + getProgramCommandLine()));
for (int i = 0; i < inputVcfFiles.size(); ++i) {
final File vcfFile = inputVcfFiles.get(i);
LOG.info("Opening " + vcfFile);
final VCFFileReader vcfFileReader = new VCFFileReader(vcfFile, false);
final CloseableIterator<VariantContext> iter = vcfFileReader.iterator();
final VCFHeader header = vcfFileReader.getFileHeader();
sampleNames.addAll(header.getSampleNamesInOrder());
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "_" + ((i) + 1), "File: " + vcfFile.getPath()));
long nLines = 0;
while (iter.hasNext()) {
final VariantContext var = iter.next();
if (nLines++ % 10000 == 0) {
LOG.info(vcfFile + " " + nLines);
}
if (!var.isVariant())
continue;
if (!var.hasGenotypes())
continue;
for (final Genotype genotype : var.getGenotypes()) {
final Variant rec = new Variant();
if (!genotype.isAvailable())
continue;
if (!genotype.isCalled())
continue;
if (genotype.isNoCall())
continue;
rec.file_index = i + 1;
rec.sampleName = genotype.getSampleName();
rec.chrom = var.getContig();
rec.start = var.getStart();
rec.end = var.getEnd();
rec.ref = var.getReference().getDisplayString();
if (var.hasID()) {
rec.id = var.getID();
}
if (genotype.hasDP()) {
rec.dp = genotype.getDP();
}
if (genotype.hasGQ()) {
rec.gq = genotype.getGQ();
}
final List<Allele> alleles = genotype.getAlleles();
if (alleles == null)
continue;
if (alleles.size() == 1) {
rec.a1 = alleles.get(0).getDisplayString().toUpperCase();
rec.a2 = rec.a1;
} else if (alleles.size() == 2) {
rec.a1 = alleles.get(0).getDisplayString().toUpperCase();
rec.a2 = alleles.get(1).getDisplayString().toUpperCase();
if (rec.a1.compareTo(rec.a2) > 0) {
String tmp = rec.a2;
rec.a2 = rec.a1;
rec.a1 = tmp;
}
} else {
continue;
}
variants.add(rec);
}
}
iter.close();
vcfFileReader.close();
}
variants.doneAdding();
LOG.info("Done Adding");
final Set<String> newSampleNames = new HashSet<>();
for (int i = 0; i < inputVcfFiles.size(); ++i) {
for (final String sample : sampleNames) {
newSampleNames.add(sample + "_" + ((i) + 1));
}
}
final String GenpotypeChangedKey = "GCH";
final String GenpotypeCreated = "GNW";
final String GenpotypeDiff = "GDF";
metaData.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
metaData.add(new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Depth"));
metaData.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Qual"));
metaData.add(new VCFFormatHeaderLine(GenpotypeChangedKey, 1, VCFHeaderLineType.Integer, "Changed Genotype"));
metaData.add(new VCFFormatHeaderLine(GenpotypeCreated, 1, VCFHeaderLineType.Integer, "Genotype Created/Deleted"));
metaData.add(new VCFInfoHeaderLine(GenpotypeDiff, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples with Genotype Difference"));
final VCFHeader header = new VCFHeader(metaData, new ArrayList<String>(newSampleNames));
final VariantContextWriter w = super.openVariantContextWriter(outputFile);
w.writeHeader(header);
final PosComparator posCompare = new PosComparator();
final EqualRangeIterator<Variant> iter = new EqualRangeIterator<>(variants.iterator(), posCompare);
while (iter.hasNext()) {
final List<Variant> row = iter.next();
/**
* this sample is not always the same
*/
final Set<String> samplesModified = new TreeSet<>();
/**
* the number of sample is different from vcflist.size()
*/
final Set<String> samplesCreates = new TreeSet<>();
final Counter<String> samplesSeen = new Counter<>();
for (int x = 0; x < row.size(); ++x) {
final Variant var1 = row.get(x);
samplesSeen.incr(var1.sampleName);
for (int y = x + 1; y < row.size(); ++y) {
final Variant var2 = row.get(y);
if (!var2.sampleName.equals(var1.sampleName))
continue;
if (var1.a1.equals(var2.a1) && var1.a2.equals(var2.a2))
continue;
samplesModified.add(var1.sampleName);
}
}
for (final String sampleName : samplesSeen.keySet()) {
if (samplesSeen.count(sampleName) != inputVcfFiles.size()) {
samplesCreates.add(sampleName);
}
}
final Variant first = row.get(0);
final Set<Allele> alleles = new HashSet<>();
alleles.add(Allele.create(first.ref, true));
for (final Variant var : row) {
alleles.add(Allele.create(var.a1, var.a1.equalsIgnoreCase(var.ref)));
alleles.add(Allele.create(var.a2, var.a2.equalsIgnoreCase(var.ref)));
}
final VariantContextBuilder b = new VariantContextBuilder(getClass().getName(), first.chrom, first.start, first.end, alleles);
// build genotypes
final List<Genotype> genotypes = new ArrayList<Genotype>();
for (final Variant var : row) {
// alleles for this genotype
final List<Allele> galleles = new ArrayList<Allele>();
galleles.add(Allele.create(var.a1, var.a1.equalsIgnoreCase(var.ref)));
galleles.add(Allele.create(var.a2, var.a2.equalsIgnoreCase(var.ref)));
final GenotypeBuilder gb = new GenotypeBuilder();
gb.DP(var.dp);
gb.alleles(galleles);
gb.name(var.sampleName + "_" + var.file_index);
gb.GQ(var.gq);
gb.attribute(GenpotypeChangedKey, samplesModified.contains(var.sampleName) ? 1 : 0);
gb.attribute(GenpotypeCreated, samplesCreates.contains(var.sampleName) ? 1 : 0);
genotypes.add(gb.make());
}
b.genotypes(genotypes);
b.id(first.id);
if (!(samplesModified.isEmpty() && samplesCreates.isEmpty())) {
Set<String> set2 = new TreeSet<String>(samplesModified);
set2.addAll(samplesCreates);
b.attribute(GenpotypeDiff, set2.toArray());
}
if (!only_print_modified || !(samplesModified.isEmpty() && samplesCreates.isEmpty())) {
w.add(b.make());
}
}
iter.close();
w.close();
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
if (variants != null)
try {
variants.cleanup();
} catch (Exception err) {
}
}
return 0;
}
use of com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator in project jvarkit by lindenb.
the class VcfIn method scanFileSorted.
private int scanFileSorted(final VariantContextWriter vcw, final String databaseVcfUri, final VcfIterator userVcfIn) {
EqualRangeVcfIterator equalRangeDbIter = null;
EqualRangeIterator<VariantContext> equalRangeUserVcf = null;
try {
final VCFHeader header = new VCFHeader(userVcfIn.getHeader());
final SAMSequenceDictionary userVcfDict = header.getSequenceDictionary();
// / NO need if(dict1==null)
if (userVcfDict == null) {
LOG.error(JvarkitException.VcfDictionaryMissing.getMessage("user file"));
return -1;
}
final Comparator<VariantContext> userVcfComparator = VCFUtils.createTidPosComparator(userVcfDict);
equalRangeDbIter = new EqualRangeVcfIterator(VCFUtils.createVcfIterator(databaseVcfUri), userVcfComparator);
this.addMetaData(header);
vcw.writeHeader(header);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(userVcfDict).logger(LOG);
equalRangeUserVcf = new EqualRangeIterator<>(userVcfIn, userVcfComparator);
while (equalRangeUserVcf.hasNext()) {
final List<VariantContext> ctxList = equalRangeUserVcf.next();
progress.watch(ctxList.get(0));
// fill both contextes
final List<VariantContext> dbContexes = new ArrayList<VariantContext>(equalRangeDbIter.next(ctxList.get(0)));
for (final VariantContext userCtx : ctxList) {
boolean keep = dbContexes.stream().filter(V -> sameContext(userCtx, V)).anyMatch(V -> allUserAltFoundInDatabase(userCtx, V));
addVariant(vcw, userCtx, keep);
}
if (vcw.checkError())
break;
}
equalRangeUserVcf.close();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(equalRangeDbIter);
CloserUtil.close(userVcfIn);
CloserUtil.close(vcw);
}
}
use of com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator in project jvarkit by lindenb.
the class VCFComposite method doWork.
@Override
public int doWork(final List<String> args) {
PrintWriter out = null;
try {
out = super.openFileOrStdoutAsPrintWriter(this.outputFile);
if (listModels) {
for (final Type t : Type.values()) {
out.println(t.name());
out.println("\t" + t.getDescription());
}
out.flush();
return 0;
}
this.pedigree = Pedigree.newParser().parse(pedigreeFile);
if (this.pedigree.getAffected().isEmpty()) {
LOG.error("No Affected sample in " + this.pedigreeFile);
return -1;
}
if (this.pedigree.getUnaffected().isEmpty()) {
LOG.error("No Unaffected sample in " + this.pedigreeFile);
return -1;
}
final DiseaseModel model = this.createModel();
final String inputName = super.oneFileOrNull(args);
final LineIterator r = (inputName == null ? IOUtils.openStreamForLineIterator(stdin()) : IOUtils.openURIForLineIterator(inputName));
final VCFCodec codec = new VCFCodec();
final VCFHeader header = (VCFHeader) codec.readActualHeader(r);
final AnnPredictionParser annParser = new AnnPredictionParserFactory(header).get();
final VepPredictionParser vepParser = new VepPredictionParserFactory(header).get();
// final VCFHeader h2=new VCFHeader(header.getMetaDataInInputOrder(),header.getSampleNamesInOrder());
// h2.addMetaDataLine(new VCFInfoHeaderLine(this.TAG,1,VCFHeaderLineType.String,"Values from bigwig file: "+BIGWIG));
SortingCollection<GeneAndVariant> sorting = null;
String prevContig = null;
for (; ; ) {
String line;
final VariantContext ctx;
if (r.hasNext()) {
line = r.next();
ctx = codec.decode(line);
} else {
line = null;
ctx = null;
}
if (ctx == null || !ctx.getContig().equals(prevContig)) {
if (sorting != null) {
LOG.debug("Dump contig " + prevContig);
sorting.doneAdding();
CloseableIterator<GeneAndVariant> iter2 = sorting.iterator();
EqualRangeIterator<GeneAndVariant> eqiter = new EqualRangeIterator<>(iter2, (A, B) -> A.gene.compareTo(B.gene));
while (eqiter.hasNext()) {
final List<GeneAndVariant> variants = eqiter.next();
model.scan(variants.get(0).gene, variants.stream().map(L -> codec.decode(L.ctxLine)).collect(Collectors.toList()), out);
}
eqiter.close();
iter2.close();
sorting.cleanup();
}
sorting = null;
if (ctx == null)
break;
prevContig = ctx.getContig();
}
if (!ctx.isVariant())
continue;
if (!acceptFiltered && ctx.isFiltered())
continue;
if (!acceptID && ctx.hasID())
continue;
if (!model.accept(ctx))
continue;
final Set<String> geneKeys = new HashSet<>();
for (final AnnPredictionParser.AnnPrediction pred : annParser.getPredictions(ctx)) {
geneKeys.addAll(pred.getGeneKeys().stream().map(S -> ctx.getContig() + "_" + S).collect(Collectors.toSet()));
}
for (final VepPredictionParser.VepPrediction pred : vepParser.getPredictions(ctx)) {
geneKeys.addAll(pred.getGeneKeys().stream().map(S -> ctx.getContig() + "_" + S).collect(Collectors.toSet()));
}
if (sorting == null) {
sorting = SortingCollection.newInstance(GeneAndVariant.class, new GeneAndVariantCodec(), (A, B) -> {
int i = A.gene.compareTo(B.gene);
if (i != 0)
return i;
return A.ctxLine.compareTo(B.ctxLine);
}, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sorting.setDestructiveIteration(true);
}
for (final String gk : geneKeys) {
final GeneAndVariant gav = new GeneAndVariant();
gav.gene = gk;
gav.ctxLine = line;
sorting.add(gav);
}
}
out.flush();
out.close();
out = null;
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
}
}
use of com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator in project jvarkit by lindenb.
the class VcfLoopOverGenes method doWork.
@SuppressWarnings("resource")
@Override
public int doWork(final List<String> args) {
PrintWriter pw = null;
VCFFileReader vcfFileReader = null;
CloseableIterator<VariantContext> iter = null;
CloseableIterator<GeneLoc> iter2 = null;
BufferedReader br = null;
ArchiveFactory archive = null;
try {
final File vcf = new File(oneAndOnlyOneFile(args));
vcfFileReader = new VCFFileReader(vcf, (this.geneFile != null || !StringUtil.isBlank(this.regionStr)));
this.dictionary = vcfFileReader.getFileHeader().getSequenceDictionary();
if (this.dictionary == null) {
throw new JvarkitException.VcfDictionaryMissing(vcf);
}
final VcfTools tools = new VcfTools(vcfFileReader.getFileHeader());
if (!this.prefix.isEmpty() && !this.prefix.endsWith(".")) {
this.prefix += ".";
}
if (this.geneFile == null) {
final SortingCollection<GeneLoc> sortingCollection = SortingCollection.newInstance(GeneLoc.class, new GeneLocCodec(), (A, B) -> A.compareTo(B), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sortingCollection.setDestructiveIteration(true);
if (StringUtil.isBlank(this.regionStr)) {
iter = vcfFileReader.iterator();
} else {
final IntervalParser parser = new IntervalParser(this.dictionary);
parser.setContigNameIsWholeContig(true);
final Interval interval = parser.parse(this.regionStr);
if (interval == null) {
LOG.error("Cannot parse interval " + this.regionStr);
return -1;
}
iter = vcfFileReader.query(interval.getContig(), interval.getStart(), interval.getEnd());
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(vcfFileReader.getFileHeader()).logger(LOG);
if (this.splitMethod.equals(SplitMethod.Annotations)) {
while (iter.hasNext()) {
final VariantContext ctx = progress.watch(iter.next());
for (final AnnPredictionParser.AnnPrediction pred : tools.getAnnPredictionParser().getPredictions(ctx)) {
if (this.snpEffNoIntergenic && pred.isIntergenicRegion()) {
continue;
}
if (!StringUtil.isBlank(pred.getGeneName())) {
sortingCollection.add(create(ctx, pred.getGeneName(), SourceType.ANN_GeneName));
}
if (!StringUtil.isBlank(pred.getGeneId())) {
sortingCollection.add(create(ctx, pred.getGeneId(), SourceType.ANN_GeneID));
}
if (!StringUtil.isBlank(pred.getFeatureId())) {
sortingCollection.add(create(ctx, pred.getFeatureId(), SourceType.ANN_FeatureID));
}
}
for (final VepPredictionParser.VepPrediction pred : tools.getVepPredictionParser().getPredictions(ctx)) {
if (!StringUtil.isBlank(pred.getGene())) {
sortingCollection.add(create(ctx, pred.getGene(), SourceType.VEP_Gene));
}
if (!StringUtil.isBlank(pred.getFeature())) {
sortingCollection.add(create(ctx, pred.getFeature(), SourceType.VEP_Feature));
}
if (!StringUtil.isBlank(pred.getSymbol())) {
sortingCollection.add(create(ctx, pred.getSymbol(), SourceType.VEP_Symbol));
}
if (!StringUtil.isBlank(pred.getHgncId())) {
sortingCollection.add(create(ctx, pred.getHgncId(), SourceType.VEP_HgncId));
}
}
}
} else /**
* split VCF per sliding window of variants
*/
if (this.splitMethod.equals(SplitMethod.VariantSlidingWindow)) {
if (this.variantsWinCount < 1) {
LOG.error("Bad value for variantsWinCount");
return -1;
}
if (this.variantsWinShift < 1 || this.variantsWinShift > this.variantsWinCount) {
LOG.error("Bad value for variantsWinShift");
return -1;
}
final List<VariantContext> buffer = new ArrayList<>(this.variantsWinCount);
/**
* routine to dump buffer into sorting collection
*/
final Runnable dumpBuffer = () -> {
if (buffer.isEmpty())
return;
final String contig = buffer.get(0).getContig();
final int chromStart = buffer.stream().mapToInt(CTX -> CTX.getStart()).min().getAsInt();
// use last of start too
final int chromEnd0 = buffer.stream().mapToInt(CTX -> CTX.getStart()).max().getAsInt();
// final int chromEnd1 = buffer.stream().mapToInt(CTX->CTX.getEnd()).max().getAsInt();
final String identifier = contig + "_" + String.format(NUM_FORMAT, chromStart) + "_" + String.format(NUM_FORMAT, chromEnd0);
for (final VariantContext ctx : buffer) {
sortingCollection.add(create(ctx, identifier, SourceType.SlidingVariants));
}
};
while (iter.hasNext()) {
VariantContext ctx = progress.watch(iter.next());
/* reduce the memory footprint for this context */
ctx = new VariantContextBuilder(ctx).genotypes(Collections.emptyList()).unfiltered().rmAttributes(new ArrayList<>(ctx.getAttributes().keySet())).make();
if (!buffer.isEmpty() && !buffer.get(0).getContig().equals(ctx.getContig())) {
dumpBuffer.run();
buffer.clear();
}
buffer.add(ctx);
if (buffer.size() >= this.variantsWinCount) {
dumpBuffer.run();
final int fromIndex = Math.min(this.variantsWinShift, buffer.size());
buffer.subList(0, fromIndex).clear();
}
}
dumpBuffer.run();
buffer.clear();
} else if (this.splitMethod.equals(SplitMethod.ContigSlidingWindow)) {
if (this.contigWinLength < 1) {
LOG.error("Bad value for contigWinCount");
return -1;
}
if (this.contigWinShift < 1 || this.contigWinShift > this.contigWinLength) {
LOG.error("Bad value for contigWinShift");
return -1;
}
while (iter.hasNext()) {
VariantContext ctx = progress.watch(iter.next());
/* reduce the memory footprint for this context */
ctx = new VariantContextBuilder(ctx).genotypes(Collections.emptyList()).unfiltered().rmAttributes(new ArrayList<>(ctx.getAttributes().keySet())).make();
int start = 0;
while (start <= ctx.getStart()) {
if (start + this.contigWinLength >= ctx.getStart()) {
final int chromStart = start;
final int chromEnd0 = start + this.contigWinLength;
final String identifier = ctx.getContig() + "_" + String.format(NUM_FORMAT, chromStart) + "_" + String.format(NUM_FORMAT, chromEnd0);
sortingCollection.add(create(ctx, identifier, SourceType.SlidingContig));
}
start += this.contigWinShift;
}
}
} else {
throw new IllegalStateException("No such method: " + this.splitMethod);
}
sortingCollection.doneAdding();
progress.finish();
iter.close();
iter = null;
pw = super.openFileOrStdoutAsPrintWriter(this.outputFile);
iter2 = sortingCollection.iterator();
final EqualRangeIterator<GeneLoc> eqiter = new EqualRangeIterator<>(iter2, this.compareGeneName);
int geneIdentifierId = 0;
while (eqiter.hasNext()) {
final List<GeneLoc> gene = eqiter.next();
pw.print(gene.get(0).contig);
pw.print('\t');
// -1 for BED
pw.print(gene.stream().mapToInt(G -> G.start).min().getAsInt() - 1);
pw.print('\t');
pw.print(gene.stream().mapToInt(G -> G.end).max().getAsInt());
pw.print('\t');
pw.print(this.prefix + String.format("%09d", ++geneIdentifierId));
pw.print('\t');
pw.print(gene.get(0).geneName);
pw.print('\t');
pw.print(gene.get(0).sourceType);
pw.print('\t');
pw.print(gene.size());
pw.println();
}
pw.flush();
pw.close();
pw = null;
eqiter.close();
iter2.close();
iter2 = null;
sortingCollection.cleanup();
} else {
if (this.nJobs < 1) {
this.nJobs = Math.max(1, Runtime.getRuntime().availableProcessors());
LOG.info("setting njobs to " + this.nJobs);
}
final ExecutorService executorService;
final List<Future<Integer>> futureResults;
if (this.nJobs > 1) {
executorService = new ThreadPoolExecutor(this.nJobs, this.nJobs, 0L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<Runnable>());
futureResults = new ArrayList<>();
} else {
executorService = null;
futureResults = Collections.emptyList();
}
if (this.outputFile == null) {
LOG.error("When scanning a VCF with " + this.geneFile + ". Output file must be defined");
}
if (!this.exec.isEmpty()) {
if (this.outputFile.getName().endsWith(".zip")) {
LOG.error("Cannot execute " + this.exec + " when saving to a zip.");
return -1;
}
}
archive = ArchiveFactory.open(this.outputFile);
PrintWriter manifest = this.deleteAfterCommand && !this.exec.isEmpty() ? // all files will be deleted, no manifest needed
new PrintWriter(new NullOuputStream()) : archive.openWriter(this.prefix + "manifest.txt");
br = IOUtils.openFileForBufferedReading(this.geneFile);
final BedLineCodec bedCodec = new BedLineCodec();
for (; ; ) {
if (!futureResults.isEmpty()) {
int i = 0;
while (i < futureResults.size()) {
final Future<Integer> r = futureResults.get(i);
if (r.isCancelled()) {
LOG.error("Task was canceled. Break.");
return -1;
} else if (r.isDone()) {
futureResults.remove(i);
int rez = r.get();
if (rez != 0) {
LOG.error("Task Failed (" + rez + "). Break");
}
} else {
i++;
}
}
}
final String line = br.readLine();
if (line == null)
break;
if (line.startsWith("#") || line.isEmpty())
continue;
final BedLine bedLine = bedCodec.decode(line);
if (bedLine == null)
continue;
// ID
final String geneIdentifier = bedLine.get(3);
// name
final String geneName = bedLine.get(4);
final SourceType sourceType = SourceType.valueOf(bedLine.get(5));
final String filename = geneIdentifier;
final String outputVcfName = (filename.startsWith(this.prefix) ? "" : this.prefix) + filename + ".vcf" + (this.compress ? ".gz" : "");
LOG.info(bedLine.getContig() + ":" + bedLine.getStart() + "-" + bedLine.getEnd() + " length :" + (bedLine.getEnd() - bedLine.getStart()));
if (bedLine.getEnd() - bedLine.getStart() > 1E6) {
LOG.warn("That's a large region ! " + bedLine);
}
OutputStream vcfOutputStream = null;
VariantContextWriter vw = null;
int countVariants = 0;
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(vcfFileReader.getFileHeader()).logger(LOG).prefix(geneName + " " + bedLine.getContig() + ":" + bedLine.getStart() + "-" + bedLine.getEnd());
iter = vcfFileReader.query(bedLine.getContig(), bedLine.getStart(), bedLine.getEnd());
while (iter.hasNext()) {
VariantContext ctx = progress.watch(iter.next());
switch(sourceType) {
case SlidingVariants:
{
// nothing
break;
}
case SlidingContig:
{
// nothing
break;
}
case ANN_GeneName:
case ANN_FeatureID:
case ANN_GeneID:
{
final List<String> preds = new ArrayList<>();
for (final AnnPredictionParser.AnnPrediction pred : tools.getAnnPredictionParser().getPredictions(ctx)) {
final String predictionIdentifier;
switch(sourceType) {
case ANN_GeneName:
predictionIdentifier = pred.getGeneName();
break;
case ANN_FeatureID:
predictionIdentifier = pred.getFeatureId();
break;
case ANN_GeneID:
predictionIdentifier = pred.getGeneId();
break;
default:
throw new IllegalStateException(bedLine.toString());
}
if (StringUtil.isBlank(predictionIdentifier))
continue;
if (!geneName.equals(predictionIdentifier))
continue;
preds.add(pred.getOriginalAttributeAsString());
}
if (preds.isEmpty()) {
ctx = null;
} else {
ctx = new VariantContextBuilder(ctx).rmAttribute(tools.getAnnPredictionParser().getTag()).attribute(tools.getAnnPredictionParser().getTag(), preds).make();
}
break;
}
case VEP_Gene:
case VEP_Feature:
case VEP_Symbol:
case VEP_HgncId:
{
final List<String> preds = new ArrayList<>();
for (final VepPredictionParser.VepPrediction pred : tools.getVepPredictions(ctx)) {
final String predictionIdentifier;
switch(sourceType) {
case VEP_Gene:
predictionIdentifier = pred.getGene();
break;
case VEP_Feature:
predictionIdentifier = pred.getFeature();
break;
case VEP_Symbol:
predictionIdentifier = pred.getSymbol();
break;
case VEP_HgncId:
predictionIdentifier = pred.getHgncId();
break;
default:
throw new IllegalStateException(bedLine.toString());
}
if (StringUtil.isBlank(predictionIdentifier))
continue;
if (!geneName.equals(predictionIdentifier))
continue;
preds.add(pred.getOriginalAttributeAsString());
}
if (preds.isEmpty()) {
ctx = null;
} else {
ctx = new VariantContextBuilder(ctx).rmAttribute(tools.getVepPredictionParser().getTag()).attribute(tools.getVepPredictionParser().getTag(), preds).make();
}
break;
}
default:
throw new IllegalStateException(bedLine.toString());
}
if (ctx == null)
continue;
if (vcfOutputStream == null) {
LOG.info(filename);
manifest.println(outputVcfName);
final VCFHeader header = new VCFHeader(vcfFileReader.getFileHeader());
header.addMetaDataLine(new VCFHeaderLine(VCF_HEADER_SPLITKEY, filename));
vcfOutputStream = archive.openOuputStream(outputVcfName);
vw = VCFUtils.createVariantContextWriterToOutputStream(vcfOutputStream);
vw.writeHeader(header);
}
countVariants++;
vw.add(ctx);
if (countVariants % 1000 == 0) {
LOG.info("Loading : " + geneIdentifier + " N=" + countVariants);
}
}
progress.finish();
LOG.info(geneIdentifier + " N=" + countVariants);
if (vcfOutputStream != null) {
vw.close();
vcfOutputStream.flush();
vcfOutputStream.close();
vw = null;
if (!this.exec.isEmpty()) {
final Callable<Integer> callable = () -> {
final File vcfOutFile = new File(this.outputFile, outputVcfName);
IOUtil.assertFileIsReadable(vcfOutFile);
final String vcfPath = vcfOutFile.getPath();
final StringTokenizer st = new StringTokenizer(this.exec);
final List<String> command = new ArrayList<>(1 + st.countTokens());
while (st.hasMoreTokens()) {
String token = st.nextToken().replaceAll("__PREFIX__", this.prefix).replaceAll("__CONTIG__", bedLine.getContig()).replaceAll("__CHROM__", bedLine.getContig()).replaceAll("__ID__", geneIdentifier).replaceAll("__NAME__", geneName).replaceAll("__START__", String.valueOf(bedLine.getStart())).replaceAll("__END__", String.valueOf(bedLine.getEnd())).replaceAll("__SOURCE__", sourceType.name()).replaceAll("__VCF__", vcfPath);
command.add(token);
}
LOG.info(command.stream().map(S -> "'" + S + "'").collect(Collectors.joining(" ")));
final ProcessBuilder pb = new ProcessBuilder(command);
pb.redirectErrorStream(true);
final Process p = pb.start();
final Thread stdoutThread = new Thread(() -> {
try {
InputStream in = p.getInputStream();
IOUtils.copyTo(in, stdout());
} catch (Exception err) {
LOG.error(err);
}
});
stdoutThread.start();
int exitValue = p.waitFor();
if (exitValue != 0) {
LOG.error("Command failed (" + exitValue + "):" + String.join(" ", command));
return -1;
} else {
if (deleteAfterCommand) {
if (!vcfOutFile.delete()) {
LOG.warn("Cannot delete " + vcfOutFile);
}
}
return 0;
}
};
if (executorService != null) {
final Future<Integer> rez = executorService.submit(callable);
futureResults.add(rez);
} else {
final int ret = callable.call();
if (ret != 0) {
LOG.error("Error with process (" + ret + ")");
return ret;
}
}
}
} else {
manifest.println("#" + filename);
LOG.warn("No Variant Found for " + line);
}
iter.close();
}
;
if (executorService != null) {
LOG.info("shutdown");
executorService.shutdown();
executorService.awaitTermination(365, TimeUnit.DAYS);
}
br.close();
br = null;
manifest.close();
archive.close();
archive = null;
LOG.info("Done");
}
vcfFileReader.close();
vcfFileReader = null;
return 0;
} catch (Exception e) {
LOG.error(e);
return -1;
} finally {
{
CloserUtil.close(iter2);
CloserUtil.close(iter);
CloserUtil.close(pw);
CloserUtil.close(vcfFileReader);
CloserUtil.close(br);
CloserUtil.close(archive);
}
}
}
Aggregations