use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class VcfIn method scanFileSorted.
private int scanFileSorted(final VariantContextWriter vcw, final String databaseVcfUri, final VcfIterator userVcfIn) {
EqualRangeVcfIterator equalRangeDbIter = null;
EqualRangeIterator<VariantContext> equalRangeUserVcf = null;
try {
final VCFHeader header = new VCFHeader(userVcfIn.getHeader());
final SAMSequenceDictionary userVcfDict = header.getSequenceDictionary();
// / NO need if(dict1==null)
if (userVcfDict == null) {
LOG.error(JvarkitException.VcfDictionaryMissing.getMessage("user file"));
return -1;
}
final Comparator<VariantContext> userVcfComparator = VCFUtils.createTidPosComparator(userVcfDict);
equalRangeDbIter = new EqualRangeVcfIterator(VCFUtils.createVcfIterator(databaseVcfUri), userVcfComparator);
this.addMetaData(header);
vcw.writeHeader(header);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(userVcfDict).logger(LOG);
equalRangeUserVcf = new EqualRangeIterator<>(userVcfIn, userVcfComparator);
while (equalRangeUserVcf.hasNext()) {
final List<VariantContext> ctxList = equalRangeUserVcf.next();
progress.watch(ctxList.get(0));
// fill both contextes
final List<VariantContext> dbContexes = new ArrayList<VariantContext>(equalRangeDbIter.next(ctxList.get(0)));
for (final VariantContext userCtx : ctxList) {
boolean keep = dbContexes.stream().filter(V -> sameContext(userCtx, V)).anyMatch(V -> allUserAltFoundInDatabase(userCtx, V));
addVariant(vcw, userCtx, keep);
}
if (vcw.checkError())
break;
}
equalRangeUserVcf.close();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(equalRangeDbIter);
CloserUtil.close(userVcfIn);
CloserUtil.close(vcw);
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class VcfConcat method fromFiles.
private int fromFiles(final VariantContextWriter out) throws IOException {
List<VcfIterator> inputs = new ArrayList<VcfIterator>(this.inputFiles.size());
List<String> inputFiles = new ArrayList<>(this.inputFiles.size());
List<String> samples = new ArrayList<>();
SAMSequenceDictionary dict = null;
try {
Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
/* open each vcf file */
for (String vcfFile : this.inputFiles) {
LOG.info("Opening " + vcfFile);
VcfIterator r = VCFUtils.createVcfIterator(vcfFile);
/* check VCF dict */
VCFHeader header = r.getHeader();
if (dict == null && inputs.isEmpty()) {
dict = header.getSequenceDictionary();
} else if (!inputs.isEmpty() && ((dict == null && header.getSequenceDictionary() != null) || (dict != null && header.getSequenceDictionary() == null))) {
LOG.error("not.the.same.sequence.dictionaries");
return -1;
} else if (!inputs.isEmpty() && dict != null && !SequenceUtil.areSequenceDictionariesEqual(dict, header.getSequenceDictionary())) {
LOG.error("not.the.same.sequence.dictionaries");
return -1;
}
/* check samples */
if (inputs.isEmpty()) {
samples = header.getSampleNamesInOrder();
} else if (!header.getSampleNamesInOrder().equals(samples)) {
LOG.error("No same samples");
return -1;
}
metaData.addAll(header.getMetaDataInInputOrder());
inputs.add(r);
inputFiles.add(VCFUtils.escapeInfoField(vcfFile));
}
/* create comparator according to dict*/
final Comparator<VariantContext> comparator = (dict == null ? VCFUtils.createChromPosRefComparator() : VCFUtils.createTidPosRefComparator(dict));
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkVersion", HtsjdkVersion.getVersion()));
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkHome", HtsjdkVersion.getHome()));
metaData.add(new VCFInfoHeaderLine(VARIANTSOURCE, 1, VCFHeaderLineType.String, "Origin File of Varant"));
VCFHeader h2 = new VCFHeader(metaData, samples);
out.writeHeader(h2);
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
for (; ; ) {
/* get 'smallest' variant */
VariantContext smallest = null;
int idx = 0;
int best_idx = -1;
while (idx < inputs.size()) {
VcfIterator in = inputs.get(idx);
if (!in.hasNext()) {
CloserUtil.close(in);
inputs.remove(idx);
inputFiles.remove(idx);
} else {
VariantContext ctx = in.peek();
if (smallest == null || comparator.compare(smallest, ctx) > 0) {
smallest = ctx;
best_idx = idx;
}
++idx;
}
}
if (smallest == null)
break;
final VariantContext ctx = progress.watch(inputs.get(best_idx).next());
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.attribute(VARIANTSOURCE, inputFiles.get(best_idx));
out.add(vcb.make());
}
progress.finish();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(inputs);
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class VcfToZip method doWork.
@Override
public int doWork(List<String> args) {
if (this.outputFile != null && this.outputFile.getName().endsWith(".zip")) {
LOG.error("Filename must end with '.zip' " + outputFile);
return -1;
}
LineIterator lr = null;
VcfIterator in = null;
ZipOutputStream zout = null;
FileOutputStream fout = null;
int num_vcfs = 0;
VariantContextWriter vcw = null;
args = new ArrayList<>(IOUtils.unrollFiles(args));
try {
int optind = 0;
if (this.outputFile != null) {
fout = new FileOutputStream(this.outputFile);
zout = new ZipOutputStream(fout);
} else {
zout = new ZipOutputStream(stdout());
}
do {
if (args.isEmpty()) {
LOG.info("reading concatenated vcf from stdin");
lr = IOUtils.openStreamForLineIterator(stdin());
} else {
LOG.info("reading concatenated vcf from " + args.get(optind));
lr = IOUtils.openURIForLineIterator(args.get(optind));
}
while (lr.hasNext()) {
++num_vcfs;
in = VCFUtils.createVcfIteratorFromLineIterator(lr, true);
final VCFHeader header = in.getHeader();
String filename = null;
if (this.titleHeaderStr != null && !this.titleHeaderStr.isEmpty()) {
final VCFHeaderLine h = header.getOtherHeaderLine(this.titleHeaderStr);
if (h != null && !h.getValue().trim().isEmpty())
filename = h.getValue().trim();
}
if (filename == null || filename.trim().isEmpty()) {
// create title
filename = String.format("vcf2zip.%05d.vcf", num_vcfs);
// set name in header
if (this.titleHeaderStr != null && !this.titleHeaderStr.isEmpty()) {
header.addMetaDataLine(new VCFHeaderLine(this.titleHeaderStr.trim(), filename));
}
}
if (!filename.endsWith(".vcf")) {
filename += ".vcf";
}
if (!this.zipPrefix.isEmpty()) {
filename = this.zipPrefix + (this.zipPrefix.endsWith("/") ? "" : "/") + filename;
}
LOG.info(filename);
final ZipEntry entry = new ZipEntry(filename);
entry.setComment("Created with " + getProgramName());
zout.putNextEntry(entry);
vcw = VCFUtils.createVariantContextWriterToOutputStream(IOUtils.uncloseableOutputStream(zout));
vcw.writeHeader(header);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
while (in.hasNext()) {
vcw.add(progress.watch(in.next()));
}
vcw.close();
progress.finish();
zout.closeEntry();
in.close();
}
++optind;
} while (optind < args.size());
zout.finish();
zout.flush();
zout.close();
zout = null;
CloserUtil.close(fout);
LOG.info("done. Number of VCFs:" + num_vcfs);
return RETURN_OK;
} catch (final Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(in);
CloserUtil.close(lr);
CloserUtil.close(zout);
CloserUtil.close(fout);
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class FindMyVirus method doWork.
@Override
public int doWork(List<String> args) {
if (virusNames.isEmpty()) {
LOG.error("no virus name");
return -1;
}
SamReader sfr = null;
SAMFileWriter[] sfwArray = new SAMFileWriter[CAT.values().length];
try {
sfr = openSamReader(oneFileOrNull(args));
SAMFileHeader header = sfr.getFileHeader();
for (CAT category : CAT.values()) {
LOG.info("Opening " + category);
SAMFileHeader header2 = header.clone();
header2.addComment("Category:" + category.name());
header2.addComment("Description:" + category.getDescription());
SAMProgramRecord rec = header2.createProgramRecord();
rec.setCommandLine(this.getProgramCommandLine());
rec.setProgramName(getProgramName());
rec.setProgramVersion(getVersion());
rec.setAttribute("CAT", category.name());
File outputFile = new File(this.outputFile.getParentFile(), this.outputFile.getName() + "." + category.name() + ".bam");
LOG.info("Opening " + outputFile);
File countFile = new File(outputFile.getParentFile(), outputFile.getName() + "." + category.name() + ".count.txt");
SAMFileWriter sfw = writingBamArgs.openSAMFileWriter(outputFile, header2, true);
sfw = new SAMFileWriterCount(sfw, countFile, category);
sfwArray[category.ordinal()] = sfw;
}
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header.getSequenceDictionary());
OtherCanonicalAlignFactory xpAlignFactory = new OtherCanonicalAlignFactory(header);
SAMRecordIterator iter = sfr.iterator();
while (iter.hasNext()) {
SAMRecord rec = iter.next();
progress.watch(rec);
CAT category = null;
List<OtherCanonicalAlign> xpList = Collections.emptyList();
if (category == null && !rec.getReadPairedFlag()) {
category = CAT.unpaired;
}
if (category == null && rec.isSecondaryOrSupplementary()) {
category = CAT.secondary;
}
if (category == null && rec.getReadFailsVendorQualityCheckFlag()) {
category = CAT.failsqual;
}
if (category == null && rec.getDuplicateReadFlag()) {
category = CAT.duplicate;
}
if (category == null && rec.getReadUnmappedFlag()) {
category = CAT.unmapped;
}
if (category == null) {
xpList = xpAlignFactory.getXPAligns(rec);
}
boolean xp_containsVirus = false;
boolean xp_containsChrom = false;
for (OtherCanonicalAlign xpa : xpList) {
if (virusNames.contains(xpa.getReferenceName())) {
xp_containsVirus = true;
} else {
xp_containsChrom = true;
}
}
/* both reads mapped on ref */
if (category == null && !rec.getReadUnmappedFlag() && !rec.getMateUnmappedFlag() && !virusNames.contains(rec.getReferenceName()) && !virusNames.contains(rec.getMateReferenceName())) {
if (!xp_containsVirus) {
category = CAT.both_ref;
} else {
category = CAT.ref_and_virus_spliced;
}
}
/* pair(unmapped,mapped on reference) */
if (category == null && ((!rec.getReadUnmappedFlag() && rec.getMateUnmappedFlag() && !virusNames.contains(rec.getReferenceName())) || (rec.getReadUnmappedFlag() && !rec.getMateUnmappedFlag() && !virusNames.contains(rec.getMateReferenceName())))) {
if (!xp_containsVirus) {
category = CAT.ref_orphan;
} else {
category = CAT.ref_and_virus_spliced;
}
}
/* both reads mapped on virus */
if (category == null && !rec.getReadUnmappedFlag() && !rec.getMateUnmappedFlag() && virusNames.contains(rec.getReferenceName()) && virusNames.contains(rec.getMateReferenceName())) {
if (!xp_containsChrom) {
category = CAT.both_virus;
} else {
category = CAT.ref_and_virus_spliced;
}
}
if (category == null && ((!rec.getReadUnmappedFlag() && rec.getMateUnmappedFlag() && virusNames.contains(rec.getReferenceName())) || (rec.getReadUnmappedFlag() && !rec.getMateUnmappedFlag() && virusNames.contains(rec.getMateReferenceName())))) {
if (!xp_containsChrom) {
category = CAT.virus_orphan;
} else {
category = CAT.ref_and_virus_spliced;
}
}
if (category == null && !rec.getReadUnmappedFlag() && !rec.getMateUnmappedFlag() && ((virusNames.contains(rec.getReferenceName()) && !virusNames.contains(rec.getMateReferenceName())) || (!virusNames.contains(rec.getReferenceName()) && virusNames.contains(rec.getMateReferenceName())))) {
category = CAT.ref_and_virus;
}
/*dispatch */
if (category == null) {
LOG.warning("Not handled: " + rec);
category = CAT.undetermined;
}
sfwArray[category.ordinal()].addAlignment(rec);
}
for (SAMFileWriter sfw : sfwArray) {
LOG.info("Closing " + sfw);
sfw.close();
}
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
LOG.info("Closing");
CloserUtil.close(sfr);
CloserUtil.close(sfwArray);
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class FixVcfMissingGenotypes method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VcfIterator in, final VariantContextWriter out) {
final Set<String> bamFiles = IOUtils.unrollFiles(bamList);
final Map<String, List<SamReader>> sample2bam = new HashMap<>(bamFiles.size());
try {
final VCFHeader header = in.getHeader();
for (final String bamFile : bamFiles) {
LOG.info("Reading header for " + bamFile);
final SamReader reader = super.openSamReader(bamFile);
if (!reader.hasIndex()) {
LOG.error("No BAM index available for " + bamFile);
return -1;
}
final SAMFileHeader samHeader = reader.getFileHeader();
for (final SAMReadGroupRecord g : samHeader.getReadGroups()) {
if (g.getSample() == null)
continue;
final String sample = g.getSample();
if (StringUtil.isBlank(sample))
continue;
List<SamReader> readers = sample2bam.get(sample);
if (readers == null) {
readers = new ArrayList<>();
sample2bam.put(sample, readers);
}
readers.add(reader);
}
}
final VCFHeader h2 = new VCFHeader(header);
if (h2.getFormatHeaderLine(VCFConstants.DEPTH_KEY) == null) {
h2.addMetaDataLine(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY));
}
if (h2.getFormatHeaderLine(VCFConstants.GENOTYPE_KEY) == null) {
h2.addMetaDataLine(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY));
}
if (h2.getFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY) == null) {
h2.addMetaDataLine(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
}
h2.addMetaDataLine(new VCFFormatHeaderLine(this.fixedTag, 1, VCFHeaderLineType.Integer, "Genotype was set as homozygous (min depth =" + this.minDepth + ")"));
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
this.recalculator.setHeader(h2);
out.writeHeader(h2);
while (in.hasNext()) {
final VariantContext ctx = progress.watch(in.next());
boolean somethingWasChanged = false;
final List<Genotype> genotypes = new ArrayList<>(ctx.getNSamples());
for (int i = 0; i < ctx.getNSamples(); ++i) {
Genotype genotype = ctx.getGenotype(i);
final String sample = genotype.getSampleName();
if (!genotype.isCalled() || (!genotype.hasDP() && this.fixDP)) {
List<SamReader> samReaders = sample2bam.get(sample);
if (samReaders == null)
samReaders = Collections.emptyList();
final int depth = fetchDP(ctx, sample, samReaders);
if (genotype.isCalled() && !genotype.hasDP()) {
genotype = new GenotypeBuilder(genotype).DP(depth).make();
somethingWasChanged = true;
} else // genotype was not called
{
if (depth >= this.minDepth) {
final List<Allele> homozygous = new ArrayList<>(2);
homozygous.add(ctx.getReference());
homozygous.add(ctx.getReference());
final GenotypeBuilder gb = new GenotypeBuilder(genotype);
gb.alleles(homozygous);
gb.attribute(this.fixedTag, 1);
gb.DP(depth);
if (!StringUtil.isBlank(this.fixedGenotypesAreFiltered))
gb.filter(this.fixedGenotypesAreFiltered);
somethingWasChanged = true;
genotype = gb.make();
} else if (// cannot fix but we can update DP
!genotype.hasDP()) {
genotype = new GenotypeBuilder(genotype).DP(depth).make();
somethingWasChanged = true;
}
}
}
genotypes.add(genotype);
}
if (somethingWasChanged) {
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.genotypes(genotypes);
out.add(this.recalculator.apply(vcb.make()));
} else {
out.add(ctx);
}
}
progress.finish();
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
sample2bam.values().stream().flatMap(L -> L.stream()).forEach(R -> CloserUtil.close(R));
}
}
Aggregations