use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class VcfUcsc method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VcfIterator in, final VariantContextWriter out) {
final String TAG;
if (StringUtil.isBlank(this.infoTag)) {
TAG = "UCSC_" + database.toUpperCase() + "_" + table.toUpperCase();
} else {
TAG = this.infoTag;
}
VCFHeader header = in.getHeader();
final VCFHeader h2 = new VCFHeader(header);
h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, database + "." + table));
if (!StringUtil.isBlank(this.filterIn)) {
h2.addMetaDataLine(new VCFFilterHeaderLine(this.filterIn, "Set by " + this.getClass().getName()));
} else if (!StringUtil.isBlank(this.filterOut)) {
h2.addMetaDataLine(new VCFFilterHeaderLine(this.filterOut, "Set by " + this.getClass().getName()));
}
h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
out.writeHeader(h2);
final Map<Integer, PreparedStatement> bin2pstmt = new HashMap<>();
try {
if (!this.has_bin_column) {
bin2pstmt.put(0, createPreparedStatement(0));
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
while (in.hasNext()) {
final VariantContext ctx = progress.watch(in.next());
int start0, end0;
if (// mutation starts *after* the base
ctx.isIndel()) {
start0 = ctx.getStart();
end0 = ctx.getEnd();
} else {
start0 = ctx.getStart() - 1;
end0 = ctx.getEnd();
}
// extends left/right
start0 = Math.max(0, start0 - this.extend_bases);
end0 += this.extend_bases;
final Set<String> atts = new HashSet<String>();
if (this.has_bin_column) {
final List<Integer> binList = reg2bins(start0, end0);
PreparedStatement pstmt = bin2pstmt.get(binList.size());
if (pstmt == null) {
LOG.debug("create prepared statemement for bin.size=" + binList.size() + "[" + start0 + ":" + end0 + "]");
pstmt = createPreparedStatement(binList.size());
bin2pstmt.put(binList.size(), pstmt);
}
initPstmt(pstmt, ctx.getContig(), start0, end0);
for (int x = 0; x < binList.size(); ++x) {
pstmt.setInt(4 + x, binList.get(x));
}
select(atts, pstmt);
} else {
// already defined
final PreparedStatement pstmt = bin2pstmt.get(0);
initPstmt(pstmt, ctx.getContig(), start0, end0);
select(atts, pstmt);
}
if (atts.isEmpty() && StringUtil.isBlank(this.filterIn) && StringUtil.isBlank(this.filterOut)) {
out.add(ctx);
continue;
}
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
if (!StringUtil.isBlank(this.filterIn) && !atts.isEmpty()) {
vcb.filter(this.filterIn);
} else if (!StringUtil.isBlank(this.filterOut) && atts.isEmpty()) {
vcb.filter(this.filterOut);
}
if (!atts.isEmpty()) {
vcb.attribute(TAG, atts.toArray());
}
out.add(vcb.make());
}
progress.finish();
return 0;
} catch (final SQLException err) {
LOG.error(err);
throw new RuntimeException("SQLError", err);
} finally {
for (final PreparedStatement pstmt : bin2pstmt.values()) {
CloserUtil.close(pstmt);
}
bin2pstmt.clear();
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class VcfFilterJdk method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VcfIterator iter, final VariantContextWriter delegate) {
final CtxWriterFactory.CtxWriter out = this.component.open(delegate);
out.writeHeader(iter.getHeader());
out.filter_instance.userData.put("first.variant", Boolean.TRUE);
out.filter_instance.userData.put("last.variant", Boolean.FALSE);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(iter.getHeader()).logger(LOG);
while (iter.hasNext() && !out.checkError()) {
out.add(progress.watch(iter.next()));
out.filter_instance.userData.put("first.variant", Boolean.FALSE);
out.filter_instance.userData.put("last.variant", !iter.hasNext());
final Object stop = out.filter_instance.userData.get("STOP");
if (Boolean.TRUE.equals(stop))
break;
}
progress.finish();
out.close();
return 0;
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class VCFFixIndels method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator r, VariantContextWriter w) {
long nChanged = 0L;
final String TAG = "INDELFIXED";
final VCFHeader header = r.getHeader();
final VCFHeader h2 = new VCFHeader(header);
addMetaData(h2);
h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, 1, VCFHeaderLineType.String, "Fix Indels for @SolenaLS (position|alleles...)"));
w.writeHeader(h2);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
while (r.hasNext()) {
boolean somethingChanged = false;
VariantContext ctx = progress.watch(r.next());
/* map old allele to new allele */
Map<Allele, Allele> original2modified = new HashMap<Allele, Allele>();
/* did we found a strange allele (symbolic, etc ) */
boolean strange_allele_flag = false;
for (final Allele a : ctx.getAlleles()) {
original2modified.put(a, a);
if (a.isSymbolic() || a.isNoCall()) {
strange_allele_flag = true;
break;
}
}
if (strange_allele_flag || original2modified.size() < 2) /* at least 2 alleles: REF+ ALT */
{
w.add(ctx);
continue;
}
/* record chromStart if we need to shift to the right */
int chromStart = ctx.getStart();
/* trim right then left */
for (int side = 0; side < 2; ++side) {
boolean done = false;
while (!done) {
boolean ok_side = true;
/* common nucleotide at end/start */
Character targetChar = null;
done = true;
// scan side
Set<Allele> keys = new HashSet<>(original2modified.keySet());
for (Allele k : keys) {
Allele newAllele = original2modified.get(k);
if (newAllele.isSymbolic()) {
ok_side = false;
break;
}
String baseString = newAllele.getBaseString().trim().toUpperCase();
if (baseString.length() < 2) {
ok_side = false;
break;
}
/* first or last char or all sequences
* side==0 : right
* side==1 : left
* */
Character baseChar = (side == 0 ? baseString.charAt(baseString.length() - 1) : baseString.charAt(0));
if (targetChar == null) {
targetChar = baseChar;
} else if (!targetChar.equals(baseChar)) {
/* doesn't end with same nucleotide */
ok_side = false;
break;
}
}
/* ok we can shift all alleles */
if (ok_side && targetChar != null) {
done = false;
somethingChanged = true;
for (Allele k : keys) {
Allele newAllele = original2modified.get(k);
String baseString = newAllele.getBaseString().trim().toUpperCase();
if (// remove last nucleotide
side == 0) {
newAllele = Allele.create(baseString.substring(0, baseString.length() - 1), newAllele.isReference());
} else {
newAllele = Allele.create(baseString.substring(1), newAllele.isReference());
}
original2modified.put(k, newAllele);
}
if (side == 1)
chromStart++;
}
}
/* end of while done */
}
if (!somethingChanged) {
w.add(ctx);
continue;
}
final VariantContextBuilder b = new VariantContextBuilder(ctx);
b.start(chromStart);
Allele newRef = original2modified.get(ctx.getReference());
b.stop(chromStart + newRef.getBaseString().length() - 1);
b.alleles(original2modified.values());
List<Genotype> genotypes = new ArrayList<>();
for (final String sample : header.getSampleNamesInOrder()) {
final Genotype g = ctx.getGenotype(sample);
if (!g.isCalled()) {
genotypes.add(g);
continue;
}
final GenotypeBuilder gb = new GenotypeBuilder(g);
final List<Allele> aL = new ArrayList<>();
for (final Allele a : g.getAlleles()) {
aL.add(original2modified.get(a));
}
gb.alleles(aL);
genotypes.add(gb.make());
}
StringBuilder tagContent = new StringBuilder();
tagContent.append(String.valueOf(ctx.getStart()));
for (Allele a : ctx.getAlleles()) {
tagContent.append("|");
tagContent.append(a.toString());
}
b.attribute(TAG, tagContent.toString());
b.genotypes(genotypes);
w.add(b.make());
++nChanged;
if (w.checkError())
break;
}
progress.finish();
LOG.info("indels changed:" + nChanged);
return RETURN_OK;
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class VcfOffsetsIndexFactory method indexVcfFile.
/**
* index a vcf file for its variant offsets
*/
public File indexVcfFile(final File vcfFile, final File indexFile) throws IOException {
LOG.info("indexing " + vcfFile);
IOUtil.assertFileIsReadable(vcfFile);
DataOutputStream daos = null;
BlockCompressedInputStream bgzin = null;
AsciiLineReader ascii = null;
VCFHeader header = null;
final VCFCodec codec = new VCFCodec();
SAMSequenceDictionaryProgress progress = null;
try {
daos = new DataOutputStream(new FileOutputStream(indexFile));
daos.write(MAGIC);
if (vcfFile.getName().endsWith(".vcf.gz")) {
bgzin = new BlockCompressedInputStream(vcfFile);
ascii = null;
} else if (vcfFile.getName().endsWith(".vcf")) {
bgzin = null;
ascii = new AsciiLineReader(new FileInputStream(vcfFile));
} else {
throw new IllegalArgumentException("not a vcf.gz or vcf file: " + vcfFile);
}
final List<String> headerLines = new ArrayList<>();
for (; ; ) {
final long offset = (ascii == null ? bgzin.getPosition() : ascii.getPosition());
final String line = (ascii == null ? bgzin.readLine() : ascii.readLine());
if (line == null)
break;
if (line.startsWith("#")) {
headerLines.add(line);
if (line.startsWith("#CHROM")) {
codec.readHeader(new LineIterator() {
int i = 0;
@Override
public String next() {
final String s = headerLines.get(i);
i++;
return s;
}
@Override
public boolean hasNext() {
return i < headerLines.size();
}
@Override
public String peek() {
return i < headerLines.size() ? headerLines.get(i) : null;
}
});
header = VCFUtils.parseHeader(headerLines).header;
progress = new SAMSequenceDictionaryProgress(header);
progress.logger(this.logger == null ? LOG : this.logger);
progress.setLogPrefix("indexing");
}
continue;
}
if (progress == null) {
throw new JvarkitException.FileFormatError("no vcf header in " + vcfFile);
}
final VariantContext ctx = codec.decode(line);
progress.watch(ctx);
if (this.acceptVariant != null) {
if (!acceptVariant.test(ctx))
continue;
}
daos.writeLong(offset);
}
if (progress == null) {
throw new JvarkitException.FileFormatError("no vcf header in " + vcfFile);
}
progress.finish();
daos.flush();
daos.close();
return indexFile;
} catch (final IOException err) {
throw err;
} finally {
CloserUtil.close(ascii);
CloserUtil.close(bgzin);
CloserUtil.close(daos);
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class VcfCompareCallersOneSample method doWork.
@Override
public int doWork(List<String> args) {
File inputFile = null;
List<EqualRangeVcfIterator> listChallengers = new ArrayList<>();
VariantContextWriter vcw = null;
VcfIterator in = null;
try {
in = super.openVcfIterator(oneFileOrNull(args));
VCFHeader header = in.getHeader();
if (header.getNGenotypeSamples() != 1) {
LOG.error("vcf.must.have.only.one.sample");
return -1;
}
VCFHeader h2 = new VCFHeader(header);
h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkVersion", HtsjdkVersion.getVersion()));
h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkHome", HtsjdkVersion.getHome()));
SAMSequenceDictionary dict = header.getSequenceDictionary();
if (dict == null) {
LOG.error("no.dict.in.vcf");
return -1;
}
Comparator<VariantContext> ctxComparator = VCFUtils.createTidPosComparator(dict);
/* load files to be challenged */
for (File cf : this.challengerVcf) {
// do not challenge vs itself
if (inputFile != null && inputFile.equals(cf)) {
LOG.error("Ignoring challenger (self): " + cf);
continue;
}
VcfIterator cin = VCFUtils.createVcfIteratorFromFile(cf);
VCFHeader ch = cin.getHeader();
if (ch.getNGenotypeSamples() != 1) {
LOG.warning("vcf.must.have.only.one.sample");
cin.close();
continue;
}
if (!header.getSampleNamesInOrder().get(0).equals(ch.getSampleNamesInOrder().get(0))) {
LOG.warning("Ignoring " + cf + " because not the same sample.");
cin.close();
continue;
}
SAMSequenceDictionary hdict = ch.getSequenceDictionary();
if (hdict == null || !SequenceUtil.areSequenceDictionariesEqual(dict, hdict)) {
LOG.error("not.the.same.sequence.dictionaries");
return -1;
}
listChallengers.add(new EqualRangeVcfIterator(cin, ctxComparator));
}
vcw = super.openVariantContextWriter(outputFile);
vcw.writeHeader(h2);
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
VariantContext prev_ctx = null;
while (in.hasNext() && !vcw.checkError()) {
VariantContext ctx = progress.watch(in.next());
// check input order
if (prev_ctx != null && ctxComparator.compare(prev_ctx, ctx) > 0) {
LOG.error("bad sort order : got\n\t" + prev_ctx + "\nbefore\n\t" + ctx + "\n");
return -1;
}
prev_ctx = ctx;
int countInOtherFiles = 0;
for (EqualRangeVcfIterator citer : listChallengers) {
boolean foundInThatFile = false;
List<VariantContext> ctxChallenging = citer.next(ctx);
for (VariantContext ctx2 : ctxChallenging) {
if (!ctx2.getReference().equals(ctx.getReference()))
continue;
boolean ok = true;
if (!this.ignoreAlternate) {
Set<Allele> myAlt = new HashSet<Allele>(ctx.getAlternateAlleles());
myAlt.removeAll(ctx2.getAlternateAlleles());
if (!myAlt.isEmpty())
ok = false;
}
if (ok) {
foundInThatFile = true;
break;
}
}
countInOtherFiles += (foundInThatFile ? 1 : 0);
}
if (countInOtherFiles >= minCountInclusive && countInOtherFiles <= maxCountInclusive) {
vcw.add(ctx);
}
}
progress.finish();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(vcw);
CloserUtil.close(listChallengers);
CloserUtil.close(in);
}
}
Aggregations