use of htsjdk.variant.vcf.VCFFilterHeaderLine in project gatk by broadinstitute.
the class LiftOverVcf method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
IOUtil.assertFileIsReadable(CHAIN);
IOUtil.assertFileIsWritable(OUTPUT);
IOUtil.assertFileIsWritable(REJECT);
////////////////////////////////////////////////////////////////////////
// Setup the inputs
////////////////////////////////////////////////////////////////////////
final LiftOver liftOver = new LiftOver(CHAIN);
final VCFFileReader in = new VCFFileReader(INPUT, false);
logger.info("Loading up the target reference genome.");
final ReferenceSequenceFileWalker walker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
final Map<String, byte[]> refSeqs = new HashMap<>();
for (final SAMSequenceRecord rec : walker.getSequenceDictionary().getSequences()) {
refSeqs.put(rec.getSequenceName(), walker.get(rec.getSequenceIndex()).getBases());
}
CloserUtil.close(walker);
////////////////////////////////////////////////////////////////////////
// Setup the outputs
////////////////////////////////////////////////////////////////////////
final VCFHeader inHeader = in.getFileHeader();
final VCFHeader outHeader = new VCFHeader(inHeader);
outHeader.setSequenceDictionary(walker.getSequenceDictionary());
final VariantContextWriter out = new VariantContextWriterBuilder().setOption(Options.INDEX_ON_THE_FLY).setOutputFile(OUTPUT).setReferenceDictionary(walker.getSequenceDictionary()).build();
out.writeHeader(outHeader);
final VariantContextWriter rejects = new VariantContextWriterBuilder().setOutputFile(REJECT).unsetOption(Options.INDEX_ON_THE_FLY).build();
final VCFHeader rejectHeader = new VCFHeader(in.getFileHeader());
for (final VCFFilterHeaderLine line : FILTERS) rejectHeader.addMetaDataLine(line);
rejects.writeHeader(rejectHeader);
////////////////////////////////////////////////////////////////////////
// Read the input VCF, lift the records over and write to the sorting
// collection.
////////////////////////////////////////////////////////////////////////
long failedLiftover = 0, failedAlleleCheck = 0, total = 0;
logger.info("Lifting variants over and sorting.");
final SortingCollection<VariantContext> sorter = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(outHeader), outHeader.getVCFRecordComparator(), MAX_RECORDS_IN_RAM, TMP_DIR);
ProgressLogger progress = new ProgressLogger(logger, 1000000, "read");
for (final VariantContext ctx : in) {
++total;
final Interval source = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd());
final Interval target = liftOver.liftOver(source, 1.0);
if (target == null) {
rejects.add(new VariantContextBuilder(ctx).filter(FILTER_CANNOT_LIFTOVER).make());
failedLiftover++;
} else {
// Fix the alleles if we went from positive to negative strand
final List<Allele> alleles = new ArrayList<>();
for (final Allele oldAllele : ctx.getAlleles()) {
if (target.isPositiveStrand() || oldAllele.isSymbolic()) {
alleles.add(oldAllele);
} else {
alleles.add(Allele.create(SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference()));
}
}
// Build the new variant context
final VariantContextBuilder builder = new VariantContextBuilder(ctx.getSource(), target.getContig(), target.getStart(), target.getEnd(), alleles);
builder.id(ctx.getID());
builder.attributes(ctx.getAttributes());
builder.genotypes(ctx.getGenotypes());
builder.filters(ctx.getFilters());
builder.log10PError(ctx.getLog10PError());
// Check that the reference allele still agrees with the reference sequence
boolean mismatchesReference = false;
for (final Allele allele : builder.getAlleles()) {
if (allele.isReference()) {
final byte[] ref = refSeqs.get(target.getContig());
final String refString = StringUtil.bytesToString(ref, target.getStart() - 1, target.length());
if (!refString.equalsIgnoreCase(allele.getBaseString())) {
mismatchesReference = true;
}
break;
}
}
if (mismatchesReference) {
rejects.add(new VariantContextBuilder(ctx).filter(FILTER_MISMATCHING_REF_ALLELE).make());
failedAlleleCheck++;
} else {
sorter.add(builder.make());
}
}
progress.record(ctx.getContig(), ctx.getStart());
}
final NumberFormat pfmt = new DecimalFormat("0.0000%");
final String pct = pfmt.format((failedLiftover + failedAlleleCheck) / (double) total);
logger.info("Processed ", total, " variants.");
logger.info(Long.toString(failedLiftover), " variants failed to liftover.");
logger.info(Long.toString(failedAlleleCheck), " variants lifted over but had mismatching reference alleles after lift over.");
logger.info(pct, " of variants were not successfully lifted over and written to the output.");
rejects.close();
in.close();
////////////////////////////////////////////////////////////////////////
// Write the sorted outputs to the final output file
////////////////////////////////////////////////////////////////////////
sorter.doneAdding();
progress = new ProgressLogger(logger, 1000000, "written");
logger.info("Writing out sorted records to final VCF.");
for (final VariantContext ctx : sorter) {
out.add(ctx);
progress.record(ctx.getContig(), ctx.getStart());
}
out.close();
sorter.cleanup();
return null;
}
use of htsjdk.variant.vcf.VCFFilterHeaderLine in project jvarkit by lindenb.
the class VariantsInWindow method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter writer) {
final VCFHeader header = new VCFHeader(in.getHeader());
if (header.getInfoHeaderLine(this.winName) != null) {
LOG.error("VCF header already contains the INFO header ID=" + this.winName);
}
header.addMetaDataLine(new VCFInfoHeaderLine(this.winName, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Window : start|end|number-of-matching-variants|number-of-non-matching-variants"));
if (this.filterName != null && this.treshold > 0) {
if (header.getInfoHeaderLine(this.filterName) != null) {
LOG.error("VCF header already contains the FORMAT header ID=" + this.filterName);
}
header.addMetaDataLine(new VCFFilterHeaderLine(this.filterName, "Filter defined in " + getProgramName()));
}
writer.writeHeader(header);
while (in.hasNext()) {
final VariantContext ctx = in.next();
mapVariant(writer, ctx);
}
flushVariants(writer, null);
return 0;
}
use of htsjdk.variant.vcf.VCFFilterHeaderLine in project jvarkit by lindenb.
the class NoZeroVariationVCF method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
final VCFHeader header = in.getHeader();
if (in.hasNext()) {
LOG.info("found a variant in VCF.");
VCFUtils.copyHeaderAndVariantsTo(in, out);
} else {
LOG.info("no a variant in VCF. Creating a fake Variant");
header.addMetaDataLine(new VCFFilterHeaderLine("FAKESNP", "Fake SNP created because vcf input was empty."));
VCFFormatHeaderLine gtHeaderLine = header.getFormatHeaderLine(VCFConstants.GENOTYPE_KEY);
if (gtHeaderLine == null) {
LOG.info("Adding GT to header");
header.addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
}
gtHeaderLine = header.getFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY);
if (gtHeaderLine == null) {
LOG.info("Adding GQ to header");
header.addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality"));
}
out.writeHeader(header);
SAMSequenceDictionary dict = this.indexedFastaSequenceFile.getSequenceDictionary();
// choose random chrom, best is 'random' , but not 1...23,X,Y, etc...
String chrom = dict.getSequence(0).getSequenceName();
for (final SAMSequenceRecord ssr : dict.getSequences()) {
String ssn = ssr.getSequenceName();
if (ssn.contains("_")) {
chrom = ssn;
break;
}
}
for (final SAMSequenceRecord ssr : dict.getSequences()) {
String ssn = ssr.getSequenceName();
if (ssn.toLowerCase().contains("random")) {
chrom = ssn;
break;
}
if (ssn.toLowerCase().contains("gl")) {
chrom = ssn;
break;
}
}
GenomicSequence gseq = new GenomicSequence(this.indexedFastaSequenceFile, chrom);
char ref = 'N';
char alt = 'N';
int POS = 0;
for (POS = 0; POS < gseq.length(); ++POS) {
ref = Character.toUpperCase(gseq.charAt(POS));
if (ref == 'N')
continue;
switch(ref) {
case 'A':
alt = 'T';
break;
case 'T':
alt = 'G';
break;
case 'G':
alt = 'C';
break;
case 'C':
alt = 'A';
break;
default:
break;
}
if (alt == 'N')
continue;
break;
}
if (alt == 'N')
throw new RuntimeException("found only N");
VariantContextBuilder vcb = new VariantContextBuilder();
Allele a1 = Allele.create((byte) ref, true);
Allele a2 = Allele.create((byte) alt, false);
List<Allele> la1a2 = new ArrayList<Allele>(2);
List<Genotype> genotypes = new ArrayList<Genotype>(header.getSampleNamesInOrder().size());
la1a2.add(a1);
la1a2.add(a2);
vcb.chr(gseq.getChrom());
vcb.start(POS + 1);
vcb.stop(POS + 1);
vcb.filter("FAKESNP");
vcb.alleles(la1a2);
vcb.log10PError(-0.1);
for (String sample : header.getSampleNamesInOrder()) {
final GenotypeBuilder gb = new GenotypeBuilder(sample);
if (genotypes.isEmpty()) {
gb.DP(1);
gb.GQ(1);
gb.alleles(la1a2);
gb.noAD();
gb.noPL();
genotypes.add(gb.make());
} else {
genotypes.add(GenotypeBuilder.createMissing(sample, 2));
}
}
vcb.genotypes(genotypes);
vcb.noID();
out.add(vcb.make());
}
return 0;
}
use of htsjdk.variant.vcf.VCFFilterHeaderLine in project jvarkit by lindenb.
the class FixVCF method doWork.
private int doWork(String filenameIn, InputStream vcfStream, VariantContextWriter w) throws IOException {
final AbstractVCFCodec vcfCodec = VCFUtils.createDefaultVCFCodec();
LineIterator r = new LineIteratorImpl(new SynchronousLineReader(vcfStream));
final VCFHeader header = (VCFHeader) vcfCodec.readActualHeader(r);
// samples names have been changed by picard api and reordered !!!
// re-create the original order
List<String> sampleNamesInSameOrder = new ArrayList<String>(header.getSampleNamesInOrder().size());
for (int col = 0; col < header.getSampleNamesInOrder().size(); ++col) {
for (String sample : header.getSampleNameToOffset().keySet()) {
if (header.getSampleNameToOffset().get(sample) == col) {
sampleNamesInSameOrder.add(sample);
break;
}
}
}
if (sampleNamesInSameOrder.size() != header.getSampleNamesInOrder().size()) {
throw new IllegalStateException();
}
VCFHeader h2 = new VCFHeader(header.getMetaDataInInputOrder(), sampleNamesInSameOrder);
File tmp = IOUtil.newTempFile("tmp", ".vcf.gz", new File[] { tmpDir });
tmp.deleteOnExit();
PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(tmp)));
while (r.hasNext()) {
String line = r.next();
pw.println(line);
VariantContext ctx = null;
try {
ctx = vcfCodec.decode(line);
} catch (Exception err) {
pw.close();
LOG.error(line);
LOG.error(err);
return -1;
}
for (String f : ctx.getFilters()) {
if (h2.getFilterHeaderLine(f) != null)
continue;
// if(f.equals(VCFConstants.PASSES_FILTERS_v4)) continue; hum...
if (f.isEmpty() || f.equals(VCFConstants.UNFILTERED))
continue;
LOG.info("Fixing missing Filter:" + f);
h2.addMetaDataLine(new VCFFilterHeaderLine(f));
}
for (String tag : ctx.getAttributes().keySet()) {
if (h2.getInfoHeaderLine(tag) != null)
continue;
LOG.info("Fixing missing INFO:" + tag);
h2.addMetaDataLine(new VCFInfoHeaderLine(tag, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "undefined. Saved by " + getClass()));
}
}
pw.flush();
pw.close();
pw = null;
LOG.info("re-reading VCF frm tmpFile:" + tmp);
h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName(), "Saved VCF FILTER AND INFO from " + filenameIn));
// save header in memory
ByteArrayOutputStream baos = new ByteArrayOutputStream();
VariantContextWriter w2 = VCFUtils.createVariantContextWriterToOutputStream(baos);
w2.writeHeader(h2);
w2.close();
baos.close();
// reopen tmp file
@SuppressWarnings("resource") VcfIterator in = new VcfIteratorImpl(new SequenceInputStream(new ByteArrayInputStream(baos.toByteArray()), new GZIPInputStream(new FileInputStream(tmp))));
w.writeHeader(h2);
while (in.hasNext()) {
w.add(in.next());
}
in.close();
tmp.delete();
return 0;
}
use of htsjdk.variant.vcf.VCFFilterHeaderLine in project jvarkit by lindenb.
the class VcfToRdf method writeHeader.
private void writeHeader(final VCFHeader header, final URI source) {
if (source != null) {
emit(source, "rdf:type", "vcf:File");
}
final SAMSequenceDictionary dict = header.getSequenceDictionary();
if (dict != null) {
for (final SAMSequenceRecord ssr : dict.getSequences()) {
emit(URI.create("urn:chrom/" + ssr.getSequenceName()), "rdf:type", "vcf:Chromosome", "dc:title", ssr.getSequenceName(), "vcf:length", ssr.getSequenceLength(), "vcf:sequenceIndex", ssr.getSequenceIndex());
}
}
for (final VCFFilterHeaderLine h : header.getFilterLines()) {
emit(URI.create("urn:filter/" + h.getID()), "rdf:type", "vcf:Filter", "dc:title", h.getID(), "dc:description", h.getValue());
}
final Pedigree pedigree = Pedigree.newParser().parse(header);
for (final Pedigree.Person pe : pedigree.getPersons()) {
final URI sample = URI.create("urn:sample/" + pe.getId());
emit(sample, "rdf:type", "foaf:Person", "foaf:name", pe.getId(), "foaf:family", pe.getFamily().getId());
if (pe.isMale())
emit(sample, "idt:gender", "male");
if (pe.isFemale())
emit(sample, "idt:gender", "female");
if (pe.isAffected())
emit(sample, "idt:status", "affected");
if (pe.isUnaffected())
emit(sample, "idt:status", "unaffected");
}
// Sample
for (final String sample : header.getSampleNamesInOrder()) {
emit(URI.create("urn:sample/" + sample), "rdf:type", "vcf:Sample", "dc:title", sample);
}
}
Aggregations