use of htsjdk.variant.vcf.VCFFilterHeaderLine in project gatk by broadinstitute.
the class FilterMutectCalls method onTraversalStart.
@Override
public void onTraversalStart() {
final VCFHeader inputHeader = getHeaderForVariants();
final Set<VCFHeaderLine> headerLines = new HashSet<>(inputHeader.getMetaDataInSortedOrder());
Mutect2FilteringEngine.M_2_FILTER_NAMES.stream().map(GATKVCFHeaderLines::getFilterLine).forEach(headerLines::add);
headerLines.add(new VCFFilterHeaderLine(Mutect2FilteringEngine.ARTIFACT_IN_NORMAL_FILTER_NAME, "artifact_in_normal"));
headerLines.add(new VCFFilterHeaderLine(Mutect2FilteringEngine.MEDIAN_BASE_QUALITY_DIFFERENCE_FILTER_NAME, "ref - alt median base quality"));
headerLines.add(new VCFFilterHeaderLine(Mutect2FilteringEngine.MEDIAN_MAPPING_QUALITY_DIFFERENCE_FILTER_NAME, "ref - alt median mapping quality"));
headerLines.add(new VCFFilterHeaderLine(Mutect2FilteringEngine.MEDIAN_CLIPPING_DIFFERENCE_FILTER_NAME, "ref - alt median clipping"));
headerLines.add(new VCFFilterHeaderLine(Mutect2FilteringEngine.MEDIAN_FRAGMENT_LENGTH_DIFFERENCE_FILTER_NAME, "abs(ref - alt) median fragment length"));
headerLines.add(new VCFFilterHeaderLine(Mutect2FilteringEngine.READ_POSITION_FILTER_NAME, "median distance of alt variants from end of reads"));
headerLines.add(new VCFFilterHeaderLine(Mutect2FilteringEngine.CONTAMINATION_FILTER_NAME, "contamination"));
headerLines.addAll(getDefaultToolVCFHeaderLines());
final VCFHeader vcfHeader = new VCFHeader(headerLines, inputHeader.getGenotypeSamples());
vcfWriter = createVCFWriter(new File(outputVcf));
vcfWriter.writeHeader(vcfHeader);
}
use of htsjdk.variant.vcf.VCFFilterHeaderLine in project gatk by broadinstitute.
the class FilterVcf method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
final List<VariantFilter> variantFilters = CollectionUtil.makeList(new AlleleBalanceFilter(MIN_AB), new FisherStrandFilter(MAX_FS), new QdFilter(MIN_QD));
final List<GenotypeFilter> genotypeFilters = CollectionUtil.makeList(new GenotypeQualityFilter(MIN_GQ), new DepthFilter(MIN_DP));
try (final VCFFileReader in = new VCFFileReader(INPUT, false)) {
final FilterApplyingVariantIterator iterator = new FilterApplyingVariantIterator(in.iterator(), variantFilters, genotypeFilters);
try (final VariantContextWriter out = new VariantContextWriterBuilder().setOutputFile(OUTPUT).build()) {
final VCFHeader header = in.getFileHeader();
header.addMetaDataLine(new VCFFilterHeaderLine("AllGtsFiltered", "Site filtered out because all genotypes are filtered out."));
header.addMetaDataLine(new VCFFormatHeaderLine("FT", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype filters."));
for (final VariantFilter filter : variantFilters) {
for (final VCFFilterHeaderLine line : filter.headerLines()) {
header.addMetaDataLine(line);
}
}
out.writeHeader(in.getFileHeader());
while (iterator.hasNext()) {
out.add(iterator.next());
}
}
}
return null;
}
use of htsjdk.variant.vcf.VCFFilterHeaderLine in project jvarkit by lindenb.
the class HaloplexParasite method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
SamReader samReader = null;
final List<Mutation> mutations = new ArrayList<>();
try {
final Set<File> bamFiles = Files.lines(this.bamList.toPath()).filter(T -> !(T.isEmpty() || T.startsWith("#"))).map(T -> new File(T)).collect(Collectors.toSet());
final VCFHeader header = new VCFHeader();
header.setSequenceDictionary(in.getHeader().getSequenceDictionary());
final VCFFilterHeaderLine filter = new VCFFilterHeaderLine("HALOPLEXPARASITE", "fails Parasite Haloplex Sequence");
final VCFInfoHeaderLine infoWord = new VCFInfoHeaderLine(filter.getID(), 1, VCFHeaderLineType.String, "Parasite Haloplex Sequence (Word|Count|Fraction)");
super.addMetaData(header);
out.writeHeader(header);
header.addMetaDataLine(filter);
header.addMetaDataLine(infoWord);
while (in.hasNext()) {
final VariantContext ctx = in.next();
final VariantContextBuilder vcb = new VariantContextBuilder(inputName, ctx.getContig(), ctx.getStart(), ctx.getEnd(), ctx.getAlleles());
if (!(ctx.isIndel() || ctx.isMixed())) {
out.add(vcb.make());
continue;
}
if (!vcb.getAlleles().stream().filter(A -> !(A.isSymbolic() || A.isNoCall() || A.length() < this.minClipSize)).findAny().isPresent()) {
out.add(vcb.make());
continue;
}
final Mutation mut = new Mutation(ctx);
mutations.add(mut);
}
final Counter<String> words = new Counter<>();
for (final File bamFile : bamFiles) {
LOG.info("Opening " + bamFile);
samReader = createSamReaderFactory().open(bamFile);
for (final Mutation mut : mutations) {
// words seen in that mutation : don't use a Counter
final Set<String> mutWords = new HashSet<>();
/* loop over reads overlapping that mutation */
final SAMRecordIterator sri = samReader.queryOverlapping(mut.contig, mut.start, mut.end);
while (sri.hasNext()) {
final SAMRecord rec = sri.next();
if (rec.getReadUnmappedFlag())
continue;
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getDuplicateReadFlag())
continue;
if (rec.getReadFailsVendorQualityCheckFlag())
continue;
final Cigar cigar = rec.getCigar();
if (cigar.numCigarElements() == 1)
continue;
final byte[] bases = rec.getReadBases();
int refpos = rec.getUnclippedStart();
int readpos = 0;
/* scan cigar overlapping that mutation */
for (final CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
final int ref_end = refpos + (op.consumesReferenceBases() || op.isClipping() ? ce.getLength() : 0);
final int read_end = readpos + (op.consumesReadBases() ? ce.getLength() : 0);
/* check clip is large enough */
if (op.equals(CigarOperator.S) && ce.getLength() >= this.minClipSize) {
/* check clip overlap mutation */
if (!(ref_end < mut.start || refpos > mut.end)) {
/* break read of soft clip into words */
for (int x = 0; x + this.minClipSize <= ce.getLength(); ++x) {
final String substr = new String(bases, readpos + x, this.minClipSize);
if (!substr.contains("N")) {
final String revcomp = AcidNucleics.reverseComplement(substr);
mutWords.add(substr);
if (!revcomp.equals(substr))
mutWords.add(revcomp);
}
}
}
}
refpos = ref_end;
readpos = read_end;
}
}
sri.close();
for (final String w : mutWords) {
words.incr(w);
}
}
// end of for(mutations)
samReader.close();
samReader = null;
}
LOG.info("mutations:" + mutations.size());
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
for (final Mutation mut : mutations) {
progress.watch(mut.contig, mut.start);
final VariantContextBuilder vcb = new VariantContextBuilder(inputName, mut.contig, mut.start, mut.end, mut.alleles);
String worstString = null;
Double worstFraction = null;
final double totalWords = words.getTotal();
for (final Allele a : mut.alleles) {
if (a.isSymbolic() || a.isNoCall() || a.length() < this.minClipSize)
continue;
for (int x = 0; x + this.minClipSize <= a.length(); ++x) {
final String substr = new String(a.getBases(), x, this.minClipSize);
final long count = words.count(substr);
final double fraction = count / totalWords;
if (worstFraction == null || worstFraction < fraction) {
worstString = substr + "|" + count + "|" + fraction;
worstFraction = fraction;
}
}
}
if (worstString != null) {
vcb.attribute(infoWord.getID(), worstString);
}
if (worstFraction != null && worstFraction.doubleValue() >= this.tresholdFraction) {
vcb.filter(filter.getID());
}
out.add(vcb.make());
}
progress.finish();
return RETURN_OK;
} catch (Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(samReader);
}
}
use of htsjdk.variant.vcf.VCFFilterHeaderLine in project jvarkit by lindenb.
the class VcfBurdenFilterGenes method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VcfIterator in, final VariantContextWriter out) {
final VCFHeader header = in.getHeader();
try {
final VCFHeader h2 = addMetaData(new VCFHeader(header));
final VCFFilterHeaderLine filterControlsHeader;
if (!StringUtil.isBlank(this.filterTag)) {
filterControlsHeader = new VCFFilterHeaderLine(this.filterTag.trim(), "Genes not in list " + this.geneFile);
h2.addMetaDataLine(filterControlsHeader);
} else {
filterControlsHeader = null;
}
final List<String> lookColumns = Arrays.asList("CCDS", "Feature", "ENSP", "Gene", "HGNC", "HGNC_ID", "SYMBOL", "RefSeq");
final VepPredictionParser vepParser = new VepPredictionParserFactory(header).get();
final AnnPredictionParser annParser = new AnnPredictionParserFactory(header).get();
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(header.getSequenceDictionary()).logger(LOG);
out.writeHeader(h2);
while (in.hasNext() && !out.checkError()) {
final VariantContext ctx = progess.watch(in.next());
boolean keep = false;
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
// not just set FILTER ?
if (filterControlsHeader == null) {
vcb.rmAttribute(vepParser.getTag());
vcb.rmAttribute(annParser.getTag());
}
final List<String> newVepList = new ArrayList<>();
for (final String predStr : ctx.getAttributeAsList(vepParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
final VepPredictionParser.VepPrediction pred = vepParser.parseOnePrediction(ctx, predStr);
for (final String col : lookColumns) {
final String token = pred.getByCol(col);
if (!StringUtil.isBlank(token) && this.geneNames.contains(token)) {
newVepList.add(predStr);
keep = true;
break;
}
}
}
final List<String> newEffList = new ArrayList<>();
for (final String predStr : ctx.getAttributeAsList(annParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
final AnnPredictionParser.AnnPrediction pred = annParser.parseOnePrediction(predStr);
final String token = pred.getGeneName();
if (!StringUtil.isBlank(token) && this.geneNames.contains(token)) {
newEffList.add(predStr);
keep = true;
break;
}
}
// not just set FILTER ?
if (filterControlsHeader == null) {
if (!newVepList.isEmpty())
vcb.attribute(vepParser.getTag(), newVepList);
if (!newEffList.isEmpty())
vcb.attribute(annParser.getTag(), newEffList);
}
if (filterControlsHeader != null) {
if (!keep) {
vcb.filter(filterControlsHeader.getID());
} else if (!ctx.isFiltered()) {
vcb.passFilters();
}
out.add(vcb.make());
} else {
if (keep)
out.add(vcb.make());
}
}
progess.finish();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(in);
}
}
use of htsjdk.variant.vcf.VCFFilterHeaderLine in project jvarkit by lindenb.
the class VcfToSql method read.
private void read(File filename) throws IOException {
/* insert ATGC */
this.alleleTable.insert(outputWriter, null, "A");
this.alleleTable.insert(outputWriter, null, "C");
this.alleleTable.insert(outputWriter, null, "G");
this.alleleTable.insert(outputWriter, null, "T");
/* insert this sample */
this.vcfFileTable.insert(outputWriter, null, filename);
final SelectStmt vcffile_id = new SelectStmt(this.vcfFileTable);
final Map<String, SelectStmt> sample2sampleid = new HashMap<String, SelectStmt>();
final Map<String, SelectStmt> filter2filterid = new HashMap<String, SelectStmt>();
final Map<String, SelectStmt> chrom2chromId = new HashMap<String, SelectStmt>();
final VcfIterator r = VCFUtils.createVcfIteratorFromFile(filename);
final VCFHeader header = r.getHeader();
/* parse samples */
for (final String sampleName : header.getSampleNamesInOrder()) {
this.sampleTable.insert(outputWriter, null, sampleName);
SelectStmt sample_id = new SelectStmt(this.sampleTable, "name", sampleName);
sample2sampleid.put(sampleName, sample_id);
this.sample2fileTable.insert(outputWriter, null, vcffile_id, sample_id);
}
/* parse filters */
for (final VCFFilterHeaderLine filter : header.getFilterLines()) {
this.filterTable.insert(outputWriter, null, vcffile_id, filter.getID(), filter.getValue());
filter2filterid.put(filter.getID(), new SelectStmt(this.filterTable, "name", filter.getID()));
}
filter2filterid.put(VCFConstants.PASSES_FILTERS_v4, new SelectStmt(this.filterTable, "name", VCFConstants.PASSES_FILTERS_v4));
final SAMSequenceDictionary dict = header.getSequenceDictionary();
if (dict == null) {
throw new RuntimeException("dictionary missing in VCF");
}
/* parse sequence dict */
for (final SAMSequenceRecord ssr : dict.getSequences()) {
this.chromosomeTable.insert(outputWriter, null, vcffile_id, ssr.getSequenceName(), ssr.getSequenceLength());
chrom2chromId.put(ssr.getSequenceName(), new SelectStmt(this.chromosomeTable, "name", ssr.getSequenceName()));
}
VepPredictionParser vepPredictionParser = new VepPredictionParserFactory(header).get();
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
int nVariants = 0;
while (r.hasNext()) {
if (this.outputWriter.checkError())
break;
VariantContext var = progress.watch(r.next());
++nVariants;
/* insert ref allele */
this.alleleTable.insert(outputWriter, null, var.getReference().getBaseString());
/* insert variant */
this.variantTable.insert(outputWriter, null, vcffile_id, nVariants, chrom2chromId.get(var.getContig()), var.getStart(), (var.hasID() ? var.getID() : null), new SelectStmt(this.alleleTable, "bases", var.getReference().getBaseString()), (var.hasLog10PError() ? var.getPhredScaledQual() : null));
SelectStmt variant_id = new SelectStmt(variantTable);
/* insert alternate alleles */
for (Allele alt : var.getAlternateAlleles()) {
/* insert alt allele */
this.alleleTable.insert(outputWriter, null, alt.getBaseString());
this.variant2altTable.insert(outputWriter, null, variant_id, new SelectStmt(this.alleleTable, "bases", alt.getBaseString()));
}
/* insert filters */
for (final String filter : var.getFilters()) {
if (filter2filterid.get(filter) == null) {
throw new IOException("VCF Error: filter " + filter + " is not defined in the VCF header.");
}
this.variant2filters.insert(outputWriter, null, variant_id, filter2filterid.get(filter));
}
if (!this.ignore_info) {
for (final VepPrediction pred : vepPredictionParser.getPredictions(var)) {
/*
vepPrediction.insert(
outputWriter,
null,
variant_id,
pred.getEnsemblGene(),
pred.getEnsemblTranscript(),
pred.getEnsemblProtein(),
pred.getSymbol()
);
SelectStmt pred_id = new SelectStmt(vepPrediction);
for(SequenceOntologyTree.Term t: pred.getSOTerms())
{
String term=t.getAcn().replace(':', '_');
soTermTable.insert(
outputWriter,
null,
term,
t.getAcn()
);//for bioportal compatibility
SelectStmt term_id = new SelectStmt(soTermTable,"acn",term);
vepPrediction2so.insert(
outputWriter,
null,
pred_id,
term_id
);
}
*/
}
}
/* insert genotypes */
for (final String sampleName : sample2sampleid.keySet()) {
final Genotype g = var.getGenotype(sampleName);
if (!g.isAvailable() || g.isNoCall())
continue;
genotypeTable.insert(outputWriter, null, variant_id, sample2sampleid.get(sampleName), g.isCalled() ? new SelectStmt(this.alleleTable, "bases", g.getAllele(0).getBaseString()) : null, g.isCalled() ? new SelectStmt(this.alleleTable, "bases", g.getAllele(1).getBaseString()) : null, g.hasDP() ? g.getDP() : null, g.hasGQ() ? g.getGQ() : null);
}
}
r.close();
}
Aggregations