use of htsjdk.variant.vcf.VCFHeader in project jvarkit by lindenb.
the class TestNg01 method getVcfHeader.
static VCFHeader getVcfHeader(final File f) {
final VCFFileReader r = new VCFFileReader(f, false);
final VCFHeader h = r.getFileHeader();
r.close();
return h;
}
use of htsjdk.variant.vcf.VCFHeader in project jvarkit by lindenb.
the class VcfStage method buildInfoHeaderTab.
/**
* build a table describing the INFO column
*/
private Tab buildInfoHeaderTab(final VCFHeader header) {
final TableView<VCFInfoHeaderLine> table = new TableView<>(FXCollections.observableArrayList(header.getInfoHeaderLines()));
table.getColumns().add(makeColumn("ID", F -> F.getID()));
table.getColumns().add(makeColumn("Type", F -> F.getType() == null ? null : F.getType().name()));
table.getColumns().add(makeColumn("Count", F -> F.isFixedCount() ? F.getCount() : null));
table.getColumns().add(makeColumn("Description", F -> F.getDescription()));
final Tab tab = new Tab("INFO", table);
tab.setClosable(false);
table.setPlaceholder(new Label("No INFO defined."));
return tab;
}
use of htsjdk.variant.vcf.VCFHeader in project jvarkit by lindenb.
the class VcfToSql method read.
private void read(File filename) throws IOException {
/* insert ATGC */
this.alleleTable.insert(outputWriter, null, "A");
this.alleleTable.insert(outputWriter, null, "C");
this.alleleTable.insert(outputWriter, null, "G");
this.alleleTable.insert(outputWriter, null, "T");
/* insert this sample */
this.vcfFileTable.insert(outputWriter, null, filename);
final SelectStmt vcffile_id = new SelectStmt(this.vcfFileTable);
final Map<String, SelectStmt> sample2sampleid = new HashMap<String, SelectStmt>();
final Map<String, SelectStmt> filter2filterid = new HashMap<String, SelectStmt>();
final Map<String, SelectStmt> chrom2chromId = new HashMap<String, SelectStmt>();
final VcfIterator r = VCFUtils.createVcfIteratorFromFile(filename);
final VCFHeader header = r.getHeader();
/* parse samples */
for (final String sampleName : header.getSampleNamesInOrder()) {
this.sampleTable.insert(outputWriter, null, sampleName);
SelectStmt sample_id = new SelectStmt(this.sampleTable, "name", sampleName);
sample2sampleid.put(sampleName, sample_id);
this.sample2fileTable.insert(outputWriter, null, vcffile_id, sample_id);
}
/* parse filters */
for (final VCFFilterHeaderLine filter : header.getFilterLines()) {
this.filterTable.insert(outputWriter, null, vcffile_id, filter.getID(), filter.getValue());
filter2filterid.put(filter.getID(), new SelectStmt(this.filterTable, "name", filter.getID()));
}
filter2filterid.put(VCFConstants.PASSES_FILTERS_v4, new SelectStmt(this.filterTable, "name", VCFConstants.PASSES_FILTERS_v4));
final SAMSequenceDictionary dict = header.getSequenceDictionary();
if (dict == null) {
throw new RuntimeException("dictionary missing in VCF");
}
/* parse sequence dict */
for (final SAMSequenceRecord ssr : dict.getSequences()) {
this.chromosomeTable.insert(outputWriter, null, vcffile_id, ssr.getSequenceName(), ssr.getSequenceLength());
chrom2chromId.put(ssr.getSequenceName(), new SelectStmt(this.chromosomeTable, "name", ssr.getSequenceName()));
}
VepPredictionParser vepPredictionParser = new VepPredictionParserFactory(header).get();
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
int nVariants = 0;
while (r.hasNext()) {
if (this.outputWriter.checkError())
break;
VariantContext var = progress.watch(r.next());
++nVariants;
/* insert ref allele */
this.alleleTable.insert(outputWriter, null, var.getReference().getBaseString());
/* insert variant */
this.variantTable.insert(outputWriter, null, vcffile_id, nVariants, chrom2chromId.get(var.getContig()), var.getStart(), (var.hasID() ? var.getID() : null), new SelectStmt(this.alleleTable, "bases", var.getReference().getBaseString()), (var.hasLog10PError() ? var.getPhredScaledQual() : null));
SelectStmt variant_id = new SelectStmt(variantTable);
/* insert alternate alleles */
for (Allele alt : var.getAlternateAlleles()) {
/* insert alt allele */
this.alleleTable.insert(outputWriter, null, alt.getBaseString());
this.variant2altTable.insert(outputWriter, null, variant_id, new SelectStmt(this.alleleTable, "bases", alt.getBaseString()));
}
/* insert filters */
for (final String filter : var.getFilters()) {
if (filter2filterid.get(filter) == null) {
throw new IOException("VCF Error: filter " + filter + " is not defined in the VCF header.");
}
this.variant2filters.insert(outputWriter, null, variant_id, filter2filterid.get(filter));
}
if (!this.ignore_info) {
for (final VepPrediction pred : vepPredictionParser.getPredictions(var)) {
/*
vepPrediction.insert(
outputWriter,
null,
variant_id,
pred.getEnsemblGene(),
pred.getEnsemblTranscript(),
pred.getEnsemblProtein(),
pred.getSymbol()
);
SelectStmt pred_id = new SelectStmt(vepPrediction);
for(SequenceOntologyTree.Term t: pred.getSOTerms())
{
String term=t.getAcn().replace(':', '_');
soTermTable.insert(
outputWriter,
null,
term,
t.getAcn()
);//for bioportal compatibility
SelectStmt term_id = new SelectStmt(soTermTable,"acn",term);
vepPrediction2so.insert(
outputWriter,
null,
pred_id,
term_id
);
}
*/
}
}
/* insert genotypes */
for (final String sampleName : sample2sampleid.keySet()) {
final Genotype g = var.getGenotype(sampleName);
if (!g.isAvailable() || g.isNoCall())
continue;
genotypeTable.insert(outputWriter, null, variant_id, sample2sampleid.get(sampleName), g.isCalled() ? new SelectStmt(this.alleleTable, "bases", g.getAllele(0).getBaseString()) : null, g.isCalled() ? new SelectStmt(this.alleleTable, "bases", g.getAllele(1).getBaseString()) : null, g.hasDP() ? g.getDP() : null, g.hasGQ() ? g.getGQ() : null);
}
}
r.close();
}
use of htsjdk.variant.vcf.VCFHeader in project jvarkit by lindenb.
the class VCFPredictions method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VcfIterator r, VariantContextWriter w) {
ReferenceContig genomicSequence = null;
try {
LOG.info("opening REF:" + this.referenceGenomeSource);
this.referenceGenome = new ReferenceGenomeFactory().open(this.referenceGenomeSource);
loadKnownGenesFromUri();
final VCFHeader header = (VCFHeader) r.getHeader();
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(this.referenceGenome.getDictionary());
contigNameConverter.setOnNotFound(OnNotFound.SKIP);
final VCFHeader h2 = new VCFHeader(header);
addMetaData(h2);
switch(this.outputSyntax) {
case Vep:
{
h2.addMetaDataLine(new VCFInfoHeaderLine("CSQ", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Consequence type as predicted by VEP" + ". Format: Allele|Feature|Feature_type|Consequence|CDS_position|Protein_position|Amino_acids|Codons"));
break;
}
case SnpEff:
{
h2.addMetaDataLine(new VCFInfoHeaderLine("ANN", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'"));
break;
}
default:
{
final StringBuilder format = new StringBuilder();
for (FORMAT1 f : FORMAT1.values()) {
if (format.length() > 0)
format.append("|");
format.append(f.name());
}
h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Prediction from " + getClass().getSimpleName() + ". Format: " + format));
break;
}
}
w.writeHeader(h2);
final SequenceOntologyTree soTree = SequenceOntologyTree.getInstance();
final SequenceOntologyTree.Term so_intron = soTree.getTermByAcn("SO:0001627");
final SequenceOntologyTree.Term so_exon = soTree.getTermByAcn("SO:0001791");
final SequenceOntologyTree.Term so_splice_donor = soTree.getTermByAcn("SO:0001575");
final SequenceOntologyTree.Term so_splice_acceptor = soTree.getTermByAcn("SO:0001574");
final SequenceOntologyTree.Term so_5_prime_UTR_variant = soTree.getTermByAcn("SO:0001623");
final SequenceOntologyTree.Term so_3_prime_UTR_variant = soTree.getTermByAcn("SO:0001624");
final SequenceOntologyTree.Term so_splicing_variant = soTree.getTermByAcn("SO:0001568");
final SequenceOntologyTree.Term so_stop_lost = soTree.getTermByAcn("SO:0001578");
final SequenceOntologyTree.Term so_stop_gained = soTree.getTermByAcn("SO:0001587");
final SequenceOntologyTree.Term so_coding_synonymous = soTree.getTermByAcn("SO:0001819");
final SequenceOntologyTree.Term so_coding_non_synonymous = soTree.getTermByAcn("SO:0001583");
final SequenceOntologyTree.Term so_intergenic = soTree.getTermByAcn("SO:0001628");
final SequenceOntologyTree.Term so_nc_transcript_variant = soTree.getTermByAcn("SO:0001619");
final SequenceOntologyTree.Term so_non_coding_exon_variant = soTree.getTermByAcn("SO:0001792");
final SequenceOntologyTree.Term _2KB_upstream_variant = soTree.getTermByAcn("SO:0001636");
final SequenceOntologyTree.Term _5KB_upstream_variant = soTree.getTermByAcn("SO:0001635");
final SequenceOntologyTree.Term _5KB_downstream_variant = soTree.getTermByAcn("SO:0001633");
final SequenceOntologyTree.Term _500bp_downstream_variant = soTree.getTermByAcn("SO:0001634");
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
while (r.hasNext()) {
final VariantContext ctx = progress.watch(r.next());
final String normalizedContig = contigNameConverter.apply(ctx.getContig());
final List<KnownGene> genes = new ArrayList<>();
if (!StringUtil.isBlank(normalizedContig)) {
for (final List<KnownGene> l2 : this.knownGenes.getOverlapping(new Interval(normalizedContig, ctx.getStart(), // 1-based
ctx.getEnd()))) {
genes.addAll(l2);
}
}
final List<Annotation> ctx_annotations = new ArrayList<Annotation>();
if (genes == null || genes.isEmpty()) {
// intergenic
Annotation a = new Annotation();
a.seqont.add(so_intergenic);
ctx_annotations.add(a);
} else {
if (genomicSequence == null || !genomicSequence.hasName(normalizedContig)) {
LOG.info("getting genomic Sequence for " + normalizedContig);
genomicSequence = this.referenceGenome.getContig(normalizedContig);
if (genomicSequence == null)
throw new JvarkitException.ContigNotFoundInDictionary(normalizedContig, this.referenceGenome.getDictionary());
}
for (final KnownGene gene : genes) {
final GeneticCode geneticCode = GeneticCode.getStandard();
for (final Allele alt2 : ctx.getAlternateAlleles()) {
if (alt2.isNoCall())
continue;
if (alt2.isSymbolic()) {
LOG.warn("symbolic allele are not handled... " + alt2.getDisplayString());
continue;
}
if (alt2.isReference())
continue;
final Annotation annotations = new Annotation();
annotations.kg = gene;
annotations.alt2 = alt2;
if (gene.isNonCoding()) {
annotations.seqont.add(so_nc_transcript_variant);
continue;
}
ctx_annotations.add(annotations);
StringBuilder wildRNA = null;
ProteinCharSequence wildProt = null;
ProteinCharSequence mutProt = null;
MutedSequence mutRNA = null;
int position_in_cds = -1;
final int position = ctx.getStart() - 1;
if (!String.valueOf(genomicSequence.charAt(position)).equalsIgnoreCase(ctx.getReference().getBaseString())) {
if (isSimpleBase(ctx.getReference())) {
LOG.warn("Warning REF!=GENOMIC SEQ!!! at " + position + "/" + ctx.getReference());
}
continue;
}
if (gene.isPositiveStrand()) {
if (position < gene.getTxStart() - 2000) {
annotations.seqont.add(_5KB_upstream_variant);
} else if (position < gene.getTxStart()) {
annotations.seqont.add(_2KB_upstream_variant);
} else if (position >= gene.getTxEnd() + 500) {
annotations.seqont.add(_5KB_downstream_variant);
} else if (position >= gene.getTxEnd()) {
annotations.seqont.add(_500bp_downstream_variant);
} else if (position < gene.getCdsStart()) {
// UTR5
annotations.seqont.add(so_5_prime_UTR_variant);
} else if (gene.getCdsEnd() <= position) {
annotations.seqont.add(so_3_prime_UTR_variant);
} else {
int exon_index = 0;
while (exon_index < gene.getExonCount()) {
final KnownGene.Exon exon = gene.getExon(exon_index);
for (int i = exon.getStart(); i < exon.getEnd(); ++i) {
if (i == position) {
annotations.exon_name = exon.getName();
if (exon.isNonCoding()) {
annotations.seqont.add(so_non_coding_exon_variant);
}
}
if (i < gene.getTxStart())
continue;
if (i < gene.getCdsStart())
continue;
if (i >= gene.getCdsEnd())
break;
if (wildRNA == null) {
wildRNA = new StringBuilder();
mutRNA = new MutedSequence(wildRNA);
}
if (i == position) {
annotations.seqont.add(so_exon);
annotations.exon_name = exon.getName();
position_in_cds = wildRNA.length();
annotations.position_cds = position_in_cds;
// in splicing ?
if (exon.isSplicing(position)) {
if (exon.isSplicingAcceptor(position)) {
// SPLICING_ACCEPTOR
annotations.seqont.add(so_splice_acceptor);
} else if (exon.isSplicingDonor(position)) {
// SPLICING_DONOR
annotations.seqont.add(so_splice_donor);
} else // ??
{
annotations.seqont.add(so_splicing_variant);
}
}
}
wildRNA.append(genomicSequence.charAt(i));
if (i == position && isSimpleBase(alt2) && isSimpleBase(ctx.getReference())) {
mutRNA.put(position_in_cds, alt2.getBaseString().charAt(0));
}
if (wildRNA.length() % 3 == 0 && wildRNA.length() > 0 && wildProt == null) {
wildProt = new ProteinCharSequence(geneticCode, wildRNA);
mutProt = new ProteinCharSequence(geneticCode, mutRNA);
}
}
final KnownGene.Intron intron = exon.getNextIntron();
if (intron != null && intron.contains(position)) {
annotations.intron_name = intron.getName();
annotations.seqont.add(so_intron);
if (intron.isSplicing(position)) {
if (intron.isSplicingAcceptor(position)) {
annotations.seqont.add(so_splice_acceptor);
} else if (intron.isSplicingDonor(position)) {
annotations.seqont.add(so_splice_donor);
} else // ???
{
annotations.seqont.add(so_splicing_variant);
}
}
}
++exon_index;
}
}
} else // reverse orientation
{
if (position >= gene.getTxEnd() + 2000) {
annotations.seqont.add(_5KB_upstream_variant);
} else if (position >= gene.getTxEnd()) {
annotations.seqont.add(_2KB_upstream_variant);
} else if (position < gene.getTxStart() - 500) {
annotations.seqont.add(_5KB_downstream_variant);
} else if (position < gene.getTxStart()) {
annotations.seqont.add(_500bp_downstream_variant);
} else if (position < gene.getCdsStart()) {
annotations.seqont.add(so_3_prime_UTR_variant);
} else if (gene.getCdsEnd() <= position) {
annotations.seqont.add(so_5_prime_UTR_variant);
} else {
int exon_index = gene.getExonCount() - 1;
while (exon_index >= 0) {
final KnownGene.Exon exon = gene.getExon(exon_index);
for (int i = exon.getEnd() - 1; i >= exon.getStart(); --i) {
if (i == position) {
annotations.exon_name = exon.getName();
if (exon.isNonCoding()) {
annotations.seqont.add(so_non_coding_exon_variant);
}
}
if (i >= gene.getCdsEnd())
continue;
if (i < gene.getCdsStart())
break;
if (wildRNA == null) {
wildRNA = new StringBuilder();
mutRNA = new MutedSequence(wildRNA);
}
if (i == position) {
annotations.seqont.add(so_exon);
position_in_cds = wildRNA.length();
annotations.position_cds = position_in_cds;
// in splicing ?
if (exon.isSplicing(position)) {
if (exon.isSplicingAcceptor(position)) {
annotations.seqont.add(so_splice_acceptor);
} else if (exon.isSplicingDonor(position)) {
annotations.seqont.add(so_splice_donor);
} else // ?
{
annotations.seqont.add(so_splicing_variant);
}
}
if (isSimpleBase(alt2) && isSimpleBase(ctx.getReference())) {
mutRNA.put(position_in_cds, AcidNucleics.complement(alt2.getBaseString().charAt(0)));
}
}
wildRNA.append(AcidNucleics.complement(genomicSequence.charAt(i)));
if (wildRNA.length() % 3 == 0 && wildRNA.length() > 0 && wildProt == null) {
wildProt = new ProteinCharSequence(geneticCode, wildRNA);
mutProt = new ProteinCharSequence(geneticCode, mutRNA);
}
}
final KnownGene.Intron intron = exon.getPrevIntron();
if (intron != null && intron.contains(position)) {
annotations.intron_name = intron.getName();
annotations.seqont.add(so_intron);
if (intron.isSplicing(position)) {
if (intron.isSplicingAcceptor(position)) {
annotations.seqont.add(so_splice_acceptor);
} else if (intron.isSplicingDonor(position)) {
annotations.seqont.add(so_splice_donor);
} else // ?
{
annotations.seqont.add(so_splicing_variant);
}
}
}
--exon_index;
}
}
}
if (isSimpleBase(alt2) && isSimpleBase(ctx.getReference()) && wildProt != null && mutProt != null && position_in_cds >= 0) {
final int pos_aa = position_in_cds / 3;
final int mod = position_in_cds % 3;
annotations.wildCodon = ("" + wildRNA.charAt(position_in_cds - mod + 0) + wildRNA.charAt(position_in_cds - mod + 1) + wildRNA.charAt(position_in_cds - mod + 2));
annotations.mutCodon = ("" + mutRNA.charAt(position_in_cds - mod + 0) + mutRNA.charAt(position_in_cds - mod + 1) + mutRNA.charAt(position_in_cds - mod + 2));
annotations.position_protein = (pos_aa + 1);
annotations.wildAA = String.valueOf(wildProt.charAt(pos_aa));
annotations.mutAA = (String.valueOf(mutProt.charAt(pos_aa)));
annotations.seqont.remove(so_exon);
if (isStop(wildProt.charAt(pos_aa)) && !isStop(mutProt.charAt(pos_aa))) {
annotations.seqont.add(so_stop_lost);
} else if (!isStop(wildProt.charAt(pos_aa)) && isStop(mutProt.charAt(pos_aa))) {
annotations.seqont.add(so_stop_gained);
} else if (wildProt.charAt(pos_aa) == mutProt.charAt(pos_aa)) {
annotations.seqont.add(so_coding_synonymous);
} else {
annotations.seqont.add(so_coding_non_synonymous);
}
}
}
}
}
final Set<String> info = new HashSet<String>(ctx_annotations.size());
for (final Annotation a : ctx_annotations) {
info.add(a.toString());
}
final VariantContextBuilder vb = new VariantContextBuilder(ctx);
final String thetag;
switch(this.outputSyntax) {
case Vep:
thetag = "CSQ";
break;
case SnpEff:
thetag = "ANN";
break;
default:
thetag = TAG;
break;
}
vb.attribute(thetag, info.toArray());
w.add(vb.make());
}
return RETURN_OK;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(this.referenceGenome);
}
}
use of htsjdk.variant.vcf.VCFHeader in project jvarkit by lindenb.
the class VCFMerge method workUsingPeekIterator.
private int workUsingPeekIterator() {
VariantContextWriter out = null;
final List<PeekVCF> input = new ArrayList<PeekVCF>();
try {
final Set<String> genotypeSampleNames = new TreeSet<String>();
final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
// get all VCF, check same dict
for (final String arg : this.userVcfFiles) {
LOG.info("Opening " + arg);
final PeekVCF p = new PeekVCF(arg);
input.add(p);
genotypeSampleNames.addAll(p.header.getSampleNamesInOrder());
metaData.addAll(p.header.getMetaDataInInputOrder());
if (this.global_dictionary == null) {
this.global_dictionary = p.header.getSequenceDictionary();
} else if (!SequenceUtil.areSequenceDictionariesEqual(this.global_dictionary, p.header.getSequenceDictionary())) {
throw new JvarkitException.DictionariesAreNotTheSame(this.global_dictionary, p.header.getSequenceDictionary());
}
}
if (this.global_dictionary == null) {
throw new IllegalStateException("No Dict");
}
// super.addMetaData(metaData);
if (this.doNotMergeRowLines) {
metaData.add(NO_MERGE_INFO_HEADER);
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(this.global_dictionary);
out = super.openVariantContextWriter(this.outputFile);
final VCFHeader headerOut = new VCFHeader(metaData, genotypeSampleNames);
out.writeHeader(headerOut);
final List<VariantContext> row = new ArrayList<VariantContext>(input.size());
// find smallest ordered variant
long nCountForGC = 0L;
for (; ; ) {
row.clear();
for (final PeekVCF peekVcf : input) {
final List<VariantContext> peekVcfList = peekVcf.peek();
if (peekVcf.peek().isEmpty())
continue;
final VariantContext ctx = peekVcfList.get(0);
if (row.isEmpty()) {
row.addAll(peekVcfList);
continue;
}
final int cmp = this.compareChromPosRef.compare(ctx, row.get(0));
if (cmp == 0) {
row.addAll(peekVcfList);
} else if (cmp < 0) {
row.clear();
row.addAll(peekVcfList);
}
}
if (row.isEmpty())
break;
for (final VariantContext merged : buildContextFromVariantContext(headerOut, row)) {
out.add(progress.watch(merged));
}
// consumme peeked variants
for (final PeekVCF peekVcf : input) {
peekVcf.reset(row.get(0));
}
if (nCountForGC++ % 100000 == 0)
System.gc();
}
for (final PeekVCF peekVcf : input) {
peekVcf.close();
}
input.clear();
CloserUtil.close(out);
out = null;
progress.finish();
LOG.info("Done peek sorting");
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
for (final PeekVCF p : input) {
p.close();
}
}
}
Aggregations