use of com.github.lindenb.jvarkit.util.bio.GeneticCode in project jvarkit by lindenb.
the class VCFPredictions method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VcfIterator r, VariantContextWriter w) {
ReferenceContig genomicSequence = null;
try {
LOG.info("opening REF:" + this.referenceGenomeSource);
this.referenceGenome = new ReferenceGenomeFactory().open(this.referenceGenomeSource);
loadKnownGenesFromUri();
final VCFHeader header = (VCFHeader) r.getHeader();
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(this.referenceGenome.getDictionary());
contigNameConverter.setOnNotFound(OnNotFound.SKIP);
final VCFHeader h2 = new VCFHeader(header);
addMetaData(h2);
switch(this.outputSyntax) {
case Vep:
{
h2.addMetaDataLine(new VCFInfoHeaderLine("CSQ", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Consequence type as predicted by VEP" + ". Format: Allele|Feature|Feature_type|Consequence|CDS_position|Protein_position|Amino_acids|Codons"));
break;
}
case SnpEff:
{
h2.addMetaDataLine(new VCFInfoHeaderLine("ANN", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'"));
break;
}
default:
{
final StringBuilder format = new StringBuilder();
for (FORMAT1 f : FORMAT1.values()) {
if (format.length() > 0)
format.append("|");
format.append(f.name());
}
h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Prediction from " + getClass().getSimpleName() + ". Format: " + format));
break;
}
}
w.writeHeader(h2);
final SequenceOntologyTree soTree = SequenceOntologyTree.getInstance();
final SequenceOntologyTree.Term so_intron = soTree.getTermByAcn("SO:0001627");
final SequenceOntologyTree.Term so_exon = soTree.getTermByAcn("SO:0001791");
final SequenceOntologyTree.Term so_splice_donor = soTree.getTermByAcn("SO:0001575");
final SequenceOntologyTree.Term so_splice_acceptor = soTree.getTermByAcn("SO:0001574");
final SequenceOntologyTree.Term so_5_prime_UTR_variant = soTree.getTermByAcn("SO:0001623");
final SequenceOntologyTree.Term so_3_prime_UTR_variant = soTree.getTermByAcn("SO:0001624");
final SequenceOntologyTree.Term so_splicing_variant = soTree.getTermByAcn("SO:0001568");
final SequenceOntologyTree.Term so_stop_lost = soTree.getTermByAcn("SO:0001578");
final SequenceOntologyTree.Term so_stop_gained = soTree.getTermByAcn("SO:0001587");
final SequenceOntologyTree.Term so_coding_synonymous = soTree.getTermByAcn("SO:0001819");
final SequenceOntologyTree.Term so_coding_non_synonymous = soTree.getTermByAcn("SO:0001583");
final SequenceOntologyTree.Term so_intergenic = soTree.getTermByAcn("SO:0001628");
final SequenceOntologyTree.Term so_nc_transcript_variant = soTree.getTermByAcn("SO:0001619");
final SequenceOntologyTree.Term so_non_coding_exon_variant = soTree.getTermByAcn("SO:0001792");
final SequenceOntologyTree.Term _2KB_upstream_variant = soTree.getTermByAcn("SO:0001636");
final SequenceOntologyTree.Term _5KB_upstream_variant = soTree.getTermByAcn("SO:0001635");
final SequenceOntologyTree.Term _5KB_downstream_variant = soTree.getTermByAcn("SO:0001633");
final SequenceOntologyTree.Term _500bp_downstream_variant = soTree.getTermByAcn("SO:0001634");
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
while (r.hasNext()) {
final VariantContext ctx = progress.watch(r.next());
final String normalizedContig = contigNameConverter.apply(ctx.getContig());
final List<KnownGene> genes = new ArrayList<>();
if (!StringUtil.isBlank(normalizedContig)) {
for (final List<KnownGene> l2 : this.knownGenes.getOverlapping(new Interval(normalizedContig, ctx.getStart(), // 1-based
ctx.getEnd()))) {
genes.addAll(l2);
}
}
final List<Annotation> ctx_annotations = new ArrayList<Annotation>();
if (genes == null || genes.isEmpty()) {
// intergenic
Annotation a = new Annotation();
a.seqont.add(so_intergenic);
ctx_annotations.add(a);
} else {
if (genomicSequence == null || !genomicSequence.hasName(normalizedContig)) {
LOG.info("getting genomic Sequence for " + normalizedContig);
genomicSequence = this.referenceGenome.getContig(normalizedContig);
if (genomicSequence == null)
throw new JvarkitException.ContigNotFoundInDictionary(normalizedContig, this.referenceGenome.getDictionary());
}
for (final KnownGene gene : genes) {
final GeneticCode geneticCode = GeneticCode.getStandard();
for (final Allele alt2 : ctx.getAlternateAlleles()) {
if (alt2.isNoCall())
continue;
if (alt2.isSymbolic()) {
LOG.warn("symbolic allele are not handled... " + alt2.getDisplayString());
continue;
}
if (alt2.isReference())
continue;
final Annotation annotations = new Annotation();
annotations.kg = gene;
annotations.alt2 = alt2;
if (gene.isNonCoding()) {
annotations.seqont.add(so_nc_transcript_variant);
continue;
}
ctx_annotations.add(annotations);
StringBuilder wildRNA = null;
ProteinCharSequence wildProt = null;
ProteinCharSequence mutProt = null;
MutedSequence mutRNA = null;
int position_in_cds = -1;
final int position = ctx.getStart() - 1;
if (!String.valueOf(genomicSequence.charAt(position)).equalsIgnoreCase(ctx.getReference().getBaseString())) {
if (isSimpleBase(ctx.getReference())) {
LOG.warn("Warning REF!=GENOMIC SEQ!!! at " + position + "/" + ctx.getReference());
}
continue;
}
if (gene.isPositiveStrand()) {
if (position < gene.getTxStart() - 2000) {
annotations.seqont.add(_5KB_upstream_variant);
} else if (position < gene.getTxStart()) {
annotations.seqont.add(_2KB_upstream_variant);
} else if (position >= gene.getTxEnd() + 500) {
annotations.seqont.add(_5KB_downstream_variant);
} else if (position >= gene.getTxEnd()) {
annotations.seqont.add(_500bp_downstream_variant);
} else if (position < gene.getCdsStart()) {
// UTR5
annotations.seqont.add(so_5_prime_UTR_variant);
} else if (gene.getCdsEnd() <= position) {
annotations.seqont.add(so_3_prime_UTR_variant);
} else {
int exon_index = 0;
while (exon_index < gene.getExonCount()) {
final KnownGene.Exon exon = gene.getExon(exon_index);
for (int i = exon.getStart(); i < exon.getEnd(); ++i) {
if (i == position) {
annotations.exon_name = exon.getName();
if (exon.isNonCoding()) {
annotations.seqont.add(so_non_coding_exon_variant);
}
}
if (i < gene.getTxStart())
continue;
if (i < gene.getCdsStart())
continue;
if (i >= gene.getCdsEnd())
break;
if (wildRNA == null) {
wildRNA = new StringBuilder();
mutRNA = new MutedSequence(wildRNA);
}
if (i == position) {
annotations.seqont.add(so_exon);
annotations.exon_name = exon.getName();
position_in_cds = wildRNA.length();
annotations.position_cds = position_in_cds;
// in splicing ?
if (exon.isSplicing(position)) {
if (exon.isSplicingAcceptor(position)) {
// SPLICING_ACCEPTOR
annotations.seqont.add(so_splice_acceptor);
} else if (exon.isSplicingDonor(position)) {
// SPLICING_DONOR
annotations.seqont.add(so_splice_donor);
} else // ??
{
annotations.seqont.add(so_splicing_variant);
}
}
}
wildRNA.append(genomicSequence.charAt(i));
if (i == position && isSimpleBase(alt2) && isSimpleBase(ctx.getReference())) {
mutRNA.put(position_in_cds, alt2.getBaseString().charAt(0));
}
if (wildRNA.length() % 3 == 0 && wildRNA.length() > 0 && wildProt == null) {
wildProt = new ProteinCharSequence(geneticCode, wildRNA);
mutProt = new ProteinCharSequence(geneticCode, mutRNA);
}
}
final KnownGene.Intron intron = exon.getNextIntron();
if (intron != null && intron.contains(position)) {
annotations.intron_name = intron.getName();
annotations.seqont.add(so_intron);
if (intron.isSplicing(position)) {
if (intron.isSplicingAcceptor(position)) {
annotations.seqont.add(so_splice_acceptor);
} else if (intron.isSplicingDonor(position)) {
annotations.seqont.add(so_splice_donor);
} else // ???
{
annotations.seqont.add(so_splicing_variant);
}
}
}
++exon_index;
}
}
} else // reverse orientation
{
if (position >= gene.getTxEnd() + 2000) {
annotations.seqont.add(_5KB_upstream_variant);
} else if (position >= gene.getTxEnd()) {
annotations.seqont.add(_2KB_upstream_variant);
} else if (position < gene.getTxStart() - 500) {
annotations.seqont.add(_5KB_downstream_variant);
} else if (position < gene.getTxStart()) {
annotations.seqont.add(_500bp_downstream_variant);
} else if (position < gene.getCdsStart()) {
annotations.seqont.add(so_3_prime_UTR_variant);
} else if (gene.getCdsEnd() <= position) {
annotations.seqont.add(so_5_prime_UTR_variant);
} else {
int exon_index = gene.getExonCount() - 1;
while (exon_index >= 0) {
final KnownGene.Exon exon = gene.getExon(exon_index);
for (int i = exon.getEnd() - 1; i >= exon.getStart(); --i) {
if (i == position) {
annotations.exon_name = exon.getName();
if (exon.isNonCoding()) {
annotations.seqont.add(so_non_coding_exon_variant);
}
}
if (i >= gene.getCdsEnd())
continue;
if (i < gene.getCdsStart())
break;
if (wildRNA == null) {
wildRNA = new StringBuilder();
mutRNA = new MutedSequence(wildRNA);
}
if (i == position) {
annotations.seqont.add(so_exon);
position_in_cds = wildRNA.length();
annotations.position_cds = position_in_cds;
// in splicing ?
if (exon.isSplicing(position)) {
if (exon.isSplicingAcceptor(position)) {
annotations.seqont.add(so_splice_acceptor);
} else if (exon.isSplicingDonor(position)) {
annotations.seqont.add(so_splice_donor);
} else // ?
{
annotations.seqont.add(so_splicing_variant);
}
}
if (isSimpleBase(alt2) && isSimpleBase(ctx.getReference())) {
mutRNA.put(position_in_cds, AcidNucleics.complement(alt2.getBaseString().charAt(0)));
}
}
wildRNA.append(AcidNucleics.complement(genomicSequence.charAt(i)));
if (wildRNA.length() % 3 == 0 && wildRNA.length() > 0 && wildProt == null) {
wildProt = new ProteinCharSequence(geneticCode, wildRNA);
mutProt = new ProteinCharSequence(geneticCode, mutRNA);
}
}
final KnownGene.Intron intron = exon.getPrevIntron();
if (intron != null && intron.contains(position)) {
annotations.intron_name = intron.getName();
annotations.seqont.add(so_intron);
if (intron.isSplicing(position)) {
if (intron.isSplicingAcceptor(position)) {
annotations.seqont.add(so_splice_acceptor);
} else if (intron.isSplicingDonor(position)) {
annotations.seqont.add(so_splice_donor);
} else // ?
{
annotations.seqont.add(so_splicing_variant);
}
}
}
--exon_index;
}
}
}
if (isSimpleBase(alt2) && isSimpleBase(ctx.getReference()) && wildProt != null && mutProt != null && position_in_cds >= 0) {
final int pos_aa = position_in_cds / 3;
final int mod = position_in_cds % 3;
annotations.wildCodon = ("" + wildRNA.charAt(position_in_cds - mod + 0) + wildRNA.charAt(position_in_cds - mod + 1) + wildRNA.charAt(position_in_cds - mod + 2));
annotations.mutCodon = ("" + mutRNA.charAt(position_in_cds - mod + 0) + mutRNA.charAt(position_in_cds - mod + 1) + mutRNA.charAt(position_in_cds - mod + 2));
annotations.position_protein = (pos_aa + 1);
annotations.wildAA = String.valueOf(wildProt.charAt(pos_aa));
annotations.mutAA = (String.valueOf(mutProt.charAt(pos_aa)));
annotations.seqont.remove(so_exon);
if (isStop(wildProt.charAt(pos_aa)) && !isStop(mutProt.charAt(pos_aa))) {
annotations.seqont.add(so_stop_lost);
} else if (!isStop(wildProt.charAt(pos_aa)) && isStop(mutProt.charAt(pos_aa))) {
annotations.seqont.add(so_stop_gained);
} else if (wildProt.charAt(pos_aa) == mutProt.charAt(pos_aa)) {
annotations.seqont.add(so_coding_synonymous);
} else {
annotations.seqont.add(so_coding_non_synonymous);
}
}
}
}
}
final Set<String> info = new HashSet<String>(ctx_annotations.size());
for (final Annotation a : ctx_annotations) {
info.add(a.toString());
}
final VariantContextBuilder vb = new VariantContextBuilder(ctx);
final String thetag;
switch(this.outputSyntax) {
case Vep:
thetag = "CSQ";
break;
case SnpEff:
thetag = "ANN";
break;
default:
thetag = TAG;
break;
}
vb.attribute(thetag, info.toArray());
w.add(vb.make());
}
return RETURN_OK;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(this.referenceGenome);
}
}
use of com.github.lindenb.jvarkit.util.bio.GeneticCode in project jvarkit by lindenb.
the class BackLocate method backLocate.
private void backLocate(final PrintStream out, final KnownGene gene, final String geneName, char aa1, char aa2, int peptidePos1) throws IOException {
final GeneticCode geneticCode = getGeneticCodeByChromosome(gene.getChromosome());
RNASequence wildRNA = null;
ProteinCharSequence wildProt = null;
if (this.genomicContig == null || !this.genomicContig.hasName(gene.getContig())) {
LOG.info("fetch genome");
this.genomicContig = this.referenceGenome.getContig(gene.getContig());
if (this.genomicContig == null) {
LOG.warn("No contig " + gene.getContig() + " in reference genome.");
return;
}
}
if (gene.isPositiveStrand()) {
int exon_index = 0;
while (exon_index < gene.getExonCount()) {
for (int i = gene.getExonStart(exon_index); i < gene.getExonEnd(exon_index); ++i) {
if (i < gene.getCdsStart())
continue;
if (i >= gene.getCdsEnd())
break;
if (wildRNA == null) {
wildRNA = new RNASequence(this.genomicContig, '+');
}
wildRNA.genomicPositions.add(i);
if (wildRNA.length() % 3 == 0 && wildRNA.length() > 0 && wildProt == null) {
wildProt = new ProteinCharSequence(geneticCode, wildRNA);
}
}
++exon_index;
}
} else // reverse orientation
{
int exon_index = gene.getExonCount() - 1;
while (exon_index >= 0) {
for (int i = gene.getExonEnd(exon_index) - 1; i >= gene.getExonStart(exon_index); --i) {
if (i >= gene.getCdsEnd())
continue;
if (i < gene.getCdsStart())
break;
if (wildRNA == null) {
wildRNA = new RNASequence(this.genomicContig, '-');
}
wildRNA.genomicPositions.add(i);
if (wildRNA.length() % 3 == 0 && wildRNA.length() > 0 && wildProt == null) {
wildProt = new ProteinCharSequence(geneticCode, wildRNA);
}
}
--exon_index;
}
}
if (wildProt == null) {
stderr().println("#no protein found for transcript:" + gene.getName());
return;
}
int peptideIndex0 = peptidePos1 - 1;
if (peptideIndex0 >= wildProt.length()) {
out.println("#index out of range for :" + gene.getName() + " petide length=" + wildProt.length());
return;
}
if (wildProt.charAt(peptideIndex0) != aa1) {
out.println("##Warning ref aminod acid for " + gene.getName() + " [" + peptidePos1 + "] is not the same (" + wildProt.charAt(peptideIndex0) + "/" + aa1 + ")");
} else {
out.println("##" + gene.getName());
}
int[] indexesInRNA = new int[] { 0 + peptideIndex0 * 3, 1 + peptideIndex0 * 3, 2 + peptideIndex0 * 3 };
final String wildCodon = "" + wildRNA.charAt(indexesInRNA[0]) + wildRNA.charAt(indexesInRNA[1]) + wildRNA.charAt(indexesInRNA[2]);
/* 2015 : adding possible mut codons */
final Set<String> possibleAltCodons = new HashSet<>();
final char[] bases = new char[] { 'A', 'C', 'G', 'T' };
for (int codon_pos = 0; codon_pos < 3; ++codon_pos) {
StringBuilder sb = new StringBuilder(wildCodon);
for (char mutBase : bases) {
sb.setCharAt(codon_pos, mutBase);
if (geneticCode.translate(sb.charAt(0), sb.charAt(1), sb.charAt(2)) == Character.toUpperCase(aa2)) {
possibleAltCodons.add(sb.toString());
}
}
}
for (int indexInRna : indexesInRNA) {
out.print(geneName);
out.print('\t');
out.print(aa1);
out.print('\t');
out.print(peptidePos1);
out.print('\t');
out.print(aa2);
out.print('\t');
out.print(gene.getName());
out.print('\t');
out.print(gene.getStrand() == Strand.NEGATIVE ? "-" : "+");
out.print('\t');
out.print(wildProt.charAt(peptideIndex0));
out.print('\t');
out.print(indexInRna);
out.print('\t');
out.print(wildCodon);
out.print('\t');
if (possibleAltCodons.isEmpty()) {
out.print('.');
} else {
boolean first = true;
for (String mutCodon : possibleAltCodons) {
if (!first)
out.print('|');
first = false;
out.print(mutCodon);
}
}
out.print('\t');
out.print(wildRNA.charAt(indexInRna));
out.print('\t');
out.print(gene.getChromosome());
out.print('\t');
out.print(wildRNA.genomicPositions.get(indexInRna));
out.print('\t');
String exonName = null;
for (KnownGene.Exon exon : gene.getExons()) {
int genome = wildRNA.genomicPositions.get(indexInRna);
if (exon.getStart() <= genome && genome < exon.getEnd()) {
exonName = exon.getName();
break;
}
}
out.print(exonName);
if (this.printSequences) {
String s = wildRNA.toString();
out.print('\t');
out.print(s.substring(0, indexInRna) + "[" + s.charAt(indexInRna) + "]" + (indexInRna + 1 < s.length() ? s.substring(indexInRna + 1) : ""));
s = wildProt.toString();
out.print('\t');
out.print(s.substring(0, peptideIndex0) + "[" + aa1 + "/" + aa2 + "/" + wildProt.charAt(peptideIndex0) + "]" + (peptideIndex0 + 1 < s.length() ? s.substring(peptideIndex0 + 1) : ""));
}
out.println();
}
}
Aggregations