use of com.github.lindenb.jvarkit.util.bio.structure.RNASequence in project jvarkit by lindenb.
the class BackLocate method backLocate.
private void backLocate(final PrintStream out, final Transcript transcript, final String geneName, final AminoAcid aa1, final AminoAcid aa2, int peptidePos1, final String extraUserData) throws IOException {
final Set<String> messages = new LinkedHashSet<>();
final GeneticCode geneticCode = getGeneticCodeByChromosome(transcript.getContig());
final RNASequenceFactory rnaSequenceFactory = new RNASequenceFactory();
if (this.genomicContig == null || !this.genomicContig.getChrom().equals(transcript.getContig())) {
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.referenceGenome);
final SAMSequenceRecord ssr = dict.getSequence(transcript.getContig());
if (ssr == null) {
LOG.warn(JvarkitException.ContigNotFoundInDictionary.getMessage(transcript.getContig(), dict));
return;
}
this.genomicContig = new GenomicSequence(this.referenceGenome, transcript.getContig());
}
rnaSequenceFactory.setContigToGenomicSequence(C -> this.genomicContig);
final RNASequence wildRNA = rnaSequenceFactory.getCodingRNA(transcript);
final PeptideSequence<RNASequence> wildProt = PeptideSequence.of(wildRNA, geneticCode);
final int peptideIndex0 = peptidePos1 - 1;
if (peptideIndex0 >= wildProt.length()) {
out.println("##index out of range for :" + transcript.getId() + " petide length=" + wildProt.length());
return;
}
if (wildProt.charAt(peptideIndex0) != Character.toUpperCase(aa1.getOneLetterCode())) {
messages.add("REF aminod acid [" + peptidePos1 + "] is not the same (" + wildProt.charAt(peptideIndex0) + "/" + aa1 + ")");
}
final int[] indexesInRNA = new int[] { 0 + peptideIndex0 * 3, 1 + peptideIndex0 * 3, 2 + peptideIndex0 * 3 };
final String wildCodon = "" + wildRNA.charAt(indexesInRNA[0]) + wildRNA.charAt(indexesInRNA[1]) + wildRNA.charAt(indexesInRNA[2]);
/* 2015 : adding possible mut codons */
final Set<String> possibleAltCodons = new HashSet<>();
final char[] bases = new char[] { 'A', 'C', 'G', 'T' };
for (int codon_pos = 0; codon_pos < 3; ++codon_pos) {
final StringBuilder sb = new StringBuilder(wildCodon);
for (char mutBase : bases) {
sb.setCharAt(codon_pos, mutBase);
if (geneticCode.translate(sb.charAt(0), sb.charAt(1), sb.charAt(2)) == Character.toUpperCase(aa2.getOneLetterCode())) {
possibleAltCodons.add(sb.toString());
}
}
}
for (int indexInRna : indexesInRNA) {
out.print(geneName);
out.print('\t');
out.print(aa1.getThreeLettersCode());
out.print('\t');
out.print(peptidePos1);
out.print('\t');
out.print(aa2.getThreeLettersCode());
out.print('\t');
out.print(transcript.getGene().getGeneName());
out.print('\t');
out.print(transcript.getId());
out.print('\t');
out.print(transcript.getStrand());
out.print('\t');
out.print(wildProt.charAt(peptideIndex0));
out.print('\t');
out.print(indexInRna);
out.print('\t');
out.print(wildCodon);
out.print('\t');
if (possibleAltCodons.isEmpty()) {
out.print('.');
} else {
out.print(String.join("|", possibleAltCodons));
}
out.print('\t');
out.print(wildRNA.charAt(indexInRna));
out.print('\t');
out.print(transcript.getContig());
out.print('\t');
out.print(wildRNA.convertRnaIndex0ToGenomic0(indexInRna));
out.print('\t');
String exonName = null;
for (final Exon exon : transcript.getExons()) {
int genome = wildRNA.convertRnaIndex0ToGenomic0(indexInRna);
if (exon.contains(genome)) {
exonName = exon.getName();
break;
}
}
out.print(exonName);
if (this.printSequences) {
String s = wildRNA.toString();
out.print('\t');
out.print(s.substring(0, indexInRna) + "[" + s.charAt(indexInRna) + "]" + (indexInRna + 1 < s.length() ? s.substring(indexInRna + 1) : ""));
s = wildProt.toString();
out.print('\t');
out.print(s.substring(0, peptideIndex0) + "[" + aa1 + "/" + aa2 + "/" + wildProt.charAt(peptideIndex0) + "]" + (peptideIndex0 + 1 < s.length() ? s.substring(peptideIndex0 + 1) : ""));
}
out.print('\t');
out.print(messages.isEmpty() ? "." : String.join(";", messages));
out.print('\t');
out.print(extraUserData);
out.println();
}
}
use of com.github.lindenb.jvarkit.util.bio.structure.RNASequence in project jvarkit by lindenb.
the class VCFPredictions method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VCFIterator r, VariantContextWriter w) {
try {
this.referenceGenome = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidxPath);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.referenceGenome);
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
loadGtf(dict);
final VCFHeader header = r.getHeader();
final VCFHeader h2 = new VCFHeader(header);
JVarkitVersion.getInstance().addMetaData(this, h2);
switch(this.outputSyntax) {
case Vep:
{
h2.addMetaDataLine(new VCFInfoHeaderLine("CSQ", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Consequence type as predicted by VEP" + ". Format: Allele|Feature|Feature_type|Consequence|CDS_position|Protein_position|Amino_acids|Codons"));
break;
}
case SnpEff:
{
h2.addMetaDataLine(new VCFInfoHeaderLine("ANN", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'"));
break;
}
default:
{
final StringBuilder format = new StringBuilder();
for (FORMAT1 f : FORMAT1.values()) {
if (format.length() > 0)
format.append("|");
format.append(f.name());
}
h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Prediction from " + getClass().getSimpleName() + ". Format: " + format));
break;
}
}
w.writeHeader(h2);
final RNASequenceFactory rnaSeqFactory = new RNASequenceFactory();
rnaSeqFactory.setContigToGenomicSequence(S -> getGenomicSequence(S));
while (r.hasNext()) {
final VariantContext ctx = r.next();
final String normalizedContig = contigNameConverter.apply(ctx.getContig());
if (StringUtil.isBlank(normalizedContig)) {
w.add(ctx);
continue;
}
final List<Transcript> transcripts = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, ctx.getStart(), ctx.getEnd())).stream().flatMap(L -> L.stream()).collect(Collectors.toList());
final List<Annotation> all_annotations = new ArrayList<>();
final List<Allele> alternateAlleles;
if (ctx.getNAlleles() <= 1) {
// not a variant, just REF
alternateAlleles = Arrays.asList(Allele.NO_CALL);
} else {
alternateAlleles = ctx.getAlternateAlleles();
}
for (final Allele altAllele : alternateAlleles) {
if (altAllele.isReference() || altAllele.equals(Allele.SPAN_DEL) || altAllele.equals(Allele.NON_REF_ALLELE))
continue;
/* intergenic ====================================================== */
if (transcripts.isEmpty()) {
Transcript leftGene = null;
String leftId = "";
String leftName = "";
for (Iterator<Transcript> iter = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, 1, ctx.getStart())).stream().flatMap(L -> L.stream()).iterator(); iter.hasNext(); ) {
final Transcript t = iter.next();
if (leftGene == null || leftGene.getEnd() < t.getEnd()) {
leftGene = t;
leftId = t.getGene().getId();
leftName = t.getGene().getGeneName();
}
}
Transcript rightGene = null;
String rightId = "";
String rightName = "";
for (Iterator<Transcript> iter = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, ctx.getEnd(), dict.getSequence(normalizedContig).getSequenceLength())).stream().flatMap(L -> L.stream()).iterator(); iter.hasNext(); ) {
final Transcript t = iter.next();
if (rightGene == null || t.getStart() < rightGene.getStart()) {
rightGene = t;
rightId = t.getGene().getId();
rightName = t.getGene().getGeneName();
}
}
// intergenic
final Annotation annot = new Annotation(altAllele);
annot.seqont.add("intergenic");
annot.geneId = leftId + "-" + rightId;
annot.geneName = leftName + "-" + rightName;
all_annotations.add(annot);
} else {
for (final Transcript transcript : transcripts) {
final Annotation annotation = new Annotation(altAllele, transcript);
all_annotations.add(annotation);
if (!transcript.overlaps(ctx)) {
if (((ctx.getEnd() < transcript.getStart() && transcript.isNegativeStrand()) || (ctx.getStart() > transcript.getEnd() && transcript.isPositiveStrand()))) {
if (ctx.withinDistanceOf(transcript, 500)) {
annotation.seqont.add("500B_downstream_variant");
} else if (ctx.withinDistanceOf(transcript, 2_000)) {
annotation.seqont.add("2KB_downstream_variant");
}
} else if (((ctx.getEnd() < transcript.getStart() && transcript.isPositiveStrand()) || (ctx.getStart() > transcript.getEnd() && transcript.isNegativeStrand()))) {
if (ctx.withinDistanceOf(transcript, 2_000)) {
annotation.seqont.add("2KB_upstream_variant");
} else if (ctx.withinDistanceOf(transcript, 5_000)) {
annotation.seqont.add("5KB_upstream_variant");
}
}
continue;
}
if (CoordMath.encloses(ctx.getStart(), ctx.getEnd(), transcript.getStart(), transcript.getEnd())) {
// TODO can be inversion ,etc...
annotation.seqont.add("transcript_ablation");
continue;
}
for (int side = 0; side < 2; ++side) {
final Optional<UTR> opt_utr = (side == 0 ? transcript.getTranscriptUTR5() : transcript.getTranscriptUTR3());
if (!opt_utr.isPresent())
continue;
final UTR utr = opt_utr.get();
if (CoordMath.overlaps(utr.getStart(), utr.getEnd(), ctx.getStart(), ctx.getEnd())) {
annotation.seqont.add(side == 0 ? "5_prime_UTR_variant" : "3_prime_UTR_variant");
}
}
for (int side = 0; side < 2; ++side) {
final Optional<? extends ExonOrIntron> opt_ex;
if (side == 0) {
opt_ex = transcript.getExons().stream().filter(E -> E.overlaps(ctx)).findFirst();
} else {
opt_ex = transcript.getIntrons().stream().filter(E -> E.overlaps(ctx)).findFirst();
}
if (!opt_ex.isPresent())
continue;
final ExonOrIntron ei = opt_ex.get();
if (side == 0) {
if (transcript.isNonCoding())
annotation.seqont.add("non_coding_transcript_exon_variant");
} else {
if (transcript.isNonCoding())
annotation.seqont.add("non_coding_transcript_intron_variant");
annotation.seqont.add("intron");
}
if (ctx.getStart() == ctx.getEnd() && ei.isSplicing(ctx.getStart())) {
if (ei.isSplicingAcceptor(ctx.getStart())) {
// SPLICING_ACCEPTOR
annotation.seqont.add("splice_acceptor");
} else if (ei.isSplicingDonor(ctx.getStart())) {
// SPLICING_DONOR
annotation.seqont.add("splice_donor");
} else // ??
{
annotation.seqont.add("splicing_variant");
}
}
}
final StructuralVariantType svType = ctx.getStructuralVariantType();
if (svType != null) {
continue;
}
if (transcript.isNonCoding()) {
// TODO
annotation.seqont.add("non_coding_transcript_variant");
continue;
}
RNASequence cDNA = this.transcriptId2cdna.get(transcript.getId());
if (cDNA == null) {
cDNA = rnaSeqFactory.getCodingRNA(transcript);
this.transcriptId2cdna.put(transcript.getId(), cDNA);
}
final OptionalInt opt_pos_cdna0 = cDNA.convertGenomic0ToRnaIndex0(ctx.getStart() - 1);
if (!opt_pos_cdna0.isPresent())
continue;
final int pos_cdna0 = opt_pos_cdna0.getAsInt();
final int pos_aa = pos_cdna0 / 3;
final GeneticCode geneticCode = GeneticCode.getStandard();
if (AcidNucleics.isATGC(altAllele.getBaseString())) {
String bases = altAllele.getBaseString().toUpperCase();
if (transcript.isNegativeStrand()) {
bases = AcidNucleics.reverseComplement(bases);
}
final MutedSequence mutRNA = new MutedSequence(cDNA, pos_cdna0, ctx.getReference().length(), bases);
final PeptideSequence<CharSequence> wildProt = PeptideSequence.of(cDNA, geneticCode);
final PeptideSequence<CharSequence> mutProt = PeptideSequence.of(mutRNA, geneticCode);
final int mod = pos_cdna0 % 3;
annotation.wildCodon = ("" + cDNA.charAt(pos_cdna0 - mod + 0) + cDNA.charAt(pos_cdna0 - mod + 1) + cDNA.charAt(pos_cdna0 - mod + 2));
annotation.mutCodon = ("" + mutRNA.charAt(pos_cdna0 - mod + 0) + mutRNA.charAt(pos_cdna0 - mod + 1) + mutRNA.charAt(pos_cdna0 - mod + 2));
annotation.position_protein = (pos_aa + 1);
annotation.wildAA = String.valueOf(wildProt.charAt(pos_aa));
annotation.mutAA = (String.valueOf(mutProt.charAt(pos_aa)));
if (isStop(wildProt.charAt(pos_aa)) && !isStop(mutProt.charAt(pos_aa))) {
annotation.seqont.add("stop_lost");
} else if (!isStop(wildProt.charAt(pos_aa)) && isStop(mutProt.charAt(pos_aa))) {
annotation.seqont.add("stop_gained");
} else if (wildProt.charAt(pos_aa) == mutProt.charAt(pos_aa)) {
annotation.seqont.add("synonymous");
} else {
annotation.seqont.add("missense_variant");
}
}
}
}
}
final Set<String> info = new HashSet<String>(all_annotations.size());
for (final Annotation a : all_annotations) {
info.add(a.toString());
}
final VariantContextBuilder vb = new VariantContextBuilder(ctx);
final String thetag;
switch(this.outputSyntax) {
case Vep:
thetag = "CSQ";
break;
case SnpEff:
thetag = "ANN";
break;
default:
thetag = TAG;
break;
}
vb.attribute(thetag, info.toArray());
w.add(vb.make());
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(w);
CloserUtil.close(r);
CloserUtil.close(this.referenceGenome);
}
}
use of com.github.lindenb.jvarkit.util.bio.structure.RNASequence in project jvarkit by lindenb.
the class VCFCombineTwoSnvs method getTranscript.
private RNASequence getTranscript(final Transcript kg) {
RNASequence mrna = this.kgId2transcriptCache.get(kg.getId());
if (mrna != null && mrna.getTranscript().getContig().equals(kg.getContig()))
return mrna;
mrna = this.rnaSequenceFactory.getCodingRNA(kg);
this.kgId2transcriptCache.put(kg.getId(), mrna);
return mrna;
}
use of com.github.lindenb.jvarkit.util.bio.structure.RNASequence in project jvarkit by lindenb.
the class VCFCombineTwoSnvs method challenge.
private void challenge(final VariantContext ctx, final Allele allele, final Transcript gene, final String vcfLine) throws IOException {
if (allele.isSymbolic())
return;
if (allele.isNoCall())
return;
if (!allele.isCalled())
return;
if (allele.length() != 1)
return;
if (ctx.getReference().length() != 1)
return;
if (allele.equals(Allele.SPAN_DEL))
return;
final RNASequence wildRNA = this.getTranscript(gene);
final int positionContext0 = ctx.getStart() - 1;
final OptionalInt position_in_cnda0 = wildRNA.convertGenomic0ToRnaIndex0(positionContext0);
if (!position_in_cnda0.isPresent())
return;
final Variant variant = new Variant(ctx, allele, gene);
variant.sorting_id = ID_GENERATOR++;
variant.position_in_cdna = position_in_cnda0.getAsInt();
char mutBase = Character.toUpperCase(allele.getDisplayString().charAt(0));
if (gene.isNegativeStrand())
mutBase = AcidNucleics.complement(mutBase);
final MutedSequence mutRNA = new MutedSequence(wildRNA, variant.position_in_cdna, mutBase);
variant.wildCodon = "";
variant.mutCodon = "";
for (int i = 0; i < 3; ++i) {
final int pos = variant.codonStart() + i;
variant.wildCodon += (pos < wildRNA.length() ? wildRNA.charAt(pos) : '*');
variant.mutCodon += (pos < mutRNA.length() ? mutRNA.charAt(pos) : '*');
}
variant.wildCodon = variant.wildCodon.toUpperCase();
variant.mutCodon = variant.mutCodon.toUpperCase();
variant.vcfLine = vcfLine;
if (variant.wildCodon.equals(variant.mutCodon)) {
LOG.info("Uh??????? " + allele + " " + ctx.getContig() + ":" + ctx.getStart() + " " + position_in_cnda0 + " " + wildRNA.length() + " " + variant.wildCodon + " " + variant.mutCodon);
return;
}
this.variants.add(variant);
}
Aggregations