use of com.github.lindenb.jvarkit.util.bio.structure.PeptideSequence in project jvarkit by lindenb.
the class VCFPredictions method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VCFIterator r, VariantContextWriter w) {
try {
this.referenceGenome = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidxPath);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.referenceGenome);
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
loadGtf(dict);
final VCFHeader header = r.getHeader();
final VCFHeader h2 = new VCFHeader(header);
JVarkitVersion.getInstance().addMetaData(this, h2);
switch(this.outputSyntax) {
case Vep:
{
h2.addMetaDataLine(new VCFInfoHeaderLine("CSQ", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Consequence type as predicted by VEP" + ". Format: Allele|Feature|Feature_type|Consequence|CDS_position|Protein_position|Amino_acids|Codons"));
break;
}
case SnpEff:
{
h2.addMetaDataLine(new VCFInfoHeaderLine("ANN", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'"));
break;
}
default:
{
final StringBuilder format = new StringBuilder();
for (FORMAT1 f : FORMAT1.values()) {
if (format.length() > 0)
format.append("|");
format.append(f.name());
}
h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Prediction from " + getClass().getSimpleName() + ". Format: " + format));
break;
}
}
w.writeHeader(h2);
final RNASequenceFactory rnaSeqFactory = new RNASequenceFactory();
rnaSeqFactory.setContigToGenomicSequence(S -> getGenomicSequence(S));
while (r.hasNext()) {
final VariantContext ctx = r.next();
final String normalizedContig = contigNameConverter.apply(ctx.getContig());
if (StringUtil.isBlank(normalizedContig)) {
w.add(ctx);
continue;
}
final List<Transcript> transcripts = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, ctx.getStart(), ctx.getEnd())).stream().flatMap(L -> L.stream()).collect(Collectors.toList());
final List<Annotation> all_annotations = new ArrayList<>();
final List<Allele> alternateAlleles;
if (ctx.getNAlleles() <= 1) {
// not a variant, just REF
alternateAlleles = Arrays.asList(Allele.NO_CALL);
} else {
alternateAlleles = ctx.getAlternateAlleles();
}
for (final Allele altAllele : alternateAlleles) {
if (altAllele.isReference() || altAllele.equals(Allele.SPAN_DEL) || altAllele.equals(Allele.NON_REF_ALLELE))
continue;
/* intergenic ====================================================== */
if (transcripts.isEmpty()) {
Transcript leftGene = null;
String leftId = "";
String leftName = "";
for (Iterator<Transcript> iter = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, 1, ctx.getStart())).stream().flatMap(L -> L.stream()).iterator(); iter.hasNext(); ) {
final Transcript t = iter.next();
if (leftGene == null || leftGene.getEnd() < t.getEnd()) {
leftGene = t;
leftId = t.getGene().getId();
leftName = t.getGene().getGeneName();
}
}
Transcript rightGene = null;
String rightId = "";
String rightName = "";
for (Iterator<Transcript> iter = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, ctx.getEnd(), dict.getSequence(normalizedContig).getSequenceLength())).stream().flatMap(L -> L.stream()).iterator(); iter.hasNext(); ) {
final Transcript t = iter.next();
if (rightGene == null || t.getStart() < rightGene.getStart()) {
rightGene = t;
rightId = t.getGene().getId();
rightName = t.getGene().getGeneName();
}
}
// intergenic
final Annotation annot = new Annotation(altAllele);
annot.seqont.add("intergenic");
annot.geneId = leftId + "-" + rightId;
annot.geneName = leftName + "-" + rightName;
all_annotations.add(annot);
} else {
for (final Transcript transcript : transcripts) {
final Annotation annotation = new Annotation(altAllele, transcript);
all_annotations.add(annotation);
if (!transcript.overlaps(ctx)) {
if (((ctx.getEnd() < transcript.getStart() && transcript.isNegativeStrand()) || (ctx.getStart() > transcript.getEnd() && transcript.isPositiveStrand()))) {
if (ctx.withinDistanceOf(transcript, 500)) {
annotation.seqont.add("500B_downstream_variant");
} else if (ctx.withinDistanceOf(transcript, 2_000)) {
annotation.seqont.add("2KB_downstream_variant");
}
} else if (((ctx.getEnd() < transcript.getStart() && transcript.isPositiveStrand()) || (ctx.getStart() > transcript.getEnd() && transcript.isNegativeStrand()))) {
if (ctx.withinDistanceOf(transcript, 2_000)) {
annotation.seqont.add("2KB_upstream_variant");
} else if (ctx.withinDistanceOf(transcript, 5_000)) {
annotation.seqont.add("5KB_upstream_variant");
}
}
continue;
}
if (CoordMath.encloses(ctx.getStart(), ctx.getEnd(), transcript.getStart(), transcript.getEnd())) {
// TODO can be inversion ,etc...
annotation.seqont.add("transcript_ablation");
continue;
}
for (int side = 0; side < 2; ++side) {
final Optional<UTR> opt_utr = (side == 0 ? transcript.getTranscriptUTR5() : transcript.getTranscriptUTR3());
if (!opt_utr.isPresent())
continue;
final UTR utr = opt_utr.get();
if (CoordMath.overlaps(utr.getStart(), utr.getEnd(), ctx.getStart(), ctx.getEnd())) {
annotation.seqont.add(side == 0 ? "5_prime_UTR_variant" : "3_prime_UTR_variant");
}
}
for (int side = 0; side < 2; ++side) {
final Optional<? extends ExonOrIntron> opt_ex;
if (side == 0) {
opt_ex = transcript.getExons().stream().filter(E -> E.overlaps(ctx)).findFirst();
} else {
opt_ex = transcript.getIntrons().stream().filter(E -> E.overlaps(ctx)).findFirst();
}
if (!opt_ex.isPresent())
continue;
final ExonOrIntron ei = opt_ex.get();
if (side == 0) {
if (transcript.isNonCoding())
annotation.seqont.add("non_coding_transcript_exon_variant");
} else {
if (transcript.isNonCoding())
annotation.seqont.add("non_coding_transcript_intron_variant");
annotation.seqont.add("intron");
}
if (ctx.getStart() == ctx.getEnd() && ei.isSplicing(ctx.getStart())) {
if (ei.isSplicingAcceptor(ctx.getStart())) {
// SPLICING_ACCEPTOR
annotation.seqont.add("splice_acceptor");
} else if (ei.isSplicingDonor(ctx.getStart())) {
// SPLICING_DONOR
annotation.seqont.add("splice_donor");
} else // ??
{
annotation.seqont.add("splicing_variant");
}
}
}
final StructuralVariantType svType = ctx.getStructuralVariantType();
if (svType != null) {
continue;
}
if (transcript.isNonCoding()) {
// TODO
annotation.seqont.add("non_coding_transcript_variant");
continue;
}
RNASequence cDNA = this.transcriptId2cdna.get(transcript.getId());
if (cDNA == null) {
cDNA = rnaSeqFactory.getCodingRNA(transcript);
this.transcriptId2cdna.put(transcript.getId(), cDNA);
}
final OptionalInt opt_pos_cdna0 = cDNA.convertGenomic0ToRnaIndex0(ctx.getStart() - 1);
if (!opt_pos_cdna0.isPresent())
continue;
final int pos_cdna0 = opt_pos_cdna0.getAsInt();
final int pos_aa = pos_cdna0 / 3;
final GeneticCode geneticCode = GeneticCode.getStandard();
if (AcidNucleics.isATGC(altAllele.getBaseString())) {
String bases = altAllele.getBaseString().toUpperCase();
if (transcript.isNegativeStrand()) {
bases = AcidNucleics.reverseComplement(bases);
}
final MutedSequence mutRNA = new MutedSequence(cDNA, pos_cdna0, ctx.getReference().length(), bases);
final PeptideSequence<CharSequence> wildProt = PeptideSequence.of(cDNA, geneticCode);
final PeptideSequence<CharSequence> mutProt = PeptideSequence.of(mutRNA, geneticCode);
final int mod = pos_cdna0 % 3;
annotation.wildCodon = ("" + cDNA.charAt(pos_cdna0 - mod + 0) + cDNA.charAt(pos_cdna0 - mod + 1) + cDNA.charAt(pos_cdna0 - mod + 2));
annotation.mutCodon = ("" + mutRNA.charAt(pos_cdna0 - mod + 0) + mutRNA.charAt(pos_cdna0 - mod + 1) + mutRNA.charAt(pos_cdna0 - mod + 2));
annotation.position_protein = (pos_aa + 1);
annotation.wildAA = String.valueOf(wildProt.charAt(pos_aa));
annotation.mutAA = (String.valueOf(mutProt.charAt(pos_aa)));
if (isStop(wildProt.charAt(pos_aa)) && !isStop(mutProt.charAt(pos_aa))) {
annotation.seqont.add("stop_lost");
} else if (!isStop(wildProt.charAt(pos_aa)) && isStop(mutProt.charAt(pos_aa))) {
annotation.seqont.add("stop_gained");
} else if (wildProt.charAt(pos_aa) == mutProt.charAt(pos_aa)) {
annotation.seqont.add("synonymous");
} else {
annotation.seqont.add("missense_variant");
}
}
}
}
}
final Set<String> info = new HashSet<String>(all_annotations.size());
for (final Annotation a : all_annotations) {
info.add(a.toString());
}
final VariantContextBuilder vb = new VariantContextBuilder(ctx);
final String thetag;
switch(this.outputSyntax) {
case Vep:
thetag = "CSQ";
break;
case SnpEff:
thetag = "ANN";
break;
default:
thetag = TAG;
break;
}
vb.attribute(thetag, info.toArray());
w.add(vb.make());
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(w);
CloserUtil.close(r);
CloserUtil.close(this.referenceGenome);
}
}
Aggregations