use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.
the class BackLocate method loadKnownGenesFromUri.
private void loadKnownGenesFromUri(String kgURI) throws IOException {
if (this.referenceGenome.getDictionary() == null) {
throw new JvarkitException.FastaDictionaryMissing("No sequence dictionary in " + this.indexedRefUri);
}
LOG.info("loading genes");
final Set<String> unknown = new HashSet<String>();
BufferedReader in = IOUtils.openURIForBufferedReading(kgURI);
String line;
final Pattern tab = Pattern.compile("[\t]");
while ((line = in.readLine()) != null) {
if (line.isEmpty())
continue;
final String[] tokens = tab.split(line);
final KnownGene g = new KnownGene(tokens);
final Interval rgn = new Interval(g.getContig(), g.getTxStart() + 1, g.getTxEnd());
if (this.referenceGenome.getDictionary().getSequence(rgn.getContig()) == null) {
if (!unknown.contains(g.getContig())) {
LOG.warn("The reference doesn't contain chromosome " + g.getContig());
unknown.add(g.getContig());
}
continue;
}
this.knwonGenes.put(g.getName(), g);
}
in.close();
LOG.info("genes:" + this.knwonGenes.size());
}
use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.
the class Biostar81455 method doWork.
@Override
public int doWork(List<String> args) {
BufferedReader r = null;
String line;
PrintStream out = null;
final Pattern tab = Pattern.compile("[\t]");
try {
if (this.kgUri == null || this.kgUri.trim().isEmpty()) {
LOG.error("undefined kguri");
return -1;
}
LOG.info("readubf " + kgUri);
this.kgMap = KnownGene.loadUriAsIntervalTreeMap(this.kgUri, (kg) -> kg.getExonCount() != 0);
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
}
try {
r = super.openBufferedReader(oneFileOrNull(args));
out = openFileOrStdoutAsPrintStream(this.outputFile);
while ((line = r.readLine()) != null) {
if (line.startsWith("#")) {
out.println(line);
continue;
}
boolean found = false;
String[] tokens = tab.split(line);
int pos0 = Integer.parseInt(tokens[1]);
final IntervalTree<List<KnownGene>> kgs = kgMap.debugGetTree(tokens[0]);
if (kgs == null) {
LOG.info("no gene found in chromosome " + tokens[0] + " (check chrom prefix?)");
} else {
KnownGene bestGene = null;
for (Iterator<IntervalTree.Node<List<KnownGene>>> iter = kgs.iterator(); iter.hasNext(); ) {
for (KnownGene kg : iter.next().getValue()) {
if (bestGene == null || Math.abs(distance(pos0, kg)) < Math.abs(distance(pos0, bestGene))) {
bestGene = kg;
}
}
}
if (bestGene != null) {
// get all transcritpts
final List<KnownGene> overlapKg = new ArrayList<>();
for (final List<KnownGene> lkg : kgMap.getOverlapping(new Interval(bestGene.getChromosome(), bestGene.getTxStart() + 1, bestGene.getTxEnd()))) {
overlapKg.addAll(lkg);
}
for (final KnownGene kg : overlapKg) {
KnownGene.Exon bestExon = null;
for (KnownGene.Exon exon : kg.getExons()) {
if (bestExon == null || Math.abs(distance(pos0, exon)) < Math.abs(distance(pos0, bestExon))) {
bestExon = exon;
}
}
if (bestExon != null) {
out.println(line + "\t" + kg.getName() + "\t" + kg.getTxStart() + "\t" + kg.getTxEnd() + "\t" + kg.getStrand() + "\t" + bestExon.getName() + "\t" + bestExon.getStart() + "\t" + bestExon.getEnd() + "\t" + distance(pos0, bestExon));
found = true;
}
}
}
}
if (!found) {
out.println(line + "\tNULL");
}
}
return RETURN_OK;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
CloserUtil.close(out);
}
}
use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.
the class VcfDoest method run.
private void run(final LineIterator lr, final PrintWriter pw) throws IOException {
SortingCollection<TranscriptInfo> sorting = null;
CloseableIterator<TranscriptInfo> iter2 = null;
try {
while (lr.hasNext()) {
VcfIterator in = VCFUtils.createVcfIteratorFromLineIterator(lr, true);
final VCFHeader header = in.getHeader();
final Pedigree pedigree = Pedigree.newParser().parse(header);
if (pedigree.isEmpty()) {
throw new IOException("No pedigree found in header VCF header. use VcfInjectPedigree to add it");
}
final SortedSet<Pedigree.Person> individuals = new TreeSet<>();
for (final Pedigree.Person individual : pedigree.getPersons()) {
if (individual.isAffected() || individual.isUnaffected()) {
individuals.add(individual);
}
}
boolean first = true;
pw.println("# samples ( 0: unaffected 1:affected)");
pw.print("population <- data.frame(family=c(");
first = true;
for (final Pedigree.Person person : individuals) {
if (!first)
pw.print(",");
pw.print("\"" + person.getFamily().getId() + "\"");
first = false;
}
pw.print("),name=c(");
first = true;
for (final Pedigree.Person person : individuals) {
if (!first)
pw.print(",");
pw.print("\"" + person.getId() + "\"");
first = false;
}
pw.print("),status=c(");
first = true;
for (final Pedigree.Person person : individuals) {
if (!first)
pw.print(",");
pw.print(person.isUnaffected() ? 0 : 1);
first = false;
}
pw.println("))");
sorting = SortingCollection.newInstance(TranscriptInfo.class, new TranscriptInfoCodec(), new TranscriptInfoCmp(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sorting.setDestructiveIteration(true);
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(header.getSequenceDictionary());
/**
* loop over variants
*/
while (in.hasNext() && !pw.checkError()) {
final VariantContext ctx = progess.watch(in.next());
if (ctx.isFiltered())
continue;
if (ctx.getAlternateAlleles().isEmpty())
continue;
final Allele altAllele = ctx.getAltAlleleWithHighestAlleleCount();
final MafCalculator mafCalculator = new MafCalculator(altAllele, ctx.getContig());
boolean genotyped = false;
for (final Pedigree.Person p : pedigree.getPersons()) {
if (!(p.isAffected() || p.isUnaffected()))
continue;
final Genotype g = ctx.getGenotype(p.getId());
if (g == null)
throw new IOException("Strange I cannot find individual " + p + " in the pedigree. Aborting.");
if (g.isCalled()) {
mafCalculator.add(g, p.isMale());
}
if (g.isHet() || g.isHomVar()) {
if (!g.getAlleles().contains(altAllele))
continue;
genotyped = true;
break;
}
}
if (!genotyped)
continue;
final Interval interval = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd());
final List<KnownGene> genes = this.overlap(interval);
if (genes.isEmpty())
continue;
for (final KnownGene kg : genes) {
final TranscriptInfo trInfo = new TranscriptInfo();
trInfo.contig = kg.getContig();
trInfo.txStart = kg.getTxStart();
trInfo.txEnd = kg.getTxEnd();
trInfo.transcriptName = kg.getName();
trInfo.strand = (byte) (kg.isPositiveStrand() ? '+' : '-');
trInfo.exonCount = kg.getExonCount();
trInfo.transcriptLength = kg.getTranscriptLength();
trInfo.ctxStart = ctx.getStart();
trInfo.ref = ctx.getReference();
trInfo.alt = altAllele;
trInfo.maf = mafCalculator.getMaf();
trInfo.genotypes = new byte[individuals.size()];
int idx = 0;
for (final Pedigree.Person individual : individuals) {
final Genotype genotype = ctx.getGenotype(individual.getId());
final byte b;
if (genotype.isHomRef()) {
b = 0;
} else if (genotype.isHomVar() && genotype.getAlleles().contains(altAllele)) {
b = 2;
} else if (genotype.isHet() && genotype.getAlleles().contains(altAllele) && genotype.getAlleles().contains(ctx.getReference())) {
b = 1;
} else /* we treat 0/2 has hom-ref */
if (genotype.isHet() && !genotype.getAlleles().contains(altAllele) && genotype.getAlleles().contains(ctx.getReference())) {
LOG.warn("Treating " + genotype + " as hom-ref (0) alt=" + altAllele);
b = 0;
} else /* we treat 2/2 has hom-ref */
if (genotype.isHomVar() && !genotype.getAlleles().contains(altAllele)) {
LOG.warn("Treating " + genotype + " as hom-ref (0) alt=" + altAllele);
b = 0;
} else {
b = -9;
}
trInfo.genotypes[idx] = b;
++idx;
}
KnownGene archetype = kg;
/* find gene archetype = longest overlapping */
for (final KnownGene kg2 : genes) {
if (kg2 == kg)
continue;
if (archetype.getStrand().equals(kg2.getStrand()) && archetype.getTranscriptLength() < kg2.getTranscriptLength()) {
archetype = kg2;
}
}
trInfo.archetypeName = archetype.getName();
trInfo.archetypeLength = archetype.getTranscriptLength();
boolean ctxWasFoundInExon = false;
final int ctxPos0 = ctx.getStart() - 1;
int indexInTranscript0 = 0;
for (final KnownGene.Exon exon : kg.getExons()) {
// variant in exon ?
if (!(exon.getStart() > (ctx.getEnd() - 1) || (ctx.getStart() - 1) >= exon.getEnd())) {
ctxWasFoundInExon = true;
indexInTranscript0 += (ctxPos0 - exon.getStart());
if (kg.isNegativeStrand()) {
indexInTranscript0 = (kg.getTranscriptLength() - 1) - indexInTranscript0;
}
trInfo.indexInTranscript0 = indexInTranscript0;
trInfo.overlapName = exon.getName();
sorting.add(trInfo);
break;
} else {
indexInTranscript0 += (exon.getEnd() - exon.getStart());
}
}
if (ctxWasFoundInExon) {
continue;
}
indexInTranscript0 = 0;
// search closest intron/exon junction
for (int ex = 0; ex + 1 < kg.getExonCount(); ++ex) {
final KnownGene.Exon exon1 = kg.getExon(ex);
indexInTranscript0 += (exon1.getEnd() - exon1.getStart());
final KnownGene.Exon exon2 = kg.getExon(ex + 1);
if (exon1.getEnd() <= ctxPos0 && ctxPos0 < exon2.getStart()) {
final int dist_to_exon1 = ctxPos0 - exon1.getEnd();
final int dist_to_exon2 = exon2.getStart() - ctxPos0;
if (dist_to_exon2 < dist_to_exon1) {
indexInTranscript0++;
}
if (kg.isNegativeStrand()) {
indexInTranscript0 = (kg.getTranscriptLength() - 1) - indexInTranscript0;
}
trInfo.indexInTranscript0 = indexInTranscript0;
trInfo.overlapName = exon1.getNextIntron().getName();
sorting.add(trInfo);
break;
}
}
}
// end loop over genes
}
// end while loop over variants
progess.finish();
sorting.doneAdding();
LOG.info("done adding");
iter2 = sorting.iterator();
final EqualRangeIterator<TranscriptInfo> eqiter = new EqualRangeIterator<TranscriptInfo>(iter2, new Comparator<TranscriptInfo>() {
@Override
public int compare(final TranscriptInfo o1, final TranscriptInfo o2) {
int i = o1.contig.compareTo(o2.contig);
if (i != 0)
return i;
i = o1.transcriptName.compareTo(o2.transcriptName);
return i;
}
});
while (eqiter.hasNext()) {
final List<TranscriptInfo> list = eqiter.next();
final TranscriptInfo front = list.get(0);
pw.println("# BEGIN TRANSCRIPT " + front.transcriptName + " ##########################################");
pw.println("transcript.chrom <- \"" + front.contig + "\"");
pw.println("transcript.txStart0 <- " + front.txStart + "");
pw.println("transcript.txEnd0 <- " + front.txEnd + "");
pw.println("transcript.name <- \"" + front.transcriptName + "\"");
pw.println("transcript.strand <- \"" + ((char) front.strand) + "\"");
pw.println("transcript.length <- " + front.transcriptLength + "");
pw.println("transcript.exonCount <- " + front.exonCount + "");
pw.println("archetype.name <- \"" + front.archetypeName + "\"");
pw.println("archetype.length <- " + front.archetypeLength + "");
pw.print("variants <- data.frame(chrom=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print("\"" + v.contig + "\"");
first = false;
}
pw.print("),chromStart=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print(v.ctxStart);
first = false;
}
pw.print("),chromEnd=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print(v.ctxStart + v.ref.length() - 1);
first = false;
}
pw.print("),refAllele=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print("\"" + v.ref.getDisplayString() + "\"");
first = false;
}
pw.print("),altAllele=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print("\"" + v.alt.getDisplayString() + "\"");
first = false;
}
pw.print("),positionInTranscript1=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print(v.indexInTranscript0 + 1);
first = false;
}
pw.print("),maf=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print(v.maf);
first = false;
}
pw.print("),overlapName=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print("\"" + v.overlapName + "\"");
first = false;
}
pw.println("))");
pw.println("# genotypes as a list. Should be a multiple of length(samples).");
pw.println("# 0 is homref (0/0), 1 is het (0/1), 2 is homvar (1/1)");
pw.println("# if the variant contains another ALT allele: (0/2) and (2/2) are considered 0 (homref)");
pw.print("genotypes <- c(");
first = true;
for (final TranscriptInfo tr : list) {
for (byte g : tr.genotypes) {
if (!first)
pw.print(",");
first = false;
pw.print((int) g);
}
}
pw.println(")");
pw.println("stopifnot(NROW(variants) * NROW(population) == length(genotypes) )");
if (this.userDefinedFunName == null || this.userDefinedFunName.trim().isEmpty()) {
pw.println("## WARNING not user-defined R function was defined");
} else {
pw.println("# consumme data with user-defined R function ");
pw.println(this.userDefinedFunName + "()");
}
pw.println("# END TRANSCRIPT " + front.transcriptName + " ##########################################");
}
// end while eqiter
eqiter.close();
iter2.close();
iter2 = null;
sorting.cleanup();
sorting = null;
}
} finally {
CloserUtil.close(iter2);
if (sorting != null)
sorting.cleanup();
}
}
use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.
the class VCFPredictions method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VcfIterator r, VariantContextWriter w) {
ReferenceContig genomicSequence = null;
try {
LOG.info("opening REF:" + this.referenceGenomeSource);
this.referenceGenome = new ReferenceGenomeFactory().open(this.referenceGenomeSource);
loadKnownGenesFromUri();
final VCFHeader header = (VCFHeader) r.getHeader();
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(this.referenceGenome.getDictionary());
contigNameConverter.setOnNotFound(OnNotFound.SKIP);
final VCFHeader h2 = new VCFHeader(header);
addMetaData(h2);
switch(this.outputSyntax) {
case Vep:
{
h2.addMetaDataLine(new VCFInfoHeaderLine("CSQ", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Consequence type as predicted by VEP" + ". Format: Allele|Feature|Feature_type|Consequence|CDS_position|Protein_position|Amino_acids|Codons"));
break;
}
case SnpEff:
{
h2.addMetaDataLine(new VCFInfoHeaderLine("ANN", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'"));
break;
}
default:
{
final StringBuilder format = new StringBuilder();
for (FORMAT1 f : FORMAT1.values()) {
if (format.length() > 0)
format.append("|");
format.append(f.name());
}
h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Prediction from " + getClass().getSimpleName() + ". Format: " + format));
break;
}
}
w.writeHeader(h2);
final SequenceOntologyTree soTree = SequenceOntologyTree.getInstance();
final SequenceOntologyTree.Term so_intron = soTree.getTermByAcn("SO:0001627");
final SequenceOntologyTree.Term so_exon = soTree.getTermByAcn("SO:0001791");
final SequenceOntologyTree.Term so_splice_donor = soTree.getTermByAcn("SO:0001575");
final SequenceOntologyTree.Term so_splice_acceptor = soTree.getTermByAcn("SO:0001574");
final SequenceOntologyTree.Term so_5_prime_UTR_variant = soTree.getTermByAcn("SO:0001623");
final SequenceOntologyTree.Term so_3_prime_UTR_variant = soTree.getTermByAcn("SO:0001624");
final SequenceOntologyTree.Term so_splicing_variant = soTree.getTermByAcn("SO:0001568");
final SequenceOntologyTree.Term so_stop_lost = soTree.getTermByAcn("SO:0001578");
final SequenceOntologyTree.Term so_stop_gained = soTree.getTermByAcn("SO:0001587");
final SequenceOntologyTree.Term so_coding_synonymous = soTree.getTermByAcn("SO:0001819");
final SequenceOntologyTree.Term so_coding_non_synonymous = soTree.getTermByAcn("SO:0001583");
final SequenceOntologyTree.Term so_intergenic = soTree.getTermByAcn("SO:0001628");
final SequenceOntologyTree.Term so_nc_transcript_variant = soTree.getTermByAcn("SO:0001619");
final SequenceOntologyTree.Term so_non_coding_exon_variant = soTree.getTermByAcn("SO:0001792");
final SequenceOntologyTree.Term _2KB_upstream_variant = soTree.getTermByAcn("SO:0001636");
final SequenceOntologyTree.Term _5KB_upstream_variant = soTree.getTermByAcn("SO:0001635");
final SequenceOntologyTree.Term _5KB_downstream_variant = soTree.getTermByAcn("SO:0001633");
final SequenceOntologyTree.Term _500bp_downstream_variant = soTree.getTermByAcn("SO:0001634");
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
while (r.hasNext()) {
final VariantContext ctx = progress.watch(r.next());
final String normalizedContig = contigNameConverter.apply(ctx.getContig());
final List<KnownGene> genes = new ArrayList<>();
if (!StringUtil.isBlank(normalizedContig)) {
for (final List<KnownGene> l2 : this.knownGenes.getOverlapping(new Interval(normalizedContig, ctx.getStart(), // 1-based
ctx.getEnd()))) {
genes.addAll(l2);
}
}
final List<Annotation> ctx_annotations = new ArrayList<Annotation>();
if (genes == null || genes.isEmpty()) {
// intergenic
Annotation a = new Annotation();
a.seqont.add(so_intergenic);
ctx_annotations.add(a);
} else {
if (genomicSequence == null || !genomicSequence.hasName(normalizedContig)) {
LOG.info("getting genomic Sequence for " + normalizedContig);
genomicSequence = this.referenceGenome.getContig(normalizedContig);
if (genomicSequence == null)
throw new JvarkitException.ContigNotFoundInDictionary(normalizedContig, this.referenceGenome.getDictionary());
}
for (final KnownGene gene : genes) {
final GeneticCode geneticCode = GeneticCode.getStandard();
for (final Allele alt2 : ctx.getAlternateAlleles()) {
if (alt2.isNoCall())
continue;
if (alt2.isSymbolic()) {
LOG.warn("symbolic allele are not handled... " + alt2.getDisplayString());
continue;
}
if (alt2.isReference())
continue;
final Annotation annotations = new Annotation();
annotations.kg = gene;
annotations.alt2 = alt2;
if (gene.isNonCoding()) {
annotations.seqont.add(so_nc_transcript_variant);
continue;
}
ctx_annotations.add(annotations);
StringBuilder wildRNA = null;
ProteinCharSequence wildProt = null;
ProteinCharSequence mutProt = null;
MutedSequence mutRNA = null;
int position_in_cds = -1;
final int position = ctx.getStart() - 1;
if (!String.valueOf(genomicSequence.charAt(position)).equalsIgnoreCase(ctx.getReference().getBaseString())) {
if (isSimpleBase(ctx.getReference())) {
LOG.warn("Warning REF!=GENOMIC SEQ!!! at " + position + "/" + ctx.getReference());
}
continue;
}
if (gene.isPositiveStrand()) {
if (position < gene.getTxStart() - 2000) {
annotations.seqont.add(_5KB_upstream_variant);
} else if (position < gene.getTxStart()) {
annotations.seqont.add(_2KB_upstream_variant);
} else if (position >= gene.getTxEnd() + 500) {
annotations.seqont.add(_5KB_downstream_variant);
} else if (position >= gene.getTxEnd()) {
annotations.seqont.add(_500bp_downstream_variant);
} else if (position < gene.getCdsStart()) {
// UTR5
annotations.seqont.add(so_5_prime_UTR_variant);
} else if (gene.getCdsEnd() <= position) {
annotations.seqont.add(so_3_prime_UTR_variant);
} else {
int exon_index = 0;
while (exon_index < gene.getExonCount()) {
final KnownGene.Exon exon = gene.getExon(exon_index);
for (int i = exon.getStart(); i < exon.getEnd(); ++i) {
if (i == position) {
annotations.exon_name = exon.getName();
if (exon.isNonCoding()) {
annotations.seqont.add(so_non_coding_exon_variant);
}
}
if (i < gene.getTxStart())
continue;
if (i < gene.getCdsStart())
continue;
if (i >= gene.getCdsEnd())
break;
if (wildRNA == null) {
wildRNA = new StringBuilder();
mutRNA = new MutedSequence(wildRNA);
}
if (i == position) {
annotations.seqont.add(so_exon);
annotations.exon_name = exon.getName();
position_in_cds = wildRNA.length();
annotations.position_cds = position_in_cds;
// in splicing ?
if (exon.isSplicing(position)) {
if (exon.isSplicingAcceptor(position)) {
// SPLICING_ACCEPTOR
annotations.seqont.add(so_splice_acceptor);
} else if (exon.isSplicingDonor(position)) {
// SPLICING_DONOR
annotations.seqont.add(so_splice_donor);
} else // ??
{
annotations.seqont.add(so_splicing_variant);
}
}
}
wildRNA.append(genomicSequence.charAt(i));
if (i == position && isSimpleBase(alt2) && isSimpleBase(ctx.getReference())) {
mutRNA.put(position_in_cds, alt2.getBaseString().charAt(0));
}
if (wildRNA.length() % 3 == 0 && wildRNA.length() > 0 && wildProt == null) {
wildProt = new ProteinCharSequence(geneticCode, wildRNA);
mutProt = new ProteinCharSequence(geneticCode, mutRNA);
}
}
final KnownGene.Intron intron = exon.getNextIntron();
if (intron != null && intron.contains(position)) {
annotations.intron_name = intron.getName();
annotations.seqont.add(so_intron);
if (intron.isSplicing(position)) {
if (intron.isSplicingAcceptor(position)) {
annotations.seqont.add(so_splice_acceptor);
} else if (intron.isSplicingDonor(position)) {
annotations.seqont.add(so_splice_donor);
} else // ???
{
annotations.seqont.add(so_splicing_variant);
}
}
}
++exon_index;
}
}
} else // reverse orientation
{
if (position >= gene.getTxEnd() + 2000) {
annotations.seqont.add(_5KB_upstream_variant);
} else if (position >= gene.getTxEnd()) {
annotations.seqont.add(_2KB_upstream_variant);
} else if (position < gene.getTxStart() - 500) {
annotations.seqont.add(_5KB_downstream_variant);
} else if (position < gene.getTxStart()) {
annotations.seqont.add(_500bp_downstream_variant);
} else if (position < gene.getCdsStart()) {
annotations.seqont.add(so_3_prime_UTR_variant);
} else if (gene.getCdsEnd() <= position) {
annotations.seqont.add(so_5_prime_UTR_variant);
} else {
int exon_index = gene.getExonCount() - 1;
while (exon_index >= 0) {
final KnownGene.Exon exon = gene.getExon(exon_index);
for (int i = exon.getEnd() - 1; i >= exon.getStart(); --i) {
if (i == position) {
annotations.exon_name = exon.getName();
if (exon.isNonCoding()) {
annotations.seqont.add(so_non_coding_exon_variant);
}
}
if (i >= gene.getCdsEnd())
continue;
if (i < gene.getCdsStart())
break;
if (wildRNA == null) {
wildRNA = new StringBuilder();
mutRNA = new MutedSequence(wildRNA);
}
if (i == position) {
annotations.seqont.add(so_exon);
position_in_cds = wildRNA.length();
annotations.position_cds = position_in_cds;
// in splicing ?
if (exon.isSplicing(position)) {
if (exon.isSplicingAcceptor(position)) {
annotations.seqont.add(so_splice_acceptor);
} else if (exon.isSplicingDonor(position)) {
annotations.seqont.add(so_splice_donor);
} else // ?
{
annotations.seqont.add(so_splicing_variant);
}
}
if (isSimpleBase(alt2) && isSimpleBase(ctx.getReference())) {
mutRNA.put(position_in_cds, AcidNucleics.complement(alt2.getBaseString().charAt(0)));
}
}
wildRNA.append(AcidNucleics.complement(genomicSequence.charAt(i)));
if (wildRNA.length() % 3 == 0 && wildRNA.length() > 0 && wildProt == null) {
wildProt = new ProteinCharSequence(geneticCode, wildRNA);
mutProt = new ProteinCharSequence(geneticCode, mutRNA);
}
}
final KnownGene.Intron intron = exon.getPrevIntron();
if (intron != null && intron.contains(position)) {
annotations.intron_name = intron.getName();
annotations.seqont.add(so_intron);
if (intron.isSplicing(position)) {
if (intron.isSplicingAcceptor(position)) {
annotations.seqont.add(so_splice_acceptor);
} else if (intron.isSplicingDonor(position)) {
annotations.seqont.add(so_splice_donor);
} else // ?
{
annotations.seqont.add(so_splicing_variant);
}
}
}
--exon_index;
}
}
}
if (isSimpleBase(alt2) && isSimpleBase(ctx.getReference()) && wildProt != null && mutProt != null && position_in_cds >= 0) {
final int pos_aa = position_in_cds / 3;
final int mod = position_in_cds % 3;
annotations.wildCodon = ("" + wildRNA.charAt(position_in_cds - mod + 0) + wildRNA.charAt(position_in_cds - mod + 1) + wildRNA.charAt(position_in_cds - mod + 2));
annotations.mutCodon = ("" + mutRNA.charAt(position_in_cds - mod + 0) + mutRNA.charAt(position_in_cds - mod + 1) + mutRNA.charAt(position_in_cds - mod + 2));
annotations.position_protein = (pos_aa + 1);
annotations.wildAA = String.valueOf(wildProt.charAt(pos_aa));
annotations.mutAA = (String.valueOf(mutProt.charAt(pos_aa)));
annotations.seqont.remove(so_exon);
if (isStop(wildProt.charAt(pos_aa)) && !isStop(mutProt.charAt(pos_aa))) {
annotations.seqont.add(so_stop_lost);
} else if (!isStop(wildProt.charAt(pos_aa)) && isStop(mutProt.charAt(pos_aa))) {
annotations.seqont.add(so_stop_gained);
} else if (wildProt.charAt(pos_aa) == mutProt.charAt(pos_aa)) {
annotations.seqont.add(so_coding_synonymous);
} else {
annotations.seqont.add(so_coding_non_synonymous);
}
}
}
}
}
final Set<String> info = new HashSet<String>(ctx_annotations.size());
for (final Annotation a : ctx_annotations) {
info.add(a.toString());
}
final VariantContextBuilder vb = new VariantContextBuilder(ctx);
final String thetag;
switch(this.outputSyntax) {
case Vep:
thetag = "CSQ";
break;
case SnpEff:
thetag = "ANN";
break;
default:
thetag = TAG;
break;
}
vb.attribute(thetag, info.toArray());
w.add(vb.make());
}
return RETURN_OK;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(this.referenceGenome);
}
}
use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.
the class VCFPredictions method loadKnownGenesFromUri.
private void loadKnownGenesFromUri() throws IOException {
BufferedReader in = null;
try {
if (this.referenceGenome.getDictionary() == null) {
throw new JvarkitException.FastaDictionaryMissing(this.referenceGenomeSource);
}
int n_ignored = 0;
int n_genes = 0;
this.knownGenes = new IntervalTreeMap<>();
LOG.info("loading genes");
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(this.referenceGenome.getDictionary());
contigNameConverter.setOnNotFound(OnNotFound.SKIP);
in = IOUtils.openURIForBufferedReading(this.kgURI);
String line;
final Pattern tab = Pattern.compile("[\t]");
while ((line = in.readLine()) != null) {
if (StringUtil.isBlank(line))
continue;
final String[] tokens = tab.split(line);
final KnownGene g = new KnownGene(tokens);
final String normalizedContig = contigNameConverter.apply(g.getContig());
if (StringUtil.isBlank(normalizedContig)) {
++n_ignored;
continue;
}
if (this.referenceGenome.getDictionary().getSequence(normalizedContig) == null) {
++n_ignored;
continue;
}
// because we want to set
final int extend_gene_search = 5000;
// SO:5KB_upstream_variant
final Interval interval = new Interval(normalizedContig, Math.max(1, g.getTxStart() + 1 - extend_gene_search), g.getTxEnd() + extend_gene_search);
List<KnownGene> L = this.knownGenes.get(interval);
if (L == null) {
L = new ArrayList<>(2);
this.knownGenes.put(interval, L);
}
++n_genes;
L.add(g);
}
in.close();
in = null;
LOG.info("genes:" + n_genes + " (ignored: " + n_ignored + ")");
} finally {
CloserUtil.close(in);
}
}
Aggregations