Search in sources :

Example 16 with KnownGene

use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.

the class TView method paint.

void paint(final PrintStream out) {
    final Colorizer colorizer;
    switch(this.formatOut) {
        case html:
            colorizer = new HtmlColorizer(out);
            break;
        case tty:
            colorizer = new AnsiColorizer(out);
            break;
        case plain:
            colorizer = new Colorizer(out);
            break;
        default:
            throw new IllegalStateException();
    }
    if (interval == null) {
        LOG.warn("No interval defined");
        return;
    }
    final GenomicSequence contigSequence;
    final Function<Integer, Character> refPosToBase;
    if (indexedFastaSequenceFile != null) {
        final SAMSequenceDictionary dict = SAMSequenceDictionaryExtractor.extractDictionary(referenceFile);
        if (dict.getSequence(this.interval.getContig()) == null) {
            LOG.warn("No interval with contig " + interval + " in REF");
            return;
        }
        contigSequence = new GenomicSequence(indexedFastaSequenceFile, interval.getContig());
        refPosToBase = POS -> {
            if (POS < 0 || POS >= contigSequence.length())
                return 'N';
            return contigSequence.charAt(POS);
        };
    } else {
        contigSequence = null;
        refPosToBase = POS -> 'N';
    }
    /**
     * test if genomic position is in interval
     */
    final Predicate<Integer> testInInterval = new Predicate<Integer>() {

        @Override
        public boolean test(final Integer pos) {
            return interval.getStart() <= pos && pos <= interval.getEnd();
        }
    };
    final int pixelWidth = this.interval.length();
    final Map<Integer, Integer> genomicpos2insertlen = new TreeMap<>();
    final Map<String, List<SAMRecord>> group2record = new TreeMap<>();
    for (final SamReader samReader : this.samReaders) {
        SAMRecordIterator iter = samReader.query(this.interval.getContig(), this.interval.getStart(), this.interval.getEnd(), false);
        while (iter.hasNext()) {
            final SAMRecord rec = iter.next();
            if (rec.getReadUnmappedFlag())
                continue;
            if (rec.getCigar() == null)
                continue;
            if (getRecordFilter().filterOut(rec))
                continue;
            if (!rec.getContig().equals(interval.getContig()))
                continue;
            if (right().apply(rec) < this.interval.getStart())
                continue;
            if (this.interval.getEnd() < left().apply(rec))
                continue;
            String group = this.groupBy.getPartion(rec);
            if (group == null || group.isEmpty()) {
                group = "undefined_" + this.groupBy.name();
            }
            List<SAMRecord> records = group2record.get(group);
            if (records == null) {
                records = new ArrayList<>();
                group2record.put(group, records);
            }
            records.add(rec);
            // loop over cigar, get the longest insert
            int refpos = rec.getAlignmentStart();
            for (final CigarElement ce : rec.getCigar().getCigarElements()) {
                if (!this.showInsertions)
                    break;
                final CigarOperator op = ce.getOperator();
                if (op.equals(CigarOperator.I) && testInInterval.test(refpos)) {
                    final Integer longestInsert = genomicpos2insertlen.get(refpos);
                    if (longestInsert == null || longestInsert.compareTo(ce.getLength()) < 0) {
                        genomicpos2insertlen.put(refpos, ce.getLength());
                    }
                }
                if (op.consumesReferenceBases()) {
                    refpos += ce.getLength();
                }
                if (refpos > interval.getEnd())
                    break;
            }
        }
        CloserUtil.close(iter);
        CloserUtil.close(samReader);
    }
    /**
     * compute where are the insertions
     */
    // LOG.debug(genomicpos2insertlen);
    final Predicate<Integer> insertIsPresentAtX = SCREENX -> {
        int x = 0;
        int ref = interval.getStart();
        while (x < pixelWidth) {
            if (x > SCREENX)
                return false;
            final Integer insertLen = genomicpos2insertlen.get(ref);
            if (insertLen == null) {
                ++x;
                ++ref;
            } else {
                if (x <= SCREENX && SCREENX < x + insertLen)
                    return true;
                // (+1) I DON'T UNDERSTAND WHY, BUT IT WORKS
                x += (insertLen + 1);
                ++ref;
            }
        }
        return false;
    };
    final Function<Character, AnsiColor> base2ansiColor = BASE -> {
        switch(Character.toUpperCase(BASE)) {
            case 'A':
                return AnsiColor.BLUE;
            case 'T':
                return AnsiColor.GREEN;
            case 'G':
                return AnsiColor.CYAN;
            case 'C':
                return AnsiColor.YELLOW;
            default:
                return null;
        }
    };
    /**
     * print interval title
     */
    out.println(interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd());
    /**
     * paint base position
     */
    int ref = this.interval.getStart();
    int x = 0;
    out.print(margin("POS:"));
    while (x < pixelWidth) {
        if (insertIsPresentAtX.test(x)) {
            colorizer.pen(AnsiColor.RED).print("^");
            ++x;
        } else if ((ref - this.interval.getStart()) % 10 == 0) {
            final String f = String.format("%d", ref);
            for (int i = 0; i < f.length() && x < pixelWidth; ++i) {
                colorizer.pen(AnsiColor.GREEN).print(f.charAt(i));
                if (!insertIsPresentAtX.test(x))
                    ++ref;
                ++x;
            }
        } else {
            out.print(".");
            ++ref;
            ++x;
        }
    }
    out.println();
    /* paint ref base */
    out.print(margin("REF:"));
    ref = this.interval.getStart();
    x = 0;
    while (x < pixelWidth) {
        if (insertIsPresentAtX.test(x)) {
            colorizer.paper(AnsiColor.YELLOW).print("*");
            ++x;
        } else {
            char refBase = refPosToBase.apply(ref - 1);
            colorizer.pen(base2ansiColor.apply(refBase)).print(refBase);
            ++ref;
            ++x;
        }
    }
    out.println();
    /* loop over samples **/
    for (final String groupName : group2record.keySet()) {
        if (this.maxReadRowPerGroup == 0)
            continue;
        final ConsensusBuilder consensus = new ConsensusBuilder();
        int y_group = 0;
        final List<List<SAMRecord>> rows = new ArrayList<>();
        out.println(margin(""));
        switch(this.layoutReads) {
            case name:
                {
                    rows.addAll(group2record.get(groupName).stream().sorted((R1, R2) -> R1.getReadName().compareTo(R2.getReadName())).map(R -> Collections.singletonList(R)).collect(Collectors.toList()));
                    break;
                }
            default:
                {
                    /* pileup reads */
                    for (final SAMRecord rec : group2record.get(groupName)) {
                        int y = 0;
                        for (y = 0; y < rows.size(); ++y) {
                            final List<SAMRecord> row = rows.get(y);
                            final SAMRecord last = row.get(row.size() - 1);
                            if (right().apply(last) + this.distance_between_reads < left().apply(rec)) {
                                row.add(rec);
                                break;
                            }
                        }
                        if (y == rows.size()) {
                            final List<SAMRecord> row = new ArrayList<>();
                            row.add(rec);
                            rows.add(row);
                        }
                    }
                    break;
                }
        }
        // each row is only one read, so we need to print the groupName
        if (layoutReads == LayoutReads.name) {
            out.print(margin(groupName));
            out.println();
        }
        /* print each row */
        for (final List<SAMRecord> row : rows) {
            ++y_group;
            boolean print_this_line = (this.maxReadRowPerGroup < 0 || y_group <= this.maxReadRowPerGroup);
            if (print_this_line) {
                // each row is only one read, print the read name
                if (layoutReads == LayoutReads.name) {
                    String readName = row.get(0).getReadName();
                    if (row.get(0).getReadPairedFlag()) {
                        if (row.get(0).getFirstOfPairFlag()) {
                            readName += "/1";
                        }
                        if (row.get(0).getSecondOfPairFlag()) {
                            readName += "/2";
                        }
                    }
                    out.print(margin(readName));
                } else {
                    out.print(margin(y_group == 1 ? groupName : ""));
                }
            }
            ref = interval.getStart();
            x = 0;
            for (final SAMRecord rec : row) {
                int readRef = left().apply(rec);
                // pad before record
                while (x < pixelWidth && ref < readRef && testInInterval.test(ref)) {
                    if (!insertIsPresentAtX.test(x))
                        ++ref;
                    ++x;
                    if (print_this_line)
                        out.print(' ');
                    consensus.add(' ');
                }
                int readpos = 0;
                /* get read base function */
                final Function<Integer, Character> baseAt = new Function<Integer, Character>() {

                    @Override
                    public Character apply(final Integer readpos) {
                        final byte[] readBases = rec.getReadBases();
                        if (readBases == SAMRecord.NULL_SEQUENCE)
                            return 'N';
                        if (readpos < 0 || readpos >= rec.getReadLength())
                            return '?';
                        return (char) readBases[readpos];
                    }
                };
                for (final CigarElement ce : rec.getCigar()) {
                    final CigarOperator op = ce.getOperator();
                    if (op.equals(CigarOperator.PADDING))
                        continue;
                    /* IN INSERTION, only print if showInsertions is true */
                    if (this.showInsertions && op.equals(CigarOperator.I)) {
                        int cigarIdx = 0;
                        while (x < pixelWidth && cigarIdx < ce.getLength()) {
                            if (testInInterval.test(readRef)) {
                                final char readbase = baseAt.apply(readpos);
                                if (print_this_line)
                                    colorizer.paper(AnsiColor.RED).print(readbase);
                                consensus.add(readbase);
                                ++x;
                            }
                            ++cigarIdx;
                            ++readpos;
                        }
                        continue;
                    }
                    int cigarIdx = 0;
                    while (x < pixelWidth && cigarIdx < ce.getLength()) {
                        colorizer.clear();
                        // pad before base
                        while (x < pixelWidth && testInInterval.test(readRef) && (insertIsPresentAtX.test(x))) {
                            ++x;
                            if (print_this_line)
                                colorizer.paper(AnsiColor.YELLOW).print("*");
                            consensus.add(' ');
                            continue;
                        }
                        switch(op) {
                            case I:
                                {
                                    // if visible, processed above
                                    if (showInsertions)
                                        throw new IllegalStateException();
                                    readpos++;
                                    break;
                                }
                            case P:
                                break;
                            case H:
                                {
                                    if (showClip) {
                                        if (testInInterval.test(readRef)) {
                                            if (print_this_line)
                                                colorizer.paper(AnsiColor.YELLOW).print('N');
                                            // CLIPPED base not part of consensus
                                            consensus.add(' ');
                                            ++x;
                                        }
                                        ++readRef;
                                    }
                                    break;
                                }
                            case S:
                                {
                                    if (showClip) {
                                        if (testInInterval.test(readRef)) {
                                            final char readBase = baseAt.apply(readpos);
                                            if (print_this_line)
                                                colorizer.paper(AnsiColor.YELLOW).print(readBase);
                                            // CLIPPED base not part of consensus
                                            consensus.add(' ');
                                            ++x;
                                        }
                                        ++readpos;
                                        ++readRef;
                                    } else {
                                        readpos++;
                                    }
                                    break;
                                }
                            case D:
                            case N:
                                {
                                    if (testInInterval.test(readRef)) {
                                        if (print_this_line)
                                            colorizer.paper(AnsiColor.RED).print('-');
                                        // deletion not not part of consensus
                                        consensus.add(' ');
                                        ++x;
                                    }
                                    ++readRef;
                                    break;
                                }
                            case EQ:
                            case M:
                            case X:
                                {
                                    if (testInInterval.test(readRef)) {
                                        final char refBase = Character.toUpperCase(refPosToBase.apply(readRef - 1));
                                        char readBase = Character.toUpperCase(baseAt.apply(readpos));
                                        consensus.add(readBase);
                                        colorizer.pen(base2ansiColor.apply(readBase));
                                        if (op.equals(CigarOperator.X) || (refBase != 'N' && readBase != 'N' && readBase != refBase)) {
                                            colorizer.pen(AnsiColor.RED);
                                        } else if (hideBases) {
                                            if (rec.getReadNegativeStrandFlag()) {
                                                readBase = ',';
                                            } else {
                                                readBase = '.';
                                            }
                                        }
                                        if (showReadName) {
                                            final String readName = rec.getReadName();
                                            if (readpos < 0 || readpos >= readName.length()) {
                                                readBase = '_';
                                            } else {
                                                readBase = readName.charAt(readpos);
                                            }
                                        }
                                        if (rec.getReadNegativeStrandFlag()) {
                                            readBase = Character.toLowerCase(readBase);
                                        } else {
                                            readBase = Character.toUpperCase(readBase);
                                        }
                                        if (print_this_line)
                                            colorizer.print(readBase);
                                        ++x;
                                    }
                                    ++readpos;
                                    ++readRef;
                                    break;
                                }
                        }
                        ++cigarIdx;
                    }
                }
                // end of loop cigar
                ref = readRef;
            }
            // out.println( " "+ref+" "+row.get(0).getAlignmentStart()+" "+row.get(0).getCigarString()+" "+row.get(0).getReadString());
            while (x < pixelWidth) {
                if (print_this_line)
                    out.print(" ");
                ++x;
            }
            if (print_this_line)
                out.println();
            consensus.eol();
            if (out.checkError())
                break;
        }
        if (out.checkError())
            break;
        if (!this.hideConsensus && consensus.bases.stream().anyMatch(C -> C.getCoverage() > 0)) {
            out.print(margin(groupName + " CONSENSUS"));
            x = 0;
            ref = interval.getStart();
            while (x < consensus.bases.size()) {
                final char refBase = Character.toUpperCase(refPosToBase.apply(ref - 1));
                final char consensusBase = consensus.bases.get(x).getConsensus();
                if (Character.isWhitespace(consensusBase)) {
                // nothing
                } else if (refBase != 'N' && consensusBase != refBase) {
                    colorizer.pen(AnsiColor.RED);
                } else {
                    colorizer.pen(base2ansiColor.apply(consensusBase));
                }
                if (!insertIsPresentAtX.test(x))
                    ++ref;
                colorizer.print(consensusBase);
                ++x;
            }
            out.println();
        }
        if (this.numCoverageRows > 0) {
            int minCov = consensus.bases.stream().mapToInt(C -> C.getCoverage()).min().orElse(0);
            final int maxCov = consensus.bases.stream().mapToInt(C -> C.getCoverage()).max().orElse(0);
            for (int y = 0; maxCov > 0 && y < this.numCoverageRows; ++y) {
                if (minCov == maxCov)
                    minCov--;
                double fract = (maxCov - minCov) / ((double) this.numCoverageRows);
                int inverse_y = (this.numCoverageRows - 1) - y;
                int d0 = (int) ((fract) * inverse_y);
                // int d1 = (int)((fract) * (inverse_y+1));
                out.print(margin(y == 0 ? groupName + " " + maxCov : (y + 1 == this.numCoverageRows ? String.valueOf(minCov) : "")));
                for (x = 0; x < consensus.bases.size(); ++x) {
                    int depth = consensus.bases.get(x).getCoverage() - minCov;
                    colorizer.print(depth >= d0 ? BLACK_SQUARE : ' ');
                }
                out.println();
            }
        }
    }
    if (this.tabixKnownGene != null && this.indexedFastaSequenceFile != null) {
        final List<KnownGene> genes = this.tabixKnownGene.getItemsInInterval(this.interval);
        if (!genes.isEmpty()) {
            out.println(this.knownGeneUri);
            for (final KnownGene gene : genes) {
                final KnownGene.CodingRNA codingRna = gene.getCodingRNA(contigSequence);
                final KnownGene.Peptide peptide = codingRna.getPeptide();
                out.print(margin(gene.getName()));
                x = 0;
                int ref0 = this.interval.getStart() - 1;
                while (x < pixelWidth) {
                    if (insertIsPresentAtX.test(x)) {
                        out.print("*");
                        ++x;
                    } else {
                        char pepChar = ' ';
                        if (ref0 >= gene.getTxStart() && ref0 < gene.getTxEnd()) {
                            pepChar = (gene.isPositiveStrand() ? '>' : '<');
                            int pepIdx = peptide.convertGenomicToPeptideCoordinate(ref0);
                            if (pepIdx != -1) {
                                final String aa3 = GeneticCode.aminoAcidTo3Letters(peptide.charAt(pepIdx));
                                final int[] offset = peptide.convertToGenomicCoordinates(pepIdx);
                                if (offset != null && offset.length == 3 && aa3 != null && aa3.length() == 3) {
                                    if (offset[0] == ref0)
                                        pepChar = aa3.charAt(0);
                                    else if (offset[1] == ref0)
                                        pepChar = aa3.charAt(1);
                                    else if (offset[2] == ref0)
                                        pepChar = aa3.charAt(2);
                                    else
                                        pepChar = '?';
                                } else {
                                    pepChar = '?';
                                }
                            }
                        }
                        out.print(pepChar);
                        ++ref0;
                        ++x;
                    }
                }
                while (x < pixelWidth) {
                    out.print(" ");
                    ++x;
                }
                out.println();
            }
        }
        out.println();
    }
    /**
     * variant section
     */
    if (!this.vcfReaders.isEmpty() && !out.checkError()) {
        final Function<GenotypeType, Character> gTypeToSymbol = new Function<GenotypeType, Character>() {

            @Override
            public Character apply(final GenotypeType gt) {
                switch(gt) {
                    case NO_CALL:
                        return '?';
                    case HOM_REF:
                        return '0';
                    case HET:
                        return '1';
                    case HOM_VAR:
                        return '2';
                    case MIXED:
                        return 'm';
                    case UNAVAILABLE:
                        return 'u';
                    default:
                        return '.';
                }
            }
        };
        out.println();
        for (final VcfSource r : this.vcfReaders) {
            if (out.checkError())
                break;
            final VCFHeader header = r.vcfFileReader.getFileHeader();
            final CloseableIterator<VariantContext> iter = r.vcfFileReader.query(this.interval.getContig(), interval.getStart(), interval.getEnd());
            final List<VariantContext> variants = new ArrayList<>();
            while (iter.hasNext()) {
                variants.add(iter.next());
            }
            iter.close();
            if (variants.isEmpty())
                continue;
            out.println(r.vcfFile.getPath());
            if (header.hasGenotypingData()) {
                for (final String sample : header.getSampleNamesInOrder()) {
                    if (!variants.stream().map(V -> V.getGenotype(sample)).filter(G -> !hideNoCall || (hideNoCall && !G.isNoCall())).filter(G -> !hideHomRef || (hideHomRef && !G.isHomRef())).findAny().isPresent()) {
                        continue;
                    }
                    out.print(margin(sample));
                    ref = this.interval.getStart();
                    x = 0;
                    while (x < pixelWidth) {
                        if (insertIsPresentAtX.test(x)) {
                            out.print("*");
                            ++x;
                        } else {
                            char refBase = ' ';
                            for (final VariantContext ctx : variants) {
                                if (ctx.getStart() == ref) {
                                    final Genotype g = ctx.getGenotype(sample);
                                    if (g.isNoCall() && this.hideNoCall)
                                        continue;
                                    if (g.isHomRef() && this.hideHomRef)
                                        continue;
                                    refBase = gTypeToSymbol.apply(g.getType());
                                    break;
                                }
                            }
                            out.print(refBase);
                            ++ref;
                            ++x;
                        }
                    }
                    out.println();
                }
            } else // no genotype
            {
                for (final VariantContext ctx : variants) {
                    out.print(margin(String.valueOf(ctx.getStart()) + ":" + ctx.getReference().getDisplayString() + "/" + ctx.getAlternateAlleles().stream().map(A -> A.getDisplayString()).collect(Collectors.joining(","))));
                    ref = this.interval.getStart();
                    x = 0;
                    while (x < pixelWidth) {
                        if (insertIsPresentAtX.test(x)) {
                            out.print("*");
                            ++x;
                        } else {
                            out.print(ctx.getStart() == ref ? '+' : ' ');
                            ++ref;
                            ++x;
                        }
                    }
                    out.println();
                }
            }
        }
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Parameter(com.beust.jcommander.Parameter) VCFFileReader(htsjdk.variant.vcf.VCFFileReader) VCFHeader(htsjdk.variant.vcf.VCFHeader) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) SAMRecordPartition(com.github.lindenb.jvarkit.util.samtools.SAMRecordPartition) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) Function(java.util.function.Function) ValidationStringency(htsjdk.samtools.ValidationStringency) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Interval(htsjdk.samtools.util.Interval) Map(java.util.Map) XMLStreamException(javax.xml.stream.XMLStreamException) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) XMLStreamWriter(javax.xml.stream.XMLStreamWriter) SAMSequenceDictionaryExtractor(htsjdk.variant.utils.SAMSequenceDictionaryExtractor) GeneticCode(com.github.lindenb.jvarkit.util.bio.GeneticCode) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintStream(java.io.PrintStream) Counter(com.github.lindenb.jvarkit.util.Counter) Predicate(java.util.function.Predicate) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) GenotypeType(htsjdk.variant.variantcontext.GenotypeType) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Set(java.util.Set) SamFilterParser(com.github.lindenb.jvarkit.util.bio.samfilter.SamFilterParser) IOException(java.io.IOException) SamReader(htsjdk.samtools.SamReader) Collectors(java.util.stream.Collectors) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) TabixKnownGeneFileReader(com.github.lindenb.jvarkit.util.ucsc.TabixKnownGeneFileReader) File(java.io.File) SAMRecord(htsjdk.samtools.SAMRecord) XMLOutputFactory(javax.xml.stream.XMLOutputFactory) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) List(java.util.List) SamInputResource(htsjdk.samtools.SamInputResource) TreeMap(java.util.TreeMap) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) Closeable(java.io.Closeable) VariantContext(htsjdk.variant.variantcontext.VariantContext) Collections(java.util.Collections) SamReaderFactory(htsjdk.samtools.SamReaderFactory) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Predicate(java.util.function.Predicate) SamReader(htsjdk.samtools.SamReader) Function(java.util.function.Function) ArrayList(java.util.ArrayList) List(java.util.List) VCFHeader(htsjdk.variant.vcf.VCFHeader) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) Genotype(htsjdk.variant.variantcontext.Genotype) CigarOperator(htsjdk.samtools.CigarOperator) TreeMap(java.util.TreeMap) CigarElement(htsjdk.samtools.CigarElement) SAMRecord(htsjdk.samtools.SAMRecord) GenotypeType(htsjdk.variant.variantcontext.GenotypeType) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene)

Example 17 with KnownGene

use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.

the class VCFCombineTwoSnvs method doVcfToVcf.

@Override
protected int doVcfToVcf(final String inputName, File saveAs) {
    BufferedReader bufferedReader = null;
    htsjdk.variant.variantcontext.writer.VariantContextWriter w = null;
    SortingCollection<CombinedMutation> mutations = null;
    CloseableIterator<Variant> varIter = null;
    CloseableIterator<CombinedMutation> mutIter = null;
    Map<String, SamReader> sample2samReader = new HashMap<>();
    try {
        bufferedReader = inputName == null ? IOUtils.openStreamForBufferedReader(stdin()) : IOUtils.openURIForBufferedReading(inputName);
        final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(bufferedReader);
        /* get VCF header */
        final VCFHeader header = cah.header;
        final Set<String> sampleNamesInOrder = new HashSet<>(header.getSampleNamesInOrder());
        LOG.info("opening REF:" + referenceFile);
        this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.referenceFile);
        final SAMSequenceDictionary dict = this.indexedFastaSequenceFile.getSequenceDictionary();
        if (dict == null)
            throw new IOException("dictionary missing");
        if (this.bamIn != null) {
            /**
             * unroll and open bam file
             */
            for (final File bamFile : IOUtils.unrollFileCollection(Collections.singletonList(this.bamIn))) {
                LOG.info("opening BAM :" + this.bamIn);
                final SamReader samReader = SamReaderFactory.makeDefault().referenceSequence(this.referenceFile).validationStringency(ValidationStringency.LENIENT).open(this.bamIn);
                if (!samReader.hasIndex()) {
                    throw new IOException("Sam file is NOT indexed: " + bamFile);
                }
                final SAMFileHeader samHeader = samReader.getFileHeader();
                if (samHeader.getSequenceDictionary() == null || !SequenceUtil.areSequenceDictionariesEqual(dict, samReader.getFileHeader().getSequenceDictionary())) {
                    throw new IOException(bamFile + " and REF don't have the same Sequence Dictionary.");
                }
                /* get sample name */
                String sampleName = null;
                for (final SAMReadGroupRecord rg : samHeader.getReadGroups()) {
                    if (rg.getSample() == null)
                        continue;
                    if (sampleName != null && !sampleName.equals(rg.getSample())) {
                        samReader.close();
                        throw new IOException(bamFile + " Contains two samples " + sampleName + " " + rg.getSample());
                    }
                    sampleName = rg.getSample();
                }
                if (sampleName == null) {
                    samReader.close();
                    LOG.warn("no sample in " + bamFile);
                    continue;
                }
                if (!sampleNamesInOrder.contains(sampleName)) {
                    samReader.close();
                    LOG.warn("no sample " + sampleName + " in vcf");
                    continue;
                }
                sample2samReader.put(sampleName, samReader);
            }
        }
        loadKnownGenesFromUri();
        this.variants = SortingCollection.newInstance(Variant.class, new VariantCodec(), new VariantComparator(dict), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        this.variants.setDestructiveIteration(true);
        SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
        String vcfLine = null;
        while ((vcfLine = bufferedReader.readLine()) != null) {
            final VariantContext ctx = progress.watch(cah.codec.decode(vcfLine));
            /* discard non SNV variant */
            if (!ctx.isVariant() || ctx.isIndel()) {
                continue;
            }
            /* find the overlapping genes : extend the interval of the variant to include the stop codon */
            final Collection<KnownGene> genes = new ArrayList<>();
            for (List<KnownGene> lkg : this.knownGenes.getOverlapping(new Interval(ctx.getContig(), Math.max(1, ctx.getStart() - 3), ctx.getEnd() + 3))) {
                genes.addAll(lkg);
            }
            final List<Allele> alternateAlleles = ctx.getAlternateAlleles();
            /* loop over overlapping genes */
            for (final KnownGene kg : genes) {
                /* loop over available alleles */
                for (int allele_idx = 0; allele_idx < alternateAlleles.size(); ++allele_idx) {
                    final Allele alt = alternateAlleles.get(allele_idx);
                    challenge(ctx, alt, kg, vcfLine);
                }
            }
        }
        progress.finish();
        this.variants.doneAdding();
        mutations = SortingCollection.newInstance(CombinedMutation.class, new MutationCodec(), new MutationComparator(dict), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        mutations.setDestructiveIteration(true);
        final VCFFilterHeaderLine vcfFilterHeaderLine = new VCFFilterHeaderLine("TwoHaplotypes", "(number of reads carrying both mutation) < (reads carrying variant 1 + reads carrying variant 2) ");
        varIter = this.variants.iterator();
        progress = new SAMSequenceDictionaryProgress(header);
        final ArrayList<Variant> buffer = new ArrayList<>();
        for (; ; ) {
            Variant variant = null;
            if (varIter.hasNext()) {
                variant = varIter.next();
                progress.watch(variant.contig, variant.genomicPosition1);
            }
            if (variant == null || !(!buffer.isEmpty() && buffer.get(0).contig.equals(variant.contig) && buffer.get(0).transcriptName.equals(variant.transcriptName))) {
                if (!buffer.isEmpty()) {
                    for (int i = 0; i < buffer.size(); ++i) {
                        final Variant v1 = buffer.get(i);
                        for (int j = i + 1; j < buffer.size(); ++j) {
                            final Variant v2 = buffer.get(j);
                            if (v1.codonStart() != v2.codonStart())
                                continue;
                            if (v1.positionInCodon() == v2.positionInCodon())
                                continue;
                            if (!v1.wildCodon.equals(v2.wildCodon)) {
                                throw new IllegalStateException();
                            }
                            final StringBuilder combinedCodon = new StringBuilder(v1.wildCodon);
                            combinedCodon.setCharAt(v1.positionInCodon(), v1.mutCodon.charAt(v1.positionInCodon()));
                            combinedCodon.setCharAt(v2.positionInCodon(), v2.mutCodon.charAt(v2.positionInCodon()));
                            final String pwild = new ProteinCharSequence(v1.wildCodon).getString();
                            final String p1 = new ProteinCharSequence(v1.mutCodon).getString();
                            final String p2 = new ProteinCharSequence(v2.mutCodon).getString();
                            final String pCombined = new ProteinCharSequence(combinedCodon).getString();
                            final String combinedSO;
                            final String combinedType;
                            /* both AA are synonymous, while combined is not */
                            if (!pCombined.equals(pwild) && p1.equals(pwild) && p2.equals(pwild)) {
                                combinedType = "combined_is_nonsynonymous";
                                if (pCombined.equals("*")) {
                                    /* http://www.sequenceontology.org/browser/current_svn/term/SO:0001587 */
                                    combinedSO = "stop_gained";
                                } else if (pwild.equals("*")) {
                                    /* http://www.sequenceontology.org/browser/current_svn/term/SO:0002012 */
                                    combinedSO = "stop_lost";
                                } else {
                                    /* http://www.sequenceontology.org/miso/current_svn/term/SO:0001992 */
                                    combinedSO = "nonsynonymous_variant";
                                }
                            } else if (!pCombined.equals(p1) && !pCombined.equals(p2) && !pCombined.equals(pwild)) {
                                combinedType = "combined_is_new";
                                if (pCombined.equals("*")) {
                                    /* http://www.sequenceontology.org/browser/current_svn/term/SO:0001587 */
                                    combinedSO = "stop_gained";
                                } else {
                                    /* http://www.sequenceontology.org/miso/current_svn/term/SO:0001992 */
                                    combinedSO = "nonsynonymous_variant";
                                }
                            } else {
                                combinedType = null;
                                combinedSO = null;
                            }
                            /**
                             * ok, there is something interesting here ,
                             * create two new Mutations carrying the
                             * two variants
                             */
                            if (combinedSO != null) {
                                /**
                                 * grantham score is max found combined vs (p1/p2/wild)
                                 */
                                int grantham_score = GranthamScore.score(pCombined.charAt(0), pwild.charAt(0));
                                grantham_score = Math.max(grantham_score, GranthamScore.score(pCombined.charAt(0), p1.charAt(0)));
                                grantham_score = Math.max(grantham_score, GranthamScore.score(pCombined.charAt(0), p2.charAt(0)));
                                /**
                                 * info that will be displayed in the vcf
                                 */
                                final Map<String, Object> info1 = v1.getInfo(v2);
                                final Map<String, Object> info2 = v2.getInfo(v1);
                                // filter for this combined: default it fails the filter
                                String filter = vcfFilterHeaderLine.getID();
                                final Map<String, Object> combinedMap = new LinkedHashMap<>();
                                combinedMap.put("CombinedCodon", combinedCodon);
                                combinedMap.put("CombinedAA", pCombined);
                                combinedMap.put("CombinedSO", combinedSO);
                                combinedMap.put("CombinedType", combinedType);
                                combinedMap.put("GranthamScore", grantham_score);
                                info1.putAll(combinedMap);
                                info2.putAll(combinedMap);
                                final Map<String, CoverageInfo> sample2coverageInfo = new HashMap<>(sample2samReader.size());
                                final int chromStart = Math.min(v1.genomicPosition1, v2.genomicPosition1);
                                final int chromEnd = Math.max(v1.genomicPosition1, v2.genomicPosition1);
                                /* get phasing info for each sample*/
                                for (final String sampleName : sample2samReader.keySet()) {
                                    final SamReader samReader = sample2samReader.get(sampleName);
                                    final CoverageInfo covInfo = new CoverageInfo();
                                    sample2coverageInfo.put(sampleName, covInfo);
                                    SAMRecordIterator iter = null;
                                    try {
                                        iter = samReader.query(v1.contig, chromStart, chromEnd, false);
                                        while (iter.hasNext()) {
                                            final SAMRecord rec = iter.next();
                                            if (rec.getReadUnmappedFlag())
                                                continue;
                                            if (rec.isSecondaryOrSupplementary())
                                                continue;
                                            if (rec.getDuplicateReadFlag())
                                                continue;
                                            if (rec.getReadFailsVendorQualityCheckFlag())
                                                continue;
                                            // get DEPTh for variant 1
                                            if (rec.getAlignmentStart() <= v1.genomicPosition1 && v1.genomicPosition1 <= rec.getAlignmentEnd()) {
                                                covInfo.depth1++;
                                            }
                                            // get DEPTh for variant 2
                                            if (rec.getAlignmentStart() <= v2.genomicPosition1 && v2.genomicPosition1 <= rec.getAlignmentEnd()) {
                                                covInfo.depth2++;
                                            }
                                            if (rec.getAlignmentEnd() < chromEnd)
                                                continue;
                                            if (rec.getAlignmentStart() > chromStart)
                                                continue;
                                            final Cigar cigar = rec.getCigar();
                                            if (cigar == null)
                                                continue;
                                            final byte[] bases = rec.getReadBases();
                                            if (bases == null)
                                                continue;
                                            int refpos1 = rec.getAlignmentStart();
                                            int readpos = 0;
                                            boolean found_variant1_on_this_read = false;
                                            boolean found_variant2_on_this_read = false;
                                            /**
                                             * loop over cigar
                                             */
                                            for (final CigarElement ce : cigar.getCigarElements()) {
                                                final CigarOperator op = ce.getOperator();
                                                switch(op) {
                                                    case P:
                                                        continue;
                                                    case S:
                                                    case I:
                                                        readpos += ce.getLength();
                                                        break;
                                                    case D:
                                                    case N:
                                                        refpos1 += ce.getLength();
                                                        break;
                                                    case H:
                                                        continue;
                                                    case EQ:
                                                    case M:
                                                    case X:
                                                        for (int x = 0; x < ce.getLength(); ++x) {
                                                            if (refpos1 == v1.genomicPosition1 && same(bases[readpos], v1.altAllele)) {
                                                                found_variant1_on_this_read = true;
                                                            } else if (refpos1 == v2.genomicPosition1 && same(bases[readpos], v2.altAllele)) {
                                                                found_variant2_on_this_read = true;
                                                            }
                                                            refpos1++;
                                                            readpos++;
                                                        }
                                                        break;
                                                    default:
                                                        throw new IllegalStateException(op.name());
                                                }
                                                /* skip remaining bases after last variant */
                                                if (refpos1 > chromEnd)
                                                    break;
                                            }
                                            /* sum-up what we found */
                                            if (found_variant1_on_this_read && found_variant2_on_this_read) {
                                                covInfo.count_reads_having_both_variants++;
                                            } else if (!found_variant1_on_this_read && !found_variant2_on_this_read) {
                                                covInfo.count_reads_having_no_variants++;
                                            } else if (found_variant1_on_this_read) {
                                                covInfo.count_reads_having_variant1++;
                                            } else if (found_variant2_on_this_read) {
                                                covInfo.count_reads_having_variant2++;
                                            }
                                        }
                                    /* end of loop over reads */
                                    } finally {
                                        iter.close();
                                        iter = null;
                                    }
                                    info1.put("N_READS_BOTH_VARIANTS_" + sampleName, covInfo.count_reads_having_both_variants);
                                    info2.put("N_READS_BOTH_VARIANTS_" + sampleName, covInfo.count_reads_having_both_variants);
                                    info1.put("N_READS_NO_VARIANTS_" + sampleName, covInfo.count_reads_having_no_variants);
                                    info2.put("N_READS_NO_VARIANTS_" + sampleName, covInfo.count_reads_having_no_variants);
                                    info1.put("N_READS_TOTAL_" + sampleName, covInfo.count_reads_having_both_variants + covInfo.count_reads_having_no_variants + covInfo.count_reads_having_variant1 + covInfo.count_reads_having_variant2);
                                    info2.put("N_READS_TOTAL_" + sampleName, covInfo.count_reads_having_both_variants + covInfo.count_reads_having_no_variants + covInfo.count_reads_having_variant1 + covInfo.count_reads_having_variant2);
                                    // count for variant 1
                                    info1.put("N_READS_ONLY_1_" + sampleName, covInfo.count_reads_having_variant1);
                                    info1.put("N_READS_ONLY_2_" + sampleName, covInfo.count_reads_having_variant2);
                                    info1.put("DEPTH_1_" + sampleName, covInfo.depth1);
                                    // inverse previous count
                                    info2.put("N_READS_ONLY_1_" + sampleName, covInfo.count_reads_having_variant2);
                                    info2.put("N_READS_ONLY_2_" + sampleName, covInfo.count_reads_having_variant1);
                                    info2.put("DEPTH_2_" + sampleName, covInfo.depth2);
                                    /* number of reads with both variant is greater than
									 * reads carrying only one variant: reset the filter 
									 */
                                    if (2 * covInfo.count_reads_having_both_variants > (covInfo.count_reads_having_variant1 + covInfo.count_reads_having_variant2)) {
                                        /* reset filter */
                                        filter = VCFConstants.UNFILTERED;
                                        info1.put("FILTER_1_" + sampleName, ".");
                                        info2.put("FILTER_2_" + sampleName, ".");
                                    } else {
                                        info1.put("FILTER_1_" + sampleName, vcfFilterHeaderLine.getID());
                                        info2.put("FILTER_2_" + sampleName, vcfFilterHeaderLine.getID());
                                    }
                                }
                                /* end of loop over bams */
                                final CombinedMutation m1 = new CombinedMutation();
                                m1.contig = v1.contig;
                                m1.genomicPosition1 = v1.genomicPosition1;
                                m1.id = v1.id;
                                m1.refAllele = v1.refAllele;
                                m1.altAllele = v1.altAllele;
                                m1.vcfLine = v1.vcfLine;
                                m1.info = mapToString(info1);
                                m1.filter = filter;
                                m1.grantham_score = grantham_score;
                                m1.sorting_id = ID_GENERATOR++;
                                mutations.add(m1);
                                final CombinedMutation m2 = new CombinedMutation();
                                m2.contig = v2.contig;
                                m2.genomicPosition1 = v2.genomicPosition1;
                                m2.id = v2.id;
                                m2.refAllele = v2.refAllele;
                                m2.altAllele = v2.altAllele;
                                m2.vcfLine = v2.vcfLine;
                                m2.info = mapToString(info2);
                                m2.filter = filter;
                                m2.grantham_score = grantham_score;
                                m2.sorting_id = ID_GENERATOR++;
                                mutations.add(m2);
                            }
                        }
                    }
                }
                buffer.clear();
                if (variant == null)
                    break;
            }
            buffer.add(variant);
        }
        progress.finish();
        mutations.doneAdding();
        varIter.close();
        varIter = null;
        variants.cleanup();
        variants = null;
        final ArrayList<CombinedMutation> mBuffer = new ArrayList<>();
        final VCFHeader header2 = new VCFHeader(header);
        header2.addMetaDataLine(new VCFHeaderLine(getProgramName() + "AboutQUAL", "QUAL is filled with Grantham Score  http://www.ncbi.nlm.nih.gov/pubmed/4843792"));
        final StringBuilder infoDesc = new StringBuilder("Variant affected by two distinct mutation. Format is defined in the INFO column. ");
        final VCFInfoHeaderLine infoHeaderLine = new VCFInfoHeaderLine("CodonVariant", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, infoDesc.toString());
        super.addMetaData(header2);
        header2.addMetaDataLine(infoHeaderLine);
        if (!sample2samReader.isEmpty()) {
            header2.addMetaDataLine(vcfFilterHeaderLine);
        }
        w = super.openVariantContextWriter(saveAs);
        w.writeHeader(header2);
        progress = new SAMSequenceDictionaryProgress(header);
        mutIter = mutations.iterator();
        for (; ; ) {
            CombinedMutation mutation = null;
            if (mutIter.hasNext()) {
                mutation = mutIter.next();
                progress.watch(mutation.contig, mutation.genomicPosition1);
            }
            if (mutation == null || !(!mBuffer.isEmpty() && mBuffer.get(0).contig.equals(mutation.contig) && mBuffer.get(0).genomicPosition1 == mutation.genomicPosition1 && mBuffer.get(0).refAllele.equals(mutation.refAllele))) {
                if (!mBuffer.isEmpty()) {
                    // default grantham score used in QUAL
                    int grantham_score = -1;
                    // default filter fails
                    String filter = vcfFilterHeaderLine.getID();
                    final CombinedMutation first = mBuffer.get(0);
                    final Set<String> info = new HashSet<>();
                    final VariantContext ctx = cah.codec.decode(first.vcfLine);
                    final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
                    vcb.chr(first.contig);
                    vcb.start(first.genomicPosition1);
                    vcb.stop(first.genomicPosition1 + first.refAllele.length() - 1);
                    if (!first.id.equals(VCFConstants.EMPTY_ID_FIELD))
                        vcb.id(first.id);
                    for (final CombinedMutation m : mBuffer) {
                        info.add(m.info);
                        grantham_score = Math.max(grantham_score, m.grantham_score);
                        if (VCFConstants.UNFILTERED.equals(m.filter)) {
                            // at least one SNP is ok one this line
                            filter = null;
                        }
                    }
                    vcb.unfiltered();
                    if (filter != null && !sample2samReader.isEmpty()) {
                        vcb.filter(filter);
                    } else {
                        vcb.passFilters();
                    }
                    vcb.attribute(infoHeaderLine.getID(), new ArrayList<String>(info));
                    if (grantham_score > 0) {
                        vcb.log10PError(grantham_score / -10.0);
                    } else {
                        vcb.log10PError(VariantContext.NO_LOG10_PERROR);
                    }
                    w.add(vcb.make());
                }
                mBuffer.clear();
                if (mutation == null)
                    break;
            }
            mBuffer.add(mutation);
        }
        progress.finish();
        mutIter.close();
        mutations.cleanup();
        mutations = null;
        return RETURN_OK;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(this.indexedFastaSequenceFile);
        CloserUtil.close(mutIter);
        CloserUtil.close(varIter);
        if (this.variants != null)
            this.variants.cleanup();
        if (mutations != null)
            mutations.cleanup();
        this.variants = null;
        for (SamReader r : sample2samReader.values()) CloserUtil.close(r);
        CloserUtil.close(w);
        CloserUtil.close(bufferedReader);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) LinkedHashMap(java.util.LinkedHashMap) HashSet(java.util.HashSet) CigarOperator(htsjdk.samtools.CigarOperator) CigarElement(htsjdk.samtools.CigarElement) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) SAMRecord(htsjdk.samtools.SAMRecord) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) SAMFileHeader(htsjdk.samtools.SAMFileHeader) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) File(java.io.File) Interval(htsjdk.samtools.util.Interval) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamReader(htsjdk.samtools.SamReader) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) IOException(java.io.IOException) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) IOException(java.io.IOException) Allele(htsjdk.variant.variantcontext.Allele) Cigar(htsjdk.samtools.Cigar) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) BufferedReader(java.io.BufferedReader)

Example 18 with KnownGene

use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.

the class VCFCombineTwoSnvs method loadKnownGenesFromUri.

/**
 * load KnownGenes
 */
private void loadKnownGenesFromUri() throws IOException {
    BufferedReader in = null;
    try {
        final SAMSequenceDictionary dict = this.indexedFastaSequenceFile.getSequenceDictionary();
        if (dict == null)
            throw new IOException("dictionary missing");
        LOG.info("loading genes from " + this.kgURI);
        in = IOUtils.openURIForBufferedReading(this.kgURI);
        final Pattern tab = Pattern.compile("[\t]");
        String line = null;
        while ((line = in.readLine()) != null) {
            line = line.trim();
            if (line.isEmpty())
                continue;
            String[] tokens = tab.split(line);
            final KnownGene g = new KnownGene(tokens);
            if (g.isNonCoding())
                continue;
            if (dict.getSequence(g.getContig()) == null) {
                LOG.warn("Unknown chromosome " + g.getContig() + " in dictionary");
                continue;
            }
            // use 1 based interval
            final Interval interval = new Interval(g.getContig(), g.getTxStart() + 1, g.getTxEnd());
            List<KnownGene> lkg = this.knownGenes.get(interval);
            if (lkg == null) {
                lkg = new ArrayList<>(2);
                this.knownGenes.put(interval, lkg);
            }
            lkg.add(g);
        }
        CloserUtil.close(in);
        in = null;
        LOG.info("genes:" + knownGenes.size());
    } finally {
        CloserUtil.close(in);
    }
}
Also used : Pattern(java.util.regex.Pattern) BufferedReader(java.io.BufferedReader) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) IOException(java.io.IOException) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Interval(htsjdk.samtools.util.Interval)

Example 19 with KnownGene

use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.

the class VCFCombineTwoSnvs method challenge.

private void challenge(final VariantContext ctx, final Allele allele, final KnownGene gene, final String vcfLine) throws IOException {
    if (allele.isSymbolic())
        return;
    if (allele.isNoCall())
        return;
    if (!allele.isCalled())
        return;
    if (allele.length() != 1)
        return;
    if (ctx.getReference().length() != 1)
        return;
    if (allele.equals(Allele.SPAN_DEL))
        return;
    if (genomicSequence == null || !genomicSequence.getChrom().equals(ctx.getContig())) {
        LOG.info("getting genomic Sequence for " + gene.getContig());
        genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, gene.getContig());
    }
    final int positionContext0 = ctx.getStart() - 1;
    Variant variant = null;
    final StringBuilder wildRNA = new StringBuilder(1000);
    final MutedSequence mutRNA = new MutedSequence(wildRNA);
    if (gene.isPositiveStrand()) {
        for (int exon_index = 0; exon_index < gene.getExonCount(); ++exon_index) {
            final KnownGene.Exon exon = gene.getExon(exon_index);
            int genomicPosition = Math.max(gene.getCdsStart(), exon.getStart());
            for (; ; ) {
                // we need to consider the last stop codon
                if (genomicPosition >= exon.getEnd() || genomicPosition >= gene.getCdsEnd()) {
                    break;
                }
                wildRNA.append(genomicSequence.charAt(genomicPosition));
                if (variant == null && positionContext0 == genomicPosition) {
                    variant = new Variant(ctx, allele, gene);
                    variant.sorting_id = ID_GENERATOR++;
                    variant.position_in_cdna = wildRNA.length() - 1;
                    char mutBase = allele.getBaseString().charAt(0);
                    mutRNA.setMutation(wildRNA.length() - 1, wildRNA.length(), "" + mutBase);
                }
                ++genomicPosition;
            }
        }
    } else {
        int exon_index = gene.getExonCount() - 1;
        while (exon_index >= 0) {
            final KnownGene.Exon exon = gene.getExon(exon_index);
            int genomicPosition = Math.min(gene.getCdsEnd() - 1, exon.getEnd() - 1);
            for (; ; ) {
                if (genomicPosition < exon.getStart() || genomicPosition < gene.getCdsStart()) {
                    break;
                }
                wildRNA.append(AcidNucleics.complement(genomicSequence.charAt(genomicPosition)));
                if (variant == null && positionContext0 == genomicPosition) {
                    variant = new Variant(ctx, allele, gene);
                    variant.sorting_id = ID_GENERATOR++;
                    variant.position_in_cdna = wildRNA.length() - 1;
                    char mutBase = AcidNucleics.complement(allele.getBaseString().charAt(0));
                    mutRNA.setMutation(wildRNA.length() - 1, wildRNA.length(), "" + mutBase);
                }
                --genomicPosition;
            }
            --exon_index;
        }
    }
    if (variant != null) {
        variant.wildCodon = "";
        variant.mutCodon = "";
        for (int i = 0; i < 3; ++i) {
            int pos = variant.codonStart() + i;
            variant.wildCodon += (pos < wildRNA.length() ? wildRNA.charAt(pos) : '*');
            variant.mutCodon += (pos < mutRNA.length() ? mutRNA.charAt(pos) : '*');
        }
        variant.wildCodon = variant.wildCodon.toUpperCase();
        variant.mutCodon = variant.mutCodon.toUpperCase();
        variant.vcfLine = vcfLine;
        if (variant.wildCodon.equals(variant.mutCodon)) {
            LOG.info("Uh??????? " + allele + " " + ctx);
            return;
        }
        this.variants.add(variant);
    }
}
Also used : GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene)

Example 20 with KnownGene

use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.

the class BackLocate method backLocate.

private void backLocate(final PrintStream out, final KnownGene gene, final String geneName, char aa1, char aa2, int peptidePos1) throws IOException {
    final GeneticCode geneticCode = getGeneticCodeByChromosome(gene.getChromosome());
    RNASequence wildRNA = null;
    ProteinCharSequence wildProt = null;
    if (this.genomicContig == null || !this.genomicContig.hasName(gene.getContig())) {
        LOG.info("fetch genome");
        this.genomicContig = this.referenceGenome.getContig(gene.getContig());
        if (this.genomicContig == null) {
            LOG.warn("No contig " + gene.getContig() + " in reference genome.");
            return;
        }
    }
    if (gene.isPositiveStrand()) {
        int exon_index = 0;
        while (exon_index < gene.getExonCount()) {
            for (int i = gene.getExonStart(exon_index); i < gene.getExonEnd(exon_index); ++i) {
                if (i < gene.getCdsStart())
                    continue;
                if (i >= gene.getCdsEnd())
                    break;
                if (wildRNA == null) {
                    wildRNA = new RNASequence(this.genomicContig, '+');
                }
                wildRNA.genomicPositions.add(i);
                if (wildRNA.length() % 3 == 0 && wildRNA.length() > 0 && wildProt == null) {
                    wildProt = new ProteinCharSequence(geneticCode, wildRNA);
                }
            }
            ++exon_index;
        }
    } else // reverse orientation
    {
        int exon_index = gene.getExonCount() - 1;
        while (exon_index >= 0) {
            for (int i = gene.getExonEnd(exon_index) - 1; i >= gene.getExonStart(exon_index); --i) {
                if (i >= gene.getCdsEnd())
                    continue;
                if (i < gene.getCdsStart())
                    break;
                if (wildRNA == null) {
                    wildRNA = new RNASequence(this.genomicContig, '-');
                }
                wildRNA.genomicPositions.add(i);
                if (wildRNA.length() % 3 == 0 && wildRNA.length() > 0 && wildProt == null) {
                    wildProt = new ProteinCharSequence(geneticCode, wildRNA);
                }
            }
            --exon_index;
        }
    }
    if (wildProt == null) {
        stderr().println("#no protein found for transcript:" + gene.getName());
        return;
    }
    int peptideIndex0 = peptidePos1 - 1;
    if (peptideIndex0 >= wildProt.length()) {
        out.println("#index out of range for :" + gene.getName() + " petide length=" + wildProt.length());
        return;
    }
    if (wildProt.charAt(peptideIndex0) != aa1) {
        out.println("##Warning ref aminod acid for " + gene.getName() + "  [" + peptidePos1 + "] is not the same (" + wildProt.charAt(peptideIndex0) + "/" + aa1 + ")");
    } else {
        out.println("##" + gene.getName());
    }
    int[] indexesInRNA = new int[] { 0 + peptideIndex0 * 3, 1 + peptideIndex0 * 3, 2 + peptideIndex0 * 3 };
    final String wildCodon = "" + wildRNA.charAt(indexesInRNA[0]) + wildRNA.charAt(indexesInRNA[1]) + wildRNA.charAt(indexesInRNA[2]);
    /* 2015 : adding possible mut codons */
    final Set<String> possibleAltCodons = new HashSet<>();
    final char[] bases = new char[] { 'A', 'C', 'G', 'T' };
    for (int codon_pos = 0; codon_pos < 3; ++codon_pos) {
        StringBuilder sb = new StringBuilder(wildCodon);
        for (char mutBase : bases) {
            sb.setCharAt(codon_pos, mutBase);
            if (geneticCode.translate(sb.charAt(0), sb.charAt(1), sb.charAt(2)) == Character.toUpperCase(aa2)) {
                possibleAltCodons.add(sb.toString());
            }
        }
    }
    for (int indexInRna : indexesInRNA) {
        out.print(geneName);
        out.print('\t');
        out.print(aa1);
        out.print('\t');
        out.print(peptidePos1);
        out.print('\t');
        out.print(aa2);
        out.print('\t');
        out.print(gene.getName());
        out.print('\t');
        out.print(gene.getStrand() == Strand.NEGATIVE ? "-" : "+");
        out.print('\t');
        out.print(wildProt.charAt(peptideIndex0));
        out.print('\t');
        out.print(indexInRna);
        out.print('\t');
        out.print(wildCodon);
        out.print('\t');
        if (possibleAltCodons.isEmpty()) {
            out.print('.');
        } else {
            boolean first = true;
            for (String mutCodon : possibleAltCodons) {
                if (!first)
                    out.print('|');
                first = false;
                out.print(mutCodon);
            }
        }
        out.print('\t');
        out.print(wildRNA.charAt(indexInRna));
        out.print('\t');
        out.print(gene.getChromosome());
        out.print('\t');
        out.print(wildRNA.genomicPositions.get(indexInRna));
        out.print('\t');
        String exonName = null;
        for (KnownGene.Exon exon : gene.getExons()) {
            int genome = wildRNA.genomicPositions.get(indexInRna);
            if (exon.getStart() <= genome && genome < exon.getEnd()) {
                exonName = exon.getName();
                break;
            }
        }
        out.print(exonName);
        if (this.printSequences) {
            String s = wildRNA.toString();
            out.print('\t');
            out.print(s.substring(0, indexInRna) + "[" + s.charAt(indexInRna) + "]" + (indexInRna + 1 < s.length() ? s.substring(indexInRna + 1) : ""));
            s = wildProt.toString();
            out.print('\t');
            out.print(s.substring(0, peptideIndex0) + "[" + aa1 + "/" + aa2 + "/" + wildProt.charAt(peptideIndex0) + "]" + (peptideIndex0 + 1 < s.length() ? s.substring(peptideIndex0 + 1) : ""));
        }
        out.println();
    }
}
Also used : KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) GeneticCode(com.github.lindenb.jvarkit.util.bio.GeneticCode) HashSet(java.util.HashSet)

Aggregations

KnownGene (com.github.lindenb.jvarkit.util.ucsc.KnownGene)22 Interval (htsjdk.samtools.util.Interval)11 ArrayList (java.util.ArrayList)9 BufferedReader (java.io.BufferedReader)8 Pattern (java.util.regex.Pattern)8 IOException (java.io.IOException)7 VariantContext (htsjdk.variant.variantcontext.VariantContext)6 VCFHeader (htsjdk.variant.vcf.VCFHeader)5 File (java.io.File)5 HashSet (java.util.HashSet)5 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)4 List (java.util.List)4 Parameter (com.beust.jcommander.Parameter)3 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)3 GeneticCode (com.github.lindenb.jvarkit.util.bio.GeneticCode)3 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)3 Logger (com.github.lindenb.jvarkit.util.log.Logger)3 GenomicSequence (com.github.lindenb.jvarkit.util.picard.GenomicSequence)3 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)3 VcfIterator (com.github.lindenb.jvarkit.util.vcf.VcfIterator)3