Search in sources :

Example 16 with GenomicSequence

use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.

the class ReferenceToVCF method doWork.

@Override
public int doWork(List<String> args) {
    if (this.bedFile != null) {
        if (limitBed == null)
            limitBed = new IntervalTreeMap<Boolean>();
        try {
            Pattern tab = Pattern.compile("[\t]");
            BufferedReader r = IOUtils.openFileForBufferedReading(this.bedFile);
            String line;
            while ((line = r.readLine()) != null) {
                if (BedLine.isBedHeader(line))
                    continue;
                if (line.startsWith("#") || line.isEmpty())
                    continue;
                String[] tokens = tab.split(line, 4);
                limitBed.put(new Interval(tokens[0], 1 + Integer.parseInt(tokens[1]), 1 + Integer.parseInt(tokens[2])), true);
            }
            CloserUtil.close(r);
        } catch (Exception err) {
            LOG.error(err);
            return -1;
        }
    }
    Random random = new Random(0L);
    VariantContextWriter out = null;
    try {
        final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(oneAndOnlyOneFile(args)));
        SAMSequenceDictionary dict = fasta.getSequenceDictionary();
        out = super.openVariantContextWriter(this.outputFile);
        SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
        VCFHeader header = new VCFHeader();
        header.setSequenceDictionary(dict);
        out.writeHeader(header);
        final List<List<Allele>> combination = new ArrayList<List<Allele>>(4);
        // always keep REF as first allele please
        combination.add(Arrays.asList(Allele.create("A", true), Allele.create("C", false), Allele.create("G", false), Allele.create("T", false)));
        combination.add(Arrays.asList(Allele.create("C", true), Allele.create("A", false), Allele.create("G", false), Allele.create("T", false)));
        combination.add(Arrays.asList(Allele.create("G", true), Allele.create("A", false), Allele.create("C", false), Allele.create("T", false)));
        combination.add(Arrays.asList(Allele.create("T", true), Allele.create("A", false), Allele.create("C", false), Allele.create("G", false)));
        for (SAMSequenceRecord ssr : dict.getSequences()) {
            GenomicSequence genome = new GenomicSequence(fasta, ssr.getSequenceName());
            if (this.limitBed != null) {
                Interval interval = new Interval(ssr.getSequenceName(), 1, genome.length());
                if (!this.limitBed.containsOverlapping(interval))
                    continue;
            }
            for (int n = 0; n < genome.length(); ++n) {
                progress.watch(ssr.getSequenceIndex(), n);
                List<Allele> alleles = null;
                byte ref = (byte) genome.charAt(n);
                switch(ref) {
                    case 'a':
                    case 'A':
                        alleles = combination.get(0);
                        break;
                    case 'c':
                    case 'C':
                        alleles = combination.get(1);
                        break;
                    case 'g':
                    case 'G':
                        alleles = combination.get(2);
                        break;
                    case 't':
                    case 'T':
                        alleles = combination.get(3);
                        break;
                    default:
                        break;
                }
                if (alleles == null)
                    continue;
                if (this.limitBed != null) {
                    Interval interval = new Interval(ssr.getSequenceName(), n + 1, n + 1);
                    if (!this.limitBed.containsOverlapping(interval))
                        continue;
                }
                if (!disjoint_alts) {
                    VariantContextBuilder vcb = new VariantContextBuilder();
                    vcb.chr(ssr.getSequenceName());
                    vcb.start(n + 1);
                    vcb.stop(n + 1);
                    vcb.alleles(alleles);
                    out.add(vcb.make());
                } else {
                    for (// index 0 is always REF
                    int a = 1; // index 0 is always REF
                    a < 4; // index 0 is always REF
                    ++a) {
                        VariantContextBuilder vcb = new VariantContextBuilder();
                        vcb.chr(ssr.getSequenceName());
                        vcb.start(n + 1);
                        vcb.stop(n + 1);
                        // index 0 is always REF
                        vcb.alleles(Arrays.asList(alleles.get(0), alleles.get(a)));
                        out.add(vcb.make());
                    }
                }
                if (insertion_size > 0 && n + 1 < genome.length()) {
                    alleles = new ArrayList<Allele>(2);
                    // REFERENCE
                    alleles.add(Allele.create("" + genome.charAt(n) + genome.charAt(n + 1), true));
                    StringBuilder sb = new StringBuilder(insertion_size + 2);
                    sb.append(genome.charAt(n));
                    for (int n2 = 0; n2 < insertion_size; ++n2) {
                        switch(random.nextInt(4)) {
                            case 0:
                                sb.append('A');
                                break;
                            case 1:
                                sb.append('C');
                                break;
                            case 2:
                                sb.append('G');
                                break;
                            case 3:
                                sb.append('T');
                                break;
                        }
                    }
                    sb.append(genome.charAt(n + 1));
                    alleles.add(Allele.create(sb.toString(), false));
                    VariantContextBuilder vcb = new VariantContextBuilder();
                    vcb.chr(ssr.getSequenceName());
                    vcb.start(n + 1);
                    vcb.alleles(alleles);
                    vcb.computeEndFromAlleles(alleles, n + 1);
                    out.add(vcb.make());
                }
                if (deletion_size > 0 && n + deletion_size + 1 < genome.length()) {
                    alleles = new ArrayList<Allele>(2);
                    // REF
                    StringBuilder sb = new StringBuilder(deletion_size + 2);
                    sb.append(genome.charAt(n));
                    int lastpos = n + 1;
                    for (int n2 = 0; n2 < deletion_size; ++n2, lastpos++) {
                        sb.append(genome.charAt(lastpos));
                    }
                    sb.append(genome.charAt(lastpos));
                    alleles.add(Allele.create(sb.toString(), true));
                    alleles.add(Allele.create("" + genome.charAt(n) + genome.charAt(lastpos), false));
                    VariantContextBuilder vcb = new VariantContextBuilder();
                    vcb.chr(ssr.getSequenceName());
                    vcb.start(n + 1);
                    vcb.alleles(alleles);
                    vcb.computeEndFromAlleles(alleles, n + 1);
                    out.add(vcb.make());
                }
                if (out.checkError())
                    break;
            }
            if (out.checkError())
                break;
        }
        progress.finish();
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(out);
    }
}
Also used : ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) Random(java.util.Random) ArrayList(java.util.ArrayList) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) Pattern(java.util.regex.Pattern) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) BufferedReader(java.io.BufferedReader) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) File(java.io.File) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) Interval(htsjdk.samtools.util.Interval)

Example 17 with GenomicSequence

use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.

the class Sam2Tsv method printAln.

private void printAln(final Row row) {
    final SAMRecord rec = row.rec;
    if (rec == null)
        return;
    Cigar cigar = rec.getCigar();
    if (cigar == null)
        return;
    row.readbases = rec.getReadBases();
    row.readQuals = rec.getBaseQualities();
    if (row.readbases == null) {
        row.op = null;
        row.refPos = -1;
        row.readPos = -1;
        writeAln(row);
        return;
    }
    if (rec.getReadUnmappedFlag()) {
        row.op = null;
        row.refPos = -1;
        for (int i = 0; i < row.readbases.length; ++i) {
            row.readPos = i;
            writeAln(row);
        }
        return;
    }
    // fix hard clipped reads
    StringBuilder fixReadBases = new StringBuilder(row.readbases.length);
    StringBuilder fixReadQuals = new StringBuilder(row.readbases.length);
    int readIndex = 0;
    for (final CigarElement ce : cigar.getCigarElements()) {
        final CigarOperator op = ce.getOperator();
        for (int i = 0; i < ce.getLength(); ++i) {
            if (op.equals(CigarOperator.H)) {
                fixReadBases.append('*');
                fixReadQuals.append('*');
            } else if (!op.consumesReadBases()) {
                break;
            } else {
                fixReadBases.append((char) row.readbases[readIndex]);
                fixReadQuals.append(row.readQuals == null || row.readQuals.length <= readIndex ? '*' : (char) row.readQuals[readIndex]);
                readIndex++;
            }
        }
    }
    row.readbases = fixReadBases.toString().getBytes();
    row.readQuals = fixReadQuals.toString().getBytes();
    if (this.indexedFastaSequenceFile != null) {
        if (this.genomicSequence == null || !this.genomicSequence.getChrom().equals(rec.getReferenceName())) {
            this.genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, rec.getReferenceName());
        }
    }
    readIndex = 0;
    int refIndex = rec.getUnclippedStart();
    for (final CigarElement e : cigar.getCigarElements()) {
        row.op = e.getOperator();
        switch(e.getOperator()) {
            case S:
            case // length of read has been fixed previously, so same as 'S'
            H:
                {
                    for (int i = 0; i < e.getLength(); ++i) {
                        row.readPos = readIndex;
                        row.refPos = refIndex;
                        writeAln(row);
                        readIndex++;
                        // because we used getUnclippedStart
                        refIndex++;
                    }
                    break;
                }
            case P:
                {
                    row.refPos = -1;
                    row.readPos = -1;
                    for (int i = 0; i < e.getLength(); ++i) {
                        writeAln(row);
                    }
                    break;
                }
            case I:
                {
                    row.refPos = -1;
                    for (int i = 0; i < e.getLength(); ++i) {
                        row.readPos = readIndex;
                        writeAln(row);
                        readIndex++;
                    }
                    break;
                }
            // cont. -- reference skip
            case N:
            case D:
                {
                    row.readPos = -1;
                    for (int i = 0; i < e.getLength(); ++i) {
                        row.refPos = refIndex;
                        writeAln(row);
                        refIndex++;
                    }
                    break;
                }
            case M:
            case EQ:
            case X:
                {
                    for (int i = 0; i < e.getLength(); ++i) {
                        row.refPos = refIndex;
                        row.readPos = readIndex;
                        writeAln(row);
                        refIndex++;
                        readIndex++;
                    }
                    break;
                }
            default:
                throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator());
        }
    }
    if (printAlignment) {
        final int len = Math.max(rec.getReadNameLength(), rec.getReferenceName().length()) + 2;
        this.out.printf(":%" + len + "s %8d %s %-8d\n", rec.getReferenceName(), rec.getUnclippedStart(), L3.toString(), rec.getUnclippedEnd());
        this.out.printf(":%" + len + "s %8s %s\n", "", "", L2.toString());
        this.out.printf(":%" + len + "s %8d %s %-8d\n", rec.getReadName(), 1, L1.toString(), rec.getReadLength());
        L1.setLength(0);
        L2.setLength(0);
        L3.setLength(0);
    }
}
Also used : Cigar(htsjdk.samtools.Cigar) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) SAMRecord(htsjdk.samtools.SAMRecord) CigarOperator(htsjdk.samtools.CigarOperator) CigarElement(htsjdk.samtools.CigarElement)

Example 18 with GenomicSequence

use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.

the class SamFixCigar method doWork.

@Override
public int doWork(List<String> args) {
    if (this.faidx == null) {
        LOG.error("Reference was not specified.");
        return -1;
    }
    GenomicSequence genomicSequence = null;
    SamReader sfr = null;
    SAMFileWriter sfw = null;
    try {
        this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(faidx);
        sfr = openSamReader(oneFileOrNull(args));
        final SAMFileHeader header = sfr.getFileHeader();
        sfw = this.writingBamArgs.setReferenceFile(this.faidx).openSAMFileWriter(outputFile, header, true);
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
        final List<CigarElement> newCigar = new ArrayList<CigarElement>();
        final SAMRecordIterator iter = sfr.iterator();
        while (iter.hasNext()) {
            final SAMRecord rec = progress.watch(iter.next());
            Cigar cigar = rec.getCigar();
            byte[] bases = rec.getReadBases();
            if (rec.getReadUnmappedFlag() || cigar == null || cigar.getCigarElements().isEmpty() || bases == null) {
                sfw.addAlignment(rec);
                continue;
            }
            if (genomicSequence == null || genomicSequence.getSAMSequenceRecord().getSequenceIndex() != rec.getReferenceIndex()) {
                genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
            }
            newCigar.clear();
            int refPos1 = rec.getAlignmentStart();
            int readPos0 = 0;
            for (final CigarElement ce : cigar.getCigarElements()) {
                final CigarOperator op = ce.getOperator();
                if (op.equals(CigarOperator.M)) {
                    for (int i = 0; i < ce.getLength(); ++i) {
                        char c1 = Character.toUpperCase((char) bases[readPos0]);
                        char c2 = Character.toUpperCase(refPos1 - 1 < genomicSequence.length() ? genomicSequence.charAt(refPos1 - 1) : '*');
                        if (c2 == 'N' || c1 == c2) {
                            newCigar.add(new CigarElement(1, CigarOperator.EQ));
                        } else {
                            newCigar.add(new CigarElement(1, CigarOperator.X));
                        }
                        refPos1++;
                        readPos0++;
                    }
                } else {
                    newCigar.add(ce);
                    if (op.consumesReadBases())
                        readPos0 += ce.getLength();
                    if (op.consumesReferenceBases())
                        refPos1 += ce.getLength();
                }
            }
            int i = 0;
            while (i < newCigar.size()) {
                final CigarOperator op1 = newCigar.get(i).getOperator();
                final int length1 = newCigar.get(i).getLength();
                if (i + 1 < newCigar.size() && newCigar.get(i + 1).getOperator() == op1) {
                    final CigarOperator op2 = newCigar.get(i + 1).getOperator();
                    int length2 = newCigar.get(i + 1).getLength();
                    newCigar.set(i, new CigarElement(length1 + length2, op2));
                    newCigar.remove(i + 1);
                } else {
                    ++i;
                }
            }
            cigar = new Cigar(newCigar);
            // info("changed "+rec.getCigarString()+" to "+newCigarStr+" "+rec.getReadName()+" "+rec.getReadString());
            rec.setCigar(cigar);
            sfw.addAlignment(rec);
        }
        progress.finish();
        return RETURN_OK;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(this.indexedFastaSequenceFile);
        CloserUtil.close(sfr);
        CloserUtil.close(sfw);
    }
}
Also used : SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) SAMFileWriter(htsjdk.samtools.SAMFileWriter) ArrayList(java.util.ArrayList) CigarOperator(htsjdk.samtools.CigarOperator) CigarElement(htsjdk.samtools.CigarElement) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) SamReader(htsjdk.samtools.SamReader) Cigar(htsjdk.samtools.Cigar) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader)

Example 19 with GenomicSequence

use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.

the class TView method paint.

void paint(final PrintStream out) {
    final Colorizer colorizer;
    switch(this.formatOut) {
        case html:
            colorizer = new HtmlColorizer(out);
            break;
        case tty:
            colorizer = new AnsiColorizer(out);
            break;
        case plain:
            colorizer = new Colorizer(out);
            break;
        default:
            throw new IllegalStateException();
    }
    if (interval == null) {
        LOG.warn("No interval defined");
        return;
    }
    final GenomicSequence contigSequence;
    final Function<Integer, Character> refPosToBase;
    if (indexedFastaSequenceFile != null) {
        final SAMSequenceDictionary dict = SAMSequenceDictionaryExtractor.extractDictionary(referenceFile);
        if (dict.getSequence(this.interval.getContig()) == null) {
            LOG.warn("No interval with contig " + interval + " in REF");
            return;
        }
        contigSequence = new GenomicSequence(indexedFastaSequenceFile, interval.getContig());
        refPosToBase = POS -> {
            if (POS < 0 || POS >= contigSequence.length())
                return 'N';
            return contigSequence.charAt(POS);
        };
    } else {
        contigSequence = null;
        refPosToBase = POS -> 'N';
    }
    /**
     * test if genomic position is in interval
     */
    final Predicate<Integer> testInInterval = new Predicate<Integer>() {

        @Override
        public boolean test(final Integer pos) {
            return interval.getStart() <= pos && pos <= interval.getEnd();
        }
    };
    final int pixelWidth = this.interval.length();
    final Map<Integer, Integer> genomicpos2insertlen = new TreeMap<>();
    final Map<String, List<SAMRecord>> group2record = new TreeMap<>();
    for (final SamReader samReader : this.samReaders) {
        SAMRecordIterator iter = samReader.query(this.interval.getContig(), this.interval.getStart(), this.interval.getEnd(), false);
        while (iter.hasNext()) {
            final SAMRecord rec = iter.next();
            if (rec.getReadUnmappedFlag())
                continue;
            if (rec.getCigar() == null)
                continue;
            if (getRecordFilter().filterOut(rec))
                continue;
            if (!rec.getContig().equals(interval.getContig()))
                continue;
            if (right().apply(rec) < this.interval.getStart())
                continue;
            if (this.interval.getEnd() < left().apply(rec))
                continue;
            String group = this.groupBy.getPartion(rec);
            if (group == null || group.isEmpty()) {
                group = "undefined_" + this.groupBy.name();
            }
            List<SAMRecord> records = group2record.get(group);
            if (records == null) {
                records = new ArrayList<>();
                group2record.put(group, records);
            }
            records.add(rec);
            // loop over cigar, get the longest insert
            int refpos = rec.getAlignmentStart();
            for (final CigarElement ce : rec.getCigar().getCigarElements()) {
                if (!this.showInsertions)
                    break;
                final CigarOperator op = ce.getOperator();
                if (op.equals(CigarOperator.I) && testInInterval.test(refpos)) {
                    final Integer longestInsert = genomicpos2insertlen.get(refpos);
                    if (longestInsert == null || longestInsert.compareTo(ce.getLength()) < 0) {
                        genomicpos2insertlen.put(refpos, ce.getLength());
                    }
                }
                if (op.consumesReferenceBases()) {
                    refpos += ce.getLength();
                }
                if (refpos > interval.getEnd())
                    break;
            }
        }
        CloserUtil.close(iter);
        CloserUtil.close(samReader);
    }
    /**
     * compute where are the insertions
     */
    // LOG.debug(genomicpos2insertlen);
    final Predicate<Integer> insertIsPresentAtX = SCREENX -> {
        int x = 0;
        int ref = interval.getStart();
        while (x < pixelWidth) {
            if (x > SCREENX)
                return false;
            final Integer insertLen = genomicpos2insertlen.get(ref);
            if (insertLen == null) {
                ++x;
                ++ref;
            } else {
                if (x <= SCREENX && SCREENX < x + insertLen)
                    return true;
                // (+1) I DON'T UNDERSTAND WHY, BUT IT WORKS
                x += (insertLen + 1);
                ++ref;
            }
        }
        return false;
    };
    final Function<Character, AnsiColor> base2ansiColor = BASE -> {
        switch(Character.toUpperCase(BASE)) {
            case 'A':
                return AnsiColor.BLUE;
            case 'T':
                return AnsiColor.GREEN;
            case 'G':
                return AnsiColor.CYAN;
            case 'C':
                return AnsiColor.YELLOW;
            default:
                return null;
        }
    };
    /**
     * print interval title
     */
    out.println(interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd());
    /**
     * paint base position
     */
    int ref = this.interval.getStart();
    int x = 0;
    out.print(margin("POS:"));
    while (x < pixelWidth) {
        if (insertIsPresentAtX.test(x)) {
            colorizer.pen(AnsiColor.RED).print("^");
            ++x;
        } else if ((ref - this.interval.getStart()) % 10 == 0) {
            final String f = String.format("%d", ref);
            for (int i = 0; i < f.length() && x < pixelWidth; ++i) {
                colorizer.pen(AnsiColor.GREEN).print(f.charAt(i));
                if (!insertIsPresentAtX.test(x))
                    ++ref;
                ++x;
            }
        } else {
            out.print(".");
            ++ref;
            ++x;
        }
    }
    out.println();
    /* paint ref base */
    out.print(margin("REF:"));
    ref = this.interval.getStart();
    x = 0;
    while (x < pixelWidth) {
        if (insertIsPresentAtX.test(x)) {
            colorizer.paper(AnsiColor.YELLOW).print("*");
            ++x;
        } else {
            char refBase = refPosToBase.apply(ref - 1);
            colorizer.pen(base2ansiColor.apply(refBase)).print(refBase);
            ++ref;
            ++x;
        }
    }
    out.println();
    /* loop over samples **/
    for (final String groupName : group2record.keySet()) {
        if (this.maxReadRowPerGroup == 0)
            continue;
        final ConsensusBuilder consensus = new ConsensusBuilder();
        int y_group = 0;
        final List<List<SAMRecord>> rows = new ArrayList<>();
        out.println(margin(""));
        switch(this.layoutReads) {
            case name:
                {
                    rows.addAll(group2record.get(groupName).stream().sorted((R1, R2) -> R1.getReadName().compareTo(R2.getReadName())).map(R -> Collections.singletonList(R)).collect(Collectors.toList()));
                    break;
                }
            default:
                {
                    /* pileup reads */
                    for (final SAMRecord rec : group2record.get(groupName)) {
                        int y = 0;
                        for (y = 0; y < rows.size(); ++y) {
                            final List<SAMRecord> row = rows.get(y);
                            final SAMRecord last = row.get(row.size() - 1);
                            if (right().apply(last) + this.distance_between_reads < left().apply(rec)) {
                                row.add(rec);
                                break;
                            }
                        }
                        if (y == rows.size()) {
                            final List<SAMRecord> row = new ArrayList<>();
                            row.add(rec);
                            rows.add(row);
                        }
                    }
                    break;
                }
        }
        // each row is only one read, so we need to print the groupName
        if (layoutReads == LayoutReads.name) {
            out.print(margin(groupName));
            out.println();
        }
        /* print each row */
        for (final List<SAMRecord> row : rows) {
            ++y_group;
            boolean print_this_line = (this.maxReadRowPerGroup < 0 || y_group <= this.maxReadRowPerGroup);
            if (print_this_line) {
                // each row is only one read, print the read name
                if (layoutReads == LayoutReads.name) {
                    String readName = row.get(0).getReadName();
                    if (row.get(0).getReadPairedFlag()) {
                        if (row.get(0).getFirstOfPairFlag()) {
                            readName += "/1";
                        }
                        if (row.get(0).getSecondOfPairFlag()) {
                            readName += "/2";
                        }
                    }
                    out.print(margin(readName));
                } else {
                    out.print(margin(y_group == 1 ? groupName : ""));
                }
            }
            ref = interval.getStart();
            x = 0;
            for (final SAMRecord rec : row) {
                int readRef = left().apply(rec);
                // pad before record
                while (x < pixelWidth && ref < readRef && testInInterval.test(ref)) {
                    if (!insertIsPresentAtX.test(x))
                        ++ref;
                    ++x;
                    if (print_this_line)
                        out.print(' ');
                    consensus.add(' ');
                }
                int readpos = 0;
                /* get read base function */
                final Function<Integer, Character> baseAt = new Function<Integer, Character>() {

                    @Override
                    public Character apply(final Integer readpos) {
                        final byte[] readBases = rec.getReadBases();
                        if (readBases == SAMRecord.NULL_SEQUENCE)
                            return 'N';
                        if (readpos < 0 || readpos >= rec.getReadLength())
                            return '?';
                        return (char) readBases[readpos];
                    }
                };
                for (final CigarElement ce : rec.getCigar()) {
                    final CigarOperator op = ce.getOperator();
                    if (op.equals(CigarOperator.PADDING))
                        continue;
                    /* IN INSERTION, only print if showInsertions is true */
                    if (this.showInsertions && op.equals(CigarOperator.I)) {
                        int cigarIdx = 0;
                        while (x < pixelWidth && cigarIdx < ce.getLength()) {
                            if (testInInterval.test(readRef)) {
                                final char readbase = baseAt.apply(readpos);
                                if (print_this_line)
                                    colorizer.paper(AnsiColor.RED).print(readbase);
                                consensus.add(readbase);
                                ++x;
                            }
                            ++cigarIdx;
                            ++readpos;
                        }
                        continue;
                    }
                    int cigarIdx = 0;
                    while (x < pixelWidth && cigarIdx < ce.getLength()) {
                        colorizer.clear();
                        // pad before base
                        while (x < pixelWidth && testInInterval.test(readRef) && (insertIsPresentAtX.test(x))) {
                            ++x;
                            if (print_this_line)
                                colorizer.paper(AnsiColor.YELLOW).print("*");
                            consensus.add(' ');
                            continue;
                        }
                        switch(op) {
                            case I:
                                {
                                    // if visible, processed above
                                    if (showInsertions)
                                        throw new IllegalStateException();
                                    readpos++;
                                    break;
                                }
                            case P:
                                break;
                            case H:
                                {
                                    if (showClip) {
                                        if (testInInterval.test(readRef)) {
                                            if (print_this_line)
                                                colorizer.paper(AnsiColor.YELLOW).print('N');
                                            // CLIPPED base not part of consensus
                                            consensus.add(' ');
                                            ++x;
                                        }
                                        ++readRef;
                                    }
                                    break;
                                }
                            case S:
                                {
                                    if (showClip) {
                                        if (testInInterval.test(readRef)) {
                                            final char readBase = baseAt.apply(readpos);
                                            if (print_this_line)
                                                colorizer.paper(AnsiColor.YELLOW).print(readBase);
                                            // CLIPPED base not part of consensus
                                            consensus.add(' ');
                                            ++x;
                                        }
                                        ++readpos;
                                        ++readRef;
                                    } else {
                                        readpos++;
                                    }
                                    break;
                                }
                            case D:
                            case N:
                                {
                                    if (testInInterval.test(readRef)) {
                                        if (print_this_line)
                                            colorizer.paper(AnsiColor.RED).print('-');
                                        // deletion not not part of consensus
                                        consensus.add(' ');
                                        ++x;
                                    }
                                    ++readRef;
                                    break;
                                }
                            case EQ:
                            case M:
                            case X:
                                {
                                    if (testInInterval.test(readRef)) {
                                        final char refBase = Character.toUpperCase(refPosToBase.apply(readRef - 1));
                                        char readBase = Character.toUpperCase(baseAt.apply(readpos));
                                        consensus.add(readBase);
                                        colorizer.pen(base2ansiColor.apply(readBase));
                                        if (op.equals(CigarOperator.X) || (refBase != 'N' && readBase != 'N' && readBase != refBase)) {
                                            colorizer.pen(AnsiColor.RED);
                                        } else if (hideBases) {
                                            if (rec.getReadNegativeStrandFlag()) {
                                                readBase = ',';
                                            } else {
                                                readBase = '.';
                                            }
                                        }
                                        if (showReadName) {
                                            final String readName = rec.getReadName();
                                            if (readpos < 0 || readpos >= readName.length()) {
                                                readBase = '_';
                                            } else {
                                                readBase = readName.charAt(readpos);
                                            }
                                        }
                                        if (rec.getReadNegativeStrandFlag()) {
                                            readBase = Character.toLowerCase(readBase);
                                        } else {
                                            readBase = Character.toUpperCase(readBase);
                                        }
                                        if (print_this_line)
                                            colorizer.print(readBase);
                                        ++x;
                                    }
                                    ++readpos;
                                    ++readRef;
                                    break;
                                }
                        }
                        ++cigarIdx;
                    }
                }
                // end of loop cigar
                ref = readRef;
            }
            // out.println( " "+ref+" "+row.get(0).getAlignmentStart()+" "+row.get(0).getCigarString()+" "+row.get(0).getReadString());
            while (x < pixelWidth) {
                if (print_this_line)
                    out.print(" ");
                ++x;
            }
            if (print_this_line)
                out.println();
            consensus.eol();
            if (out.checkError())
                break;
        }
        if (out.checkError())
            break;
        if (!this.hideConsensus && consensus.bases.stream().anyMatch(C -> C.getCoverage() > 0)) {
            out.print(margin(groupName + " CONSENSUS"));
            x = 0;
            ref = interval.getStart();
            while (x < consensus.bases.size()) {
                final char refBase = Character.toUpperCase(refPosToBase.apply(ref - 1));
                final char consensusBase = consensus.bases.get(x).getConsensus();
                if (Character.isWhitespace(consensusBase)) {
                // nothing
                } else if (refBase != 'N' && consensusBase != refBase) {
                    colorizer.pen(AnsiColor.RED);
                } else {
                    colorizer.pen(base2ansiColor.apply(consensusBase));
                }
                if (!insertIsPresentAtX.test(x))
                    ++ref;
                colorizer.print(consensusBase);
                ++x;
            }
            out.println();
        }
        if (this.numCoverageRows > 0) {
            int minCov = consensus.bases.stream().mapToInt(C -> C.getCoverage()).min().orElse(0);
            final int maxCov = consensus.bases.stream().mapToInt(C -> C.getCoverage()).max().orElse(0);
            for (int y = 0; maxCov > 0 && y < this.numCoverageRows; ++y) {
                if (minCov == maxCov)
                    minCov--;
                double fract = (maxCov - minCov) / ((double) this.numCoverageRows);
                int inverse_y = (this.numCoverageRows - 1) - y;
                int d0 = (int) ((fract) * inverse_y);
                // int d1 = (int)((fract) * (inverse_y+1));
                out.print(margin(y == 0 ? groupName + " " + maxCov : (y + 1 == this.numCoverageRows ? String.valueOf(minCov) : "")));
                for (x = 0; x < consensus.bases.size(); ++x) {
                    int depth = consensus.bases.get(x).getCoverage() - minCov;
                    colorizer.print(depth >= d0 ? BLACK_SQUARE : ' ');
                }
                out.println();
            }
        }
    }
    if (this.tabixKnownGene != null && this.indexedFastaSequenceFile != null) {
        final List<KnownGene> genes = this.tabixKnownGene.getItemsInInterval(this.interval);
        if (!genes.isEmpty()) {
            out.println(this.knownGeneUri);
            for (final KnownGene gene : genes) {
                final KnownGene.CodingRNA codingRna = gene.getCodingRNA(contigSequence);
                final KnownGene.Peptide peptide = codingRna.getPeptide();
                out.print(margin(gene.getName()));
                x = 0;
                int ref0 = this.interval.getStart() - 1;
                while (x < pixelWidth) {
                    if (insertIsPresentAtX.test(x)) {
                        out.print("*");
                        ++x;
                    } else {
                        char pepChar = ' ';
                        if (ref0 >= gene.getTxStart() && ref0 < gene.getTxEnd()) {
                            pepChar = (gene.isPositiveStrand() ? '>' : '<');
                            int pepIdx = peptide.convertGenomicToPeptideCoordinate(ref0);
                            if (pepIdx != -1) {
                                final String aa3 = GeneticCode.aminoAcidTo3Letters(peptide.charAt(pepIdx));
                                final int[] offset = peptide.convertToGenomicCoordinates(pepIdx);
                                if (offset != null && offset.length == 3 && aa3 != null && aa3.length() == 3) {
                                    if (offset[0] == ref0)
                                        pepChar = aa3.charAt(0);
                                    else if (offset[1] == ref0)
                                        pepChar = aa3.charAt(1);
                                    else if (offset[2] == ref0)
                                        pepChar = aa3.charAt(2);
                                    else
                                        pepChar = '?';
                                } else {
                                    pepChar = '?';
                                }
                            }
                        }
                        out.print(pepChar);
                        ++ref0;
                        ++x;
                    }
                }
                while (x < pixelWidth) {
                    out.print(" ");
                    ++x;
                }
                out.println();
            }
        }
        out.println();
    }
    /**
     * variant section
     */
    if (!this.vcfReaders.isEmpty() && !out.checkError()) {
        final Function<GenotypeType, Character> gTypeToSymbol = new Function<GenotypeType, Character>() {

            @Override
            public Character apply(final GenotypeType gt) {
                switch(gt) {
                    case NO_CALL:
                        return '?';
                    case HOM_REF:
                        return '0';
                    case HET:
                        return '1';
                    case HOM_VAR:
                        return '2';
                    case MIXED:
                        return 'm';
                    case UNAVAILABLE:
                        return 'u';
                    default:
                        return '.';
                }
            }
        };
        out.println();
        for (final VcfSource r : this.vcfReaders) {
            if (out.checkError())
                break;
            final VCFHeader header = r.vcfFileReader.getFileHeader();
            final CloseableIterator<VariantContext> iter = r.vcfFileReader.query(this.interval.getContig(), interval.getStart(), interval.getEnd());
            final List<VariantContext> variants = new ArrayList<>();
            while (iter.hasNext()) {
                variants.add(iter.next());
            }
            iter.close();
            if (variants.isEmpty())
                continue;
            out.println(r.vcfFile.getPath());
            if (header.hasGenotypingData()) {
                for (final String sample : header.getSampleNamesInOrder()) {
                    if (!variants.stream().map(V -> V.getGenotype(sample)).filter(G -> !hideNoCall || (hideNoCall && !G.isNoCall())).filter(G -> !hideHomRef || (hideHomRef && !G.isHomRef())).findAny().isPresent()) {
                        continue;
                    }
                    out.print(margin(sample));
                    ref = this.interval.getStart();
                    x = 0;
                    while (x < pixelWidth) {
                        if (insertIsPresentAtX.test(x)) {
                            out.print("*");
                            ++x;
                        } else {
                            char refBase = ' ';
                            for (final VariantContext ctx : variants) {
                                if (ctx.getStart() == ref) {
                                    final Genotype g = ctx.getGenotype(sample);
                                    if (g.isNoCall() && this.hideNoCall)
                                        continue;
                                    if (g.isHomRef() && this.hideHomRef)
                                        continue;
                                    refBase = gTypeToSymbol.apply(g.getType());
                                    break;
                                }
                            }
                            out.print(refBase);
                            ++ref;
                            ++x;
                        }
                    }
                    out.println();
                }
            } else // no genotype
            {
                for (final VariantContext ctx : variants) {
                    out.print(margin(String.valueOf(ctx.getStart()) + ":" + ctx.getReference().getDisplayString() + "/" + ctx.getAlternateAlleles().stream().map(A -> A.getDisplayString()).collect(Collectors.joining(","))));
                    ref = this.interval.getStart();
                    x = 0;
                    while (x < pixelWidth) {
                        if (insertIsPresentAtX.test(x)) {
                            out.print("*");
                            ++x;
                        } else {
                            out.print(ctx.getStart() == ref ? '+' : ' ');
                            ++ref;
                            ++x;
                        }
                    }
                    out.println();
                }
            }
        }
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Parameter(com.beust.jcommander.Parameter) VCFFileReader(htsjdk.variant.vcf.VCFFileReader) VCFHeader(htsjdk.variant.vcf.VCFHeader) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) SAMRecordPartition(com.github.lindenb.jvarkit.util.samtools.SAMRecordPartition) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) Function(java.util.function.Function) ValidationStringency(htsjdk.samtools.ValidationStringency) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Interval(htsjdk.samtools.util.Interval) Map(java.util.Map) XMLStreamException(javax.xml.stream.XMLStreamException) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) XMLStreamWriter(javax.xml.stream.XMLStreamWriter) SAMSequenceDictionaryExtractor(htsjdk.variant.utils.SAMSequenceDictionaryExtractor) GeneticCode(com.github.lindenb.jvarkit.util.bio.GeneticCode) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintStream(java.io.PrintStream) Counter(com.github.lindenb.jvarkit.util.Counter) Predicate(java.util.function.Predicate) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) GenotypeType(htsjdk.variant.variantcontext.GenotypeType) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Set(java.util.Set) SamFilterParser(com.github.lindenb.jvarkit.util.bio.samfilter.SamFilterParser) IOException(java.io.IOException) SamReader(htsjdk.samtools.SamReader) Collectors(java.util.stream.Collectors) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) TabixKnownGeneFileReader(com.github.lindenb.jvarkit.util.ucsc.TabixKnownGeneFileReader) File(java.io.File) SAMRecord(htsjdk.samtools.SAMRecord) XMLOutputFactory(javax.xml.stream.XMLOutputFactory) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) List(java.util.List) SamInputResource(htsjdk.samtools.SamInputResource) TreeMap(java.util.TreeMap) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) Closeable(java.io.Closeable) VariantContext(htsjdk.variant.variantcontext.VariantContext) Collections(java.util.Collections) SamReaderFactory(htsjdk.samtools.SamReaderFactory) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Predicate(java.util.function.Predicate) SamReader(htsjdk.samtools.SamReader) Function(java.util.function.Function) ArrayList(java.util.ArrayList) List(java.util.List) VCFHeader(htsjdk.variant.vcf.VCFHeader) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) Genotype(htsjdk.variant.variantcontext.Genotype) CigarOperator(htsjdk.samtools.CigarOperator) TreeMap(java.util.TreeMap) CigarElement(htsjdk.samtools.CigarElement) SAMRecord(htsjdk.samtools.SAMRecord) GenotypeType(htsjdk.variant.variantcontext.GenotypeType) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene)

Example 20 with GenomicSequence

use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.

the class VCFCombineTwoSnvs method challenge.

private void challenge(final VariantContext ctx, final Allele allele, final KnownGene gene, final String vcfLine) throws IOException {
    if (allele.isSymbolic())
        return;
    if (allele.isNoCall())
        return;
    if (!allele.isCalled())
        return;
    if (allele.length() != 1)
        return;
    if (ctx.getReference().length() != 1)
        return;
    if (allele.equals(Allele.SPAN_DEL))
        return;
    if (genomicSequence == null || !genomicSequence.getChrom().equals(ctx.getContig())) {
        LOG.info("getting genomic Sequence for " + gene.getContig());
        genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, gene.getContig());
    }
    final int positionContext0 = ctx.getStart() - 1;
    Variant variant = null;
    final StringBuilder wildRNA = new StringBuilder(1000);
    final MutedSequence mutRNA = new MutedSequence(wildRNA);
    if (gene.isPositiveStrand()) {
        for (int exon_index = 0; exon_index < gene.getExonCount(); ++exon_index) {
            final KnownGene.Exon exon = gene.getExon(exon_index);
            int genomicPosition = Math.max(gene.getCdsStart(), exon.getStart());
            for (; ; ) {
                // we need to consider the last stop codon
                if (genomicPosition >= exon.getEnd() || genomicPosition >= gene.getCdsEnd()) {
                    break;
                }
                wildRNA.append(genomicSequence.charAt(genomicPosition));
                if (variant == null && positionContext0 == genomicPosition) {
                    variant = new Variant(ctx, allele, gene);
                    variant.sorting_id = ID_GENERATOR++;
                    variant.position_in_cdna = wildRNA.length() - 1;
                    char mutBase = allele.getBaseString().charAt(0);
                    mutRNA.setMutation(wildRNA.length() - 1, wildRNA.length(), "" + mutBase);
                }
                ++genomicPosition;
            }
        }
    } else {
        int exon_index = gene.getExonCount() - 1;
        while (exon_index >= 0) {
            final KnownGene.Exon exon = gene.getExon(exon_index);
            int genomicPosition = Math.min(gene.getCdsEnd() - 1, exon.getEnd() - 1);
            for (; ; ) {
                if (genomicPosition < exon.getStart() || genomicPosition < gene.getCdsStart()) {
                    break;
                }
                wildRNA.append(AcidNucleics.complement(genomicSequence.charAt(genomicPosition)));
                if (variant == null && positionContext0 == genomicPosition) {
                    variant = new Variant(ctx, allele, gene);
                    variant.sorting_id = ID_GENERATOR++;
                    variant.position_in_cdna = wildRNA.length() - 1;
                    char mutBase = AcidNucleics.complement(allele.getBaseString().charAt(0));
                    mutRNA.setMutation(wildRNA.length() - 1, wildRNA.length(), "" + mutBase);
                }
                --genomicPosition;
            }
            --exon_index;
        }
    }
    if (variant != null) {
        variant.wildCodon = "";
        variant.mutCodon = "";
        for (int i = 0; i < 3; ++i) {
            int pos = variant.codonStart() + i;
            variant.wildCodon += (pos < wildRNA.length() ? wildRNA.charAt(pos) : '*');
            variant.mutCodon += (pos < mutRNA.length() ? mutRNA.charAt(pos) : '*');
        }
        variant.wildCodon = variant.wildCodon.toUpperCase();
        variant.mutCodon = variant.mutCodon.toUpperCase();
        variant.vcfLine = vcfLine;
        if (variant.wildCodon.equals(variant.mutCodon)) {
            LOG.info("Uh??????? " + allele + " " + ctx);
            return;
        }
        this.variants.add(variant);
    }
}
Also used : GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene)

Aggregations

GenomicSequence (com.github.lindenb.jvarkit.util.picard.GenomicSequence)24 IndexedFastaSequenceFile (htsjdk.samtools.reference.IndexedFastaSequenceFile)15 ArrayList (java.util.ArrayList)12 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)10 CigarElement (htsjdk.samtools.CigarElement)10 SAMRecord (htsjdk.samtools.SAMRecord)10 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)10 SamReader (htsjdk.samtools.SamReader)10 VCFHeader (htsjdk.variant.vcf.VCFHeader)10 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)9 SAMRecordIterator (htsjdk.samtools.SAMRecordIterator)8 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)8 SAMFileHeader (htsjdk.samtools.SAMFileHeader)7 Allele (htsjdk.variant.variantcontext.Allele)7 Cigar (htsjdk.samtools.Cigar)6 VariantContext (htsjdk.variant.variantcontext.VariantContext)6 HashSet (java.util.HashSet)6 CigarOperator (htsjdk.samtools.CigarOperator)5 Genotype (htsjdk.variant.variantcontext.Genotype)5 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)5