Search in sources :

Example 1 with Strand

use of htsjdk.tribble.annotation.Strand in project jvarkit by lindenb.

the class KgToGff method process.

private void process(final Gff3Writer out, final String line) {
    try {
        final char delim = ':';
        final double UNDEFINED_SCORE = -1;
        final int UNDEFINED_PHASE = -1;
        final String[] tokens = CharSplitter.TAB.split(line);
        final KnownGene kg = new KnownGene(tokens);
        if (coding_only && kg.isNonCoding())
            return;
        final Map<String, String> atts = new LinkedHashMap<>();
        final int lclid = (++ID_GENERATOR);
        final String bioType = kg.isNonCoding() ? "misc_RNA" : "protein_coding";
        final String geneId = kg.getName() + ".g" + lclid;
        final Strand strand = Strand.decode(kg.getStrand().encodeAsChar());
        final String protName = (StringUtils.isBlank(kg.getName2()) ? kg.getName() : kg.getName2());
        atts.put("ID", "gene" + delim + geneId);
        atts.put("Name", protName + "." + lclid);
        atts.put("biotype", bioType);
        atts.put("gene_id", geneId);
        out.addFeature(new Gff3FeatureImpl(kg.getContig(), this.source, "gene", kg.getTxStart() + 1, kg.getTxEnd(), UNDEFINED_SCORE, strand, UNDEFINED_PHASE, convertMap(atts)));
        // final String transcriptId =;
        atts.clear();
        atts.put("ID", "transcript" + delim + geneId);
        atts.put("Parent", "gene" + delim + geneId);
        atts.put("Name", protName + "." + lclid);
        atts.put("biotype", bioType);
        atts.put("transcript_id", kg.getName() + ".t" + lclid);
        out.addFeature(new Gff3FeatureImpl(kg.getContig(), this.source, "mRNA", kg.getTxStart() + 1, kg.getTxEnd(), UNDEFINED_SCORE, strand, UNDEFINED_PHASE, convertMap(atts)));
        final CodingRNA cDNA = (kg.isNonCoding() ? null : kg.getCodingRNA());
        for (KnownGene.Exon exon : kg.getExons()) {
            atts.clear();
            atts.put("ID", kg.getName() + delim + "E" + exon.getIndex());
            atts.put("Parent", "transcript" + delim + geneId);
            atts.put("Name", kg.getName());
            atts.put("biotype", bioType);
            atts.put("exon_id", kg.getName() + delim + "E" + exon.getIndex());
            out.addFeature(new Gff3FeatureImpl(kg.getContig(), this.source, "exon", exon.getStart() + 1, exon.getEnd(), UNDEFINED_SCORE, strand, UNDEFINED_PHASE, convertMap(atts)));
            if (cDNA != null && !(exon.getEnd() <= kg.getCdsStart() || kg.getCdsEnd() <= exon.getStart())) {
                final int cdsStart = Math.max(exon.getStart(), kg.getCdsStart());
                final int cdsEnd = Math.min(exon.getEnd(), kg.getCdsEnd());
                if (cdsStart >= cdsEnd)
                    continue;
                atts.clear();
                atts.put("Parent", "transcript" + delim + geneId);
                atts.put("ID", "CDS" + delim + kg.getName() + delim + "CDS" + exon.getIndex());
                atts.put("protein_id", kg.getName());
                int i = 0;
                // the phase number in gff tells us how many bases to skip in this
                // feature to reach the first base of the next codon
                int phase = -1;
                if (kg.isNegativeStrand()) {
                    final int firstExonPos = cdsEnd - 1;
                    while (i < cDNA.length()) {
                        final int pos = cDNA.convertToGenomicCoordinate(i);
                        if (firstExonPos == pos) {
                            phase = i % 3;
                            phase = phase == 0 ? 0 : 3 - phase;
                            break;
                        }
                        i++;
                    }
                } else {
                    final int firstExonPos = cdsStart;
                    while (i < cDNA.length()) {
                        final int pos = cDNA.convertToGenomicCoordinate(i);
                        if (firstExonPos == pos) {
                            phase = i % 3;
                            phase = phase == 0 ? 0 : 3 - phase;
                            break;
                        }
                        i++;
                    }
                }
                if (phase < 0 || phase > 2) {
                    LOG.warn("cannot get phase for " + kg.getName() + " " + exon.getName() + " strand:" + kg.getStrand() + " kg=" + kg.getCdsStart() + "-" + kg.getCdsEnd() + " ex=" + exon.getStart() + "-" + exon.getEnd() + " rn:" + cdsStart + "-" + cdsEnd + " phase=" + phase + " leng=" + cDNA.length() + " " + line);
                    phase = UNDEFINED_PHASE;
                }
                out.addFeature(new Gff3FeatureImpl(kg.getContig(), this.source, "CDS", cdsStart + 1, cdsEnd, UNDEFINED_SCORE, strand, phase, convertMap(atts)));
            }
        }
    } catch (final Throwable err) {
        throw new RuntimeIOException(err);
    }
}
Also used : RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) LinkedHashMap(java.util.LinkedHashMap) Gff3FeatureImpl(htsjdk.tribble.gff.Gff3FeatureImpl) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) Strand(htsjdk.tribble.annotation.Strand) CodingRNA(com.github.lindenb.jvarkit.util.ucsc.KnownGene.CodingRNA)

Example 2 with Strand

use of htsjdk.tribble.annotation.Strand in project gridss by PapenfussLab.

the class GeneratePonBedpe method toPair.

private static Pair<BreakendSummary, Integer> toPair(SAMSequenceDictionary dictionary, BEDFeature feat) {
    String chr = feat.getContig();
    int start = feat.getStart();
    int end = feat.getEnd();
    int referenceIndex = dictionary.getSequenceIndex(chr);
    Strand strand = feat.getStrand();
    float score = feat.getScore();
    return Pair.create(new BreakendSummary(referenceIndex, strand == Strand.FORWARD ? BreakendDirection.Forward : BreakendDirection.Backward, start, start, end), (int) score);
}
Also used : Strand(htsjdk.tribble.annotation.Strand)

Example 3 with Strand

use of htsjdk.tribble.annotation.Strand in project gridss by PapenfussLab.

the class VirusBreakendFilter method transformToBreakpointNotation.

public static List<VariantContext> transformToBreakpointNotation(SAMSequenceDictionary dictionary, VariantContext vc, int minMapq) {
    List<ChimericAlignment> bealn = infoToChimeric(vc, VcfInfoAttributes.BREAKEND_ALIGNMENTS.attribute());
    bealn.sort(ChimericAlignment.ByMapqAlignedLength);
    ChimericAlignment humanAlignment = bealn.get(0);
    boolean viralNegative = vc.getAlternateAllele(0).getDisplayString().startsWith(".");
    boolean breakpointAtEndOfHostAlignment = viralNegative != humanAlignment.isNegativeStrand;
    int hostPosition = humanAlignment.pos + (breakpointAtEndOfHostAlignment ? humanAlignment.cigar.getReferenceLength() - 1 : 0);
    String hostId = vc.getID() + "_host";
    String virusId = vc.getID() + "_virus";
    String alt = vc.getAlternateAllele(0).getDisplayString();
    String breakendSeq = alt.substring(1, alt.length() - 1);
    char anchorBase = alt.charAt(0) == '.' ? alt.charAt(alt.length() - 1) : alt.charAt(0);
    String virusInsertedSequence;
    if (!viralNegative) {
        // virus - insertion - host
        // NIIIHHHHSSS.
        int insertLength = humanAlignment.getFirstAlignedBaseReadOffset();
        virusInsertedSequence = breakendSeq.substring(0, insertLength);
    } else {
        // virus - insertion - host
        // ClipHostInsert.
        int insertOffset = humanAlignment.getLastAlignedBaseReadOffset();
        virusInsertedSequence = breakendSeq.substring(insertOffset + 1);
    }
    int viralPosition = vc.getStart();
    Strand virusBreakpointOrientation = viralNegative ? Strand.NEGATIVE : Strand.POSITIVE;
    Strand hostBreakpointOrientation = breakpointAtEndOfHostAlignment ? Strand.POSITIVE : Strand.NEGATIVE;
    String hostChr = humanAlignment.rname;
    String viralChr = vc.getContig();
    String hostInsertedSequence = virusBreakpointOrientation == hostBreakpointOrientation ? SequenceUtil.reverseComplement(virusInsertedSequence) : virusInsertedSequence;
    VariantContextBuilder hostBuilder = new VariantContextBuilder().chr(humanAlignment.rname).start(hostPosition).stop(hostPosition).id(hostId).log10PError(vc.getLog10PError()).filters(getFilter(humanAlignment, vc.getFilters(), minMapq)).alleles("N", (hostBreakpointOrientation == Strand.FORWARD ? "N" + hostInsertedSequence : "") + (virusBreakpointOrientation == Strand.FORWARD ? "]" : "[") + viralChr + ":" + viralPosition + (virusBreakpointOrientation == Strand.FORWARD ? "]" : "[") + (hostBreakpointOrientation == Strand.FORWARD ? "" : hostInsertedSequence + "N"));
    VariantContextBuilder virusBuilder = new VariantContextBuilder(vc).id(virusId).filters(getFilter(humanAlignment, vc.getFilters(), minMapq)).alleles(String.valueOf(anchorBase), (virusBreakpointOrientation == Strand.FORWARD ? anchorBase + virusInsertedSequence : "") + (hostBreakpointOrientation == Strand.FORWARD ? "]" : "[") + hostChr + ":" + hostPosition + (hostBreakpointOrientation == Strand.FORWARD ? "]" : "[") + (virusBreakpointOrientation == Strand.FORWARD ? "" : virusInsertedSequence + anchorBase));
    for (VcfInfoAttributes attr : new VcfInfoAttributes[] { VcfInfoAttributes.BREAKEND_ALIGNMENTS, VcfInfoAttributes.INSERTED_SEQUENCE_NCBI_TAXONOMY_ID, VcfInfoAttributes.INSERTED_SEQUENCE_REPEATMASKER_SA_TAG, VcfInfoAttributes.INSERTED_SEQUENCE_REPEATMASKER_OVERLAP, VcfInfoAttributes.INSERTED_SEQUENCE_REPEATMASKER_REPEAT_TYPE, VcfInfoAttributes.INSERTED_SEQUENCE_REPEATMASKER_REPEAT_CLASS, VcfInfoAttributes.INSERTED_SEQUENCE_REPEATMASKER_ORIENTATION }) {
        if (vc.hasAttribute(attr.attribute())) {
            hostBuilder.attribute(attr.attribute(), vc.getAttribute(attr.attribute()));
        }
    }
    hostBuilder.attribute(VcfSvConstants.BREAKEND_EVENT_ID_KEY, vc.getID());
    virusBuilder.attribute(VcfSvConstants.BREAKEND_EVENT_ID_KEY, vc.getID());
    hostBuilder.attribute(VcfSvConstants.MATE_BREAKEND_ID_KEY, virusBuilder.getID());
    virusBuilder.attribute(VcfSvConstants.MATE_BREAKEND_ID_KEY, hostBuilder.getID());
    hostBuilder.attribute(VcfSvConstants.SV_TYPE_KEY, SvType.BND.name());
    virusBuilder.attribute(VcfSvConstants.SV_TYPE_KEY, SvType.BND.name());
    return ImmutableList.of(virusBuilder.make(), hostBuilder.make());
}
Also used : ChimericAlignment(au.edu.wehi.idsv.sam.ChimericAlignment) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Strand(htsjdk.tribble.annotation.Strand) VcfInfoAttributes(au.edu.wehi.idsv.vcf.VcfInfoAttributes)

Example 4 with Strand

use of htsjdk.tribble.annotation.Strand in project jvarkit by lindenb.

the class RNASeqPolyA method decodeGff3Feature.

private void decodeGff3Feature(final Gff3Feature geneFeat, final ContigNameConverter converter, final Map<String, GeneInfo> geneMap) {
    if (geneFeat == null)
        return;
    final String debugTranscript = dynaParams.getOrDefault("debug.transcript", "");
    final BiFunction<Gff3Feature, String, String> getKey = (FEAT, KEY) -> {
        List<String> atts = FEAT.getAttribute(KEY);
        return atts == null || atts.size() != 1 ? null : atts.get(0);
    };
    if (!geneFeat.getType().equals("gene")) {
        for (Gff3Feature other : geneFeat.getChildren()) {
            decodeGff3Feature(other, converter, geneMap);
        }
        return;
    }
    final String chrom = converter.apply(geneFeat.getContig());
    if (StringUtils.isBlank(chrom))
        return;
    if (!StringUtils.isBlank(this.limit_contig) && !chrom.equals(this.limit_contig))
        return;
    final String geneID = geneFeat.getID();
    if (StringUtils.isBlank(geneID)) {
        LOG.warn("not ID for " + geneFeat);
        return;
    }
    if (geneMap.containsKey(geneID))
        throw new IllegalArgumentException("duplicate gene ID " + geneID);
    final GeneInfo gene = new GeneInfo();
    geneMap.put(geneID, gene);
    gene.geneId = geneID;
    gene.contig = chrom;
    gene.geneName = getKey.apply(geneFeat, "Name");
    if (StringUtils.isBlank(gene.geneName))
        gene.geneName = getKey.apply(geneFeat, "gene_name");
    gene.biotype = getKey.apply(geneFeat, "biotype");
    if (StringUtils.isBlank(gene.biotype))
        gene.biotype = getKey.apply(geneFeat, "gene_type");
    final List<Interval> all_exons = new ArrayList<>();
    for (Gff3Feature trFeat : geneFeat.getChildren()) {
        /* ignore this , there is plenty of type under gene 
			if(!(trFeat.getType().equals("transcript") || trFeat.getType().equals("mRNA"))) {
				continue;
				}
			*/
        final String transcriptId = trFeat.getID();
        if (StringUtils.isBlank(transcriptId)) {
            LOG.warn("not ID for " + transcriptId);
            continue;
        }
        if (!StringUtils.isBlank(debugTranscript) && !debugTranscript.equals(transcriptId)) {
            continue;
        }
        if (gene.transcripts.containsKey(transcriptId))
            throw new IllegalArgumentException("duplicate transcriptId ID " + transcriptId);
        for (final Gff3Feature exFeat : trFeat.getChildren()) {
            if (!exFeat.getType().equals("exon")) {
                continue;
            }
            all_exons.add(new Interval(chrom, exFeat.getStart(), exFeat.getEnd()));
            LastExon lastExon = gene.transcripts.get(transcriptId);
            if (lastExon == null) {
                lastExon = new LastExon();
                lastExon.gene = gene;
                lastExon.transcriptId = transcriptId;
                lastExon.strand = exFeat.getStrand();
                lastExon.start = exFeat.getStart();
                lastExon.end = exFeat.getEnd();
                gene.transcripts.put(transcriptId, lastExon);
            } else if (!lastExon.strand.equals(exFeat.getStrand())) {
                throw new IllegalArgumentException("conflict strand for " + transcriptId);
            }
            if ((lastExon.isPlusStrand() && exFeat.getEnd() > lastExon.getEnd()) || (lastExon.isMinusStrand() && exFeat.getStart() < lastExon.getStart())) {
                lastExon.start = exFeat.getStart();
                lastExon.end = exFeat.getEnd();
            }
        }
    }
    if (this.remove_if_overlapping_exon) {
        final Set<String> toRemove = new HashSet<>();
        for (String id : gene.transcripts.keySet()) {
            final LastExon exi = gene.transcripts.get(id);
            if (all_exons.stream().anyMatch(EX -> EX.getStart() < exi.getPosition() && EX.getEnd() > exi.getPosition())) {
                toRemove.add(exi.transcriptId);
            }
        }
        for (String id : toRemove) gene.transcripts.remove(id);
    }
    if (this.remove_duplicate_transcripts) {
        final List<String> ids = new ArrayList<>(gene.transcripts.keySet());
        final Set<String> toRemove = new HashSet<>();
        for (int i = 0; i + 1 < ids.size(); i++) {
            final LastExon exi = gene.transcripts.get(ids.get(i));
            if (toRemove.contains(exi.transcriptId))
                continue;
            for (int j = i + 1; j < ids.size(); j++) {
                final LastExon exj = gene.transcripts.get(ids.get(j));
                if ((exi.isPlusStrand() && exj.isPlusStrand() && exi.getEnd() == exj.getEnd()) || (exi.isMinusStrand() && exj.isMinusStrand() && exi.getStart() == exj.getStart())) {
                    if (exi.otherIds == null)
                        exi.otherIds = new HashSet<>();
                    exi.otherIds.add(exj.transcriptId);
                    toRemove.add(exj.transcriptId);
                }
            }
        }
        for (String id : toRemove) gene.transcripts.remove(id);
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) AsciiLineReader(htsjdk.tribble.readers.AsciiLineReader) Allele(htsjdk.variant.variantcontext.Allele) Program(com.github.lindenb.jvarkit.util.jcommander.Program) LineIterator(htsjdk.tribble.readers.LineIterator) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) BiFunction(java.util.function.BiFunction) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) UnaryOperator(java.util.function.UnaryOperator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) Map(java.util.Map) Path(java.nio.file.Path) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Collection(java.util.Collection) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) SAMRecord(htsjdk.samtools.SAMRecord) LineIteratorImpl(htsjdk.tribble.readers.LineIteratorImpl) Objects(java.util.Objects) ReferenceSequenceFileFactory(htsjdk.samtools.reference.ReferenceSequenceFileFactory) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) CoordMath(htsjdk.samtools.util.CoordMath) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) AcidNucleics(com.github.lindenb.jvarkit.util.bio.AcidNucleics) HashMap(java.util.HashMap) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Interval(htsjdk.samtools.util.Interval) Strand(htsjdk.tribble.annotation.Strand) SAMRecordDefaultFilter(com.github.lindenb.jvarkit.samtools.SAMRecordDefaultFilter) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFConstants(htsjdk.variant.vcf.VCFConstants) Gff3Feature(htsjdk.tribble.gff.Gff3Feature) Locatable(htsjdk.samtools.util.Locatable) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ToIntFunction(java.util.function.ToIntFunction) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) SamInputResource(htsjdk.samtools.SamInputResource) Gff3Codec(htsjdk.tribble.gff.Gff3Codec) QueryInterval(htsjdk.samtools.QueryInterval) DynamicParameter(com.beust.jcommander.DynamicParameter) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Collections(java.util.Collections) InputStream(java.io.InputStream) Gff3Feature(htsjdk.tribble.gff.Gff3Feature) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) Interval(htsjdk.samtools.util.Interval) QueryInterval(htsjdk.samtools.QueryInterval) HashSet(java.util.HashSet)

Example 5 with Strand

use of htsjdk.tribble.annotation.Strand in project jvarkit by lindenb.

the class RNASeqPolyA method doWork.

@Override
public int doWork(final List<String> args) {
    final Map<String, GeneInfo> geneToTrancripts = new HashMap<>();
    try {
        final String debugTranscript = dynaParams.getOrDefault("debug.transcript", "");
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.faidx);
        if (!StringUtils.isBlank(this.limit_contig) && dict.getSequence(this.limit_contig) == null) {
            throw new JvarkitException.ContigNotFoundInDictionary(this.limit_contig, dict);
        }
        final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(dict);
        final Gff3Codec gff3 = new Gff3Codec(Gff3Codec.DecodeDepth.DEEP);
        try (InputStream is = IOUtils.openPathForReading(this.gffPath)) {
            final AsciiLineReader asciiLineReader = AsciiLineReader.from(is);
            final LineIterator lr = new LineIteratorImpl(asciiLineReader);
            while (!gff3.isDone(lr)) {
                decodeGff3Feature(gff3.decode(lr), converter, geneToTrancripts);
            }
            gff3.close(lr);
            asciiLineReader.close();
        }
        if (geneToTrancripts.isEmpty()) {
            LOG.warn("no transcript was found.");
        // continue , empty VCF must be produced
        }
        final IntervalTreeMap<LastExon> exonMap = new IntervalTreeMap<>();
        // fill exonMap
        geneToTrancripts.values().stream().flatMap(K -> K.transcripts.values().stream()).forEach(X -> exonMap.put(new Interval(X), X));
        final ToIntFunction<String> toTid = C -> {
            final SAMSequenceRecord ssr = dict.getSequence(C);
            if (ssr == null)
                throw new JvarkitException.ContigNotFoundInDictionary(C, dict);
            return ssr.getSequenceIndex();
        };
        final List<Path> inputs = IOUtils.unrollPaths(args);
        final QueryInterval[] intervals = this.disable_bam_index || inputs.isEmpty() ? null : QueryInterval.optimizeIntervals(exonMap.values().stream().map(R -> new QueryInterval(toTid.applyAsInt(R.getContig()), R.getPosition(), R.getPosition())).toArray(N -> new QueryInterval[N]));
        final SamReaderFactory srf = super.createSamReaderFactory().referenceSequence(this.faidx);
        final String primerAAA;
        if (polyA_primer_size > 0) {
            primerAAA = StringUtils.repeat(this.polyA_primer_size, 'A');
        } else {
            primerAAA = null;
        }
        final Set<String> samples = new HashSet<>();
        int bam_index = 0;
        // loop over the bams
        for (; ; ) {
            final Path bamFilename = inputs.isEmpty() ? null : inputs.get(bam_index);
            try (SamReader sr = inputs.isEmpty() ? srf.open(SamInputResource.of(stdin())) : srf.open(bamFilename)) {
                final SAMFileHeader header0 = sr.getFileHeader();
                SequenceUtil.assertSequenceDictionariesEqual(dict, SequenceDictionaryUtils.extractRequired(header0));
                final String sample = header0.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse(bamFilename == null ? "STDIN" : IOUtils.getFilenameWithoutCommonSuffixes(bamFilename));
                if (samples.contains(sample)) {
                    LOG.error("duplicate sample " + sample);
                    return -1;
                }
                samples.add(sample);
                final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(dict).build();
                try (CloseableIterator<SAMRecord> iter = (intervals == null || inputs.isEmpty() ? /*stdin*/
                sr.iterator() : sr.query(intervals, false))) {
                    while (iter.hasNext()) {
                        final SAMRecord rec = progress.apply(iter.next());
                        if (rec.getReadUnmappedFlag())
                            continue;
                        if (!StringUtils.isBlank(this.limit_contig) && !rec.getContig().equals(this.limit_contig))
                            continue;
                        if (this.default_read_filter && !SAMRecordDefaultFilter.accept(rec))
                            continue;
                        final Collection<LastExon> lastExons = exonMap.getOverlapping(rec);
                        if (lastExons.isEmpty())
                            continue;
                        final Cigar cigar = rec.getCigar();
                        if (cigar == null || cigar.isEmpty())
                            continue;
                        final byte[] bases = rec.getReadBases();
                        if (bases == null || SAMRecord.NULL_SEQUENCE.equals(bases))
                            continue;
                        for (LastExon exon : lastExons) {
                            if (!StringUtils.isBlank(debugTranscript) && !exon.transcriptId.equals(debugTranscript)) {
                                continue;
                            }
                            ExonCount count = exon.sample2count.get(sample);
                            if (count == null) {
                                count = new ExonCount();
                                exon.sample2count.put(sample, count);
                            }
                            final StringBuilder sb = new StringBuilder();
                            boolean indel_flag = false;
                            boolean last_exon_in_intron_flag = false;
                            boolean match_last_base = false;
                            int ref1 = rec.getUnclippedStart();
                            int read0 = 0;
                            for (CigarElement ce : cigar) {
                                if (exon.isMinusStrand() && ref1 > exon.start)
                                    break;
                                if (this.ignore_with_indels && indel_flag)
                                    break;
                                final CigarOperator op = ce.getOperator();
                                switch(op) {
                                    case P:
                                        break;
                                    case I:
                                        indel_flag = true;
                                        for (int i = 0; i < ce.getLength(); i++) {
                                            if (exon.isAfterExon(ref1)) {
                                                sb.append((char) Character.toUpperCase(bases[read0]));
                                            }
                                            read0++;
                                        }
                                        break;
                                    case D:
                                    case N:
                                        if ((exon.isPlusStrand() && CoordMath.overlaps(ref1, ref1 + ce.getLength() - 1, exon.getEnd(), exon.getEnd() + 1)) || (exon.isMinusStrand() && CoordMath.overlaps(ref1, ref1 + ce.getLength() - 1, exon.getStart() - 1, exon.getStart()))) {
                                            last_exon_in_intron_flag = true;
                                        }
                                        ref1 += ce.getLength();
                                        indel_flag = true;
                                        break;
                                    case H:
                                        for (int i = 0; i < ce.getLength(); i++) {
                                            if (exon.isAfterExon(ref1)) {
                                                sb.append('N');
                                            }
                                            if (ref1 == exon.getPosition())
                                                match_last_base = true;
                                            ref1++;
                                        }
                                        break;
                                    case S:
                                    case M:
                                    case X:
                                    case EQ:
                                        for (int i = 0; i < ce.getLength(); i++) {
                                            if (exon.isAfterExon(ref1)) {
                                                sb.append((char) Character.toUpperCase(bases[read0]));
                                            }
                                            if (ref1 == exon.getPosition())
                                                match_last_base = true;
                                            read0++;
                                            ref1++;
                                        }
                                        break;
                                    default:
                                        throw new IllegalStateException(op.name());
                                }
                            }
                            // premature end or start
                            if (!match_last_base || (exon.isPlusStrand() && ref1 < exon.getEnd()) || (exon.isMinusStrand() && ref1 < exon.getStart()) || (this.ignore_with_indels && indel_flag) || last_exon_in_intron_flag) {
                                continue;
                            }
                            // if(read0!=bases.length) throw new IllegalStateException("read0:"+read0+" expected "+bases.length+" in "+rec.getReadName());
                            // if(ref1!=1+rec.getUnclippedEnd())throw new IllegalStateException("ref1:"+ref1+" expected 1+"+rec.getUnclippedEnd()+" in "+rec.getReadName());
                            ++count.n_tested_reads;
                            String polyA;
                            if (exon.isMinusStrand()) {
                                polyA = AcidNucleics.reverseComplement(sb);
                            } else {
                                polyA = sb.toString();
                            }
                            if (primerAAA != null) {
                                final int pos = polyA.indexOf(primerAAA);
                                if (pos > 0)
                                    polyA = polyA.substring(pos);
                            }
                            int count_polyA = 0;
                            for (int i = 0; i < polyA.length(); i++) {
                                if (polyA.charAt(i) != 'A')
                                    break;
                                count_polyA++;
                            }
                            if (count_polyA > 0) {
                                count.n_tested_reads_with_A++;
                                count.sum_polyA += count_polyA;
                            }
                            if (count_polyA > count.max_length_polyA) {
                                count.max_length_polyA = count_polyA;
                            }
                        }
                    // end of loop last exon
                    }
                    progress.close();
                }
            }
            ++bam_index;
            if (inputs.isEmpty() || bam_index >= inputs.size())
                break;
        }
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        final VCFInfoHeaderLine infoGeneId = new VCFInfoHeaderLine("GENE", 1, VCFHeaderLineType.String, "Gene ID in " + this.gffPath);
        metaData.add(infoGeneId);
        final VCFInfoHeaderLine infoTranscriptId = new VCFInfoHeaderLine("TRANSCRIPT", 1, VCFHeaderLineType.String, "Transcript ID in " + this.gffPath);
        metaData.add(infoTranscriptId);
        final VCFInfoHeaderLine infoStrand = new VCFInfoHeaderLine("STRAND", 1, VCFHeaderLineType.String, "Strand");
        metaData.add(infoStrand);
        final VCFInfoHeaderLine infoTranscriptMaxPolyA = new VCFInfoHeaderLine("TRANSCRIPT_MAX", 1, VCFHeaderLineType.Integer, "Max poly A in Transcript");
        metaData.add(infoTranscriptMaxPolyA);
        final VCFInfoHeaderLine infoGeneMaxPolyA = new VCFInfoHeaderLine("GENE_MAX", 1, VCFHeaderLineType.Integer, "Max poly A in Gene");
        metaData.add(infoGeneMaxPolyA);
        final VCFInfoHeaderLine infoEndPos = new VCFInfoHeaderLine("POS3", 1, VCFHeaderLineType.Integer, "End 3 prime position");
        metaData.add(infoEndPos);
        final VCFInfoHeaderLine infoGeneName = new VCFInfoHeaderLine("GENE_NAME", 1, VCFHeaderLineType.String, "Gene Name");
        metaData.add(infoGeneName);
        final VCFInfoHeaderLine infoBiotype = new VCFInfoHeaderLine("GENE_BIOTYPE", 1, VCFHeaderLineType.String, "Gene Biotype");
        metaData.add(infoBiotype);
        final int n_last_exon_bases = Math.max(0, Integer.parseInt(dynaParams.getOrDefault("last.n.exons", "10")));
        final VCFInfoHeaderLine infoLastExonBases = new VCFInfoHeaderLine("LAST_BASES", 1, VCFHeaderLineType.String, "Last exon bases N=" + n_last_exon_bases + ". Reverse-complemented for negative strand.");
        metaData.add(infoLastExonBases);
        final int n_after_exon_bases = Math.max(0, Integer.parseInt(dynaParams.getOrDefault("after.n.exons", "10")));
        final VCFInfoHeaderLine infoAfterExonBases = new VCFInfoHeaderLine("AFTER_BASES", 1, VCFHeaderLineType.String, "Bases after exon N=" + n_after_exon_bases + ". Reverse-complemented for negative strand.");
        metaData.add(infoAfterExonBases);
        final VCFInfoHeaderLine infoOtherIdss = new VCFInfoHeaderLine("OTHER_IDS", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Other transcripts ending at the same coordinate.");
        metaData.add(infoOtherIdss);
        final VCFFormatHeaderLine fmtMaxPolyA = new VCFFormatHeaderLine("MAX", 1, VCFHeaderLineType.Integer, "Max poly A");
        metaData.add(fmtMaxPolyA);
        final VCFFormatHeaderLine fmtReadPolyA = new VCFFormatHeaderLine("DPA", 1, VCFHeaderLineType.Integer, "Read with at least one A");
        metaData.add(fmtReadPolyA);
        final VCFFormatHeaderLine fmtAveragePolyA = new VCFFormatHeaderLine("AVG", 1, VCFHeaderLineType.Float, "average length of poly-A for reads carrying at least one A.");
        metaData.add(fmtAveragePolyA);
        VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.DEPTH_KEY, VCFConstants.END_KEY);
        VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.DEPTH_KEY);
        final VCFHeader header = new VCFHeader(metaData, samples.stream().sorted().collect(Collectors.toList()));
        header.setSequenceDictionary(dict);
        JVarkitVersion.getInstance().addMetaData(this, header);
        final UnaryOperator<String> afterColon = S -> {
            if (!(S.startsWith("gene:") || S.startsWith("transcript:")))
                return S;
            int colon = S.indexOf(":");
            return S.substring(colon + 1);
        };
        final List<Allele> ALLELES = Collections.singletonList(Allele.create("N", true));
        try (VariantContextWriter w = writingVariantsDelegate.dictionary(dict).open(this.outputFile);
            ReferenceSequenceFile fai = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx)) {
            w.writeHeader(header);
            exonMap.values().stream().sorted(new ContigDictComparator(dict).createLocatableComparator()).forEach(T -> {
                if (T.getDP() == 0)
                    return;
                final String lastBases;
                final String afterBases;
                if (T.isPlusStrand()) {
                    lastBases = fai.getSubsequenceAt(T.getContig(), Math.max(T.getStart(), T.getEnd() - n_last_exon_bases), T.getEnd()).getBaseString();
                    final SAMSequenceRecord ssr = Objects.requireNonNull(dict.getSequence(T.getContig()));
                    afterBases = fai.getSubsequenceAt(T.getContig(), T.getEnd() + 1, Math.min(T.getEnd() + n_after_exon_bases, ssr.getLengthOnReference())).getBaseString();
                } else if (T.isMinusStrand()) {
                    lastBases = AcidNucleics.reverseComplement(fai.getSubsequenceAt(T.getContig(), T.getStart(), Math.min(T.getEnd(), T.getStart() + n_last_exon_bases)).getBaseString());
                    afterBases = AcidNucleics.reverseComplement(fai.getSubsequenceAt(T.getContig(), Math.max(1, T.getStart() - n_after_exon_bases), T.getStart() - 1).getBaseString());
                } else {
                    lastBases = null;
                    afterBases = null;
                }
                final VariantContextBuilder vcb = new VariantContextBuilder();
                vcb.chr(T.getContig());
                vcb.start(T.getStart());
                vcb.stop(T.getEnd());
                vcb.id(afterColon.apply(T.transcriptId));
                vcb.attribute(VCFConstants.END_KEY, T.getEnd());
                vcb.attribute(infoGeneId.getID(), afterColon.apply(T.gene.geneId));
                vcb.attribute(infoTranscriptId.getID(), afterColon.apply(T.transcriptId));
                vcb.attribute(infoStrand.getID(), T.strand.name());
                vcb.attribute(infoEndPos.getID(), T.getPosition());
                if (T.otherIds != null && !T.otherIds.isEmpty()) {
                    vcb.attribute(infoOtherIdss.getID(), T.otherIds.stream().map(afterColon).collect(Collectors.toList()));
                }
                if (!StringUtils.isBlank(lastBases)) {
                    vcb.attribute(infoLastExonBases.getID(), lastBases);
                }
                if (!StringUtils.isBlank(afterBases)) {
                    vcb.attribute(infoAfterExonBases.getID(), afterBases);
                }
                if (!StringUtils.isBlank(T.gene.geneName)) {
                    vcb.attribute(infoGeneName.getID(), T.gene.geneName);
                }
                if (!StringUtils.isBlank(T.gene.biotype)) {
                    vcb.attribute(infoBiotype.getID(), T.gene.biotype);
                }
                final List<Genotype> genotypes = new ArrayList<>(samples.size());
                for (String sn : samples) {
                    final ExonCount count = T.sample2count.get(sn);
                    final GenotypeBuilder gb = new GenotypeBuilder(sn);
                    gb.attribute(fmtMaxPolyA.getID(), count == null ? 0 : count.max_length_polyA);
                    gb.attribute(fmtReadPolyA.getID(), count == null ? 0 : count.n_tested_reads_with_A);
                    gb.attribute(fmtAveragePolyA.getID(), count == null || count.n_tested_reads_with_A == 0 ? 0f : count.sum_polyA / (float) count.n_tested_reads_with_A);
                    gb.DP(count == null ? 0 : count.n_tested_reads);
                    genotypes.add(gb.make());
                }
                vcb.alleles(ALLELES);
                vcb.genotypes(genotypes);
                vcb.attribute(VCFConstants.DEPTH_KEY, T.getDP());
                final int score = T.getMaxPolyA();
                if (score > 0)
                    vcb.log10PError(score / -10.0);
                vcb.attribute(infoTranscriptMaxPolyA.getID(), score);
                vcb.attribute(infoGeneMaxPolyA.getID(), T.gene.getmaxPolyA());
                w.add(vcb.make());
            });
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) AsciiLineReader(htsjdk.tribble.readers.AsciiLineReader) Allele(htsjdk.variant.variantcontext.Allele) Program(com.github.lindenb.jvarkit.util.jcommander.Program) LineIterator(htsjdk.tribble.readers.LineIterator) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) BiFunction(java.util.function.BiFunction) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) UnaryOperator(java.util.function.UnaryOperator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) Map(java.util.Map) Path(java.nio.file.Path) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Collection(java.util.Collection) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) SAMRecord(htsjdk.samtools.SAMRecord) LineIteratorImpl(htsjdk.tribble.readers.LineIteratorImpl) Objects(java.util.Objects) ReferenceSequenceFileFactory(htsjdk.samtools.reference.ReferenceSequenceFileFactory) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) CoordMath(htsjdk.samtools.util.CoordMath) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) AcidNucleics(com.github.lindenb.jvarkit.util.bio.AcidNucleics) HashMap(java.util.HashMap) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Interval(htsjdk.samtools.util.Interval) Strand(htsjdk.tribble.annotation.Strand) SAMRecordDefaultFilter(com.github.lindenb.jvarkit.samtools.SAMRecordDefaultFilter) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFConstants(htsjdk.variant.vcf.VCFConstants) Gff3Feature(htsjdk.tribble.gff.Gff3Feature) Locatable(htsjdk.samtools.util.Locatable) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ToIntFunction(java.util.function.ToIntFunction) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) SamInputResource(htsjdk.samtools.SamInputResource) Gff3Codec(htsjdk.tribble.gff.Gff3Codec) QueryInterval(htsjdk.samtools.QueryInterval) DynamicParameter(com.beust.jcommander.DynamicParameter) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Collections(java.util.Collections) InputStream(java.io.InputStream) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) HashMap(java.util.HashMap) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) LineIteratorImpl(htsjdk.tribble.readers.LineIteratorImpl) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet) AsciiLineReader(htsjdk.tribble.readers.AsciiLineReader) Genotype(htsjdk.variant.variantcontext.Genotype) CigarOperator(htsjdk.samtools.CigarOperator) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) CigarElement(htsjdk.samtools.CigarElement) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) Interval(htsjdk.samtools.util.Interval) QueryInterval(htsjdk.samtools.QueryInterval) QueryInterval(htsjdk.samtools.QueryInterval) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) LineIterator(htsjdk.tribble.readers.LineIterator) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) SamReader(htsjdk.samtools.SamReader) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Path(java.nio.file.Path) SamReaderFactory(htsjdk.samtools.SamReaderFactory) InputStream(java.io.InputStream) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) Gff3Codec(htsjdk.tribble.gff.Gff3Codec) Cigar(htsjdk.samtools.Cigar) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap)

Aggregations

Strand (htsjdk.tribble.annotation.Strand)5 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)3 DynamicParameter (com.beust.jcommander.DynamicParameter)2 Parameter (com.beust.jcommander.Parameter)2 ParametersDelegate (com.beust.jcommander.ParametersDelegate)2 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)2 JvarkitException (com.github.lindenb.jvarkit.lang.JvarkitException)2 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)2 SAMRecordDefaultFilter (com.github.lindenb.jvarkit.samtools.SAMRecordDefaultFilter)2 JVarkitVersion (com.github.lindenb.jvarkit.util.JVarkitVersion)2 AcidNucleics (com.github.lindenb.jvarkit.util.bio.AcidNucleics)2 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)2 ContigNameConverter (com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)2 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)2 Program (com.github.lindenb.jvarkit.util.jcommander.Program)2 Logger (com.github.lindenb.jvarkit.util.log.Logger)2 ProgressFactory (com.github.lindenb.jvarkit.util.log.ProgressFactory)2 ContigDictComparator (com.github.lindenb.jvarkit.util.samtools.ContigDictComparator)2 WritingVariantsDelegate (com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate)2 Cigar (htsjdk.samtools.Cigar)2