Search in sources :

Example 71 with SimpleInterval

use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.

the class SamFindClippedRegions method processInput.

@Override
protected int processInput(final SAMFileHeader header, final CloseableIterator<SAMRecord> iter) {
    final Set<String> samples = header.getReadGroups().stream().map(this.partition).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toCollection(TreeSet::new));
    if (samples.isEmpty()) {
        LOG.error("no sample in read group was defined.");
        return -1;
    }
    final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
    final int bad_mapq = 30;
    // SAMFileWriter w=null;
    try {
        final IntervalTreeMap<Interval> intronMap = new IntervalTreeMap<>();
        if (this.gtfPath != null) {
            try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
                gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
                gtfReader.getAllGenes().stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.hasIntron()).flatMap(T -> T.getIntrons().stream()).map(I -> new Interval(I.getContig(), I.getStart(), I.getEnd(), I.isNegativeStrand(), I.getTranscript().getId())).forEach(I -> intronMap.put(I, I));
            }
        }
        /* build VCF header */
        final Allele reference_allele = Allele.create("N", true);
        final Allele alt_allele = Allele.create("<CLIP>", false);
        final Set<VCFHeaderLine> vcfHeaderLines = new HashSet<>();
        VCFStandardHeaderLines.addStandardFormatLines(vcfHeaderLines, true, VCFConstants.GENOTYPE_KEY, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_ALLELE_DEPTHS);
        VCFStandardHeaderLines.addStandardInfoLines(vcfHeaderLines, true, VCFConstants.DEPTH_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_FREQUENCY_KEY);
        final VCFFormatHeaderLine leftClip = new VCFFormatHeaderLine("CL", 1, VCFHeaderLineType.Integer, "Left Clip");
        vcfHeaderLines.add(leftClip);
        final VCFFormatHeaderLine rightClip = new VCFFormatHeaderLine("RL", 1, VCFHeaderLineType.Integer, "Right Clip");
        vcfHeaderLines.add(rightClip);
        final VCFFormatHeaderLine totalCip = new VCFFormatHeaderLine("TL", 1, VCFHeaderLineType.Integer, "Total Clip");
        vcfHeaderLines.add(totalCip);
        final VCFFormatHeaderLine totalDel = new VCFFormatHeaderLine("DL", 1, VCFHeaderLineType.Integer, "Total Deletions");
        vcfHeaderLines.add(totalDel);
        final VCFFormatHeaderLine noClipMAPQ = new VCFFormatHeaderLine("MQ", 1, VCFHeaderLineType.Integer, "Average MAPQ for reads without clip at this position.");
        vcfHeaderLines.add(noClipMAPQ);
        final VCFInfoHeaderLine averageMAPQ = new VCFInfoHeaderLine("AVG_MAPQ", 1, VCFHeaderLineType.Integer, "Average MAPQ for called genotypes");
        vcfHeaderLines.add(averageMAPQ);
        final VCFInfoHeaderLine infoRetrogene = new VCFInfoHeaderLine("RETROGENE", 1, VCFHeaderLineType.String, "transcript name for Possible retrogene.");
        vcfHeaderLines.add(infoRetrogene);
        final VCFFilterHeaderLine filterRetrogene = new VCFFilterHeaderLine("POSSIBLE_RETROGENE", "Junction is a possible Retrogene.");
        vcfHeaderLines.add(filterRetrogene);
        final VCFFilterHeaderLine filterlowMapq = new VCFFilterHeaderLine("LOW_MAPQ", "Low average mapq (< " + bad_mapq + ")");
        vcfHeaderLines.add(filterlowMapq);
        final VCFHeader vcfHeader = new VCFHeader(vcfHeaderLines, samples);
        vcfHeader.setSequenceDictionary(dict);
        JVarkitVersion.getInstance().addMetaData(this, vcfHeader);
        this.writingVcfConfig.dictionary(dict);
        try (VariantContextWriter w = this.writingVcfConfig.open(this.outputFile)) {
            w.writeHeader(vcfHeader);
            @SuppressWarnings("resource") final VariantContextWriter finalVariantContextWriter = w;
            /**
             * dump a BASe into the VCF
             */
            final BiConsumer<String, Base> baseConsumer = (CTG, B) -> {
                if (B.pos < 1)
                    return;
                // no clip
                if (B.sample2gt.values().stream().mapToInt(G -> G.clip()).sum() == 0)
                    return;
                if (B.sample2gt.values().stream().allMatch(G -> G.clip() < min_clip_depth))
                    return;
                if (B.sample2gt.values().stream().allMatch(G -> G.dp() < min_depth))
                    return;
                if (B.sample2gt.values().stream().allMatch(G -> G.ratio() < fraction))
                    return;
                final VariantContextBuilder vcb = new VariantContextBuilder();
                vcb.chr(CTG);
                vcb.start(B.pos);
                vcb.stop(B.pos);
                vcb.alleles(Arrays.asList(reference_allele, alt_allele));
                vcb.attribute(VCFConstants.DEPTH_KEY, B.sample2gt.values().stream().mapToInt(G -> G.dp()).sum());
                /* if gtf was specified, find intron which ends are near this pos */
                if (gtfPath != null) {
                    final Locatable bounds1 = new SimpleInterval(CTG, Math.max(1, B.pos - max_intron_distance), B.pos + max_intron_distance);
                    intronMap.getOverlapping(bounds1).stream().filter(I -> Math.abs(I.getStart() - B.pos) <= this.max_intron_distance || Math.abs(I.getEnd() - B.pos) <= this.max_intron_distance).map(I -> I.getName()).findFirst().ifPresent(transcript_id -> {
                        vcb.attribute(infoRetrogene.getID(), transcript_id);
                        vcb.filter(filterRetrogene.getID());
                    });
                    ;
                }
                final List<Genotype> genotypes = new ArrayList<>(B.sample2gt.size());
                int AC = 0;
                int AN = 0;
                int max_clip = 1;
                double sum_mapq = 0.0;
                int count_mapq = 0;
                for (final String sn : B.sample2gt.keySet()) {
                    final Gt gt = B.sample2gt.get(sn);
                    final GenotypeBuilder gb = new GenotypeBuilder(sn);
                    if (gt.clip() == 0 && gt.noClip == 0) {
                        gb.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
                    } else if (gt.noClip == 0) {
                        gb.alleles(Arrays.asList(alt_allele, alt_allele));
                        AC += 2;
                        sum_mapq += gt.noClipMapq();
                        count_mapq++;
                        AN += 2;
                    } else if (gt.clip() == 0) {
                        gb.alleles(Arrays.asList(reference_allele, reference_allele));
                        AN += 2;
                    } else {
                        gb.alleles(Arrays.asList(reference_allele, alt_allele));
                        AC++;
                        sum_mapq += gt.noClipMapq();
                        count_mapq++;
                        AN += 2;
                    }
                    gb.DP(gt.dp());
                    gb.attribute(leftClip.getID(), gt.leftClip);
                    gb.attribute(rightClip.getID(), gt.rightClip);
                    gb.attribute(totalCip.getID(), gt.clip());
                    gb.attribute(totalDel.getID(), gt.del);
                    gb.attribute(noClipMAPQ.getID(), gt.noClipMapq());
                    gb.AD(new int[] { gt.noClip, gt.clip() });
                    genotypes.add(gb.make());
                    max_clip = Math.max(max_clip, gt.clip());
                }
                if (count_mapq > 0) {
                    final int avg_mapq = (int) (sum_mapq / count_mapq);
                    vcb.attribute(averageMAPQ.getID(), avg_mapq);
                    if (avg_mapq < bad_mapq)
                        vcb.filter(filterlowMapq.getID());
                }
                vcb.log10PError(max_clip / -10.0);
                vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, AC);
                vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, AN);
                if (AN > 0)
                    vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, AC / (float) AN);
                vcb.genotypes(genotypes);
                finalVariantContextWriter.add(vcb.make());
            };
            final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
            String prevContig = null;
            final SortedMap<Integer, Base> pos2base = new TreeMap<>();
            /* get base in pos2base, create it if needed */
            final Function<Integer, Base> baseAt = POS -> {
                Base b = pos2base.get(POS);
                if (b == null) {
                    b = new Base(POS, samples);
                    pos2base.put(POS, b);
                }
                return b;
            };
            for (; ; ) {
                final SAMRecord rec = (iter.hasNext() ? progress.apply(iter.next()) : null);
                if (rec != null && !SAMRecordDefaultFilter.accept(rec, this.min_mapq))
                    continue;
                if (rec == null || !rec.getContig().equals(prevContig)) {
                    for (final Integer pos : pos2base.keySet()) {
                        baseConsumer.accept(prevContig, pos2base.get(pos));
                    }
                    if (rec == null)
                        break;
                    pos2base.clear();
                    prevContig = rec.getContig();
                }
                for (Iterator<Integer> rpos = pos2base.keySet().iterator(); rpos.hasNext(); ) {
                    final Integer pos = rpos.next();
                    if (pos.intValue() + this.max_clip_length >= rec.getUnclippedStart())
                        break;
                    baseConsumer.accept(prevContig, pos2base.get(pos));
                    rpos.remove();
                }
                final String rg = this.partition.getPartion(rec);
                if (StringUtils.isBlank(rg))
                    continue;
                for (final AlignmentBlock ab : rec.getAlignmentBlocks()) {
                    for (int n = 0; n < ab.getLength(); ++n) {
                    }
                }
                final Cigar cigar = rec.getCigar();
                int refPos = rec.getAlignmentStart();
                for (final CigarElement ce : cigar.getCigarElements()) {
                    final CigarOperator op = ce.getOperator();
                    if (op.consumesReferenceBases()) {
                        if (op.consumesReadBases()) {
                            for (int x = 0; x < ce.getLength(); ++x) {
                                final Gt gt = baseAt.apply(refPos + x).getGt(rg);
                                gt.noClip++;
                                gt.noClip_sum_mapq += rec.getMappingQuality();
                            }
                        } else if (op.equals(CigarOperator.D) || op.equals(CigarOperator.N)) {
                            baseAt.apply(refPos).getGt(rg).del++;
                            baseAt.apply(refPos + ce.getLength() - 1).getGt(rg).del++;
                        }
                        refPos += ce.getLength();
                    }
                }
                CigarElement ce = cigar.getFirstCigarElement();
                if (ce != null && ce.getOperator().isClipping() && ce.getLength() >= this.min_clip_operator_length) {
                    baseAt.apply(rec.getStart() - 1).getGt(rg).leftClip++;
                }
                ce = cigar.getLastCigarElement();
                if (ce != null && ce.getOperator().isClipping() && ce.getLength() >= this.min_clip_operator_length) {
                    baseAt.apply(rec.getEnd() + 1).getGt(rg).rightClip++;
                }
            }
        }
        // end of vcf writer
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) MultiBamLauncher(com.github.lindenb.jvarkit.jcommander.MultiBamLauncher) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) SAMRecordPartition(com.github.lindenb.jvarkit.util.samtools.SAMRecordPartition) SAMFileHeader(htsjdk.samtools.SAMFileHeader) AlignmentBlock(htsjdk.samtools.AlignmentBlock) Map(java.util.Map) Path(java.nio.file.Path) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) SAMRecord(htsjdk.samtools.SAMRecord) Objects(java.util.Objects) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) SortedMap(java.util.SortedMap) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) HashMap(java.util.HashMap) Function(java.util.function.Function) TreeSet(java.util.TreeSet) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Interval(htsjdk.samtools.util.Interval) SAMRecordDefaultFilter(com.github.lindenb.jvarkit.samtools.SAMRecordDefaultFilter) BiConsumer(java.util.function.BiConsumer) VCFConstants(htsjdk.variant.vcf.VCFConstants) Locatable(htsjdk.samtools.util.Locatable) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) Iterator(java.util.Iterator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) TreeMap(java.util.TreeMap) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) List(java.util.List) ArrayList(java.util.ArrayList) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) CigarOperator(htsjdk.samtools.CigarOperator) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) TreeMap(java.util.TreeMap) CigarElement(htsjdk.samtools.CigarElement) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) Cigar(htsjdk.samtools.Cigar) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) SAMRecord(htsjdk.samtools.SAMRecord) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) AlignmentBlock(htsjdk.samtools.AlignmentBlock) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) Interval(htsjdk.samtools.util.Interval) Locatable(htsjdk.samtools.util.Locatable)

Aggregations

SimpleInterval (com.github.lindenb.jvarkit.samtools.util.SimpleInterval)71 ArrayList (java.util.ArrayList)49 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)47 List (java.util.List)47 Locatable (htsjdk.samtools.util.Locatable)46 Path (java.nio.file.Path)46 Parameter (com.beust.jcommander.Parameter)43 Program (com.github.lindenb.jvarkit.util.jcommander.Program)43 Logger (com.github.lindenb.jvarkit.util.log.Logger)43 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)39 Collectors (java.util.stream.Collectors)38 Set (java.util.Set)37 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)36 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)35 ContigNameConverter (com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)35 SAMFileHeader (htsjdk.samtools.SAMFileHeader)34 CloserUtil (htsjdk.samtools.util.CloserUtil)34 CloseableIterator (htsjdk.samtools.util.CloseableIterator)33 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)32 SamReader (htsjdk.samtools.SamReader)32