Search in sources :

Example 16 with GtfReader

use of com.github.lindenb.jvarkit.util.bio.structure.GtfReader in project jvarkit by lindenb.

the class PlotSashimi method doWork.

@Override
public int doWork(final List<String> args) {
    ArchiveFactory archive = null;
    PrintWriter manifest = null;
    try {
        DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
        dbf.setNamespaceAware(true);
        DocumentBuilder db = dbf.newDocumentBuilder();
        this.document = db.newDocument();
        final SamReaderFactory srf = super.createSamReaderFactory();
        if (faidx != null) {
            srf.referenceSequence(this.faidx);
        }
        if (this.gtfPath != null) {
            try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
                if (this.faidx != null)
                    gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(SequenceDictionaryUtils.extractRequired(this.faidx)));
                gtfReader.getAllGenes().stream().forEach(G -> this.geneMap.put(new Interval(G), G));
            }
        }
        archive = ArchiveFactory.open(this.outputFile);
        manifest = new PrintWriter(this.manifestFile == null ? new NullOuputStream() : IOUtils.openPathForWriting(manifestFile));
        manifest.println("#chrom\tstart\tend\tbam\tGenes\tSamples\tsvg");
        for (final Path bam : IOUtils.unrollPaths(args)) {
            try (SamReader sr = srf.open(bam)) {
                if (!sr.hasIndex()) {
                    LOG.error("Bam is not indexed " + bam);
                    return -1;
                }
                final SAMFileHeader header = sr.getFileHeader();
                final ArchiveFactory final_archive = archive;
                final PrintWriter final_manifest = manifest;
                this.intervalListProvider.dictionary(header.getSequenceDictionary()).stream().forEach(R -> {
                    plotSashimi(final_archive, sr, R, bam, final_manifest);
                });
            }
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(manifest);
        CloserUtil.close(archive);
    }
}
Also used : Path(java.nio.file.Path) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) SamReader(htsjdk.samtools.SamReader) DocumentBuilderFactory(javax.xml.parsers.DocumentBuilderFactory) SamReaderFactory(htsjdk.samtools.SamReaderFactory) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) DocumentBuilder(javax.xml.parsers.DocumentBuilder) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) SAMFileHeader(htsjdk.samtools.SAMFileHeader) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) Interval(htsjdk.samtools.util.Interval)

Example 17 with GtfReader

use of com.github.lindenb.jvarkit.util.bio.structure.GtfReader in project jvarkit by lindenb.

the class FindNewSpliceSites method doWork.

@Override
public int doWork(final List<String> args) {
    SamReader sfr = null;
    PrintWriter bedWriter = null;
    SortingCollection<Junction> junctionSorter = null;
    try {
        final SamReaderFactory srf = super.createSamReaderFactory();
        if (this.faidx != null) {
            srf.referenceSequence(this.faidx);
        }
        final String input = oneFileOrNull(args);
        sfr = input == null ? srf.open(SamInputResource.of(stdin())) : srf.open(SamInputResource.of(input));
        final SAMFileHeader header0 = sfr.getFileHeader();
        try (GtfReader gftReader = new GtfReader(this.gtfPath)) {
            SAMSequenceDictionary dict = header0.getSequenceDictionary();
            if (dict != null)
                gftReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
            gftReader.getAllGenes().stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.getExonCount() > 1).flatMap(T -> T.getIntrons().stream()).map(T -> T.toInterval()).forEach(T -> {
                this.intronMap.put(T, T);
            });
        }
        final SAMFileHeader header1 = header0.clone();
        final SAMProgramRecord p = header1.createProgramRecord();
        p.setCommandLine(getProgramCommandLine());
        p.setProgramVersion(getVersion());
        p.setProgramName(getProgramName());
        this.sfw = this.writingBamArgs.openSamWriter(outputFile, header1, true);
        final SAMFileHeader header2 = header0.clone();
        final SAMProgramRecord p2 = header2.createProgramRecord();
        p2.setCommandLine(getProgramCommandLine());
        p2.setProgramVersion(getVersion());
        p2.setProgramName(getProgramName());
        this.weird = this.writingBamArgs.createSAMFileWriterFactory().makeSAMWriter(header2, true, new NullOuputStream());
        if (this.bedOut != null) {
            final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(sfr.getFileHeader());
            this.junctionComparator = new ContigDictComparator(dict).createLocatableComparator();
            junctionSorter = SortingCollection.newInstance(Junction.class, new JunctionCodec(), (A, B) -> A.compare2(B), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        }
        scan(sfr, p, p2, junctionSorter);
        sfr.close();
        if (this.bedOut != null) {
            junctionSorter.doneAdding();
            bedWriter = super.openPathOrStdoutAsPrintWriter(this.bedOut);
            final String sample = StringUtils.ifBlank(header0.getReadGroups().stream().map(RG -> RG.getSample()).filter(s -> !StringUtils.isBlank(s)).collect(Collectors.toCollection(TreeSet::new)).stream().collect(Collectors.joining(";")), ".");
            try (CloseableIterator<Junction> iter = junctionSorter.iterator()) {
                final EqualRangeIterator<Junction> eq = new EqualRangeIterator<>(iter, (A, B) -> A.compare1(B));
                while (eq.hasNext()) {
                    final List<Junction> row = eq.next();
                    final Junction first = row.get(0);
                    bedWriter.print(first.getContig());
                    bedWriter.print('\t');
                    bedWriter.print(first.getStart() - 1);
                    bedWriter.print('\t');
                    bedWriter.print(first.getEnd());
                    bedWriter.print('\t');
                    bedWriter.print(sample);
                    bedWriter.print('\t');
                    bedWriter.print(first.name);
                    bedWriter.print('\t');
                    bedWriter.print(row.size());
                    bedWriter.println();
                }
                eq.close();
            }
            bedWriter.flush();
            bedWriter.close();
            bedWriter = null;
            junctionSorter.cleanup();
        }
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(sfr);
        CloserUtil.close(this.sfw);
        CloserUtil.close(this.weird);
        CloserUtil.close(bedWriter);
    }
}
Also used : DataInputStream(java.io.DataInputStream) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) TreeSet(java.util.TreeSet) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Interval(htsjdk.samtools.util.Interval) DataOutputStream(java.io.DataOutputStream) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Locatable(htsjdk.samtools.util.Locatable) SortingCollection(htsjdk.samtools.util.SortingCollection) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) Collection(java.util.Collection) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) IOException(java.io.IOException) SAMFileWriter(htsjdk.samtools.SAMFileWriter) SamReader(htsjdk.samtools.SamReader) EOFException(java.io.EOFException) Collectors(java.util.stream.Collectors) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) SamInputResource(htsjdk.samtools.SamInputResource) SAMProgramRecord(htsjdk.samtools.SAMProgramRecord) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) Comparator(java.util.Comparator) SamReaderFactory(htsjdk.samtools.SamReaderFactory) SamReaderFactory(htsjdk.samtools.SamReaderFactory) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) SAMProgramRecord(htsjdk.samtools.SAMProgramRecord) IOException(java.io.IOException) EOFException(java.io.EOFException) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) SamReader(htsjdk.samtools.SamReader) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) TreeSet(java.util.TreeSet) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) SAMFileHeader(htsjdk.samtools.SAMFileHeader) PrintWriter(java.io.PrintWriter)

Example 18 with GtfReader

use of com.github.lindenb.jvarkit.util.bio.structure.GtfReader in project jvarkit by lindenb.

the class GtfRetroCopy method doWork.

@Override
public int doWork(final List<String> args) {
    VCFReader vcfFileReader = null;
    VariantContextWriter vcw0 = null;
    try {
        /* load the reference genome */
        /* create a contig name converter from the REF */
        final Set<String> knownGeneIds;
        if (this.knownPath != null) {
            try (BufferedReader br = IOUtils.openPathForBufferedReading(this.knownPath)) {
                knownGeneIds = br.lines().filter(L -> !StringUtils.isBlank(L)).map(S -> S.trim()).filter(S -> !(S.equals("-") || S.equals(".") || S.startsWith("#"))).collect(Collectors.toSet());
            }
        } else {
            knownGeneIds = Collections.emptySet();
        }
        // open the sam file
        final String input = oneAndOnlyOneFile(args);
        vcfFileReader = VCFReaderFactory.makeDefault().open(Paths.get(input), true);
        final VCFHeader header = vcfFileReader.getHeader();
        final SAMSequenceDictionary dict = header.getSequenceDictionary();
        final Comparator<String> contigCmp = dict == null ? (A, B) -> A.compareTo(B) : new ContigDictComparator(dict);
        final Comparator<Gene> geneCmp = (A, B) -> {
            int i = contigCmp.compare(A.getContig(), B.getContig());
            if (i != 0)
                return i;
            i = Integer.compare(A.getStart(), B.getStart());
            if (i != 0)
                return i;
            return Integer.compare(A.getEnd(), B.getEnd());
        };
        final GtfReader gtfReader = new GtfReader(this.gtfPath);
        if (dict != null && !dict.isEmpty()) {
            this.writingVcf.dictionary(dict);
            gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
        }
        final List<Gene> genes = gtfReader.getAllGenes().stream().filter(G -> G.getTranscripts().stream().count() > 0L).filter(G -> G.getTranscripts().stream().anyMatch(T -> T.getIntronCount() >= this.min_intron_count)).sorted(geneCmp).collect(Collectors.toList());
        gtfReader.close();
        /**
         * build vcf header
         */
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_QUALITY_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY, true));
        metaData.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Variation type"));
        metaData.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Variation Length"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_BOUNDS, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Introns boundaries"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_SIZES, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Introns sizes"));
        metaData.add(new VCFFilterHeaderLine(ATT_NOT_ALL_INTRONS, "Not all introns were found retrocopied"));
        for (final String att : ENSEMBL_TRANSCRIPT_ATTS) {
            metaData.add(new VCFInfoHeaderLine(att, 1, VCFHeaderLineType.String, "Value for the attribute '" + att + "' in the gtf"));
        }
        // metaData.add(new VCFFormatHeaderLine(ATT_COUNT_SUPPORTING_READS, 2,VCFHeaderLineType.Integer,"Count supporting reads [intron-left/intron-right]"));
        // metaData.add(new VCFInfoHeaderLine(ATT_RETRO_DESC, VCFHeaderLineCount.UNBOUNDED,VCFHeaderLineType.String,
        // "Retrocopy attributes: transcript-id|strand|exon-left|exon-left-bases|exon-right-bases|exon-right"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns for the Transcript"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns found retrocopied for the transcript"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_FRACTION, 1, VCFHeaderLineType.Float, "Fraction of introns found retrocopied for the transcript"));
        metaData.add(new VCFFilterHeaderLine(ATT_NOT_ALL_INTRONS, "Not all introns were found retrocopied"));
        metaData.add(new VCFFilterHeaderLine(ATT_KNOWN, "Known RetroGenes. " + (this.knownPath == null ? "" : " Source: " + this.knownPath)));
        final VCFHeader header2 = new VCFHeader(header);
        metaData.stream().forEach(H -> header2.addMetaDataLine(H));
        JVarkitVersion.getInstance().addMetaData(this, header2);
        final Allele ref = Allele.create((byte) 'N', true);
        final Allele alt = Allele.create("<RETROCOPY>", false);
        /* open vcf for writing*/
        vcw0 = this.writingVcf.open(this.outputFile);
        vcw0.writeHeader(header2);
        final ProgressFactory.Watcher<Gene> progress = ProgressFactory.newInstance().logger(LOG).dictionary(dict).build();
        for (final Gene gene : genes) {
            progress.apply(gene);
            final List<VariantContext> variants = new ArrayList<>();
            final CloseableIterator<VariantContext> iter2 = vcfFileReader.query(gene.getContig(), gene.getStart(), gene.getEnd());
            while (iter2.hasNext()) {
                final VariantContext ctx = iter2.next();
                // SNV
                if (ctx.getStart() == ctx.getEnd())
                    continue;
                StructuralVariantType svType = ctx.getStructuralVariantType();
                if (StructuralVariantType.BND.equals(svType))
                    continue;
                if (StructuralVariantType.INS.equals(svType))
                    continue;
                variants.add(ctx);
            }
            iter2.close();
            if (variants.isEmpty())
                continue;
            for (final Transcript transcript : gene.getTranscripts()) {
                if (!transcript.hasIntron())
                    continue;
                if (transcript.getIntronCount() < this.min_intron_count)
                    continue;
                final Counter<String> samples = new Counter<>();
                for (final Intron intron : transcript.getIntrons()) {
                    for (final VariantContext ctx : variants) {
                        if (!isWithinDistance(intron.getStart(), ctx.getStart()))
                            continue;
                        if (!isWithinDistance(intron.getEnd(), ctx.getEnd()))
                            continue;
                        if (ctx.hasGenotypes()) {
                            for (final Genotype g : ctx.getGenotypes()) {
                                if (g.isNoCall() || g.isHomRef())
                                    continue;
                                samples.incr(g.getSampleName());
                            }
                        } else {
                            samples.incr("*");
                        }
                    }
                // end iter2
                }
                // end intron
                final long max_count = samples.stream().mapToLong(E -> E.getValue()).max().orElse(0L);
                if (max_count == 0)
                    continue;
                if (this.only_all_introns && max_count != transcript.getIntronCount())
                    continue;
                // ok good candidate
                final VariantContextBuilder vcb = new VariantContextBuilder();
                vcb.chr(transcript.getContig());
                vcb.start(transcript.getStart());
                vcb.stop(transcript.getEnd());
                switch(this.idKey) {
                    case gene_name:
                        final String gn = transcript.getGene().getGeneName();
                        vcb.id(StringUtils.isBlank(gn) ? transcript.getId() : gn);
                        break;
                    case gene_id:
                        vcb.id(transcript.getGene().getId());
                        break;
                    case transcript_id:
                        vcb.id(transcript.getId());
                        break;
                    default:
                        throw new IllegalStateException();
                }
                final List<Allele> alleles = Arrays.asList(ref, alt);
                // vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY,2);
                // vcb.attribute(VCFConstants.ALLELE_COUNT_KEY,1);
                // vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY,0.5);
                vcb.attribute(VCFConstants.SVTYPE, "DEL");
                vcb.attribute(VCFConstants.END_KEY, transcript.getEnd());
                vcb.attribute("SVLEN", transcript.getLengthOnReference());
                vcb.attribute(ATT_INTRONS_BOUNDS, transcript.getIntrons().stream().map(S -> "" + S.getStart() + "-" + S.getEnd()).collect(Collectors.toList()));
                vcb.attribute(ATT_INTRONS_SIZES, transcript.getIntrons().stream().mapToInt(S -> S.getLengthOnReference()).toArray());
                for (final String att : ENSEMBL_TRANSCRIPT_ATTS) {
                    final String v = transcript.getProperties().get(att);
                    if (StringUtils.isBlank(v))
                        continue;
                    vcb.attribute(att, v);
                }
                vcb.alleles(alleles);
                boolean pass_filter = true;
                // introns sequences
                vcb.attribute(ATT_INTRONS_CANDIDATE_COUNT, max_count);
                vcb.attribute(ATT_INTRONS_COUNT, transcript.getIntronCount());
                vcb.attribute(ATT_INTRONS_CANDIDATE_FRACTION, max_count / (float) transcript.getIntronCount());
                if (transcript.getIntronCount() != max_count) {
                    vcb.filter(ATT_NOT_ALL_INTRONS);
                    pass_filter = false;
                }
                if (knownGeneIds.contains(transcript.getGene().getId())) {
                    vcb.filter(ATT_KNOWN);
                    pass_filter = false;
                }
                if (header.hasGenotypingData()) {
                    final List<Genotype> genotypes = new ArrayList<>();
                    for (final String sn : header.getSampleNamesInOrder()) {
                        final List<Allele> gtalleles;
                        if (samples.count(sn) == 0L) {
                            gtalleles = Arrays.asList(ref, ref);
                        } else {
                            gtalleles = Arrays.asList(ref, alt);
                        }
                        final GenotypeBuilder gb = new GenotypeBuilder(sn, gtalleles);
                        genotypes.add(gb.make());
                    }
                    vcb.genotypes(genotypes);
                }
                if (pass_filter)
                    vcb.passFilters();
                vcw0.add(vcb.make());
            }
        }
        progress.close();
        vcw0.close();
        vcfFileReader.close();
        vcfFileReader = null;
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(vcfFileReader);
        CloserUtil.close(vcw0);
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) VCFHeader(htsjdk.variant.vcf.VCFHeader) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) VCFConstants(htsjdk.variant.vcf.VCFConstants) Counter(com.github.lindenb.jvarkit.util.Counter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) VCFReader(htsjdk.variant.vcf.VCFReader) Set(java.util.Set) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) Collectors(java.util.stream.Collectors) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) List(java.util.List) Intron(com.github.lindenb.jvarkit.util.bio.structure.Intron) Paths(java.nio.file.Paths) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) StructuralVariantType(htsjdk.variant.variantcontext.StructuralVariantType) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) BufferedReader(java.io.BufferedReader) Comparator(java.util.Comparator) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) Collections(java.util.Collections) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Counter(com.github.lindenb.jvarkit.util.Counter) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) VCFReader(htsjdk.variant.vcf.VCFReader) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) Intron(com.github.lindenb.jvarkit.util.bio.structure.Intron) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) BufferedReader(java.io.BufferedReader) StructuralVariantType(htsjdk.variant.variantcontext.StructuralVariantType)

Example 19 with GtfReader

use of com.github.lindenb.jvarkit.util.bio.structure.GtfReader in project jvarkit by lindenb.

the class KnownRetroCopy method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VCFIterator iterin, VariantContextWriter out) {
    try {
        /* load the reference genome */
        /* create a contig name converter from the REF */
        final Set<String> knownGeneIds;
        if (this.knownPath != null) {
            try (BufferedReader br = IOUtils.openPathForBufferedReading(this.knownPath)) {
                knownGeneIds = br.lines().filter(L -> !StringUtils.isBlank(L)).map(S -> S.trim()).filter(S -> !(S.equals("-") || S.equals(".") || S.startsWith("#"))).collect(Collectors.toSet());
            }
        } else {
            knownGeneIds = Collections.emptySet();
        }
        // open the sam file
        final VCFHeader header = iterin.getHeader();
        final SAMSequenceDictionary dict = header.getSequenceDictionary();
        final IntervalTreeMap<List<Intron>> intronMap = new IntervalTreeMap<>();
        final GtfReader gtfReader = new GtfReader(this.gtfPath);
        if (dict != null && !dict.isEmpty())
            gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
        gtfReader.getAllGenes().stream().filter(G -> G.getTranscripts().stream().count() > 0L).filter(G -> G.getTranscripts().stream().anyMatch(T -> T.getIntronCount() >= this.min_intron_count)).flatMap(G -> G.getTranscripts().stream()).flatMap(G -> G.getIntrons().stream()).forEach(INTRON -> {
            List<Intron> introns = intronMap.get(INTRON);
            if (introns == null) {
                introns = new ArrayList<>();
                intronMap.put(INTRON.toInterval(), introns);
            }
            introns.add(INTRON);
        });
        gtfReader.close();
        /**
         * build vcf header
         */
        final VCFHeader header2 = new VCFHeader(header);
        header2.addMetaDataLine(new VCFFilterHeaderLine(ATT_FILTER_INTRON, "variant could be a deleted intron from a retrocopy"));
        header2.addMetaDataLine(new VCFFilterHeaderLine(ATT_FILTER_KNOWN, "variant could be a deleted intron from a known retrocopy"));
        header2.addMetaDataLine(new VCFInfoHeaderLine(ATT_RETROCOPY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Identifiers for the retrocopies."));
        JVarkitVersion.getInstance().addMetaData(this, header2);
        out.writeHeader(header2);
        final ProgressFactory.Watcher<VariantContext> progress = ProgressFactory.newInstance().logger(LOG).dictionary(dict).build();
        while (iterin.hasNext()) {
            final VariantContext ctx = progress.apply(iterin.next());
            if (ctx.getStart() == ctx.getEnd()) {
                out.add(ctx);
                continue;
            }
            final String svType = ctx.getAttributeAsString(VCFConstants.SVTYPE, "");
            if (svType.equals("BND") || svType.equals("INS")) {
                out.add(ctx);
                continue;
            }
            boolean known_flag = false;
            final Set<String> retrocopy_identifiers = new TreeSet<>();
            for (final Intron intron : intronMap.getOverlapping(ctx).stream().flatMap(L -> L.stream()).filter(I -> isWithinDistance(I.getStart(), ctx.getStart())).filter(I -> isWithinDistance(I.getEnd(), ctx.getEnd())).collect(Collectors.toList())) {
                if (knownGeneIds.contains(intron.getTranscript().getGene().getId())) {
                    known_flag = true;
                }
                retrocopy_identifiers.add(VCFUtils.escapeInfoField(intron.getTranscript().getGene().getId()));
                retrocopy_identifiers.add(VCFUtils.escapeInfoField(intron.getTranscript().getId()));
                String s = intron.getTranscript().getGene().getProperties().getOrDefault("gene_name", "");
                if (!StringUtils.isBlank(s))
                    retrocopy_identifiers.add(VCFUtils.escapeInfoField(s));
                s = intron.getTranscript().getProperties().getOrDefault("transcript_name", "");
                if (!StringUtils.isBlank(s))
                    retrocopy_identifiers.add(VCFUtils.escapeInfoField(s));
            }
            if (retrocopy_identifiers.isEmpty()) {
                out.add(ctx);
                continue;
            }
            final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
            vcb.filter(ATT_FILTER_INTRON);
            if (known_flag)
                vcb.filter(ATT_FILTER_KNOWN);
            vcb.attribute(ATT_RETROCOPY, new ArrayList<>(retrocopy_identifiers));
            out.add(vcb.make());
        }
        progress.close();
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
    }
}
Also used : VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFIterator(htsjdk.variant.vcf.VCFIterator) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Path(java.nio.file.Path) VCFConstants(htsjdk.variant.vcf.VCFConstants) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) Set(java.util.Set) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) Collectors(java.util.stream.Collectors) File(java.io.File) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) List(java.util.List) Intron(com.github.lindenb.jvarkit.util.bio.structure.Intron) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) BufferedReader(java.io.BufferedReader) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) Collections(java.util.Collections) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) Intron(com.github.lindenb.jvarkit.util.bio.structure.Intron) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) TreeSet(java.util.TreeSet) BufferedReader(java.io.BufferedReader) ArrayList(java.util.ArrayList) List(java.util.List) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap)

Example 20 with GtfReader

use of com.github.lindenb.jvarkit.util.bio.structure.GtfReader in project jvarkit by lindenb.

the class VcfToPostscript method doWork.

@Override
public int doWork(List<String> args) {
    VCFIterator iter = null;
    BufferedReader r = null;
    try {
        iter = super.openVCFIterator(oneFileOrNull(args));
        this.outw = super.openPathOrStdoutAsPrintStream(this.outputFile);
        final SAMSequenceDictionary dict = iter.getHeader().getSequenceDictionary();
        if (this.gtfPath != null) {
            try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
                if (dict != null)
                    gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
                this.chrom2transcript.putAll(gtfReader.getAllGenes().stream().flatMap(G -> G.getTranscripts().stream()).collect(Collectors.groupingBy(X -> X.getContig())));
            }
        }
        final double ticksH = (fHeight / 2.0f) * 0.6f;
        final double ticksx = 20;
        this.outw.print("%!PS\n" + "%%Creator: Pierre Lindenbaum PhD plindenbaum@yahoo.fr http://plindenbaum.blogspot.com\n" + "%%Title: " + getClass().getSimpleName() + "\n" + "%%CreationDate: " + new Date() + "\n" + "%%EndComments\n" + "%%BoundingBox: 0 0 " + (this.pageDef.width + margin.left + margin.right) + " " + (this.pageDef.height + margin.top + margin.bottom) + "\n" + "/Courier findfont 9 scalefont setfont\n" + "/circle { 10 0 360 arc} bind def\n" + "/ticksF {\n" + (-ticksH) + " " + (ticksH) + " rmoveto\n" + ticksH + " " + (-ticksH) + " rlineto\n" + (-ticksH) + " " + (-ticksH) + " rlineto\n" + ticksH + " " + ticksH + " rmoveto\n" + "} bind def\n" + "/ticksR {\n" + (ticksH) + " " + (ticksH) + " rmoveto\n" + (-ticksH) + " " + (-ticksH) + " rlineto\n" + (ticksH) + " " + (-ticksH) + " rlineto\n" + (-ticksH) + " " + (ticksH) + " rmoveto\n" + "} bind def\n" + "/forticksF {2 dict begin\n" + "/x2 exch def\n" + "/x1 exch def\n" + "0 1 x2 x1 sub " + ticksx + " div {\n" + "ticksF " + ticksx + " 0 rmoveto\n" + "}for\n" + "} bind def\n" + "/forticksR {2 dict begin\n" + "/x2 exch def\n" + "/x1 exch def\n" + "0 1 x2 x1 sub " + ticksx + " div {\n" + " ticksR  " + ticksx + " 0 rmoveto\n" + "}for\n" + "} bind def\n" + "/box\n" + "{\n" + "4 dict begin\n" + "/height exch def\n" + "/width exch def\n" + "/y exch def\n" + "/x exch def\n" + "x y moveto\n" + "width 0 rlineto\n" + "0 height rlineto\n" + "width -1 mul 0 rlineto\n" + "0 height -1 mul rlineto\n" + "end\n" + "} bind def\n" + "/gradient\n" + "{\n" + "4 dict begin\n" + "/height exch def\n" + "/width exch def\n" + "/y exch def\n" + "/x exch def\n" + "/i 0 def\n" + "height 2 div /i exch def\n" + "\n" + "0 1 height 2 div {\n" + "	1 i height 2.0 div div sub setgray\n" + "	newpath\n" + "	x  \n" + "	y height 2 div i sub  add\n" + "	width\n" + "	i 2 mul\n" + "	box\n" + "	closepath\n" + "	fill\n" + "	i 1 sub /i exch def\n" + "	}for\n" + "newpath\n" + "0 setgray\n" + "0.4 setlinewidth\n" + "x y width height box\n" + "closepath\n" + "stroke\n" + "end\n" + "} bind def\n");
        run(iter);
        iter.close();
        this.outw.print("\n%%Trailer\n%%EOF\n");
        this.outw.flush();
        this.outw.close();
        this.outw = null;
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(r);
        CloserUtil.close(iter);
        CloserUtil.close(this.outw);
    }
}
Also used : GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) BufferedReader(java.io.BufferedReader) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFIterator(htsjdk.variant.vcf.VCFIterator) Date(java.util.Date)

Aggregations

GtfReader (com.github.lindenb.jvarkit.util.bio.structure.GtfReader)23 Path (java.nio.file.Path)22 Parameter (com.beust.jcommander.Parameter)20 ContigNameConverter (com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)20 Program (com.github.lindenb.jvarkit.util.jcommander.Program)20 Logger (com.github.lindenb.jvarkit.util.log.Logger)20 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)20 List (java.util.List)20 Collectors (java.util.stream.Collectors)18 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)15 Interval (htsjdk.samtools.util.Interval)15 ArrayList (java.util.ArrayList)15 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)14 Transcript (com.github.lindenb.jvarkit.util.bio.structure.Transcript)14 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)14 CloserUtil (htsjdk.samtools.util.CloserUtil)14 IntervalTreeMap (htsjdk.samtools.util.IntervalTreeMap)13 Locatable (htsjdk.samtools.util.Locatable)13 Set (java.util.Set)13 VariantContext (htsjdk.variant.variantcontext.VariantContext)12