Search in sources :

Example 6 with ReferenceGenomeFactory

use of com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenomeFactory in project jvarkit by lindenb.

the class TestNg01 method testRefGenomeFactoryForDAS.

@Test
public void testRefGenomeFactoryForDAS() throws IOException {
    final ReferenceGenomeFactory rgf = new ReferenceGenomeFactory();
    rgf.setBufferSize(10);
    final ReferenceGenome ref = rgf.openDAS(new URL("http://genome.cse.ucsc.edu/cgi-bin/das/hg19"));
    Assert.assertTrue(ref.size() > 23);
    ReferenceContig contig = ref.getContig("1");
    Assert.assertNotNull(contig);
    Assert.assertEquals(contig.getContig(), "1");
    Assert.assertEquals(contig.length(), 249250621);
    contig = ref.getContig("M");
    Assert.assertNotNull(contig);
    Assert.assertEquals(contig.getContig(), "M");
    Assert.assertEquals(contig.length(), 16571);
    String dna = "gatcacaggtctatcacc";
    for (int i = 0; i < dna.length(); ++i) {
        Assert.assertEquals(dna.charAt(i), contig.charAt(i));
    }
    dna = "cttaaataagacatcacgatg";
    for (int i = 0; i < dna.length(); ++i) {
        Assert.assertEquals(dna.charAt(i), contig.charAt(contig.length() - dna.length() + i));
    }
    ref.close();
}
Also used : ReferenceContig(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceContig) ReferenceGenome(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenome) ReferenceGenomeFactory(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenomeFactory) URL(java.net.URL) Test(org.testng.annotations.Test)

Example 7 with ReferenceGenomeFactory

use of com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenomeFactory in project jvarkit by lindenb.

the class BackLocate method doWork.

@Override
public int doWork(List<String> args) {
    PrintStream out = null;
    try {
        if (StringUtil.isBlank(this.indexedRefUri)) {
            throw new JvarkitException.CommandLineError("Reference file was not provided");
        }
        this.referenceGenome = new ReferenceGenomeFactory().open(this.indexedRefUri);
        if (StringUtil.isBlank(this.knownGeneURI)) {
            throw new JvarkitException.CommandLineError("Undefined knwonGeneURI");
        }
        if (StringUtil.isBlank(this.kgXRef)) {
            throw new JvarkitException.CommandLineError("Undefined kgXref");
        }
        this.loadKnownGenesFromUri(knownGeneURI);
        this.loadkgXRefFromUri(kgXRef);
        out = this.openFileOrStdoutAsPrintStream(this.outputFile);
        out.print("#User.Gene");
        out.print('\t');
        out.print("AA1");
        out.print('\t');
        out.print("petide.pos.1");
        out.print('\t');
        out.print("AA2");
        out.print('\t');
        out.print("knownGene.name");
        out.print('\t');
        out.print("knownGene.strand");
        out.print('\t');
        out.print("knownGene.AA");
        out.print('\t');
        out.print("index0.in.rna");
        out.print('\t');
        out.print("wild.codon");
        out.print('\t');
        out.print("potential.var.codons");
        out.print('\t');
        out.print("base.in.rna");
        out.print('\t');
        out.print("chromosome");
        out.print('\t');
        out.print("index0.in.genomic");
        out.print('\t');
        out.print("exon");
        if (this.printSequences) {
            out.print('\t');
            out.print("mRNA");
            out.print('\t');
            out.print("protein");
        }
        out.println();
        if (args.isEmpty()) {
            LOG.info("reading from stdin");
            final LineIterator in = IOUtils.openStdinForLineIterator();
            this.run(out, in);
            CloserUtil.close(in);
        } else {
            for (final String filename : args) {
                LOG.info("reading from " + filename);
                final LineIterator in = IOUtils.openURIForLineIterator(filename);
                this.run(out, in);
                CloserUtil.close(in);
            }
        }
        return 0;
    } catch (final Exception e) {
        LOG.severe(e);
        return -1;
    } finally {
        CloserUtil.close(this.referenceGenome);
        this.referenceGenome = null;
        CloserUtil.close(out);
    }
}
Also used : PrintStream(java.io.PrintStream) ReferenceGenomeFactory(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenomeFactory) LineIterator(htsjdk.tribble.readers.LineIterator) IOException(java.io.IOException) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException)

Example 8 with ReferenceGenomeFactory

use of com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenomeFactory in project jvarkit by lindenb.

the class BamStats04 method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.bedFile == null || !this.bedFile.exists()) {
        LOG.error("undefined option -B (bed file)");
        return -1;
    }
    if (args.isEmpty()) {
        LOG.error("Bam files missing");
        return -1;
    }
    if (this.minCoverages.isEmpty()) {
        this.minCoverages.add(0);
    }
    final String NO_PARTITION = "N/A";
    BufferedReader bedIn = null;
    final List<SamReader> samReaders = new ArrayList<>(args.size());
    PrintWriter pw = null;
    ReferenceGenome referenceGenome = null;
    ReferenceContig referenceContig = null;
    try {
        final BedLineCodec codec = new BedLineCodec();
        final Set<String> all_partitions = new TreeSet<>();
        bedIn = IOUtils.openFileForBufferedReading(this.bedFile);
        SAMSequenceDictionary dict = null;
        for (final String filename : IOUtils.unrollFiles(args)) {
            LOG.info(filename);
            final SamReader samReader = super.openSamReader(filename);
            if (!samReader.hasIndex()) {
                LOG.error(filename + " is not indexed");
                samReader.close();
                return -1;
            }
            final SAMFileHeader samFileheader = samReader.getFileHeader();
            if (samFileheader == null) {
                LOG.error("SAM file is missing a header " + filename);
                return -1;
            }
            final List<SAMReadGroupRecord> readGroups = samFileheader.getReadGroups();
            if (readGroups == null || readGroups.isEmpty()) {
                LOG.warn("No Read group (RG) in the header of " + filename);
                all_partitions.add(NO_PARTITION);
            } else {
                for (final SAMReadGroupRecord rg : readGroups) {
                    all_partitions.add(this.partition.apply(rg, NO_PARTITION));
                }
            }
            final SAMSequenceDictionary d = samFileheader.getSequenceDictionary();
            if (d == null) {
                samReader.close();
                LOG.error(JvarkitException.BamDictionaryMissing.getMessage(filename));
                return -1;
            }
            samReaders.add(samReader);
            if (dict == null) {
                dict = d;
            } else if (SequenceUtil.areSequenceDictionariesEqual(d, dict)) {
                LOG.error(JvarkitException.DictionariesAreNotTheSame.getMessage(d, dict));
                return -1;
            }
        }
        if (samReaders.isEmpty()) {
            LOG.error("No Bam defined");
            return -1;
        }
        if (!StringUtil.isBlank(this.faidxUri)) {
            referenceGenome = new ReferenceGenomeFactory().open(this.faidxUri);
        }
        pw = super.openFileOrStdoutAsPrintWriter(this.outputFile);
        pw.print("#chrom\tstart\tend\tlength\t" + this.partition.name() + (referenceGenome == null ? "" : "\tgc_percent"));
        pw.print("\tmincov\tmaxcov");
        for (final int MIN_COVERAGE : this.minCoverages) {
            pw.print("\tmeancov_" + MIN_COVERAGE + "\tmediancov_" + MIN_COVERAGE + "\tnocoveragebp_" + MIN_COVERAGE + "\tpercentcovered_" + MIN_COVERAGE);
        }
        pw.println();
        String line = null;
        while ((line = bedIn.readLine()) != null) {
            if (line.isEmpty() || line.startsWith("#"))
                continue;
            final BedLine bedLine = codec.decode(line);
            if (bedLine == null)
                continue;
            if (dict.getSequence(bedLine.getContig()) == null) {
                LOG.error("Unknown contig in " + line);
                return -1;
            }
            if (bedLine.getStart() > bedLine.getEnd()) {
                LOG.info("ignoring " + bedLine);
                continue;
            }
            if (referenceGenome != null && (referenceContig == null || !referenceContig.hasName(bedLine.getContig()))) {
                referenceContig = referenceGenome.getContig(bedLine.getContig());
            }
            final Map<String, IntervalStat> sample2stats = new HashMap<>(all_partitions.size());
            for (final String rgId : all_partitions) {
                sample2stats.put(rgId, new IntervalStat(bedLine));
            }
            for (final SamReader samReader : samReaders) {
                /**
                 *     start - 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence.
                 *	   end - 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence.
                 */
                final SAMRecordIterator r = samReader.queryOverlapping(bedLine.getContig(), bedLine.getStart(), bedLine.getEnd());
                while (r.hasNext()) {
                    final SAMRecord rec = r.next();
                    if (rec.getReadUnmappedFlag())
                        continue;
                    if (this.filter.filterOut(rec))
                        continue;
                    if (!rec.getReferenceName().equals(bedLine.getContig()))
                        continue;
                    final String partition;
                    final SAMReadGroupRecord group = rec.getReadGroup();
                    if (group == null) {
                        partition = NO_PARTITION;
                    } else {
                        final String name = this.partition.apply(group);
                        partition = (StringUtil.isBlank(name) ? NO_PARTITION : name);
                    }
                    IntervalStat stat = sample2stats.get(partition);
                    if (stat == null) {
                        stat = new IntervalStat(bedLine);
                        sample2stats.put(partition, stat);
                    }
                    stat.visit(rec);
                }
                r.close();
            }
            // end of loop over sam Readers
            final OptionalInt gcPercentInt = (referenceContig == null ? OptionalInt.empty() : referenceContig.getGCPercent(bedLine.getStart() - 1, bedLine.getEnd()).getGCPercentAsInteger());
            for (final String partitionName : sample2stats.keySet()) {
                final IntervalStat stat = sample2stats.get(partitionName);
                Arrays.sort(stat.counts);
                pw.print(bedLine.getContig() + "\t" + (bedLine.getStart() - 1) + "\t" + (bedLine.getEnd()) + "\t" + stat.counts.length + "\t" + partitionName);
                if (referenceGenome != null) {
                    pw.print("\t");
                    if (gcPercentInt.isPresent())
                        pw.print(gcPercentInt.getAsInt());
                }
                pw.print("\t" + stat.counts[0] + "\t" + stat.counts[stat.counts.length - 1]);
                for (final int MIN_COVERAGE : this.minCoverages) {
                    /**
                     * map depth to 0 if depth <= MIN_COVERAGE
                     */
                    final IntUnaryOperator depthAdjuster = (D) -> (D <= MIN_COVERAGE ? 0 : D);
                    final int count_no_coverage = (int) Arrays.stream(stat.counts).filter(D -> depthAdjuster.applyAsInt(D) <= 0).count();
                    final double mean = Percentile.average().evaluate(Arrays.stream(stat.counts).map(depthAdjuster));
                    final double median_depth = Percentile.median().evaluate(Arrays.stream(stat.counts).map(depthAdjuster));
                    pw.print("\t" + mean + "\t" + median_depth + "\t" + count_no_coverage + "\t" + (int) (((stat.counts.length - count_no_coverage) / (double) stat.counts.length) * 100.0));
                }
                pw.println();
            }
        }
        pw.flush();
        pw.close();
        pw = null;
        LOG.info("done");
        return RETURN_OK;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(referenceGenome);
        CloserUtil.close(pw);
        CloserUtil.close(bedIn);
        CloserUtil.close(samReaders);
    }
}
Also used : Cigar(htsjdk.samtools.Cigar) Arrays(java.util.Arrays) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Program(com.github.lindenb.jvarkit.util.jcommander.Program) IntUnaryOperator(java.util.function.IntUnaryOperator) Parameter(com.beust.jcommander.Parameter) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) HashMap(java.util.HashMap) SAMRecordPartition(com.github.lindenb.jvarkit.util.samtools.SAMRecordPartition) OptionalInt(java.util.OptionalInt) SAMFileHeader(htsjdk.samtools.SAMFileHeader) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) StringUtil(htsjdk.samtools.util.StringUtil) ReferenceGenomeFactory(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenomeFactory) Map(java.util.Map) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Set(java.util.Set) ReferenceContig(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceContig) SamReader(htsjdk.samtools.SamReader) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) Percentile(com.github.lindenb.jvarkit.math.stats.Percentile) File(java.io.File) SAMRecord(htsjdk.samtools.SAMRecord) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) List(java.util.List) SamRecordJEXLFilter(com.github.lindenb.jvarkit.util.samtools.SamRecordJEXLFilter) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) BufferedReader(java.io.BufferedReader) ReferenceGenome(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenome) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) ReferenceContig(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceContig) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) HashMap(java.util.HashMap) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) ArrayList(java.util.ArrayList) IntUnaryOperator(java.util.function.IntUnaryOperator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamReader(htsjdk.samtools.SamReader) TreeSet(java.util.TreeSet) PrintWriter(java.io.PrintWriter) ReferenceGenome(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenome) ReferenceGenomeFactory(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenomeFactory) OptionalInt(java.util.OptionalInt) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) SAMRecord(htsjdk.samtools.SAMRecord) BufferedReader(java.io.BufferedReader) SAMFileHeader(htsjdk.samtools.SAMFileHeader)

Aggregations

ReferenceGenomeFactory (com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenomeFactory)8 JvarkitException (com.github.lindenb.jvarkit.lang.JvarkitException)5 ReferenceContig (com.github.lindenb.jvarkit.util.bio.fasta.ReferenceContig)5 ReferenceGenome (com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenome)5 SAMRecord (htsjdk.samtools.SAMRecord)4 File (java.io.File)4 ArrayList (java.util.ArrayList)4 Parameter (com.beust.jcommander.Parameter)3 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)3 Program (com.github.lindenb.jvarkit.util.jcommander.Program)3 Logger (com.github.lindenb.jvarkit.util.log.Logger)3 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)3 SAMRecordPartition (com.github.lindenb.jvarkit.util.samtools.SAMRecordPartition)3 SamRecordJEXLFilter (com.github.lindenb.jvarkit.util.samtools.SamRecordJEXLFilter)3 Cigar (htsjdk.samtools.Cigar)3 CigarElement (htsjdk.samtools.CigarElement)3 CigarOperator (htsjdk.samtools.CigarOperator)3 SAMFileHeader (htsjdk.samtools.SAMFileHeader)3 SAMReadGroupRecord (htsjdk.samtools.SAMReadGroupRecord)3 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)3