Search in sources :

Example 1 with BedLineCodec

use of com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec in project jvarkit by lindenb.

the class BamStats01 method run.

private void run(final String filename, SamReader samFileReader) throws IOException {
    Map<String, Histogram> sample2hist = new HashMap<String, BamStats01.Histogram>();
    SAMSequenceDictionary currDict = samFileReader.getFileHeader().getSequenceDictionary();
    if (samSequenceDictionary == null) {
        samSequenceDictionary = currDict;
    }
    if (this.bedFile != null) {
        if (!SequenceUtil.areSequenceDictionariesEqual(currDict, samSequenceDictionary)) {
            samFileReader.close();
            throw new IOException("incompatible sequence dictionaries." + filename);
        }
        if (intervalTreeMap == null) {
            final BedLineCodec bedCodec = new BedLineCodec();
            intervalTreeMap = new IntervalTreeMap<>();
            LOG.info("opening " + this.bedFile);
            String line;
            final BufferedReader bedIn = IOUtils.openFileForBufferedReading(bedFile);
            while ((line = bedIn.readLine()) != null) {
                final BedLine bedLine = bedCodec.decode(line);
                if (bedLine == null)
                    continue;
                int seqIndex = currDict.getSequenceIndex(bedLine.getContig());
                if (seqIndex == -1) {
                    throw new IOException("unknown chromosome from dict in  in " + line + " " + this.bedFile);
                }
                intervalTreeMap.put(bedLine.toInterval(), Boolean.TRUE);
            }
            bedIn.close();
            LOG.info("done reading " + this.bedFile);
        }
    }
    this.chrX_index = -1;
    this.chrY_index = -1;
    for (final SAMSequenceRecord rec : currDict.getSequences()) {
        final String chromName = rec.getSequenceName().toLowerCase();
        if (chromName.equals("x") || chromName.equals("chrx")) {
            this.chrX_index = rec.getSequenceIndex();
        } else if (chromName.equals("y") || chromName.equals("chry")) {
            this.chrY_index = rec.getSequenceIndex();
        }
    }
    final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(currDict);
    final SAMRecordIterator iter = samFileReader.iterator();
    while (iter.hasNext()) {
        final SAMRecord rec = progess.watch(iter.next());
        String sampleName = groupBy.getPartion(rec);
        if (sampleName == null || sampleName.isEmpty())
            sampleName = "undefined";
        Histogram hist = sample2hist.get(sampleName);
        if (hist == null) {
            hist = new Histogram();
            sample2hist.put(sampleName, hist);
        }
        hist.histograms[Category2.ALL.ordinal()].watch(rec);
        if (intervalTreeMap == null)
            continue;
        if (rec.getReadUnmappedFlag()) {
            continue;
        }
        if (!intervalTreeMap.containsOverlapping(new Interval(rec.getReferenceName(), rec.getAlignmentStart(), rec.getAlignmentEnd()))) {
            hist.histograms[Category2.OFF_TARGET.ordinal()].watch(rec);
        } else {
            hist.histograms[Category2.IN_TARGET.ordinal()].watch(rec);
        }
    }
    progess.finish();
    samFileReader.close();
    samFileReader = null;
    for (final String sampleName : sample2hist.keySet()) {
        final Histogram hist = sample2hist.get(sampleName);
        out.print(filename + "\t" + sampleName);
        for (final Category2 cat2 : Category2.values()) {
            for (// je je suis libertineuuh, je suis une cat1
            final Category cat1 : // je je suis libertineuuh, je suis une cat1
            Category.values()) {
                out.print("\t");
                out.print(hist.histograms[cat2.ordinal()].counts[cat1.ordinal()]);
            }
            if (intervalTreeMap == null)
                break;
        }
        out.println();
    }
}
Also used : SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) HashMap(java.util.HashMap) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) IOException(java.io.IOException) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) SAMRecord(htsjdk.samtools.SAMRecord) BufferedReader(java.io.BufferedReader) Interval(htsjdk.samtools.util.Interval)

Example 2 with BedLineCodec

use of com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec in project jvarkit by lindenb.

the class KnimeVariantHelper method parseBedAsBooleanIntervalTreeMap.

public IntervalTreeMap<Boolean> parseBedAsBooleanIntervalTreeMap(final String bedUri) throws IOException {
    if (bedUri == null || bedUri.isEmpty())
        throw new IllegalArgumentException("bad bed uri");
    BufferedReader r = null;
    final IntervalTreeMap<Boolean> treeMap = new IntervalTreeMap<>();
    try {
        r = IOUtils.openURIForBufferedReading(bedUri);
        final BedLineCodec codec = new BedLineCodec();
        r.lines().forEach(L -> {
            final BedLine bedline = codec.decode(L);
            if (bedline == null) {
                LOG.warn("Ignoring line in BED (doesn't look like a BED line):" + L);
                return;
            }
            treeMap.put(bedline.toInterval(), Boolean.TRUE);
        });
        return treeMap;
    } finally {
        CloserUtil.close(r);
    }
}
Also used : BufferedReader(java.io.BufferedReader) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine)

Example 3 with BedLineCodec

use of com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec in project jvarkit by lindenb.

the class Launcher method readBedFileAsBooleanIntervalTreeMap.

/**
 * reads a Bed file and convert it to a IntervalTreeMap<Boolean>
 */
protected IntervalTreeMap<Boolean> readBedFileAsBooleanIntervalTreeMap(final java.io.File file) throws java.io.IOException {
    java.io.BufferedReader r = null;
    try {
        final IntervalTreeMap<Boolean> intervals = new IntervalTreeMap<Boolean>();
        r = com.github.lindenb.jvarkit.io.IOUtils.openFileForBufferedReading(file);
        final BedLineCodec bedCodec = new BedLineCodec();
        r.lines().filter(line -> !(line.startsWith("#") || com.github.lindenb.jvarkit.util.bio.bed.BedLine.isBedHeader(line) || line.isEmpty())).map(line -> bedCodec.decode(line)).filter(B -> B != null).map(B -> B.toInterval()).filter(L -> L.getStart() < L.getEnd()).forEach(L -> {
            intervals.put(L, true);
        });
        return intervals;
    } finally {
        htsjdk.samtools.util.CloserUtil.close(r);
    }
}
Also used : Manifest(java.util.jar.Manifest) Transformer(javax.xml.transform.Transformer) Arrays(java.util.Arrays) ParameterException(com.beust.jcommander.ParameterException) Enumeration(java.util.Enumeration) URL(java.net.URL) VCFHeader(htsjdk.variant.vcf.VCFHeader) StreamResult(javax.xml.transform.stream.StreamResult) Random(java.util.Random) VariantContextWriterBuilder(htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder) SAMFileHeader(htsjdk.samtools.SAMFileHeader) IntegerConverter(com.beust.jcommander.converters.IntegerConverter) StringUtil(htsjdk.samtools.util.StringUtil) Locale(java.util.Locale) Document(org.w3c.dom.Document) Map(java.util.Map) IStringConverter(com.beust.jcommander.IStringConverter) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) ParameterDescription(com.beust.jcommander.ParameterDescription) IncludeSourceInJar(com.github.lindenb.jvarkit.annotproc.IncludeSourceInJar) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Collection(java.util.Collection) Logger(com.github.lindenb.jvarkit.util.log.Logger) SamFilterParser(com.github.lindenb.jvarkit.util.bio.samfilter.SamFilterParser) SAMFileWriter(htsjdk.samtools.SAMFileWriter) Deflater(java.util.zip.Deflater) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) FileNotFoundException(java.io.FileNotFoundException) Dimension(java.awt.Dimension) List(java.util.List) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VariantContext(htsjdk.variant.variantcontext.VariantContext) Pattern(java.util.regex.Pattern) DocumentBuilderFactory(javax.xml.parsers.DocumentBuilderFactory) IValueValidator(com.beust.jcommander.IValueValidator) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) DOMSource(javax.xml.transform.dom.DOMSource) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) Parameter(com.beust.jcommander.Parameter) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) HashMap(java.util.HashMap) Term(com.github.lindenb.semontology.Term) Function(java.util.function.Function) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) Interval(htsjdk.samtools.util.Interval) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) PrintStream(java.io.PrintStream) SAMFileWriterFactory(htsjdk.samtools.SAMFileWriterFactory) StringWriter(java.io.StringWriter) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) JCommander(com.beust.jcommander.JCommander) IOException(java.io.IOException) OutputKeys(javax.xml.transform.OutputKeys) InputStreamReader(java.io.InputStreamReader) File(java.io.File) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) URLEncoder(java.net.URLEncoder) Element(org.w3c.dom.Element) BufferedReader(java.io.BufferedReader) TransformerFactory(javax.xml.transform.TransformerFactory) IStringConverterFactory(com.beust.jcommander.IStringConverterFactory) Collections(java.util.Collections) InputStream(java.io.InputStream) BufferedReader(java.io.BufferedReader) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec)

Example 4 with BedLineCodec

use of com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec in project jvarkit by lindenb.

the class VcfGeneEpistasis method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.geneBed == null) {
        LOG.error("gene file bed undefined");
        return -1;
    }
    if (this.outputFile == null) {
        LOG.error("output file undefined");
        return -1;
    }
    CloseableIterator<VariantContext> iter = null;
    try {
        final File vcfFile = new File(oneAndOnlyOneFile(args));
        this.vcfFileReader = new VCFFileReader(vcfFile, true);
        final VCFHeader header = this.vcfFileReader.getFileHeader();
        final Pedigree pedigree;
        if (this.pedigreeFile != null) {
            pedigree = new Pedigree.Parser().parse(this.pedigreeFile);
        } else {
            pedigree = new Pedigree.Parser().parse(header);
        }
        if (pedigree == null || pedigree.isEmpty() || !pedigree.hasAffected() || !pedigree.hasUnaffected()) {
            LOG.error("empty ped or no case/ctrl");
            return -1;
        }
        pedigree.verifyPersonsHaveUniqueNames();
        for (final Pedigree.Person p : pedigree.getPersons().stream().filter(P -> P.isAffected() || P.isUnaffected()).filter(P -> header.getSampleNamesInOrder().contains(P.getId())).collect(Collectors.toSet())) {
            this.id2samples.put(p.getId(), p);
        }
        this.vcfTools = new VcfTools(header);
        List<Interval> geneList;
        if (!this.geneBed.exists()) {
            final Map<String, Interval> gene2interval = new HashMap<>(50000);
            LOG.info("building gene file" + this.geneBed);
            iter = this.vcfFileReader.iterator();
            // iter = this.vcfFileReader.query("chr3",1,300_000_000);
            final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
            while (iter.hasNext()) {
                final VariantContext ctx = progress.watch(iter.next());
                if (!accept(ctx))
                    continue;
                for (final String geneName : getGenes(ctx)) {
                    final Interval old = gene2interval.get(geneName);
                    if (old == null) {
                        gene2interval.put(geneName, new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, geneName));
                        LOG.info("adding " + geneName + ". number of genes: " + gene2interval.size());
                    } else if (!old.getContig().equals(ctx.getContig())) {
                        LOG.error("boum :" + geneName + ": on chrom " + ctx.getContig() + " vs " + old);
                        return -1;
                    } else {
                        gene2interval.put(geneName, new Interval(ctx.getContig(), Math.min(ctx.getStart(), old.getStart()), Math.max(ctx.getEnd(), old.getEnd()), false, geneName));
                    }
                }
            }
            iter.close();
            iter = null;
            progress.finish();
            geneList = new ArrayList<>(gene2interval.values());
            PrintWriter pw = new PrintWriter(this.geneBed);
            for (final Interval g : geneList) {
                pw.println(g.getContig() + "\t" + (g.getStart() - 1) + "\t" + (g.getEnd()) + "\t" + g.getName());
            }
            pw.flush();
            pw.close();
            pw = null;
        } else {
            BedLineCodec codec = new BedLineCodec();
            BufferedReader r = IOUtil.openFileForBufferedReading(geneBed);
            geneList = r.lines().map(L -> codec.decode(L)).filter(B -> B != null).map(B -> new Interval(B.getContig(), B.getStart(), B.getEnd(), true, B.get(3))).collect(Collectors.toList());
            r.close();
        }
        if (geneList.isEmpty()) {
            LOG.error("gene List is empty");
            return -1;
        }
        final Comparator<VariantContext> ctxSorter = VCFUtils.createTidPosRefComparator(header.getSequenceDictionary());
        final Function<Interval, List<VariantContext>> loadVariants = (R) -> {
            List<VariantContext> L = new ArrayList<>();
            CloseableIterator<VariantContext> r = this.vcfFileReader.query(R.getContig(), R.getStart(), R.getEnd());
            while (r.hasNext()) {
                final VariantContext ctx = r.next();
                if (!accept(ctx))
                    continue;
                if (!getGenes(ctx).contains(R.getName()))
                    continue;
                L.add(ctx);
            }
            r.close();
            return L;
        };
        final SkatExecutor executor = this.skatFactory.build();
        Double bestSkat = null;
        LOG.info("number of genes : " + geneList.size());
        final int list_end_index = (this.user_end_index < 0 ? geneList.size() : Math.min(geneList.size(), this.user_end_index));
        for (int x = this.user_begin_index; x < list_end_index; ++x) {
            final Interval intervalx = geneList.get(x);
            final List<VariantContext> variantsx = loadVariants.apply(intervalx);
            if (variantsx.isEmpty())
                continue;
            for (int y = x; y < geneList.size(); /* pas list_end_index */
            ++y) {
                final Interval intervaly;
                final List<VariantContext> merge;
                if (y == x) {
                    // we-re testing gene 1 only
                    intervaly = intervalx;
                    merge = variantsx;
                } else {
                    intervaly = geneList.get(y);
                    if (intervaly.intersects(intervalx))
                        continue;
                    final List<VariantContext> variantsy = loadVariants.apply(intervaly);
                    if (variantsy.isEmpty())
                        continue;
                    merge = new MergedList<>(variantsx, variantsy);
                }
                LOG.info("testing : [" + x + "]" + intervalx + " [" + y + "]" + intervaly + " N:" + geneList.size() + " best: " + bestSkat);
                final Double skat = eval(executor, merge);
                if (skat == null)
                    continue;
                if (bestSkat == null || skat.compareTo(bestSkat) < 0) {
                    bestSkat = skat;
                    LOG.info("best " + bestSkat + " " + intervalx + " " + intervaly);
                    if (this.outputFile.getName().endsWith(".vcf") || this.outputFile.getName().endsWith(".vcf.gz")) {
                        final VCFHeader header2 = new VCFHeader(header);
                        header2.addMetaDataLine(new VCFHeaderLine(VcfGeneEpistasis.class.getName(), intervalx.getName() + " " + intervaly.getName() + " " + bestSkat));
                        final VariantContextWriter w = VCFUtils.createVariantContextWriter(outputFile);
                        w.writeHeader(header2);
                        merge.stream().sorted(ctxSorter).forEach(V -> w.add(V));
                        w.close();
                    } else {
                        final PrintWriter w = super.openFileOrStdoutAsPrintWriter(outputFile);
                        w.println(String.valueOf(bestSkat) + "\t" + intervalx.getName() + "\t" + intervaly.getName());
                        w.flush();
                        w.close();
                    }
                }
            }
        }
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(this.vcfFileReader);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) Program(com.github.lindenb.jvarkit.util.jcommander.Program) IOUtil(htsjdk.samtools.util.IOUtil) Parameter(com.beust.jcommander.Parameter) VCFFileReader(htsjdk.variant.vcf.VCFFileReader) VCFHeader(htsjdk.variant.vcf.VCFHeader) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) AbstractList(java.util.AbstractList) HashMap(java.util.HashMap) Function(java.util.function.Function) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) Interval(htsjdk.samtools.util.Interval) SkatResult(com.github.lindenb.jvarkit.tools.skat.SkatFactory.SkatResult) StringUtil(htsjdk.samtools.util.StringUtil) SkatExecutor(com.github.lindenb.jvarkit.tools.skat.SkatFactory.SkatExecutor) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) Map(java.util.Map) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SkatFactory(com.github.lindenb.jvarkit.tools.skat.SkatFactory) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) File(java.io.File) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VariantContext(htsjdk.variant.variantcontext.VariantContext) VcfTools(com.github.lindenb.jvarkit.util.vcf.VcfTools) BufferedReader(java.io.BufferedReader) Comparator(java.util.Comparator) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) HashMap(java.util.HashMap) VCFFileReader(htsjdk.variant.vcf.VCFFileReader) VariantContext(htsjdk.variant.variantcontext.VariantContext) AbstractList(java.util.AbstractList) ArrayList(java.util.ArrayList) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) PrintWriter(java.io.PrintWriter) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) VcfTools(com.github.lindenb.jvarkit.util.vcf.VcfTools) Pedigree(com.github.lindenb.jvarkit.util.Pedigree) BufferedReader(java.io.BufferedReader) SkatExecutor(com.github.lindenb.jvarkit.tools.skat.SkatFactory.SkatExecutor) File(java.io.File) Interval(htsjdk.samtools.util.Interval)

Example 5 with BedLineCodec

use of com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec in project jvarkit by lindenb.

the class Biostar77828 method doWork.

@Override
public int doWork(List<String> args) {
    PrintStream pw = null;
    try {
        LOG.info("load BED");
        BufferedReader in = super.openBufferedReader(oneFileOrNull(args));
        final BedLineCodec codec = new BedLineCodec();
        String line;
        while ((line = in.readLine()) != null) {
            if (line.isEmpty() || line.startsWith("#"))
                continue;
            final BedLine bedLine = codec.decode(line);
            if (bedLine == null)
                continue;
            if (bedLine.getColumnCount() < 3)
                throw new IOException("bad BED input " + bedLine);
            final Segment seg = new Segment(bedLine.getContig(), bedLine.getStart() - 1, bedLine.getEnd());
            if (seg.size() == 0)
                continue;
            this.effective_genome_size += seg.size();
            this.all_segments.add(seg);
        }
        pw = super.openFileOrStdoutAsPrintStream(this.outputFile);
        Solution best = null;
        for (long generation = 0; generation < this.N_ITERATIONS; ++generation) {
            if (generation % 100000 == 0)
                LOG.info("generation:" + generation + "/" + this.N_ITERATIONS + " " + (int) ((generation / (double) this.N_ITERATIONS) * 100.0) + "% " + String.valueOf(best));
            Solution sol = createSolution();
            sol.generation = generation;
            if (best == null || sol.compareTo(best) < 0) {
                best = sol;
            /*
	    			if(LOG.isDebugEnabled())
	    				{
	    				LOG.info("%%generation:"+generation);
	    				best.print(stderr());
	    				}*/
            }
        }
        if (best != null) {
            best.print(pw);
        }
        pw.flush();
        pw.close();
        return RETURN_OK;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(pw);
    }
}
Also used : PrintStream(java.io.PrintStream) BufferedReader(java.io.BufferedReader) IOException(java.io.IOException) IOException(java.io.IOException) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine)

Aggregations

BedLineCodec (com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec)22 BufferedReader (java.io.BufferedReader)17 BedLine (com.github.lindenb.jvarkit.util.bio.bed.BedLine)16 Interval (htsjdk.samtools.util.Interval)12 File (java.io.File)12 ArrayList (java.util.ArrayList)11 List (java.util.List)10 Logger (com.github.lindenb.jvarkit.util.log.Logger)9 CloserUtil (htsjdk.samtools.util.CloserUtil)9 Parameter (com.beust.jcommander.Parameter)8 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)8 Program (com.github.lindenb.jvarkit.util.jcommander.Program)8 SAMFileHeader (htsjdk.samtools.SAMFileHeader)8 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)8 SamReader (htsjdk.samtools.SamReader)8 HashMap (java.util.HashMap)8 Map (java.util.Map)8 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)7 StringUtil (htsjdk.samtools.util.StringUtil)7 VCFHeader (htsjdk.variant.vcf.VCFHeader)7