Search in sources :

Example 1 with SetFileRecord

use of com.github.lindenb.jvarkit.setfile.SetFileRecord in project jvarkit by lindenb.

the class SetFileTools method doInterBed.

/**
 * print  whole setrecords overlapping bed file, there is to trimming
 */
private int doInterBed(final List<String> args) throws IOException {
    if (this.intersectBedPath != null) {
        LOG.info("intersectBedPath shouldn' be specified");
        return -1;
    }
    if (!this.intersectVcfPath.isEmpty()) {
        LOG.info("intersectVcfPath shouldn't be specified");
        return -1;
    }
    if (args.size() != 2) {
        LOG.error("expected 2 files but got " + args.size() + " " + args);
        return -1;
    }
    if (args.get(0).equals("-") && args.get(1).equals("-")) {
        LOG.error("cannot use both files on stdin");
        return -1;
    }
    final IntervalTreeMap<BedLine> peaksTreeMap;
    try (BedLineReader blr = openBedLineReader(args.get(0).equals("-") ? null : Paths.get(args.get(0)))) {
        peaksTreeMap = blr.toIntervalTreeMap();
    }
    try (CloseableIterator<SetFileRecord> iter = openSetFileIterator((args.get(1).equals("-") ? Collections.emptyList() : args.subList(1, 2)))) {
        try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
            while (iter.hasNext()) {
                final SetFileRecord rec = iter.next();
                if (rec.stream().noneMatch(B -> peaksTreeMap.containsOverlapping(B)))
                    continue;
                print(pw, rec);
            }
            pw.flush();
        }
    }
    return 0;
}
Also used : SetFileRecord(com.github.lindenb.jvarkit.setfile.SetFileRecord) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) PrintWriter(java.io.PrintWriter)

Example 2 with SetFileRecord

use of com.github.lindenb.jvarkit.setfile.SetFileRecord in project jvarkit by lindenb.

the class SetFileTools method doStats.

/**
 * statistics for setFile
 */
private int doStats(final List<String> args) throws IOException {
    final Counter<String> chrom2count = new Counter<>();
    final DiscreteMedian<Integer> d_size = new DiscreteMedian<>();
    final DiscreteMedian<Integer> d_nitems = new DiscreteMedian<>();
    final DiscreteMedian<Integer> d_distance = new DiscreteMedian<>();
    final DiscreteMedian<Integer> d_item_size = new DiscreteMedian<>();
    for (final SAMSequenceRecord ssr : this.theDict.getSequences()) {
        chrom2count.initializeIfNotExists(noChr(ssr.getSequenceName()));
    }
    chrom2count.initializeIfNotExists("*multiple*");
    chrom2count.initializeIfNotExists("*empty*");
    try (CloseableIterator<SetFileRecord> iter = openSetFileIterator(args)) {
        while (iter.hasNext()) {
            final SetFileRecord rec = iter.next();
            final Set<String> chroms = rec.getChromosomes();
            switch(chroms.size()) {
                case 0:
                    chrom2count.incr("*empty*");
                    break;
                case 1:
                    chrom2count.incr(noChr(chroms.iterator().next()));
                    break;
                default:
                    chrom2count.incr("*multiple*");
                    break;
            }
            int len = rec.stream().mapToInt(B -> B.getLengthOnReference()).sum();
            d_size.add(len);
            d_nitems.add(rec.size());
            if (rec.size() > 0) {
                len = len / rec.size();
                d_item_size.add(len);
            }
            if (rec.size() > 1 && chroms.size() == 1) {
                int d = 0;
                final List<Locatable> L = sortAndMerge(rec);
                for (int i = 0; i + 1 < L.size(); i++) {
                    d += (rec.get(i + 1).getStart() - rec.get(i).getEnd());
                }
                d = d / (L.size() - 1);
                d_distance.add(d);
            }
        }
    }
    try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
        for (final String key : chrom2count.keySetDecreasing()) {
            pw.println("C\trecords-per-chrom\t" + key + "\t" + chrom2count.count(key));
        }
        pw.println("AS\taverage-size\t" + (d_size.isEmpty() ? "." : String.valueOf(d_size.getAverage().orElse(0.0))));
        pw.println("MS\tmedian-size\t" + (d_size.isEmpty() ? "." : String.valueOf(d_size.getMedian().orElse(0.0))));
        pw.println("AIS\taverage-item-size\t" + (d_item_size.isEmpty() ? "." : String.valueOf(d_item_size.getAverage().orElse(0.0))));
        pw.println("MIS\tmedian-item-size\t" + (d_item_size.isEmpty() ? "." : String.valueOf(d_item_size.getMedian().orElse(0.0))));
        pw.println("AN\taverage-nitems\t" + (d_nitems.isEmpty() ? "." : String.valueOf(d_nitems.getAverage().orElse(0.0))));
        pw.println("MN\tmedian-nitems\t" + (d_nitems.isEmpty() ? "." : String.valueOf(d_nitems.getMedian().orElse(0.0))));
        pw.println("AD\taverage-distance-between-items\t" + (d_distance.isEmpty() ? "." : String.valueOf(d_distance.getAverage().orElse(0.0))));
        pw.println("MD\tmedian-distance-between-items\t" + (d_distance.isEmpty() ? "." : String.valueOf(d_distance.getMedian().orElse(0.0))));
        pw.flush();
    }
    return 0;
}
Also used : Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFHeader(htsjdk.variant.vcf.VCFHeader) UnaryOperator(java.util.function.UnaryOperator) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) StringUtil(htsjdk.samtools.util.StringUtil) DiscreteMedian(com.github.lindenb.jvarkit.math.DiscreteMedian) Path(java.nio.file.Path) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) SetFileRecord(com.github.lindenb.jvarkit.setfile.SetFileRecord) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) AbstractCloseableIterator(com.github.lindenb.jvarkit.iterator.AbstractCloseableIterator) Collectors(java.util.stream.Collectors) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContext(htsjdk.variant.variantcontext.VariantContext) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) SetFileReaderFactory(com.github.lindenb.jvarkit.setfile.SetFileReaderFactory) IntervalExtender(com.github.lindenb.jvarkit.samtools.util.IntervalExtender) CloseableIterator(htsjdk.samtools.util.CloseableIterator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) BufferedVCFReader(com.github.lindenb.jvarkit.variant.vcf.BufferedVCFReader) Function(java.util.function.Function) ValidationStringency(htsjdk.samtools.ValidationStringency) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) LinkedList(java.util.LinkedList) Counter(com.github.lindenb.jvarkit.util.Counter) Locatable(htsjdk.samtools.util.Locatable) EqualIterator(com.github.lindenb.jvarkit.iterator.EqualIterator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IOException(java.io.IOException) Paths(java.nio.file.Paths) BufferedReader(java.io.BufferedReader) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Comparator(java.util.Comparator) Collections(java.util.Collections) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Counter(com.github.lindenb.jvarkit.util.Counter) SetFileRecord(com.github.lindenb.jvarkit.setfile.SetFileRecord) DiscreteMedian(com.github.lindenb.jvarkit.math.DiscreteMedian) Locatable(htsjdk.samtools.util.Locatable) PrintWriter(java.io.PrintWriter)

Example 3 with SetFileRecord

use of com.github.lindenb.jvarkit.setfile.SetFileRecord in project jvarkit by lindenb.

the class SetFileTools method toBed.

private int toBed(final List<String> args) throws IOException {
    try (CloseableIterator<SetFileRecord> iter = openSetFileIterator(args)) {
        try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
            while (iter.hasNext()) {
                final SetFileRecord rec = iter.next();
                for (Locatable loc : rec) {
                    pw.print(noChr(loc.getContig()));
                    pw.print("\t");
                    pw.print(loc.getStart() - 1);
                    pw.print("\t");
                    pw.print(loc.getEnd());
                    pw.print("\t");
                    pw.print(rec.getName());
                    pw.println();
                }
            }
            pw.flush();
        }
    }
    return 0;
}
Also used : SetFileRecord(com.github.lindenb.jvarkit.setfile.SetFileRecord) PrintWriter(java.io.PrintWriter) Locatable(htsjdk.samtools.util.Locatable)

Example 4 with SetFileRecord

use of com.github.lindenb.jvarkit.setfile.SetFileRecord in project jvarkit by lindenb.

the class SetFileTools method makeClusters.

private int makeClusters(final List<String> args) throws IOException {
    if (this.number_of_jobs < 1 && this.long_length_per_bin < 1L) {
        LOG.error("at least --jobs or --size must be specified.");
        return -1;
    }
    if (this.number_of_jobs > 0 && this.long_length_per_bin > 0) {
        LOG.error(" --jobs OR --size must be specified. Not both.");
        return -1;
    }
    final List<Cluster> clusters = new ArrayList<>();
    try (CloseableIterator<SetFileRecord> iter = openSetFileIterator(args)) {
        final List<SetFileRecord> records = iter.stream().filter(R -> !R.isEmpty()).sorted((A, B) -> Long.compare(B.getLongSumOfLengthOnReference(), A.getLongSumOfLengthOnReference())).collect(Collectors.toCollection(LinkedList::new));
        while (!records.isEmpty()) {
            final SetFileRecord first = records.remove(0);
            if (number_of_jobs > 0) {
                if (clusters.size() < this.number_of_jobs) {
                    final Cluster c = new Cluster();
                    c.add(first);
                } else {
                    int best_idx = -1;
                    double best_length = -1;
                    for (int y = 0; y < clusters.size(); ++y) {
                        final double total_length = clusters.get(y).getSumLength(first);
                        if (best_idx == -1 || total_length < best_length) {
                            best_idx = y;
                            best_length = total_length;
                        }
                    }
                    clusters.get(best_idx).add(first);
                }
            } else {
                int y = 0;
                while (y < clusters.size()) {
                    final Cluster cluster = clusters.get(y);
                    if (cluster.getSumLength(first) <= this.long_length_per_bin) {
                        cluster.add(first);
                        break;
                    }
                    y++;
                }
                if (y == clusters.size()) {
                    final Cluster cluster = new Cluster();
                    cluster.add(first);
                    clusters.add(cluster);
                }
            }
        }
    // end wile !records.isEmpty
    }
    // end open
    int clusterid = 0;
    try (final ArchiveFactory archive = ArchiveFactory.open(this.outputFile)) {
        for (final Cluster cluster : clusters) {
            Collections.sort(cluster.records, (A, B) -> {
                final Locatable s1 = A.get(0);
                final Locatable s2 = B.get(0);
                final int i = this.theSorter.compare(s1, s2);
                if (i != 0)
                    return i;
                return A.getName().compareTo(B.getName());
            });
            final String filename = String.format("cluster.%05d" + SetFileRecord.FILE_EXTENSION, clusterid);
            try (PrintWriter pw = archive.openWriter(filename)) {
                for (final SetFileRecord rec : cluster.records) {
                    print(pw, rec);
                }
                pw.flush();
            }
            LOG.info(filename + " " + cluster.getSumLength(null) + "bp");
            ++clusterid;
        }
    }
    return 0;
}
Also used : Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFHeader(htsjdk.variant.vcf.VCFHeader) UnaryOperator(java.util.function.UnaryOperator) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) StringUtil(htsjdk.samtools.util.StringUtil) DiscreteMedian(com.github.lindenb.jvarkit.math.DiscreteMedian) Path(java.nio.file.Path) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) SetFileRecord(com.github.lindenb.jvarkit.setfile.SetFileRecord) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) AbstractCloseableIterator(com.github.lindenb.jvarkit.iterator.AbstractCloseableIterator) Collectors(java.util.stream.Collectors) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContext(htsjdk.variant.variantcontext.VariantContext) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) SetFileReaderFactory(com.github.lindenb.jvarkit.setfile.SetFileReaderFactory) IntervalExtender(com.github.lindenb.jvarkit.samtools.util.IntervalExtender) CloseableIterator(htsjdk.samtools.util.CloseableIterator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) BufferedVCFReader(com.github.lindenb.jvarkit.variant.vcf.BufferedVCFReader) Function(java.util.function.Function) ValidationStringency(htsjdk.samtools.ValidationStringency) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) LinkedList(java.util.LinkedList) Counter(com.github.lindenb.jvarkit.util.Counter) Locatable(htsjdk.samtools.util.Locatable) EqualIterator(com.github.lindenb.jvarkit.iterator.EqualIterator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IOException(java.io.IOException) Paths(java.nio.file.Paths) BufferedReader(java.io.BufferedReader) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Comparator(java.util.Comparator) Collections(java.util.Collections) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) ArrayList(java.util.ArrayList) SetFileRecord(com.github.lindenb.jvarkit.setfile.SetFileRecord) Locatable(htsjdk.samtools.util.Locatable) PrintWriter(java.io.PrintWriter)

Example 5 with SetFileRecord

use of com.github.lindenb.jvarkit.setfile.SetFileRecord in project jvarkit by lindenb.

the class SetFileTools method openSetFileIterator.

private CloseableIterator<SetFileRecord> openSetFileIterator(final List<String> args) throws IOException {
    CloseableIterator<SetFileRecord> iter = null;
    final String input = oneFileOrNull(args);
    final SetFileReaderFactory srf = new SetFileReaderFactory(this.theDict);
    if (input == null) {
        iter = srf.open(IOUtils.openStdinForBufferedReader());
    } else {
        iter = srf.open(IOUtils.openURIForBufferedReading(input));
    }
    if (!StringUtils.isBlank(this.extendStr)) {
        final IntervalExtender xtExtender = IntervalExtender.of(this.theDict, this.extendStr);
        if (xtExtender.isChanging()) {
            iter = new ExtenderIterator(iter, xtExtender);
        }
    }
    if (intersectBedPath != null) {
        iter = new IntersectBedIterator(iter, this.intersectBedPath);
    }
    if (!intersectVcfPath.isEmpty()) {
        iter = new IntersectVcfIterator(iter, IOUtils.unrollPaths(this.intersectVcfPath));
    }
    if (!disable_uniq) {
        iter = new UniqNameIterator(iter);
    }
    return iter;
}
Also used : SetFileRecord(com.github.lindenb.jvarkit.setfile.SetFileRecord) IntervalExtender(com.github.lindenb.jvarkit.samtools.util.IntervalExtender) SetFileReaderFactory(com.github.lindenb.jvarkit.setfile.SetFileReaderFactory)

Aggregations

SetFileRecord (com.github.lindenb.jvarkit.setfile.SetFileRecord)7 PrintWriter (java.io.PrintWriter)6 Locatable (htsjdk.samtools.util.Locatable)4 BedLineReader (com.github.lindenb.jvarkit.bed.BedLineReader)3 IntervalExtender (com.github.lindenb.jvarkit.samtools.util.IntervalExtender)3 SetFileReaderFactory (com.github.lindenb.jvarkit.setfile.SetFileReaderFactory)3 BedLine (com.github.lindenb.jvarkit.util.bio.bed.BedLine)3 Parameter (com.beust.jcommander.Parameter)2 ArchiveFactory (com.github.lindenb.jvarkit.io.ArchiveFactory)2 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)2 AbstractCloseableIterator (com.github.lindenb.jvarkit.iterator.AbstractCloseableIterator)2 EqualIterator (com.github.lindenb.jvarkit.iterator.EqualIterator)2 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)2 DiscreteMedian (com.github.lindenb.jvarkit.math.DiscreteMedian)2 SimpleInterval (com.github.lindenb.jvarkit.samtools.util.SimpleInterval)2 Counter (com.github.lindenb.jvarkit.util.Counter)2 DistanceParser (com.github.lindenb.jvarkit.util.bio.DistanceParser)2 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)2 ContigNameConverter (com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)2 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)2