Search in sources :

Example 11 with CharSplitter

use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.

the class VcfGnomadPext method doVcfToVcf.

@Override
protected int doVcfToVcf(final String inputName, final VCFIterator iter, final VariantContextWriter out) {
    final JsonParser jsonParser = new JsonParser();
    final String[] standard_pext_header = new String[] { "chrom", "pos", "ref", "alt", "tx_annotation" };
    final VCFHeader h0 = iter.getHeader();
    final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(h0);
    if (!SequenceDictionaryUtils.isGRCh37(dict)) {
        LOG.error("Input is NOT GRCh37 ");
        return -1;
    }
    final OrderChecker<VariantContext> orderChecked = new OrderChecker<>(dict, false);
    final CharSplitter tab = CharSplitter.TAB;
    try (TabixReader gextFileReader = new TabixReader(this.pextDatabasePath)) {
        final String line1 = gextFileReader.readLine();
        if (StringUtils.isBlank(line1)) {
            LOG.error("Cannot read first line of " + this.pextDatabasePath);
            return -1;
        }
        if (!Arrays.equals(tab.split(line1), standard_pext_header)) {
            LOG.error("Bad header in " + this.pextDatabasePath + " expected " + String.join("(tab)", standard_pext_header) + " but got " + line1.replace("\t", "(tab)"));
            return -1;
        }
        final VCFHeader h2 = new VCFHeader(h0);
        final VCFInfoHeaderLine pexInfo = new VCFInfoHeaderLine("GNOMAD_PEXT", VCFHeaderLineCount.A, VCFHeaderLineType.String, "Gnomad pext Data from " + this.pextDatabasePath);
        h2.addMetaDataLine(pexInfo);
        JVarkitVersion.getInstance().addMetaData(this, h2);
        out.writeHeader(h2);
        while (iter.hasNext()) {
            final VariantContext ctx = orderChecked.apply(iter.next());
            final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
            vcb.rmAttribute(pexInfo.getID());
            if (!ctx.isVariant() || (this.skipFiltered && ctx.isFiltered())) {
                out.add(vcb.make());
                continue;
            }
            final List<PextEntry> entries = findOverlapping(gextFileReader, ctx);
            if (entries.isEmpty()) {
                out.add(vcb.make());
                continue;
            }
            final List<String> altInfo = new ArrayList<>(ctx.getAlleles().size());
            for (int allele_idx = 1; /* 0 is ref */
            allele_idx < ctx.getAlleles().size(); allele_idx++) {
                final Allele alt = ctx.getAlleles().get(allele_idx);
                final PextEntry entry = entries.stream().filter(E -> E.alt.equals(alt)).findFirst().orElse(null);
                if (entry == null) {
                    altInfo.add(".");
                } else {
                    final JsonElement e = jsonParser.parse(entry.jsonStr);
                    if (!e.isJsonArray())
                        throw new IllegalStateException("not an array: " + entry.jsonStr);
                    final JsonArray array = e.getAsJsonArray();
                    final StringBuilder sb = new StringBuilder();
                    for (int x = 0; x < array.size(); ++x) {
                        if (x > 0)
                            sb.append("&");
                        if (!array.get(x).isJsonObject())
                            throw new IllegalStateException("not an array: " + entry.jsonStr);
                        final StringBuilder sb2 = new StringBuilder();
                        final JsonObject obj = array.get(x).getAsJsonObject();
                        for (final Map.Entry<String, JsonElement> kv : obj.entrySet()) {
                            String key = kv.getKey();
                            // "Brain_FrontalCortex_BA9_": 1.0,
                            if (key.endsWith("_"))
                                key = key.substring(0, key.length() - 1);
                            // as far as I can see, tissues start with a uppercase
                            if (Character.isUpperCase(key.charAt(0)) && !this.restrictTissues.isEmpty() && !this.restrictTissues.contains(key))
                                continue;
                            final JsonElement v = kv.getValue();
                            if (v.isJsonNull())
                                continue;
                            if (v.getAsJsonPrimitive().isString()) {
                                final String strv = v.getAsString();
                                if (strv.equals("NaN"))
                                    continue;
                            }
                            if (sb2.length() > 0)
                                sb2.append("|");
                            sb2.append(key);
                            sb2.append(":");
                            sb2.append(kv.getValue().getAsString());
                        }
                        sb.append(sb2.toString());
                    }
                    if (sb.length() == 0)
                        sb.append(".");
                    altInfo.add(sb.toString());
                }
            }
            // at least none is not '.'
            if (altInfo.stream().anyMatch(S -> !S.equals("."))) {
                vcb.attribute(pexInfo.getID(), altInfo);
            }
            out.add(vcb.make());
        }
        out.close();
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
    }
}
Also used : TabixReader(htsjdk.tribble.readers.TabixReader) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) JsonObject(com.google.gson.JsonObject) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFHeader(htsjdk.variant.vcf.VCFHeader) JsonParser(com.google.gson.JsonParser) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) OrderChecker(com.github.lindenb.jvarkit.dict.OrderChecker) JsonArray(com.google.gson.JsonArray) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) JsonElement(com.google.gson.JsonElement) Map(java.util.Map)

Example 12 with CharSplitter

use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.

the class ScanRetroCopy method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.min_depth < 1) {
        LOG.error("Bad min depth");
        return -1;
    }
    SamReader sr = null;
    VariantContextWriter vcw0 = null;
    CloseableIterator<SAMRecord> iter = null;
    SAMFileWriter sfw = null;
    try {
        /* load the reference genome */
        this.indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
        final SAMSequenceDictionary refDict = SequenceDictionaryUtils.extractRequired(this.indexedFastaSequenceFile);
        /* create a contig name converter from the REF */
        this.refCtgNameConverter = ContigNameConverter.fromOneDictionary(refDict);
        /* READ KNOWGENES FILES */
        LOG.info("Loading " + this.knownGeneUri);
        try (BufferedReader br = IOUtils.openURIForBufferedReading(this.knownGeneUri)) {
            String line;
            final CharSplitter tab = CharSplitter.TAB;
            while ((line = br.readLine()) != null) {
                if (StringUtils.isBlank(line))
                    continue;
                final String[] tokens = tab.split(line);
                final KnownGene kg = new KnownGene(tokens);
                if (kg.getExonCount() < 2)
                    continue;
                if (this.onlyCodingTranscript && kg.isNonCoding())
                    continue;
                final String ctg = this.refCtgNameConverter.apply(kg.getContig());
                if (StringUtils.isBlank(ctg))
                    continue;
                kg.setChrom(ctg);
                final Interval interval = new Interval(ctg, kg.getTxStart() + 1, kg.getTxEnd(), kg.isNegativeStrand(), kg.getName());
                List<KnownGene> L = this.knownGenesMap.get(interval);
                if (L == null) {
                    L = new ArrayList<KnownGene>();
                    this.knownGenesMap.put(interval, L);
                }
                L.add(kg);
            }
        }
        if (this.knownGenesMap.isEmpty()) {
            LOG.error("no gene found in " + this.knownGeneUri);
            return -1;
        }
        LOG.info("Number of transcripts: " + this.knownGenesMap.values().stream().flatMap(L -> L.stream()).count());
        // open the sam file
        final SamReaderFactory samReaderFactory = super.createSamReaderFactory();
        samReaderFactory.referenceSequence(this.faidx);
        sr = samReaderFactory.open(SamInputResource.of(oneFileOrNull(args)));
        final SAMFileHeader samFileHeader = sr.getFileHeader();
        // check it's sorted on coordinate
        if (!samFileHeader.getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) {
            LOG.error("input is not sorted on coordinate but \"" + samFileHeader.getSortOrder() + "\"");
            return -1;
        }
        if (this.saveBamTo != null) {
            sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(samFileHeader, true, this.saveBamTo);
        }
        if (this.use_bai && !sr.hasIndex()) {
            LOG.warning("Cannot used bai because input is not indexed");
        }
        // if there is a bai, only query the bam in the regions of splicing
        if (this.use_bai && sr.hasIndex()) {
            LOG.info("building intervals...");
            final SAMSequenceDictionary samdict = SequenceDictionaryUtils.extractRequired(samFileHeader);
            final ContigNameConverter samConvert = ContigNameConverter.fromOneDictionary(samdict);
            final List<QueryInterval> intervalsL = this.knownGenesMap.values().stream().flatMap(K -> K.stream()).filter(KG -> samConvert.apply(KG.getContig()) != null).flatMap(KG -> KG.getExons().stream()).flatMap(exon -> {
                // we need the reads overlapping the exon bounds
                final int tid = samdict.getSequenceIndex(samConvert.apply(exon.getGene().getContig()));
                final QueryInterval q1 = new QueryInterval(tid, exon.getStart() + 1, exon.getStart() + 1);
                final QueryInterval q2 = new QueryInterval(tid, exon.getEnd(), exon.getEnd());
                return Arrays.stream(new QueryInterval[] { q1, q2 });
            }).sorted().collect(Collectors.toList());
            final QueryInterval[] intervals = QueryInterval.optimizeIntervals(intervalsL.toArray(new QueryInterval[intervalsL.size()]));
            // GC
            intervalsL.clear();
            LOG.debug("Query bam using " + intervals.length + " random access intervals. Please wait...");
            iter = sr.queryOverlapping(intervals);
        } else {
            iter = sr.iterator();
        }
        final Set<String> samples = this.partiton.getPartitions(samFileHeader);
        if (samples.isEmpty()) {
            LOG.error("No sample was defined in the read group of the input bam.");
            return -1;
        }
        /**
         * build vcf header
         */
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_QUALITY_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY, true));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY, true));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY, true));
        metaData.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Variation type"));
        metaData.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Variation Length"));
        metaData.add(new VCFInfoHeaderLine(ATT_BEST_MATCHING_LENGTH, 1, VCFHeaderLineType.Integer, "Best Matching length"));
        metaData.add(new VCFFormatHeaderLine(ATT_BEST_MATCHING_LENGTH, 1, VCFHeaderLineType.Integer, "Best Matching length"));
        metaData.add(new VCFFormatHeaderLine(ATT_GT_INTRON, 1, VCFHeaderLineType.String, "Introns info: (intron-0-idx,valid,dp-5,dp-3,max-len-5,max-len-3,avg-5,avg-3)*"));
        // metaData.add(new VCFFormatHeaderLine(ATT_COUNT_SUPPORTING_READS, 2,VCFHeaderLineType.Integer,"Count supporting reads [intron-left/intron-right]"));
        // metaData.add(new VCFInfoHeaderLine(ATT_RETRO_DESC, VCFHeaderLineCount.UNBOUNDED,VCFHeaderLineType.String,
        // "Retrocopy attributes: transcript-id|strand|exon-left|exon-left-bases|exon-right-bases|exon-right"));
        metaData.add(new VCFInfoHeaderLine(ATT_INSERTION, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Possible place of insertion:" + "chr:start-end|count-evidence|mate-genes|non-coding|distance"));
        metaData.add(new VCFInfoHeaderLine(ATT_KG_STRAND, 1, VCFHeaderLineType.String, "KnownGene strand."));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns for the Transcript"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns found retrocopied for the transcript"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_FRACTION, 1, VCFHeaderLineType.Float, "Fraction of introns found retrocopied for the transcript"));
        metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_INFO, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Introns found: chr|start|end|seq-left|seq-right"));
        metaData.add(new VCFFilterHeaderLine(ATT_FILTER_NONDOCODING, "Only non-coding transcripts"));
        metaData.add(new VCFInfoHeaderLine(ATT_SAMPLES, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples found. partition:" + this.partiton.name()));
        metaData.add(new VCFFilterHeaderLine(ATT_LOW_DEPTH_FILTER + this.low_depth_threshold, "Number of read is lower than :" + this.min_depth));
        metaData.add(new VCFFilterHeaderLine(ATT_NOT_ALL_INTRONS, "Not all introns were found retrocopied"));
        final VCFHeader header = new VCFHeader(metaData, samples);
        JVarkitVersion.getInstance().addMetaData(this, header);
        header.setSequenceDictionary(refDict);
        /* open vcf for writing*/
        vcw0 = super.openVariantContextWriter(this.outputFile);
        final VariantContextWriter vcw = vcw0;
        vcw.writeHeader(header);
        /* save gene writer */
        if (this.saveBedPeTo != null) {
            this.saveInsertionsPw = super.openPathOrStdoutAsPrintWriter(this.saveBedPeTo);
        } else {
            this.saveInsertionsPw = new PrintWriter(new NullOuputStream());
        }
        final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(samFileHeader).logger(LOG).build();
        while (iter.hasNext()) {
            final SAMRecord rec = progress.apply(iter.next());
            if (rec.getReadUnmappedFlag())
                continue;
            if (rec.getMappingQuality() < this.min_read_mapq)
                continue;
            if (rec.isSecondaryOrSupplementary())
                continue;
            if (rec.getDuplicateReadFlag())
                continue;
            final byte[] bases = rec.getReadBases();
            if (bases == null || bases == SAMRecord.NULL_SEQUENCE)
                continue;
            final Cigar cigar = rec.getCigar();
            if (cigar == null || cigar.numCigarElements() < 2)
                continue;
            final String refContig = this.refCtgNameConverter.apply(rec.getContig());
            if (StringUtils.isBlank(refContig))
                continue;
            boolean save_read_to_bam = false;
            /* get sample */
            final String sampleName = this.partiton.getPartion(rec, null);
            if (StringUtils.isBlank(sampleName))
                continue;
            /* this is a new reference sequence */
            if (this.genomicSequence == null || !this.genomicSequence.getChrom().equals(refContig)) {
                if (this.genomicSequence != null) {
                    /* DUMP things BEFORE changing the reference sequence!!! */
                    /* dump buffer */
                    dump(vcw, null);
                }
                /* map transcript-name to their  transcript */
                /*this.kgId2knownGenes.clear();
					this.knownGenesMap.values().
						stream().
						flatMap(L->L.stream()).
						filter(G->refContig.equals(G.getContig())).
						forEach(K->this.kgId2knownGenes.put(K.getName(), K));*/
                /* now, we can change genomicSequence */
                this.genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, refContig);
            }
            final CigarElement leftCigar = cigar.getCigarElement(0);
            final CigarElement rightCigar = cigar.getCigarElement(cigar.numCigarElements() - 1);
            /* both ends are not candidate */
            if (!isCandidateCigarElement(leftCigar) && !isCandidateCigarElement(rightCigar)) {
                continue;
            }
            final List<KnownGene> genes = this.knownGenesMap.getOverlapping(new Interval(refContig, rec.getUnclippedStart(), rec.getUnclippedEnd())).stream().flatMap(L -> L.stream()).collect(Collectors.toList());
            /* time to time dump the buffer for the transcripts before the current ones*/
            if (!genes.isEmpty()) {
                // get all genes overlapping the current set of genes
                int minTxStart = genes.stream().mapToInt(K -> K.getTxStart()).min().getAsInt();
                final int maxTxStart = genes.stream().mapToInt(K -> K.getTxEnd()).max().getAsInt();
                // update minTxtStart to get the lowest gene overlapping the set of genes
                minTxStart = this.knownGenesMap.getOverlapping(new Interval(refContig, minTxStart, maxTxStart)).stream().flatMap(L -> L.stream()).mapToInt(K -> K.getStart()).min().getAsInt();
                // not max, because we only need the 5' side
                dump(vcw, new Interval(refContig, minTxStart, minTxStart));
            }
            /* test each side of the clipped read */
            for (int side = 0; side < 2 && !genes.isEmpty(); ++side) {
                final CigarElement ce_side = (side == 0 ? leftCigar : rightCigar);
                if (!isCandidateCigarElement(ce_side))
                    continue;
                for (final KnownGene knownGene : genes) {
                    for (int exonIndex = 0; exonIndex < knownGene.getExonCount(); exonIndex++) {
                        if (side == 0) /* looking at cigar string in 5' */
                        {
                            if (exonIndex == 0)
                                continue;
                            // first 'M' element
                            final CigarLocatable cigarM = new CigarLocatable(refContig, rec, 1);
                            // first cigar element
                            final CigarLocatable cigarS = new CigarLocatable(refContig, rec, 0);
                            // current exon
                            final ExonOne exonRight = new ExonOne(knownGene.getExon(exonIndex));
                            if (!cigarM.overlaps(exonRight))
                                continue;
                            if (!(exonRight.getStart() >= cigarM.getStart()))
                                continue;
                            // get exon on the left
                            final ExonOne exonLeft = new ExonOne(knownGene.getExon(exonIndex - 1));
                            if (exonLeft.getLengthOnReference() < this.minCigarSize)
                                continue;
                            /* end of cigar 'M' can have same bases than the prev exon. */
                            final int malus = exonRight.getStart() - cigarM.getStart();
                            int genomic1 = exonLeft.getEnd() - malus;
                            if (genomic1 < exonLeft.getStart() || genomic1 > exonLeft.getEnd()) {
                                continue;
                            }
                            int matchLength = (this.use_malus ? malus : 0);
                            int readIdx0 = cigarS.size() - 1;
                            // loop over sequence
                            while (readIdx0 >= 0 && genomic1 >= exonLeft.getStart()) {
                                final char read_base = cigarS.readBaseAt0(readIdx0);
                                final char genome_base = exonLeft.charAt1(genomic1);
                                if (read_base != genome_base) {
                                    break;
                                }
                                readIdx0--;
                                matchLength++;
                                genomic1--;
                            }
                            if (matchLength < this._priv_ignoreCigarSize)
                                continue;
                            final KnownGene.Intron intron = knownGene.getIntron(exonIndex - 1);
                            // find match or create new
                            final Match match = new Match(intron, sampleName, rec, SIDE_3_PRIME, matchLength);
                            this.intronBuffer.add(match);
                            save_read_to_bam = true;
                        } else /* test last cigar */
                        {
                            if (exonIndex + 1 >= knownGene.getExonCount())
                                continue;
                            // last 'M' element
                            final CigarLocatable cigarM = new CigarLocatable(refContig, rec, cigar.numCigarElements() - 2);
                            // last cigar element
                            final CigarLocatable cigarS = new CigarLocatable(refContig, rec, cigar.numCigarElements() - 1);
                            // current exon
                            final ExonOne exonLeft = new ExonOne(knownGene.getExon(exonIndex));
                            if (!cigarM.overlaps(exonLeft))
                                continue;
                            if (!(exonLeft.getEnd() <= cigarM.getEnd()))
                                continue;
                            // get next exon
                            final ExonOne exonRight = new ExonOne(knownGene.getExon(exonIndex + 1));
                            if (exonRight.getLengthOnReference() < this.minCigarSize)
                                continue;
                            /* end of cigar 'M' can have same bases than the next exon. */
                            final int malus = cigarM.getEnd() - exonLeft.getEnd();
                            int genomic1 = exonRight.getStart() + malus;
                            if (genomic1 < exonRight.getStart() || genomic1 > exonRight.getEnd()) {
                                continue;
                            }
                            int matchLength = (this.use_malus ? malus : 0);
                            int readIdx0 = 0;
                            // loop over sequence
                            while (readIdx0 < cigarS.size() && genomic1 <= exonRight.getEnd()) {
                                final char read_base = cigarS.readBaseAt0(readIdx0);
                                final char genome_base = exonRight.charAt1(genomic1);
                                if (read_base != genome_base) {
                                    break;
                                }
                                readIdx0++;
                                matchLength++;
                                genomic1++;
                            }
                            if (matchLength < this._priv_ignoreCigarSize)
                                continue;
                            // find match or create new
                            final KnownGene.Intron intron = knownGene.getIntron(exonIndex);
                            final Match match = new Match(intron, sampleName, rec, SIDE_5_PRIME, matchLength);
                            this.intronBuffer.add(match);
                            save_read_to_bam = true;
                        }
                    }
                }
            }
            // end for side
            if (save_read_to_bam && sfw != null)
                sfw.addAlignment(rec);
        }
        /* dump buffer */
        dump(vcw, null);
        progress.close();
        vcw.close();
        iter.close();
        iter = null;
        sr.close();
        sr = null;
        this.saveInsertionsPw.flush();
        this.saveInsertionsPw.close();
        this.saveInsertionsPw = null;
        if (sfw != null) {
            sfw.close();
            sfw = null;
        }
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(sr);
        CloserUtil.close(vcw0);
        CloserUtil.close(sfw);
        CloserUtil.close(this.indexedFastaSequenceFile);
        CloserUtil.close(this.saveInsertionsPw);
    }
}
Also used : Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) AbstractCharSequence(com.github.lindenb.jvarkit.lang.AbstractCharSequence) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) SAMRecordPartition(com.github.lindenb.jvarkit.util.samtools.SAMRecordPartition) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) Map(java.util.Map) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) SAMFileWriter(htsjdk.samtools.SAMFileWriter) Collectors(java.util.stream.Collectors) SAMRecord(htsjdk.samtools.SAMRecord) ReferenceSequenceFileFactory(htsjdk.samtools.reference.ReferenceSequenceFileFactory) List(java.util.List) Stream(java.util.stream.Stream) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) CoordMath(htsjdk.samtools.util.CoordMath) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) IntStream(java.util.stream.IntStream) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) SAMUtils(htsjdk.samtools.SAMUtils) Parameter(com.beust.jcommander.Parameter) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) HashMap(java.util.HashMap) Function(java.util.function.Function) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Interval(htsjdk.samtools.util.Interval) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFConstants(htsjdk.variant.vcf.VCFConstants) Locatable(htsjdk.samtools.util.Locatable) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) SAMFileWriterFactory(htsjdk.samtools.SAMFileWriterFactory) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) File(java.io.File) SamInputResource(htsjdk.samtools.SamInputResource) QueryInterval(htsjdk.samtools.QueryInterval) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) BufferedReader(java.io.BufferedReader) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) QueryInterval(htsjdk.samtools.QueryInterval) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamReader(htsjdk.samtools.SamReader) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) PrintWriter(java.io.PrintWriter) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) SamReaderFactory(htsjdk.samtools.SamReaderFactory) SAMFileWriter(htsjdk.samtools.SAMFileWriter) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) SAMFileWriterFactory(htsjdk.samtools.SAMFileWriterFactory) CigarElement(htsjdk.samtools.CigarElement) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Cigar(htsjdk.samtools.Cigar) SAMRecord(htsjdk.samtools.SAMRecord) BufferedReader(java.io.BufferedReader) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) SAMFileHeader(htsjdk.samtools.SAMFileHeader) Interval(htsjdk.samtools.util.Interval) QueryInterval(htsjdk.samtools.QueryInterval)

Example 13 with CharSplitter

use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.

the class PlateOptimizer method doWork.

@Override
public int doWork(final List<String> args) {
    try {
        if (this.seed != -1) {
            this.random = new Random(this.seed);
        } else {
            this.random = new Random(System.currentTimeMillis());
        }
        final CharSplitter tab = CharSplitter.TAB;
        final String input = oneFileOrNull(args);
        try (BufferedReader br = (input == null ? IOUtils.openStreamForBufferedReader(stdin()) : IOUtils.openURIForBufferedReading(input))) {
            String line = br.readLine();
            if (line == null) {
                LOG.error("cannot read first line");
            }
            this.headers.addAll(Arrays.asList(tab.split(line)));
            for (int i = 0; i < this.headers.size(); ++i) this.column2index.put(this.headers.get(i), i);
            LOG.info("header: " + String.join(";", this.headers));
            while ((line = br.readLine()) != null) {
                if (StringUtils.isBlank(line))
                    continue;
                line = line.replaceAll("<e9>", "e");
                final String[] tokens = tab.split(line);
                if (tokens.length != this.headers.size())
                    throw new JvarkitException.TokenErrors(this.headers.size(), tokens);
                final Content content = new Content(this.all_contents.size(), tokens);
                this.all_contents.add(content);
            }
        }
        if (this.all_contents.isEmpty()) {
            LOG.error("no data defined");
            return -1;
        }
        Runtime.getRuntime().addShutdownHook(new Thread() {

            public void run() {
                ctrlc_catched_count--;
                if (ctrlc_catched_count == 0) {
                    LOG.warn("Shutting down ...");
                    Runtime.getRuntime().removeShutdownHook(this);
                } else {
                    LOG.info("press again " + ctrlc_catched_count + " times.");
                }
            }
        });
        long n_iteration = 0L;
        while (this.ctrlc_catched_count > 0) {
            iteration(++n_iteration);
        }
        ctrlc_catched_count = 0;
        LOG.info("done");
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
    }
}
Also used : JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) Random(java.util.Random) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) BufferedReader(java.io.BufferedReader)

Example 14 with CharSplitter

use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.

the class VcfGeneOntology method readGOA.

protected void readGOA() throws IOException {
    if (this.name2go != null)
        return;
    this.name2go = new HashMap<String, Set<GOOntology.Term>>();
    LOG.info("read GOA " + GOA);
    final CharSplitter tab = CharSplitter.TAB;
    BufferedReader in = IOUtils.openURIForBufferedReading(GOA);
    String line;
    while ((line = in.readLine()) != null) {
        if (line.isEmpty() || line.startsWith("!"))
            continue;
        String[] tokens = tab.split(line, 6);
        if (tokens.length < 6)
            continue;
        GOOntology.Term term = this.goTree.getTermByAccession(tokens[4]);
        if (term == null) {
            continue;
        }
        Set<GOOntology.Term> set = this.name2go.get(tokens[2]);
        if (set == null) {
            set = new HashSet<GOOntology.Term>();
            this.name2go.put(tokens[2], set);
        }
        set.add(term);
    }
    in.close();
    LOG.info("GOA size:" + this.name2go.size());
}
Also used : HashSet(java.util.HashSet) Set(java.util.Set) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) BufferedReader(java.io.BufferedReader) GOOntology(com.github.lindenb.jvarkit.go.GOOntology)

Example 15 with CharSplitter

use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.

the class TestSupport method assertTsvTableIsConsitent.

public void assertTsvTableIsConsitent(final Path f, Predicate<String> ignoreLine) {
    final CharSplitter tab = CharSplitter.TAB;
    BufferedReader r = null;
    try {
        Assert.assertTrue(Files.exists(f), "file " + f + " should exist");
        r = IOUtils.openPathForBufferedReading(f);
        int nCols = -1;
        int nRow = 0;
        String line;
        while ((line = r.readLine()) != null) {
            if (ignoreLine != null && ignoreLine.test(line))
                continue;
            nRow++;
            final String[] tokens = tab.split(line);
            final int c = tokens.length;
            if (nRow == 1) {
                nCols = c;
            } else {
                if (nCols != c) {
                    for (int i = 0; i < tokens.length; i++) {
                        System.err.println("$" + (i + 1) + ":" + tokens[i]);
                    }
                }
                Assert.assertEquals(nCols, c, "Line " + line + " expected " + nCols + " tokens but got " + c);
            }
        }
    } catch (final Exception e) {
        e.printStackTrace();
        Assert.fail();
    } finally {
        CloserUtil.close(r);
    }
}
Also used : CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) BufferedReader(java.io.BufferedReader) IOException(java.io.IOException)

Aggregations

CharSplitter (com.github.lindenb.jvarkit.lang.CharSplitter)28 BufferedReader (java.io.BufferedReader)19 IOException (java.io.IOException)12 List (java.util.List)11 Parameter (com.beust.jcommander.Parameter)10 JvarkitException (com.github.lindenb.jvarkit.lang.JvarkitException)10 Program (com.github.lindenb.jvarkit.util.jcommander.Program)10 Logger (com.github.lindenb.jvarkit.util.log.Logger)10 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)10 Set (java.util.Set)10 Path (java.nio.file.Path)9 ArrayList (java.util.ArrayList)9 HashSet (java.util.HashSet)9 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)8 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)7 CloserUtil (htsjdk.samtools.util.CloserUtil)7 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)7 VCFHeader (htsjdk.variant.vcf.VCFHeader)7 VCFInfoHeaderLine (htsjdk.variant.vcf.VCFInfoHeaderLine)7 Map (java.util.Map)7