Search in sources :

Example 1 with CharSplitter

use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.

the class TestSupport method assertTsvTableIsConsitent.

public void assertTsvTableIsConsitent(final Path f, Predicate<String> ignoreLine) {
    final CharSplitter tab = CharSplitter.TAB;
    BufferedReader r = null;
    try {
        Assert.assertTrue(Files.exists(f), "file " + f + " should exist");
        r = IOUtils.openPathForBufferedReading(f);
        int nCols = -1;
        int nRow = 0;
        String line;
        while ((line = r.readLine()) != null) {
            if (ignoreLine != null && ignoreLine.test(line))
                continue;
            nRow++;
            final String[] tokens = tab.split(line);
            final int c = tokens.length;
            if (nRow == 1) {
                nCols = c;
            } else {
                if (nCols != c) {
                    for (int i = 0; i < tokens.length; i++) {
                        System.err.println("$" + (i + 1) + ":" + tokens[i]);
                    }
                }
                Assert.assertEquals(nCols, c, "Line " + line + " expected " + nCols + " tokens but got " + c);
            }
        }
    } catch (final Exception e) {
        e.printStackTrace();
        Assert.fail();
    } finally {
        CloserUtil.close(r);
    }
}
Also used : CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) BufferedReader(java.io.BufferedReader) IOException(java.io.IOException)

Example 2 with CharSplitter

use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.

the class VcfGnomadOld method doWork.

@Override
public int doWork(final List<String> args) {
    try {
        if (this.gnomadBufferSize < 10) {
            LOG.error("buffer size is too small " + this.gnomadBufferSize);
            return -1;
        }
        if (this.manifestFile == null) {
            LOG.info("Building default manifest file...");
            for (final OmeType ot : OmeType.values()) {
                if (this.useGenomeOnly && !ot.equals(OmeType.genome))
                    continue;
                for (int i = 1; i <= 24; ++i) {
                    final ManifestEntry entry = new ManifestEntry();
                    entry.omeType = ot;
                    switch(i) {
                        case 23:
                            entry.contig = "X";
                            break;
                        case 24:
                            entry.contig = "Y";
                            break;
                        default:
                            entry.contig = String.valueOf(i);
                            break;
                    }
                    switch(gnomadVersion) {
                        case v2_1:
                            if (ot == OmeType.genome) {
                                // no "chrY" for this version for genome
                                if (i == 24)
                                    continue;
                                entry.uri = "https://storage.googleapis.com/gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr" + entry.contig + ".vcf.bgz";
                            } else {
                                entry.uri = "https://storage.googleapis.com/gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr" + entry.contig + ".vcf.bgz";
                            }
                            break;
                        default:
                            {
                                entry.close();
                                LOG.error("Building default manifest is not available for version: " + this.gnomadVersion);
                                return -1;
                            }
                    }
                    this.manifestEntries.add(entry);
                }
            }
            LOG.info("Building default manifest file... Done");
        } else {
            try {
                final CharSplitter tab = CharSplitter.TAB;
                Files.lines(this.manifestFile.toPath()).forEach(L -> {
                    if (L.startsWith("#") || StringUtil.isBlank(L))
                        return;
                    final String[] tokens = tab.split(L);
                    if (tokens.length < 3)
                        throw new JvarkitException.TokenErrors("Expected 3 words", tokens);
                    final ManifestEntry entry = new ManifestEntry();
                    entry.omeType = OmeType.valueOf(tokens[0]);
                    if (this.useGenomeOnly && !entry.omeType.equals(OmeType.genome)) {
                        entry.close();
                        return;
                    }
                    entry.contig = tokens[1].trim();
                    entry.uri = tokens[2].trim();
                    this.manifestEntries.add(entry);
                });
            } catch (final IOException err) {
                LOG.error(err);
                return -1;
            }
        }
        return doVcfToVcf(args, this.outputFile);
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
    }
}
Also used : JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOException(java.io.IOException) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOException(java.io.IOException) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException)

Example 3 with CharSplitter

use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.

the class VcfGnomadPext method findOverlapping.

/**
 * find matching variant in tabix file, use a buffer to avoid multiple random accesses
 */
final List<PextEntry> findOverlapping(final TabixReader tabix, final VariantContext ctx) throws IOException {
    final String normContig = this.ensemblCtgConvert.apply(ctx.getContig());
    if (StringUtil.isBlank(normContig))
        return Collections.emptyList();
    if (!buffer.isEmpty() && !buffer.get(0).contig.equals(normContig)) {
        this.buffer.clear();
    }
    if (this.lastInterval == null || !this.lastInterval.getContig().equals(normContig) || !CoordMath.encloses(lastInterval.getStart(), lastInterval.getEnd(), ctx.getStart(), ctx.getEnd())) {
        final CharSplitter tab = CharSplitter.TAB;
        this.buffer.clear();
        this.lastInterval = new Interval(normContig, Math.max(0, ctx.getStart() - 10), ctx.getEnd() + VcfGnomadPext.this.gnomadBufferSize);
        final TabixReader.Iterator iter = tabix.query(this.lastInterval.getContig(), this.lastInterval.getStart(), this.lastInterval.getEnd());
        for (; ; ) {
            final String line = iter.next();
            if (line == null)
                break;
            final String[] tokens = tab.split(line);
            this.buffer.add(new PextEntry(tokens));
        }
    }
    return this.buffer.stream().filter(V -> V.getStart() == ctx.getStart() && V.getEnd() == ctx.getEnd() && V.getContig().equals(normContig) && V.ref.equals(ctx.getReference()) && ctx.getAlleles().contains(V.alt)).collect(Collectors.toList());
}
Also used : JsonObject(com.google.gson.JsonObject) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFIterator(htsjdk.variant.vcf.VCFIterator) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) JsonParser(com.google.gson.JsonParser) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) JsonElement(com.google.gson.JsonElement) Interval(htsjdk.samtools.util.Interval) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) StringUtil(htsjdk.samtools.util.StringUtil) Map(java.util.Map) TabixReader(htsjdk.tribble.readers.TabixReader) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Locatable(htsjdk.samtools.util.Locatable) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Set(java.util.Set) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) Collectors(java.util.stream.Collectors) List(java.util.List) JsonArray(com.google.gson.JsonArray) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) OrderChecker(com.github.lindenb.jvarkit.dict.OrderChecker) CoordMath(htsjdk.samtools.util.CoordMath) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) OnePassVcfLauncher(com.github.lindenb.jvarkit.jcommander.OnePassVcfLauncher) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) Collections(java.util.Collections) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) TabixReader(htsjdk.tribble.readers.TabixReader) Interval(htsjdk.samtools.util.Interval)

Example 4 with CharSplitter

use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.

the class VcfGnomadPext method doVcfToVcf.

@Override
protected int doVcfToVcf(final String inputName, final VCFIterator iter, final VariantContextWriter out) {
    final JsonParser jsonParser = new JsonParser();
    final String[] standard_pext_header = new String[] { "chrom", "pos", "ref", "alt", "tx_annotation" };
    final VCFHeader h0 = iter.getHeader();
    final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(h0);
    if (!SequenceDictionaryUtils.isGRCh37(dict)) {
        LOG.error("Input is NOT GRCh37 ");
        return -1;
    }
    final OrderChecker<VariantContext> orderChecked = new OrderChecker<>(dict, false);
    final CharSplitter tab = CharSplitter.TAB;
    try (TabixReader gextFileReader = new TabixReader(this.pextDatabasePath)) {
        final String line1 = gextFileReader.readLine();
        if (StringUtils.isBlank(line1)) {
            LOG.error("Cannot read first line of " + this.pextDatabasePath);
            return -1;
        }
        if (!Arrays.equals(tab.split(line1), standard_pext_header)) {
            LOG.error("Bad header in " + this.pextDatabasePath + " expected " + String.join("(tab)", standard_pext_header) + " but got " + line1.replace("\t", "(tab)"));
            return -1;
        }
        final VCFHeader h2 = new VCFHeader(h0);
        final VCFInfoHeaderLine pexInfo = new VCFInfoHeaderLine("GNOMAD_PEXT", VCFHeaderLineCount.A, VCFHeaderLineType.String, "Gnomad pext Data from " + this.pextDatabasePath);
        h2.addMetaDataLine(pexInfo);
        JVarkitVersion.getInstance().addMetaData(this, h2);
        out.writeHeader(h2);
        while (iter.hasNext()) {
            final VariantContext ctx = orderChecked.apply(iter.next());
            final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
            vcb.rmAttribute(pexInfo.getID());
            if (!ctx.isVariant() || (this.skipFiltered && ctx.isFiltered())) {
                out.add(vcb.make());
                continue;
            }
            final List<PextEntry> entries = findOverlapping(gextFileReader, ctx);
            if (entries.isEmpty()) {
                out.add(vcb.make());
                continue;
            }
            final List<String> altInfo = new ArrayList<>(ctx.getAlleles().size());
            for (int allele_idx = 1; /* 0 is ref */
            allele_idx < ctx.getAlleles().size(); allele_idx++) {
                final Allele alt = ctx.getAlleles().get(allele_idx);
                final PextEntry entry = entries.stream().filter(E -> E.alt.equals(alt)).findFirst().orElse(null);
                if (entry == null) {
                    altInfo.add(".");
                } else {
                    final JsonElement e = jsonParser.parse(entry.jsonStr);
                    if (!e.isJsonArray())
                        throw new IllegalStateException("not an array: " + entry.jsonStr);
                    final JsonArray array = e.getAsJsonArray();
                    final StringBuilder sb = new StringBuilder();
                    for (int x = 0; x < array.size(); ++x) {
                        if (x > 0)
                            sb.append("&");
                        if (!array.get(x).isJsonObject())
                            throw new IllegalStateException("not an array: " + entry.jsonStr);
                        final StringBuilder sb2 = new StringBuilder();
                        final JsonObject obj = array.get(x).getAsJsonObject();
                        for (final Map.Entry<String, JsonElement> kv : obj.entrySet()) {
                            String key = kv.getKey();
                            // "Brain_FrontalCortex_BA9_": 1.0,
                            if (key.endsWith("_"))
                                key = key.substring(0, key.length() - 1);
                            // as far as I can see, tissues start with a uppercase
                            if (Character.isUpperCase(key.charAt(0)) && !this.restrictTissues.isEmpty() && !this.restrictTissues.contains(key))
                                continue;
                            final JsonElement v = kv.getValue();
                            if (v.isJsonNull())
                                continue;
                            if (v.getAsJsonPrimitive().isString()) {
                                final String strv = v.getAsString();
                                if (strv.equals("NaN"))
                                    continue;
                            }
                            if (sb2.length() > 0)
                                sb2.append("|");
                            sb2.append(key);
                            sb2.append(":");
                            sb2.append(kv.getValue().getAsString());
                        }
                        sb.append(sb2.toString());
                    }
                    if (sb.length() == 0)
                        sb.append(".");
                    altInfo.add(sb.toString());
                }
            }
            // at least none is not '.'
            if (altInfo.stream().anyMatch(S -> !S.equals("."))) {
                vcb.attribute(pexInfo.getID(), altInfo);
            }
            out.add(vcb.make());
        }
        out.close();
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
    }
}
Also used : TabixReader(htsjdk.tribble.readers.TabixReader) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) JsonObject(com.google.gson.JsonObject) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFHeader(htsjdk.variant.vcf.VCFHeader) JsonParser(com.google.gson.JsonParser) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) OrderChecker(com.github.lindenb.jvarkit.dict.OrderChecker) JsonArray(com.google.gson.JsonArray) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) JsonElement(com.google.gson.JsonElement) Map(java.util.Map)

Example 5 with CharSplitter

use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.

the class KnownGene method loadUriAsIntervalTreeMap.

/**
 * load knownGene file/uri as an IntervalTreeMap. Intervals in the IntervalTreeMap are *1-based* (interval.start= kg.txStart+1)
 */
public static IntervalTreeMap<List<KnownGene>> loadUriAsIntervalTreeMap(final String uri, final Predicate<KnownGene> filterOrNull) throws IOException {
    final IntervalTreeMap<List<KnownGene>> treeMap = new IntervalTreeMap<>();
    BufferedReader in = null;
    try {
        in = IOUtils.openURIForBufferedReading(uri);
        String line;
        final CharSplitter tab = CharSplitter.TAB;
        while ((line = in.readLine()) != null) {
            if (line.isEmpty())
                continue;
            final String[] tokens = tab.split(line);
            final KnownGene g = new KnownGene(tokens);
            if (filterOrNull != null && !filterOrNull.test(g))
                continue;
            final Interval interval = new Interval(g.getContig(), g.getTxStart() + 1, g.getTxEnd(), g.isNegativeStrand(), g.getName());
            List<KnownGene> L = treeMap.get(interval);
            if (L == null) {
                L = new ArrayList<>(2);
                treeMap.put(interval, L);
            }
            L.add(g);
        }
        in.close();
        in = null;
        return treeMap;
    } finally {
        CloserUtil.close(in);
    }
}
Also used : CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) BufferedReader(java.io.BufferedReader) ArrayList(java.util.ArrayList) List(java.util.List) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Interval(htsjdk.samtools.util.Interval)

Aggregations

CharSplitter (com.github.lindenb.jvarkit.lang.CharSplitter)27 BufferedReader (java.io.BufferedReader)18 IOException (java.io.IOException)11 List (java.util.List)11 Parameter (com.beust.jcommander.Parameter)9 JvarkitException (com.github.lindenb.jvarkit.lang.JvarkitException)9 Program (com.github.lindenb.jvarkit.util.jcommander.Program)9 Logger (com.github.lindenb.jvarkit.util.log.Logger)9 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)9 ArrayList (java.util.ArrayList)9 Set (java.util.Set)9 Path (java.nio.file.Path)8 HashSet (java.util.HashSet)8 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)7 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)7 CloserUtil (htsjdk.samtools.util.CloserUtil)7 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)7 VCFHeader (htsjdk.variant.vcf.VCFHeader)7 VCFInfoHeaderLine (htsjdk.variant.vcf.VCFInfoHeaderLine)7 Allele (htsjdk.variant.variantcontext.Allele)6