use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.
the class TestSupport method assertTsvTableIsConsitent.
public void assertTsvTableIsConsitent(final Path f, Predicate<String> ignoreLine) {
final CharSplitter tab = CharSplitter.TAB;
BufferedReader r = null;
try {
Assert.assertTrue(Files.exists(f), "file " + f + " should exist");
r = IOUtils.openPathForBufferedReading(f);
int nCols = -1;
int nRow = 0;
String line;
while ((line = r.readLine()) != null) {
if (ignoreLine != null && ignoreLine.test(line))
continue;
nRow++;
final String[] tokens = tab.split(line);
final int c = tokens.length;
if (nRow == 1) {
nCols = c;
} else {
if (nCols != c) {
for (int i = 0; i < tokens.length; i++) {
System.err.println("$" + (i + 1) + ":" + tokens[i]);
}
}
Assert.assertEquals(nCols, c, "Line " + line + " expected " + nCols + " tokens but got " + c);
}
}
} catch (final Exception e) {
e.printStackTrace();
Assert.fail();
} finally {
CloserUtil.close(r);
}
}
use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.
the class VcfGnomadOld method doWork.
@Override
public int doWork(final List<String> args) {
try {
if (this.gnomadBufferSize < 10) {
LOG.error("buffer size is too small " + this.gnomadBufferSize);
return -1;
}
if (this.manifestFile == null) {
LOG.info("Building default manifest file...");
for (final OmeType ot : OmeType.values()) {
if (this.useGenomeOnly && !ot.equals(OmeType.genome))
continue;
for (int i = 1; i <= 24; ++i) {
final ManifestEntry entry = new ManifestEntry();
entry.omeType = ot;
switch(i) {
case 23:
entry.contig = "X";
break;
case 24:
entry.contig = "Y";
break;
default:
entry.contig = String.valueOf(i);
break;
}
switch(gnomadVersion) {
case v2_1:
if (ot == OmeType.genome) {
// no "chrY" for this version for genome
if (i == 24)
continue;
entry.uri = "https://storage.googleapis.com/gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr" + entry.contig + ".vcf.bgz";
} else {
entry.uri = "https://storage.googleapis.com/gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr" + entry.contig + ".vcf.bgz";
}
break;
default:
{
entry.close();
LOG.error("Building default manifest is not available for version: " + this.gnomadVersion);
return -1;
}
}
this.manifestEntries.add(entry);
}
}
LOG.info("Building default manifest file... Done");
} else {
try {
final CharSplitter tab = CharSplitter.TAB;
Files.lines(this.manifestFile.toPath()).forEach(L -> {
if (L.startsWith("#") || StringUtil.isBlank(L))
return;
final String[] tokens = tab.split(L);
if (tokens.length < 3)
throw new JvarkitException.TokenErrors("Expected 3 words", tokens);
final ManifestEntry entry = new ManifestEntry();
entry.omeType = OmeType.valueOf(tokens[0]);
if (this.useGenomeOnly && !entry.omeType.equals(OmeType.genome)) {
entry.close();
return;
}
entry.contig = tokens[1].trim();
entry.uri = tokens[2].trim();
this.manifestEntries.add(entry);
});
} catch (final IOException err) {
LOG.error(err);
return -1;
}
}
return doVcfToVcf(args, this.outputFile);
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
}
}
use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.
the class VcfGnomadPext method findOverlapping.
/**
* find matching variant in tabix file, use a buffer to avoid multiple random accesses
*/
final List<PextEntry> findOverlapping(final TabixReader tabix, final VariantContext ctx) throws IOException {
final String normContig = this.ensemblCtgConvert.apply(ctx.getContig());
if (StringUtil.isBlank(normContig))
return Collections.emptyList();
if (!buffer.isEmpty() && !buffer.get(0).contig.equals(normContig)) {
this.buffer.clear();
}
if (this.lastInterval == null || !this.lastInterval.getContig().equals(normContig) || !CoordMath.encloses(lastInterval.getStart(), lastInterval.getEnd(), ctx.getStart(), ctx.getEnd())) {
final CharSplitter tab = CharSplitter.TAB;
this.buffer.clear();
this.lastInterval = new Interval(normContig, Math.max(0, ctx.getStart() - 10), ctx.getEnd() + VcfGnomadPext.this.gnomadBufferSize);
final TabixReader.Iterator iter = tabix.query(this.lastInterval.getContig(), this.lastInterval.getStart(), this.lastInterval.getEnd());
for (; ; ) {
final String line = iter.next();
if (line == null)
break;
final String[] tokens = tab.split(line);
this.buffer.add(new PextEntry(tokens));
}
}
return this.buffer.stream().filter(V -> V.getStart() == ctx.getStart() && V.getEnd() == ctx.getEnd() && V.getContig().equals(normContig) && V.ref.equals(ctx.getReference()) && ctx.getAlleles().contains(V.alt)).collect(Collectors.toList());
}
use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.
the class VcfGnomadPext method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VCFIterator iter, final VariantContextWriter out) {
final JsonParser jsonParser = new JsonParser();
final String[] standard_pext_header = new String[] { "chrom", "pos", "ref", "alt", "tx_annotation" };
final VCFHeader h0 = iter.getHeader();
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(h0);
if (!SequenceDictionaryUtils.isGRCh37(dict)) {
LOG.error("Input is NOT GRCh37 ");
return -1;
}
final OrderChecker<VariantContext> orderChecked = new OrderChecker<>(dict, false);
final CharSplitter tab = CharSplitter.TAB;
try (TabixReader gextFileReader = new TabixReader(this.pextDatabasePath)) {
final String line1 = gextFileReader.readLine();
if (StringUtils.isBlank(line1)) {
LOG.error("Cannot read first line of " + this.pextDatabasePath);
return -1;
}
if (!Arrays.equals(tab.split(line1), standard_pext_header)) {
LOG.error("Bad header in " + this.pextDatabasePath + " expected " + String.join("(tab)", standard_pext_header) + " but got " + line1.replace("\t", "(tab)"));
return -1;
}
final VCFHeader h2 = new VCFHeader(h0);
final VCFInfoHeaderLine pexInfo = new VCFInfoHeaderLine("GNOMAD_PEXT", VCFHeaderLineCount.A, VCFHeaderLineType.String, "Gnomad pext Data from " + this.pextDatabasePath);
h2.addMetaDataLine(pexInfo);
JVarkitVersion.getInstance().addMetaData(this, h2);
out.writeHeader(h2);
while (iter.hasNext()) {
final VariantContext ctx = orderChecked.apply(iter.next());
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.rmAttribute(pexInfo.getID());
if (!ctx.isVariant() || (this.skipFiltered && ctx.isFiltered())) {
out.add(vcb.make());
continue;
}
final List<PextEntry> entries = findOverlapping(gextFileReader, ctx);
if (entries.isEmpty()) {
out.add(vcb.make());
continue;
}
final List<String> altInfo = new ArrayList<>(ctx.getAlleles().size());
for (int allele_idx = 1; /* 0 is ref */
allele_idx < ctx.getAlleles().size(); allele_idx++) {
final Allele alt = ctx.getAlleles().get(allele_idx);
final PextEntry entry = entries.stream().filter(E -> E.alt.equals(alt)).findFirst().orElse(null);
if (entry == null) {
altInfo.add(".");
} else {
final JsonElement e = jsonParser.parse(entry.jsonStr);
if (!e.isJsonArray())
throw new IllegalStateException("not an array: " + entry.jsonStr);
final JsonArray array = e.getAsJsonArray();
final StringBuilder sb = new StringBuilder();
for (int x = 0; x < array.size(); ++x) {
if (x > 0)
sb.append("&");
if (!array.get(x).isJsonObject())
throw new IllegalStateException("not an array: " + entry.jsonStr);
final StringBuilder sb2 = new StringBuilder();
final JsonObject obj = array.get(x).getAsJsonObject();
for (final Map.Entry<String, JsonElement> kv : obj.entrySet()) {
String key = kv.getKey();
// "Brain_FrontalCortex_BA9_": 1.0,
if (key.endsWith("_"))
key = key.substring(0, key.length() - 1);
// as far as I can see, tissues start with a uppercase
if (Character.isUpperCase(key.charAt(0)) && !this.restrictTissues.isEmpty() && !this.restrictTissues.contains(key))
continue;
final JsonElement v = kv.getValue();
if (v.isJsonNull())
continue;
if (v.getAsJsonPrimitive().isString()) {
final String strv = v.getAsString();
if (strv.equals("NaN"))
continue;
}
if (sb2.length() > 0)
sb2.append("|");
sb2.append(key);
sb2.append(":");
sb2.append(kv.getValue().getAsString());
}
sb.append(sb2.toString());
}
if (sb.length() == 0)
sb.append(".");
altInfo.add(sb.toString());
}
}
// at least none is not '.'
if (altInfo.stream().anyMatch(S -> !S.equals("."))) {
vcb.attribute(pexInfo.getID(), altInfo);
}
out.add(vcb.make());
}
out.close();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.
the class KnownGene method loadUriAsIntervalTreeMap.
/**
* load knownGene file/uri as an IntervalTreeMap. Intervals in the IntervalTreeMap are *1-based* (interval.start= kg.txStart+1)
*/
public static IntervalTreeMap<List<KnownGene>> loadUriAsIntervalTreeMap(final String uri, final Predicate<KnownGene> filterOrNull) throws IOException {
final IntervalTreeMap<List<KnownGene>> treeMap = new IntervalTreeMap<>();
BufferedReader in = null;
try {
in = IOUtils.openURIForBufferedReading(uri);
String line;
final CharSplitter tab = CharSplitter.TAB;
while ((line = in.readLine()) != null) {
if (line.isEmpty())
continue;
final String[] tokens = tab.split(line);
final KnownGene g = new KnownGene(tokens);
if (filterOrNull != null && !filterOrNull.test(g))
continue;
final Interval interval = new Interval(g.getContig(), g.getTxStart() + 1, g.getTxEnd(), g.isNegativeStrand(), g.getName());
List<KnownGene> L = treeMap.get(interval);
if (L == null) {
L = new ArrayList<>(2);
treeMap.put(interval, L);
}
L.add(g);
}
in.close();
in = null;
return treeMap;
} finally {
CloserUtil.close(in);
}
}
Aggregations