use of org.apache.jena.riot.tokens.Tokenizer in project jena by apache.
the class NodecSSE method decode.
@Override
public Node decode(ByteBuffer bb, PrefixMapping pmap) {
// Ideally, this would be straight from the byte buffer.
// But currently we go bytes -> string -> node
// Byte -> String
String str = BlockUTF8.toString(bb);
// Easy cases.
if (str.startsWith("_:")) {
// Must be done this way.
// In particular, bnode labels can contain ":" from Jena
// TokenizerText does not recognize these.
str = str.substring(2);
return NodeFactory.createBlankNode(str);
}
if (str.startsWith("<")) {
// Do directly.
// (is it quicker?)
str = str.substring(1, str.length() - 1);
str = StrUtils.unescapeString(str);
str = StrUtils.decodeHex(str, MarkerChar);
return NodeFactory.createURI(str);
}
Tokenizer tokenizer = TokenizerFactory.makeTokenizerString(str);
if (!tokenizer.hasNext())
throw new TDBException("Failed to tokenise: " + str);
Token t = tokenizer.next();
try {
Node n = t.asNode();
if (n == null)
throw new TDBException("Not a node: " + str);
return n;
} catch (RiotException ex) {
throw new TDBException("Bad string for node: " + str);
}
}
use of org.apache.jena.riot.tokens.Tokenizer in project jena by apache.
the class TestLangNTuples method tokenizer.
protected static Tokenizer tokenizer(CharSpace charSpace, String string) {
byte[] b = StrUtils.asUTF8bytes(string);
ByteArrayInputStream in = new ByteArrayInputStream(b);
Tokenizer tokenizer = charSpace == CharSpace.ASCII ? TokenizerFactory.makeTokenizerASCII(in) : TokenizerFactory.makeTokenizerUTF8(in);
return tokenizer;
}
use of org.apache.jena.riot.tokens.Tokenizer in project jena by apache.
the class TestLangNTuples method parseCount.
protected final long parseCount(CharSpace charSpace, String... strings) {
String string = String.join("\n", strings);
Tokenizer tokenizer = tokenizer(charSpace, string);
StreamRDFCounting sink = StreamRDFLib.count();
LangRIOT x = RiotParsers.createParserNTriples(tokenizer, sink, parserProfile(new ErrorHandlerEx()));
x.parse();
return sink.count();
}
use of org.apache.jena.riot.tokens.Tokenizer in project jena by apache.
the class TestBindingStreams method testRead.
static void testRead(String x, Binding... bindings) {
Tokenizer t = TokenizerFactory.makeTokenizerString(x);
BindingInputStream inStream = new BindingInputStream(t);
if (bindings.length == 0) {
for (; inStream.hasNext(); ) inStream.next();
return;
}
int i;
for (i = 0; inStream.hasNext(); i++) {
Binding b = inStream.next();
assertTrue("Bindings do not match: expected=" + bindings[i] + " got=" + b, equalBindings(bindings[i], b));
}
assertEquals("Wrong length: expect= " + bindings.length + " got=" + i, bindings.length, i);
}
use of org.apache.jena.riot.tokens.Tokenizer in project jena by apache.
the class CmdTokens method tokens.
public static void tokens(final boolean print, final boolean timing, String... args) {
if (args.length == 0)
args = new String[] { "-" };
String arg = args[0];
if (arg.equals("--help") || arg.equals("-help") || arg.equals("-h") || arg.equals("--h")) {
System.err.println("Usage: stdin | FILE ...");
System.exit(1);
}
for (String filename : args) {
InputStream in = IO.openFile(filename);
Tokenizer tokenize = TokenizerFactory.makeTokenizerUTF8(in);
Timer timer = new Timer();
long count = 0;
timer.startTimer();
for (; tokenize.hasNext(); ) {
Token t = tokenize.next();
if (print)
System.out.println(t);
count++;
}
tokenize.close();
long millis = timer.endTimer();
if (timing) {
if (millis == 0)
System.out.printf("Tokens=%,d : Time=0.00s\n", count);
else {
double seconds = millis / 1000.0;
System.out.printf("Tokens=%,d : Time=%,.2fs : Rate=%,.2f\n", count, seconds, count / seconds);
}
}
}
}
Aggregations