use of org.apache.hyracks.util.string.UTF8StringReader in project asterixdb by apache.
the class NGramTokenizerTest method runTestNGramTokenizerWithUTF8Tokens.
void runTestNGramTokenizerWithUTF8Tokens(boolean prePost) throws IOException {
UTF8NGramTokenFactory tokenFactory = new UTF8NGramTokenFactory();
NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false, tokenFactory);
tokenizer.reset(inputBuffer, 0, inputBuffer.length);
ArrayList<String> expectedGrams = new ArrayList<String>();
getExpectedGrams(str, gramLength, expectedGrams, prePost);
int tokenCount = 0;
while (tokenizer.hasNext()) {
tokenizer.next();
// serialize hashed token
GrowableArray tokenData = new GrowableArray();
IToken token = tokenizer.getToken();
token.serializeToken(tokenData);
// deserialize token
ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
DataInput in = new DataInputStream(bais);
UTF8StringReader reader = new UTF8StringReader();
String strGram = reader.readUTF(in);
// System.out.println("\"" + strGram + "\"");
Assert.assertEquals(expectedGrams.get(tokenCount), strGram);
tokenCount++;
}
// System.out.println("---------");
}
use of org.apache.hyracks.util.string.UTF8StringReader in project asterixdb by apache.
the class WordTokenizerTest method testWordTokenizerWithUTF8Tokens.
@Test
public void testWordTokenizerWithUTF8Tokens() throws IOException {
UTF8WordTokenFactory tokenFactory = new UTF8WordTokenFactory();
DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
tokenizer.reset(inputBuffer, 0, inputBuffer.length);
int tokenCount = 0;
while (tokenizer.hasNext()) {
tokenizer.next();
// serialize hashed token
GrowableArray tokenData = new GrowableArray();
IToken token = tokenizer.getToken();
token.serializeToken(tokenData);
// deserialize token
ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
DataInput in = new DataInputStream(bais);
UTF8StringReader reader = new UTF8StringReader();
String strToken = reader.readUTF(in);
Assert.assertEquals(expectedUTF8Tokens.get(tokenCount), strToken);
tokenCount++;
}
}
Aggregations