use of org.apache.hyracks.data.std.util.GrowableArray in project asterixdb by apache.
the class NGramTokenizerTest method runTestNGramTokenizerWithCountedHashedUTF8Tokens.
void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost) throws IOException {
HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, false, false, tokenFactory);
tokenizer.reset(inputBuffer, 0, inputBuffer.length);
ArrayList<String> expectedGrams = new ArrayList<String>();
getExpectedGrams(str, gramLength, expectedGrams, prePost);
ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
HashMap<String, Integer> gramCounts = new HashMap<String, Integer>();
for (String s : expectedGrams) {
Integer count = gramCounts.get(s);
if (count == null) {
count = 1;
gramCounts.put(s, count);
} else {
count++;
}
int hash = tokenHash(s, count);
expectedHashedGrams.add(hash);
}
int tokenCount = 0;
while (tokenizer.hasNext()) {
tokenizer.next();
// serialize hashed token
GrowableArray tokenData = new GrowableArray();
IToken token = tokenizer.getToken();
token.serializeToken(tokenData);
// deserialize token
ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
DataInput in = new DataInputStream(bais);
Integer hashedGram = in.readInt();
// System.out.println(hashedGram);
Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
tokenCount++;
}
// System.out.println("---------");
}
use of org.apache.hyracks.data.std.util.GrowableArray in project asterixdb by apache.
the class WordTokenizerTest method testWordTokenizerWithHashedUTF8Tokens.
@Test
public void testWordTokenizerWithHashedUTF8Tokens() throws IOException {
HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
tokenizer.reset(inputBuffer, 0, inputBuffer.length);
int tokenCount = 0;
while (tokenizer.hasNext()) {
tokenizer.next();
// serialize hashed token
GrowableArray tokenData = new GrowableArray();
IToken token = tokenizer.getToken();
token.serializeToken(tokenData);
// deserialize token
ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
DataInput in = new DataInputStream(bais);
Integer hashedToken = in.readInt();
Assert.assertEquals(expectedHashedUTF8Tokens.get(tokenCount), hashedToken);
tokenCount++;
}
}
use of org.apache.hyracks.data.std.util.GrowableArray in project asterixdb by apache.
the class WordTokenizerTest method testWordTokenizerWithUTF8Tokens.
@Test
public void testWordTokenizerWithUTF8Tokens() throws IOException {
UTF8WordTokenFactory tokenFactory = new UTF8WordTokenFactory();
DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
tokenizer.reset(inputBuffer, 0, inputBuffer.length);
int tokenCount = 0;
while (tokenizer.hasNext()) {
tokenizer.next();
// serialize hashed token
GrowableArray tokenData = new GrowableArray();
IToken token = tokenizer.getToken();
token.serializeToken(tokenData);
// deserialize token
ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
DataInput in = new DataInputStream(bais);
UTF8StringReader reader = new UTF8StringReader();
String strToken = reader.readUTF(in);
Assert.assertEquals(expectedUTF8Tokens.get(tokenCount), strToken);
tokenCount++;
}
}
Aggregations