Search in sources :

Example 21 with GrowableArray

use of org.apache.hyracks.data.std.util.GrowableArray in project asterixdb by apache.

the class NGramTokenizerTest method runTestNGramTokenizerWithCountedHashedUTF8Tokens.

void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost) throws IOException {
    HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
    NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, false, false, tokenFactory);
    tokenizer.reset(inputBuffer, 0, inputBuffer.length);
    ArrayList<String> expectedGrams = new ArrayList<String>();
    getExpectedGrams(str, gramLength, expectedGrams, prePost);
    ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
    HashMap<String, Integer> gramCounts = new HashMap<String, Integer>();
    for (String s : expectedGrams) {
        Integer count = gramCounts.get(s);
        if (count == null) {
            count = 1;
            gramCounts.put(s, count);
        } else {
            count++;
        }
        int hash = tokenHash(s, count);
        expectedHashedGrams.add(hash);
    }
    int tokenCount = 0;
    while (tokenizer.hasNext()) {
        tokenizer.next();
        // serialize hashed token
        GrowableArray tokenData = new GrowableArray();
        IToken token = tokenizer.getToken();
        token.serializeToken(tokenData);
        // deserialize token
        ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
        DataInput in = new DataInputStream(bais);
        Integer hashedGram = in.readInt();
        // System.out.println(hashedGram);
        Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
        tokenCount++;
    }
// System.out.println("---------");
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) GrowableArray(org.apache.hyracks.data.std.util.GrowableArray) DataInputStream(java.io.DataInputStream) DataInput(java.io.DataInput) ByteArrayInputStream(java.io.ByteArrayInputStream)

Example 22 with GrowableArray

use of org.apache.hyracks.data.std.util.GrowableArray in project asterixdb by apache.

the class WordTokenizerTest method testWordTokenizerWithHashedUTF8Tokens.

@Test
public void testWordTokenizerWithHashedUTF8Tokens() throws IOException {
    HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
    DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
    tokenizer.reset(inputBuffer, 0, inputBuffer.length);
    int tokenCount = 0;
    while (tokenizer.hasNext()) {
        tokenizer.next();
        // serialize hashed token
        GrowableArray tokenData = new GrowableArray();
        IToken token = tokenizer.getToken();
        token.serializeToken(tokenData);
        // deserialize token
        ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
        DataInput in = new DataInputStream(bais);
        Integer hashedToken = in.readInt();
        Assert.assertEquals(expectedHashedUTF8Tokens.get(tokenCount), hashedToken);
        tokenCount++;
    }
}
Also used : DataInput(java.io.DataInput) ByteArrayInputStream(java.io.ByteArrayInputStream) GrowableArray(org.apache.hyracks.data.std.util.GrowableArray) DataInputStream(java.io.DataInputStream) Test(org.junit.Test)

Example 23 with GrowableArray

use of org.apache.hyracks.data.std.util.GrowableArray in project asterixdb by apache.

the class WordTokenizerTest method testWordTokenizerWithUTF8Tokens.

@Test
public void testWordTokenizerWithUTF8Tokens() throws IOException {
    UTF8WordTokenFactory tokenFactory = new UTF8WordTokenFactory();
    DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
    tokenizer.reset(inputBuffer, 0, inputBuffer.length);
    int tokenCount = 0;
    while (tokenizer.hasNext()) {
        tokenizer.next();
        // serialize hashed token
        GrowableArray tokenData = new GrowableArray();
        IToken token = tokenizer.getToken();
        token.serializeToken(tokenData);
        // deserialize token
        ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
        DataInput in = new DataInputStream(bais);
        UTF8StringReader reader = new UTF8StringReader();
        String strToken = reader.readUTF(in);
        Assert.assertEquals(expectedUTF8Tokens.get(tokenCount), strToken);
        tokenCount++;
    }
}
Also used : DataInput(java.io.DataInput) ByteArrayInputStream(java.io.ByteArrayInputStream) GrowableArray(org.apache.hyracks.data.std.util.GrowableArray) DataInputStream(java.io.DataInputStream) UTF8StringReader(org.apache.hyracks.util.string.UTF8StringReader) Test(org.junit.Test)

Aggregations

GrowableArray (org.apache.hyracks.data.std.util.GrowableArray)23 UTF8StringBuilder (org.apache.hyracks.data.std.util.UTF8StringBuilder)14 Test (org.junit.Test)12 ByteArrayInputStream (java.io.ByteArrayInputStream)7 DataInput (java.io.DataInput)7 DataInputStream (java.io.DataInputStream)7 IOException (java.io.IOException)6 DataOutput (java.io.DataOutput)5 IScalarEvaluator (org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator)5 IScalarEvaluatorFactory (org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory)5 IHyracksTaskContext (org.apache.hyracks.api.context.IHyracksTaskContext)5 HyracksDataException (org.apache.hyracks.api.exceptions.HyracksDataException)5 IPointable (org.apache.hyracks.data.std.api.IPointable)5 VoidPointable (org.apache.hyracks.data.std.primitive.VoidPointable)5 ArrayBackedValueStorage (org.apache.hyracks.data.std.util.ArrayBackedValueStorage)5 IFrameTupleReference (org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference)5 TypeMismatchException (org.apache.asterix.runtime.exceptions.TypeMismatchException)4 UTF8StringPointable (org.apache.hyracks.data.std.primitive.UTF8StringPointable)4 ArrayList (java.util.ArrayList)3 RuntimeDataException (org.apache.asterix.common.exceptions.RuntimeDataException)2