use of org.apache.hyracks.data.std.util.GrowableArray in project asterixdb by apache.
the class UTF8StringPointableTest method testSubstr.
@Test
public void testSubstr() throws Exception {
GrowableArray storage = new GrowableArray();
UTF8StringBuilder builder = new UTF8StringBuilder();
STRING_LEN_128.substr(1, 127, builder, storage);
UTF8StringPointable result = new UTF8StringPointable();
result.set(storage.getByteArray(), 0, storage.getLength());
assertEquals(0, STRING_LEN_127.compareTo(result));
storage.reset();
STRING_UTF8_MIX.substr(0, UTF8StringSample.STRING_UTF8_MIX.length(), builder, storage);
result.set(storage.getByteArray(), 0, storage.getLength());
assertEquals(0, STRING_UTF8_MIX.compareTo(result));
}
use of org.apache.hyracks.data.std.util.GrowableArray in project asterixdb by apache.
the class NGramTokenizerTest method runTestNGramTokenizerWithHashedUTF8Tokens.
void runTestNGramTokenizerWithHashedUTF8Tokens(boolean prePost) throws IOException {
HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false, tokenFactory);
tokenizer.reset(inputBuffer, 0, inputBuffer.length);
ArrayList<String> expectedGrams = new ArrayList<String>();
getExpectedGrams(str, gramLength, expectedGrams, prePost);
ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
for (String s : expectedGrams) {
int hash = tokenHash(s, 1);
expectedHashedGrams.add(hash);
}
int tokenCount = 0;
while (tokenizer.hasNext()) {
tokenizer.next();
// serialize hashed token
GrowableArray tokenData = new GrowableArray();
IToken token = tokenizer.getToken();
token.serializeToken(tokenData);
// deserialize token
ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
DataInput in = new DataInputStream(bais);
Integer hashedGram = in.readInt();
// System.out.println(hashedGram);
Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
tokenCount++;
}
// System.out.println("---------");
}
use of org.apache.hyracks.data.std.util.GrowableArray in project asterixdb by apache.
the class WordTokenizerTest method testWordTokenizerWithCountedHashedUTF8Tokens.
@Test
public void testWordTokenizerWithCountedHashedUTF8Tokens() throws IOException {
HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(false, false, tokenFactory);
tokenizer.reset(inputBuffer, 0, inputBuffer.length);
int tokenCount = 0;
while (tokenizer.hasNext()) {
tokenizer.next();
// serialize hashed token
GrowableArray tokenData = new GrowableArray();
IToken token = tokenizer.getToken();
token.serializeToken(tokenData);
// deserialize token
ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
DataInput in = new DataInputStream(bais);
Integer hashedToken = in.readInt();
Assert.assertEquals(hashedToken, expectedCountedHashedUTF8Tokens.get(tokenCount));
tokenCount++;
}
}
use of org.apache.hyracks.data.std.util.GrowableArray in project asterixdb by apache.
the class LSMInvertedIndexTestUtils method getExpectedResults.
@SuppressWarnings("unchecked")
public static void getExpectedResults(int[] scanCountArray, TreeSet<CheckTuple> checkTuples, ITupleReference searchDocument, IBinaryTokenizer tokenizer, ISerializerDeserializer tokenSerde, IInvertedIndexSearchModifier searchModifier, List<Integer> expectedResults, boolean isPartitioned) throws IOException {
// Reset scan count array.
Arrays.fill(scanCountArray, 0);
expectedResults.clear();
GrowableArray tokenData = new GrowableArray();
tokenizer.reset(searchDocument.getFieldData(0), searchDocument.getFieldStart(0), searchDocument.getFieldLength(0));
// Run though tokenizer to get number of tokens.
int numQueryTokens = 0;
while (tokenizer.hasNext()) {
tokenizer.next();
numQueryTokens++;
}
short numTokensLowerBound = -1;
short numTokensUpperBound = -1;
int invListElementField = 1;
if (isPartitioned) {
numTokensLowerBound = searchModifier.getNumTokensLowerBound((short) numQueryTokens);
numTokensUpperBound = searchModifier.getNumTokensUpperBound((short) numQueryTokens);
invListElementField = 2;
}
int occurrenceThreshold = searchModifier.getOccurrenceThreshold(numQueryTokens);
tokenizer.reset(searchDocument.getFieldData(0), searchDocument.getFieldStart(0), searchDocument.getFieldLength(0));
while (tokenizer.hasNext()) {
tokenizer.next();
IToken token = tokenizer.getToken();
tokenData.reset();
token.serializeToken(tokenData);
ByteArrayInputStream inStream = new ByteArrayInputStream(tokenData.getByteArray(), 0, tokenData.getLength());
DataInput dataIn = new DataInputStream(inStream);
Comparable tokenObj = (Comparable) tokenSerde.deserialize(dataIn);
CheckTuple lowKey;
if (numTokensLowerBound < 0) {
// Index is not partitioned, or no length filtering is possible for this search modifier.
lowKey = new CheckTuple(1, 1);
lowKey.appendField(tokenObj);
} else {
// Index is length partitioned, and search modifier supports length filtering.
lowKey = new CheckTuple(2, 2);
lowKey.appendField(tokenObj);
lowKey.appendField(Short.valueOf(numTokensLowerBound));
}
CheckTuple highKey;
if (numTokensUpperBound < 0) {
// Index is not partitioned, or no length filtering is possible for this search modifier.
highKey = new CheckTuple(1, 1);
highKey.appendField(tokenObj);
} else {
// Index is length partitioned, and search modifier supports length filtering.
highKey = new CheckTuple(2, 2);
highKey.appendField(tokenObj);
highKey.appendField(Short.valueOf(numTokensUpperBound));
}
// Get view over check tuples containing inverted-list corresponding to token.
SortedSet<CheckTuple> invList = OrderedIndexTestUtils.getPrefixExpectedSubset(checkTuples, lowKey, highKey);
Iterator<CheckTuple> invListIter = invList.iterator();
// Iterate over inverted list and update scan count array.
while (invListIter.hasNext()) {
CheckTuple checkTuple = invListIter.next();
Integer element = (Integer) checkTuple.getField(invListElementField);
scanCountArray[element]++;
}
}
// Run through scan count array, and see whether elements satisfy the given occurrence threshold.
expectedResults.clear();
for (int i = 0; i < scanCountArray.length; i++) {
if (scanCountArray[i] >= occurrenceThreshold) {
expectedResults.add(i);
}
}
}
use of org.apache.hyracks.data.std.util.GrowableArray in project asterixdb by apache.
the class TestTupleGenerator method next.
public ITupleReference next() throws HyracksDataException {
if (reuseObject) {
for (int i = 0; i < types.length; i++) {
fields[i].reset();
}
} else {
this.fields = new GrowableArray[types.length];
for (int i = 0; i < types.length; i++) {
fields[i] = new GrowableArray();
}
tuple = new TestTupleReference(fields);
}
for (int i = 0; i < types.length; i++) {
FieldType type = types[i];
switch(type) {
case Boolean:
Boolean aBoolean = random.nextBoolean();
BooleanSerializerDeserializer.INSTANCE.serialize(aBoolean, fields[i].getDataOutput());
break;
case Double:
double aDouble = random.nextDouble();
DoubleSerializerDeserializer.INSTANCE.serialize(aDouble, fields[i].getDataOutput());
break;
case Integer64:
long aLong = random.nextLong();
Integer64SerializerDeserializer.INSTANCE.serialize(aLong, fields[i].getDataOutput());
break;
case String:
String aString = RandomStringUtils.randomAlphanumeric(stringFieldSizes);
stringSerde.serialize(aString, fields[i].getDataOutput());
break;
default:
break;
}
}
return tuple;
}
Aggregations