use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken in project asterixdb by apache.
the class BinaryTokenizerOperatorNodePushable method nextFrame.
@Override
public void nextFrame(ByteBuffer buffer) throws HyracksDataException {
accessor.reset(buffer);
int tupleCount = accessor.getTupleCount();
for (int i = 0; i < tupleCount; i++) {
short numTokens = 0;
tokenizer.reset(accessor.getBuffer().array(), accessor.getTupleStartOffset(i) + accessor.getFieldSlotsLength() + accessor.getFieldStartOffset(i, docField), accessor.getFieldLength(i, docField));
if (addNumTokensKey) {
// Get the total number of tokens.
numTokens = tokenizer.getTokensCount();
}
// in the writeKeyFieldsFirst field.
while (tokenizer.hasNext()) {
tokenizer.next();
builder.reset();
// Writing Order: token, number of token, keyfield1 ... n
if (!writeKeyFieldsFirst) {
try {
IToken token = tokenizer.getToken();
token.serializeToken(builderData);
builder.addFieldEndOffset();
// Add number of tokens if requested.
if (addNumTokensKey) {
builder.getDataOutput().writeShort(numTokens);
builder.addFieldEndOffset();
}
} catch (IOException e) {
throw new HyracksDataException(e.getMessage());
}
for (int k = 0; k < keyFields.length; k++) {
builder.addField(accessor, i, keyFields[k]);
}
} else // Writing Order: keyfield1 ... n, token, number of token
{
for (int k = 0; k < keyFields.length; k++) {
builder.addField(accessor, i, keyFields[k]);
}
try {
IToken token = tokenizer.getToken();
token.serializeToken(builderData);
builder.addFieldEndOffset();
// Add number of tokens if requested.
if (addNumTokensKey) {
builder.getDataOutput().writeShort(numTokens);
builder.addFieldEndOffset();
}
} catch (IOException e) {
throw new HyracksDataException(e.getMessage());
}
}
FrameUtils.appendToWriter(writer, appender, builder.getFieldEndOffsets(), builder.getByteArray(), 0, builder.getSize());
}
}
}
use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken in project asterixdb by apache.
the class AbstractTOccurrenceSearcher method tokenizeQuery.
protected void tokenizeQuery(InvertedIndexSearchPredicate searchPred) throws HyracksDataException {
ITupleReference queryTuple = searchPred.getQueryTuple();
int queryFieldIndex = searchPred.getQueryFieldIndex();
IBinaryTokenizer queryTokenizer = searchPred.getQueryTokenizer();
// Is this a full-text query?
// Then, the last argument is conjuctive or disjunctive search option, not a query text.
// Thus, we need to remove the last argument.
boolean isFullTextSearchQuery = searchPred.getIsFullTextSearchQuery();
// Get the type of query tokenizer.
TokenizerType queryTokenizerType = queryTokenizer.getTokenizerType();
int tokenCountInOneField = 0;
queryTokenAppender.reset(queryTokenFrame, true);
queryTokenizer.reset(queryTuple.getFieldData(queryFieldIndex), queryTuple.getFieldStart(queryFieldIndex), queryTuple.getFieldLength(queryFieldIndex));
while (queryTokenizer.hasNext()) {
queryTokenizer.next();
queryTokenBuilder.reset();
tokenCountInOneField++;
try {
IToken token = queryTokenizer.getToken();
// If it's a list, it can have multiple keywords in it. But, each keyword should not be a phrase.
if (isFullTextSearchQuery) {
if (queryTokenizerType == TokenizerType.STRING && tokenCountInOneField > 1) {
throw HyracksDataException.create(ErrorCode.FULLTEXT_PHRASE_FOUND);
} else if (queryTokenizerType == TokenizerType.LIST) {
for (int j = 1; j < token.getTokenLength(); j++) {
if (DelimitedUTF8StringBinaryTokenizer.isSeparator((char) token.getData()[token.getStartOffset() + j])) {
throw HyracksDataException.create(ErrorCode.FULLTEXT_PHRASE_FOUND);
}
}
}
}
token.serializeToken(queryTokenBuilder.getFieldData());
queryTokenBuilder.addFieldEndOffset();
// WARNING: assuming one frame is big enough to hold all tokens
queryTokenAppender.append(queryTokenBuilder.getFieldEndOffsets(), queryTokenBuilder.getByteArray(), 0, queryTokenBuilder.getSize());
} catch (IOException e) {
throw new HyracksDataException(e);
}
}
}
use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken in project asterixdb by apache.
the class PartitionedInvertedIndexTokenizingTupleIterator method next.
public void next() throws HyracksDataException {
tokenizer.next();
IToken token = tokenizer.getToken();
tupleBuilder.reset();
try {
// Add token field.
token.serializeToken(tupleBuilder.getFieldData());
tupleBuilder.addFieldEndOffset();
// Add field with number of tokens.
tupleBuilder.getDataOutput().writeShort(numTokens);
tupleBuilder.addFieldEndOffset();
} catch (IOException e) {
throw new HyracksDataException(e);
}
// Add inverted-list element fields.
for (int i = 0; i < invListFieldCount; i++) {
tupleBuilder.addField(inputTuple.getFieldData(i + 1), inputTuple.getFieldStart(i + 1), inputTuple.getFieldLength(i + 1));
}
// Reset tuple reference for insert operation.
tupleReference.reset(tupleBuilder.getFieldEndOffsets(), tupleBuilder.getByteArray());
}
use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken in project asterixdb by apache.
the class LSMInvertedIndexTestUtils method getExpectedResults.
@SuppressWarnings("unchecked")
public static void getExpectedResults(int[] scanCountArray, TreeSet<CheckTuple> checkTuples, ITupleReference searchDocument, IBinaryTokenizer tokenizer, ISerializerDeserializer tokenSerde, IInvertedIndexSearchModifier searchModifier, List<Integer> expectedResults, boolean isPartitioned) throws IOException {
// Reset scan count array.
Arrays.fill(scanCountArray, 0);
expectedResults.clear();
GrowableArray tokenData = new GrowableArray();
tokenizer.reset(searchDocument.getFieldData(0), searchDocument.getFieldStart(0), searchDocument.getFieldLength(0));
// Run though tokenizer to get number of tokens.
int numQueryTokens = 0;
while (tokenizer.hasNext()) {
tokenizer.next();
numQueryTokens++;
}
short numTokensLowerBound = -1;
short numTokensUpperBound = -1;
int invListElementField = 1;
if (isPartitioned) {
numTokensLowerBound = searchModifier.getNumTokensLowerBound((short) numQueryTokens);
numTokensUpperBound = searchModifier.getNumTokensUpperBound((short) numQueryTokens);
invListElementField = 2;
}
int occurrenceThreshold = searchModifier.getOccurrenceThreshold(numQueryTokens);
tokenizer.reset(searchDocument.getFieldData(0), searchDocument.getFieldStart(0), searchDocument.getFieldLength(0));
while (tokenizer.hasNext()) {
tokenizer.next();
IToken token = tokenizer.getToken();
tokenData.reset();
token.serializeToken(tokenData);
ByteArrayInputStream inStream = new ByteArrayInputStream(tokenData.getByteArray(), 0, tokenData.getLength());
DataInput dataIn = new DataInputStream(inStream);
Comparable tokenObj = (Comparable) tokenSerde.deserialize(dataIn);
CheckTuple lowKey;
if (numTokensLowerBound < 0) {
// Index is not partitioned, or no length filtering is possible for this search modifier.
lowKey = new CheckTuple(1, 1);
lowKey.appendField(tokenObj);
} else {
// Index is length partitioned, and search modifier supports length filtering.
lowKey = new CheckTuple(2, 2);
lowKey.appendField(tokenObj);
lowKey.appendField(Short.valueOf(numTokensLowerBound));
}
CheckTuple highKey;
if (numTokensUpperBound < 0) {
// Index is not partitioned, or no length filtering is possible for this search modifier.
highKey = new CheckTuple(1, 1);
highKey.appendField(tokenObj);
} else {
// Index is length partitioned, and search modifier supports length filtering.
highKey = new CheckTuple(2, 2);
highKey.appendField(tokenObj);
highKey.appendField(Short.valueOf(numTokensUpperBound));
}
// Get view over check tuples containing inverted-list corresponding to token.
SortedSet<CheckTuple> invList = OrderedIndexTestUtils.getPrefixExpectedSubset(checkTuples, lowKey, highKey);
Iterator<CheckTuple> invListIter = invList.iterator();
// Iterate over inverted list and update scan count array.
while (invListIter.hasNext()) {
CheckTuple checkTuple = invListIter.next();
Integer element = (Integer) checkTuple.getField(invListElementField);
scanCountArray[element]++;
}
}
// Run through scan count array, and see whether elements satisfy the given occurrence threshold.
expectedResults.clear();
for (int i = 0; i < scanCountArray.length; i++) {
if (scanCountArray[i] >= occurrenceThreshold) {
expectedResults.add(i);
}
}
}
use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken in project asterixdb by apache.
the class InvertedIndexTokenizingTupleIterator method next.
public void next() throws HyracksDataException {
tokenizer.next();
IToken token = tokenizer.getToken();
tupleBuilder.reset();
// Add token field.
try {
token.serializeToken(tupleBuilder.getFieldData());
} catch (IOException e) {
throw new HyracksDataException(e);
}
tupleBuilder.addFieldEndOffset();
// Add inverted-list element fields.
for (int i = 0; i < invListFieldCount; i++) {
tupleBuilder.addField(inputTuple.getFieldData(i + 1), inputTuple.getFieldStart(i + 1), inputTuple.getFieldLength(i + 1));
}
// Reset tuple reference for insert operation.
tupleReference.reset(tupleBuilder.getFieldEndOffsets(), tupleBuilder.getByteArray());
}
Aggregations