use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer in project asterixdb by apache.
the class InMemoryInvertedIndexOpContext method setTokenizingTupleIterator.
protected void setTokenizingTupleIterator() {
IBinaryTokenizer tokenizer = getTokenizerFactory().createTokenizer();
tupleIter = new InvertedIndexTokenizingTupleIterator(tokenCmpFactories.length, btree.getFieldCount() - tokenCmpFactories.length, tokenizer);
}
use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer in project asterixdb by apache.
the class AbstractTOccurrenceSearcher method tokenizeQuery.
protected void tokenizeQuery(InvertedIndexSearchPredicate searchPred) throws HyracksDataException {
ITupleReference queryTuple = searchPred.getQueryTuple();
int queryFieldIndex = searchPred.getQueryFieldIndex();
IBinaryTokenizer queryTokenizer = searchPred.getQueryTokenizer();
// Is this a full-text query?
// Then, the last argument is conjuctive or disjunctive search option, not a query text.
// Thus, we need to remove the last argument.
boolean isFullTextSearchQuery = searchPred.getIsFullTextSearchQuery();
// Get the type of query tokenizer.
TokenizerType queryTokenizerType = queryTokenizer.getTokenizerType();
int tokenCountInOneField = 0;
queryTokenAppender.reset(queryTokenFrame, true);
queryTokenizer.reset(queryTuple.getFieldData(queryFieldIndex), queryTuple.getFieldStart(queryFieldIndex), queryTuple.getFieldLength(queryFieldIndex));
while (queryTokenizer.hasNext()) {
queryTokenizer.next();
queryTokenBuilder.reset();
tokenCountInOneField++;
try {
IToken token = queryTokenizer.getToken();
// If it's a list, it can have multiple keywords in it. But, each keyword should not be a phrase.
if (isFullTextSearchQuery) {
if (queryTokenizerType == TokenizerType.STRING && tokenCountInOneField > 1) {
throw HyracksDataException.create(ErrorCode.FULLTEXT_PHRASE_FOUND);
} else if (queryTokenizerType == TokenizerType.LIST) {
for (int j = 1; j < token.getTokenLength(); j++) {
if (DelimitedUTF8StringBinaryTokenizer.isSeparator((char) token.getData()[token.getStartOffset() + j])) {
throw HyracksDataException.create(ErrorCode.FULLTEXT_PHRASE_FOUND);
}
}
}
}
token.serializeToken(queryTokenBuilder.getFieldData());
queryTokenBuilder.addFieldEndOffset();
// WARNING: assuming one frame is big enough to hold all tokens
queryTokenAppender.append(queryTokenBuilder.getFieldEndOffsets(), queryTokenBuilder.getByteArray(), 0, queryTokenBuilder.getSize());
} catch (IOException e) {
throw new HyracksDataException(e);
}
}
}
use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer in project asterixdb by apache.
the class HashedWordTokensDescriptor method createEvaluatorFactory.
@Override
public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
return new IScalarEvaluatorFactory() {
private static final long serialVersionUID = 1L;
@Override
public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException {
ITokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
IBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, true, tokenFactory);
return new WordTokensEvaluator(args, ctx, tokenizer, BuiltinType.AINT32);
}
};
}
use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer in project asterixdb by apache.
the class WordTokensDescriptor method createEvaluatorFactory.
@Override
public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
return new IScalarEvaluatorFactory() {
private static final long serialVersionUID = 1L;
@Override
public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException {
ITokenFactory tokenFactory = new UTF8WordTokenFactory();
IBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, true, tokenFactory);
return new WordTokensEvaluator(args, ctx, tokenizer, BuiltinType.ASTRING);
}
};
}
use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer in project asterixdb by apache.
the class CountHashedWordTokensDescriptor method createEvaluatorFactory.
@Override
public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
return new IScalarEvaluatorFactory() {
private static final long serialVersionUID = 1L;
@Override
public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException {
ITokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
IBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(false, true, tokenFactory);
return new WordTokensEvaluator(args, ctx, tokenizer, BuiltinType.AINT32);
}
};
}
Aggregations