Search in sources :

Example 1 with TokenizerType

use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.TokenizerInfo.TokenizerType in project asterixdb by apache.

the class AbstractTOccurrenceSearcher method tokenizeQuery.

protected void tokenizeQuery(InvertedIndexSearchPredicate searchPred) throws HyracksDataException {
    ITupleReference queryTuple = searchPred.getQueryTuple();
    int queryFieldIndex = searchPred.getQueryFieldIndex();
    IBinaryTokenizer queryTokenizer = searchPred.getQueryTokenizer();
    // Is this a full-text query?
    // Then, the last argument is conjuctive or disjunctive search option, not a query text.
    // Thus, we need to remove the last argument.
    boolean isFullTextSearchQuery = searchPred.getIsFullTextSearchQuery();
    // Get the type of query tokenizer.
    TokenizerType queryTokenizerType = queryTokenizer.getTokenizerType();
    int tokenCountInOneField = 0;
    queryTokenAppender.reset(queryTokenFrame, true);
    queryTokenizer.reset(queryTuple.getFieldData(queryFieldIndex), queryTuple.getFieldStart(queryFieldIndex), queryTuple.getFieldLength(queryFieldIndex));
    while (queryTokenizer.hasNext()) {
        queryTokenizer.next();
        queryTokenBuilder.reset();
        tokenCountInOneField++;
        try {
            IToken token = queryTokenizer.getToken();
            // If it's a list, it can have multiple keywords in it. But, each keyword should not be a phrase.
            if (isFullTextSearchQuery) {
                if (queryTokenizerType == TokenizerType.STRING && tokenCountInOneField > 1) {
                    throw HyracksDataException.create(ErrorCode.FULLTEXT_PHRASE_FOUND);
                } else if (queryTokenizerType == TokenizerType.LIST) {
                    for (int j = 1; j < token.getTokenLength(); j++) {
                        if (DelimitedUTF8StringBinaryTokenizer.isSeparator((char) token.getData()[token.getStartOffset() + j])) {
                            throw HyracksDataException.create(ErrorCode.FULLTEXT_PHRASE_FOUND);
                        }
                    }
                }
            }
            token.serializeToken(queryTokenBuilder.getFieldData());
            queryTokenBuilder.addFieldEndOffset();
            // WARNING: assuming one frame is big enough to hold all tokens
            queryTokenAppender.append(queryTokenBuilder.getFieldEndOffsets(), queryTokenBuilder.getByteArray(), 0, queryTokenBuilder.getSize());
        } catch (IOException e) {
            throw new HyracksDataException(e);
        }
    }
}
Also used : IToken(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken) ITupleReference(org.apache.hyracks.dataflow.common.data.accessors.ITupleReference) TokenizerType(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.TokenizerInfo.TokenizerType) IBinaryTokenizer(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer) IOException(java.io.IOException) HyracksDataException(org.apache.hyracks.api.exceptions.HyracksDataException)

Aggregations

IOException (java.io.IOException)1 HyracksDataException (org.apache.hyracks.api.exceptions.HyracksDataException)1 ITupleReference (org.apache.hyracks.dataflow.common.data.accessors.ITupleReference)1 IBinaryTokenizer (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer)1 IToken (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken)1 TokenizerType (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.TokenizerInfo.TokenizerType)1