Search in sources :

Example 1 with IToken

use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken in project asterixdb by apache.

the class BinaryTokenizerOperatorNodePushable method nextFrame.

@Override
public void nextFrame(ByteBuffer buffer) throws HyracksDataException {
    accessor.reset(buffer);
    int tupleCount = accessor.getTupleCount();
    for (int i = 0; i < tupleCount; i++) {
        short numTokens = 0;
        tokenizer.reset(accessor.getBuffer().array(), accessor.getTupleStartOffset(i) + accessor.getFieldSlotsLength() + accessor.getFieldStartOffset(i, docField), accessor.getFieldLength(i, docField));
        if (addNumTokensKey) {
            // Get the total number of tokens.
            numTokens = tokenizer.getTokensCount();
        }
        // in the writeKeyFieldsFirst field.
        while (tokenizer.hasNext()) {
            tokenizer.next();
            builder.reset();
            // Writing Order: token, number of token, keyfield1 ... n
            if (!writeKeyFieldsFirst) {
                try {
                    IToken token = tokenizer.getToken();
                    token.serializeToken(builderData);
                    builder.addFieldEndOffset();
                    // Add number of tokens if requested.
                    if (addNumTokensKey) {
                        builder.getDataOutput().writeShort(numTokens);
                        builder.addFieldEndOffset();
                    }
                } catch (IOException e) {
                    throw new HyracksDataException(e.getMessage());
                }
                for (int k = 0; k < keyFields.length; k++) {
                    builder.addField(accessor, i, keyFields[k]);
                }
            } else // Writing Order: keyfield1 ... n, token, number of token
            {
                for (int k = 0; k < keyFields.length; k++) {
                    builder.addField(accessor, i, keyFields[k]);
                }
                try {
                    IToken token = tokenizer.getToken();
                    token.serializeToken(builderData);
                    builder.addFieldEndOffset();
                    // Add number of tokens if requested.
                    if (addNumTokensKey) {
                        builder.getDataOutput().writeShort(numTokens);
                        builder.addFieldEndOffset();
                    }
                } catch (IOException e) {
                    throw new HyracksDataException(e.getMessage());
                }
            }
            FrameUtils.appendToWriter(writer, appender, builder.getFieldEndOffsets(), builder.getByteArray(), 0, builder.getSize());
        }
    }
}
Also used : IToken(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken) IOException(java.io.IOException) HyracksDataException(org.apache.hyracks.api.exceptions.HyracksDataException)

Example 2 with IToken

use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken in project asterixdb by apache.

the class AbstractTOccurrenceSearcher method tokenizeQuery.

protected void tokenizeQuery(InvertedIndexSearchPredicate searchPred) throws HyracksDataException {
    ITupleReference queryTuple = searchPred.getQueryTuple();
    int queryFieldIndex = searchPred.getQueryFieldIndex();
    IBinaryTokenizer queryTokenizer = searchPred.getQueryTokenizer();
    // Is this a full-text query?
    // Then, the last argument is conjuctive or disjunctive search option, not a query text.
    // Thus, we need to remove the last argument.
    boolean isFullTextSearchQuery = searchPred.getIsFullTextSearchQuery();
    // Get the type of query tokenizer.
    TokenizerType queryTokenizerType = queryTokenizer.getTokenizerType();
    int tokenCountInOneField = 0;
    queryTokenAppender.reset(queryTokenFrame, true);
    queryTokenizer.reset(queryTuple.getFieldData(queryFieldIndex), queryTuple.getFieldStart(queryFieldIndex), queryTuple.getFieldLength(queryFieldIndex));
    while (queryTokenizer.hasNext()) {
        queryTokenizer.next();
        queryTokenBuilder.reset();
        tokenCountInOneField++;
        try {
            IToken token = queryTokenizer.getToken();
            // If it's a list, it can have multiple keywords in it. But, each keyword should not be a phrase.
            if (isFullTextSearchQuery) {
                if (queryTokenizerType == TokenizerType.STRING && tokenCountInOneField > 1) {
                    throw HyracksDataException.create(ErrorCode.FULLTEXT_PHRASE_FOUND);
                } else if (queryTokenizerType == TokenizerType.LIST) {
                    for (int j = 1; j < token.getTokenLength(); j++) {
                        if (DelimitedUTF8StringBinaryTokenizer.isSeparator((char) token.getData()[token.getStartOffset() + j])) {
                            throw HyracksDataException.create(ErrorCode.FULLTEXT_PHRASE_FOUND);
                        }
                    }
                }
            }
            token.serializeToken(queryTokenBuilder.getFieldData());
            queryTokenBuilder.addFieldEndOffset();
            // WARNING: assuming one frame is big enough to hold all tokens
            queryTokenAppender.append(queryTokenBuilder.getFieldEndOffsets(), queryTokenBuilder.getByteArray(), 0, queryTokenBuilder.getSize());
        } catch (IOException e) {
            throw new HyracksDataException(e);
        }
    }
}
Also used : IToken(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken) ITupleReference(org.apache.hyracks.dataflow.common.data.accessors.ITupleReference) TokenizerType(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.TokenizerInfo.TokenizerType) IBinaryTokenizer(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer) IOException(java.io.IOException) HyracksDataException(org.apache.hyracks.api.exceptions.HyracksDataException)

Example 3 with IToken

use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken in project asterixdb by apache.

the class PartitionedInvertedIndexTokenizingTupleIterator method next.

public void next() throws HyracksDataException {
    tokenizer.next();
    IToken token = tokenizer.getToken();
    tupleBuilder.reset();
    try {
        // Add token field.
        token.serializeToken(tupleBuilder.getFieldData());
        tupleBuilder.addFieldEndOffset();
        // Add field with number of tokens.
        tupleBuilder.getDataOutput().writeShort(numTokens);
        tupleBuilder.addFieldEndOffset();
    } catch (IOException e) {
        throw new HyracksDataException(e);
    }
    // Add inverted-list element fields.
    for (int i = 0; i < invListFieldCount; i++) {
        tupleBuilder.addField(inputTuple.getFieldData(i + 1), inputTuple.getFieldStart(i + 1), inputTuple.getFieldLength(i + 1));
    }
    // Reset tuple reference for insert operation.
    tupleReference.reset(tupleBuilder.getFieldEndOffsets(), tupleBuilder.getByteArray());
}
Also used : IToken(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken) IOException(java.io.IOException) HyracksDataException(org.apache.hyracks.api.exceptions.HyracksDataException)

Example 4 with IToken

use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken in project asterixdb by apache.

the class LSMInvertedIndexTestUtils method getExpectedResults.

@SuppressWarnings("unchecked")
public static void getExpectedResults(int[] scanCountArray, TreeSet<CheckTuple> checkTuples, ITupleReference searchDocument, IBinaryTokenizer tokenizer, ISerializerDeserializer tokenSerde, IInvertedIndexSearchModifier searchModifier, List<Integer> expectedResults, boolean isPartitioned) throws IOException {
    // Reset scan count array.
    Arrays.fill(scanCountArray, 0);
    expectedResults.clear();
    GrowableArray tokenData = new GrowableArray();
    tokenizer.reset(searchDocument.getFieldData(0), searchDocument.getFieldStart(0), searchDocument.getFieldLength(0));
    // Run though tokenizer to get number of tokens.
    int numQueryTokens = 0;
    while (tokenizer.hasNext()) {
        tokenizer.next();
        numQueryTokens++;
    }
    short numTokensLowerBound = -1;
    short numTokensUpperBound = -1;
    int invListElementField = 1;
    if (isPartitioned) {
        numTokensLowerBound = searchModifier.getNumTokensLowerBound((short) numQueryTokens);
        numTokensUpperBound = searchModifier.getNumTokensUpperBound((short) numQueryTokens);
        invListElementField = 2;
    }
    int occurrenceThreshold = searchModifier.getOccurrenceThreshold(numQueryTokens);
    tokenizer.reset(searchDocument.getFieldData(0), searchDocument.getFieldStart(0), searchDocument.getFieldLength(0));
    while (tokenizer.hasNext()) {
        tokenizer.next();
        IToken token = tokenizer.getToken();
        tokenData.reset();
        token.serializeToken(tokenData);
        ByteArrayInputStream inStream = new ByteArrayInputStream(tokenData.getByteArray(), 0, tokenData.getLength());
        DataInput dataIn = new DataInputStream(inStream);
        Comparable tokenObj = (Comparable) tokenSerde.deserialize(dataIn);
        CheckTuple lowKey;
        if (numTokensLowerBound < 0) {
            // Index is not partitioned, or no length filtering is possible for this search modifier.
            lowKey = new CheckTuple(1, 1);
            lowKey.appendField(tokenObj);
        } else {
            // Index is length partitioned, and search modifier supports length filtering.
            lowKey = new CheckTuple(2, 2);
            lowKey.appendField(tokenObj);
            lowKey.appendField(Short.valueOf(numTokensLowerBound));
        }
        CheckTuple highKey;
        if (numTokensUpperBound < 0) {
            // Index is not partitioned, or no length filtering is possible for this search modifier.
            highKey = new CheckTuple(1, 1);
            highKey.appendField(tokenObj);
        } else {
            // Index is length partitioned, and search modifier supports length filtering.
            highKey = new CheckTuple(2, 2);
            highKey.appendField(tokenObj);
            highKey.appendField(Short.valueOf(numTokensUpperBound));
        }
        // Get view over check tuples containing inverted-list corresponding to token.
        SortedSet<CheckTuple> invList = OrderedIndexTestUtils.getPrefixExpectedSubset(checkTuples, lowKey, highKey);
        Iterator<CheckTuple> invListIter = invList.iterator();
        // Iterate over inverted list and update scan count array.
        while (invListIter.hasNext()) {
            CheckTuple checkTuple = invListIter.next();
            Integer element = (Integer) checkTuple.getField(invListElementField);
            scanCountArray[element]++;
        }
    }
    // Run through scan count array, and see whether elements satisfy the given occurrence threshold.
    expectedResults.clear();
    for (int i = 0; i < scanCountArray.length; i++) {
        if (scanCountArray[i] >= occurrenceThreshold) {
            expectedResults.add(i);
        }
    }
}
Also used : DataInput(java.io.DataInput) CheckTuple(org.apache.hyracks.storage.am.common.CheckTuple) IToken(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken) ByteArrayInputStream(java.io.ByteArrayInputStream) GrowableArray(org.apache.hyracks.data.std.util.GrowableArray) DataInputStream(java.io.DataInputStream)

Example 5 with IToken

use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken in project asterixdb by apache.

the class InvertedIndexTokenizingTupleIterator method next.

public void next() throws HyracksDataException {
    tokenizer.next();
    IToken token = tokenizer.getToken();
    tupleBuilder.reset();
    // Add token field.
    try {
        token.serializeToken(tupleBuilder.getFieldData());
    } catch (IOException e) {
        throw new HyracksDataException(e);
    }
    tupleBuilder.addFieldEndOffset();
    // Add inverted-list element fields.
    for (int i = 0; i < invListFieldCount; i++) {
        tupleBuilder.addField(inputTuple.getFieldData(i + 1), inputTuple.getFieldStart(i + 1), inputTuple.getFieldLength(i + 1));
    }
    // Reset tuple reference for insert operation.
    tupleReference.reset(tupleBuilder.getFieldEndOffsets(), tupleBuilder.getByteArray());
}
Also used : IToken(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken) IOException(java.io.IOException) HyracksDataException(org.apache.hyracks.api.exceptions.HyracksDataException)

Aggregations

IToken (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken)5 IOException (java.io.IOException)4 HyracksDataException (org.apache.hyracks.api.exceptions.HyracksDataException)4 ByteArrayInputStream (java.io.ByteArrayInputStream)1 DataInput (java.io.DataInput)1 DataInputStream (java.io.DataInputStream)1 GrowableArray (org.apache.hyracks.data.std.util.GrowableArray)1 ITupleReference (org.apache.hyracks.dataflow.common.data.accessors.ITupleReference)1 CheckTuple (org.apache.hyracks.storage.am.common.CheckTuple)1 IBinaryTokenizer (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer)1 TokenizerType (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.TokenizerInfo.TokenizerType)1