Search in sources :

Example 6 with IBinaryTokenizer

use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer in project asterixdb by apache.

the class LSMInvertedIndexTestUtils method testIndexSearch.

public static void testIndexSearch(LSMInvertedIndexTestContext testCtx, TupleGenerator tupleGen, Random rnd, int numDocQueries, int numRandomQueries, IInvertedIndexSearchModifier searchModifier, int[] scanCountArray) throws IOException, HyracksDataException {
    IInvertedIndex invIndex = testCtx.invIndex;
    IInvertedIndexAccessor accessor = (IInvertedIndexAccessor) invIndex.createAccessor(NoOpOperationCallback.INSTANCE, NoOpOperationCallback.INSTANCE);
    IBinaryTokenizer tokenizer = testCtx.getTokenizerFactory().createTokenizer();
    InvertedIndexSearchPredicate searchPred = new InvertedIndexSearchPredicate(tokenizer, searchModifier);
    List<ITupleReference> documentCorpus = testCtx.getDocumentCorpus();
    // Project away the primary-key field.
    int[] fieldPermutation = new int[] { 0 };
    PermutingTupleReference searchDocument = new PermutingTupleReference(fieldPermutation);
    int numQueries = numDocQueries + numRandomQueries;
    for (int i = 0; i < numQueries; i++) {
        // If number of documents in the corpus is less than numDocQueries, then replace the remaining ones with random queries.
        if (i >= numDocQueries || i >= documentCorpus.size()) {
            // Generate a random query.
            ITupleReference randomQuery = tupleGen.next();
            searchDocument.reset(randomQuery);
        } else {
            // Pick a random document from the corpus to use as the search query.
            int queryIndex = Math.abs(rnd.nextInt() % documentCorpus.size());
            searchDocument.reset(documentCorpus.get(queryIndex));
        }
        // Set query tuple in search predicate.
        searchPred.setQueryTuple(searchDocument);
        searchPred.setQueryFieldIndex(0);
        IIndexCursor resultCursor = accessor.createSearchCursor(false);
        boolean panic = false;
        try {
            accessor.search(resultCursor, searchPred);
        } catch (HyracksDataException e) {
            // ignore panic queries.
            if (e.getErrorCode() == ErrorCode.OCCURRENCE_THRESHOLD_PANIC_EXCEPTION) {
                panic = true;
            } else {
                throw e;
            }
        }
        try {
            if (!panic) {
                // Consume cursor and deserialize results so we can sort them. Some search cursors may not deliver the result sorted (e.g., LSM search cursor).
                ArrayList<Integer> actualResults = new ArrayList<>();
                try {
                    while (resultCursor.hasNext()) {
                        resultCursor.next();
                        ITupleReference resultTuple = resultCursor.getTuple();
                        int actual = IntegerPointable.getInteger(resultTuple.getFieldData(0), resultTuple.getFieldStart(0));
                        actualResults.add(Integer.valueOf(actual));
                    }
                } catch (HyracksDataException e) {
                    if (e.getErrorCode() == ErrorCode.OCCURRENCE_THRESHOLD_PANIC_EXCEPTION) {
                        // Ignore panic queries.
                        continue;
                    } else {
                        throw e;
                    }
                }
                Collections.sort(actualResults);
                // Get expected results.
                List<Integer> expectedResults = new ArrayList<>();
                LSMInvertedIndexTestUtils.getExpectedResults(scanCountArray, testCtx.getCheckTuples(), searchDocument, tokenizer, testCtx.getFieldSerdes()[0], searchModifier, expectedResults, testCtx.getInvertedIndexType());
                Iterator<Integer> expectedIter = expectedResults.iterator();
                Iterator<Integer> actualIter = actualResults.iterator();
                while (expectedIter.hasNext() && actualIter.hasNext()) {
                    int expected = expectedIter.next();
                    int actual = actualIter.next();
                    if (actual != expected) {
                        fail("Query results do not match. Encountered: " + actual + ". Expected: " + expected + "");
                    }
                }
                if (expectedIter.hasNext()) {
                    fail("Query results do not match. Actual results missing.");
                }
                if (actualIter.hasNext()) {
                    fail("Query results do not match. Actual contains too many results.");
                }
            }
        } finally {
            resultCursor.close();
        }
    }
}
Also used : InvertedIndexSearchPredicate(org.apache.hyracks.storage.am.lsm.invertedindex.search.InvertedIndexSearchPredicate) ArrayList(java.util.ArrayList) IInvertedIndexAccessor(org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexAccessor) HyracksDataException(org.apache.hyracks.api.exceptions.HyracksDataException) PermutingTupleReference(org.apache.hyracks.storage.am.common.tuples.PermutingTupleReference) ITupleReference(org.apache.hyracks.dataflow.common.data.accessors.ITupleReference) IIndexCursor(org.apache.hyracks.storage.common.IIndexCursor) IBinaryTokenizer(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer) IInvertedIndex(org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndex)

Example 7 with IBinaryTokenizer

use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer in project asterixdb by apache.

the class PartitionedInMemoryInvertedIndexOpContext method setTokenizingTupleIterator.

protected void setTokenizingTupleIterator() {
    IBinaryTokenizer tokenizer = getTokenizerFactory().createTokenizer();
    setTupleIter(new PartitionedInvertedIndexTokenizingTupleIterator(tokenCmpFactories.length, btree.getFieldCount() - tokenCmpFactories.length, tokenizer));
}
Also used : IBinaryTokenizer(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer) PartitionedInvertedIndexTokenizingTupleIterator(org.apache.hyracks.storage.am.lsm.invertedindex.util.PartitionedInvertedIndexTokenizingTupleIterator)

Aggregations

IBinaryTokenizer (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer)7 WordTokensEvaluator (org.apache.asterix.runtime.evaluators.common.WordTokensEvaluator)3 IScalarEvaluatorFactory (org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory)3 IHyracksTaskContext (org.apache.hyracks.api.context.IHyracksTaskContext)3 DelimitedUTF8StringBinaryTokenizer (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer)3 ITokenFactory (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory)3 HyracksDataException (org.apache.hyracks.api.exceptions.HyracksDataException)2 ITupleReference (org.apache.hyracks.dataflow.common.data.accessors.ITupleReference)2 HashedUTF8WordTokenFactory (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8WordTokenFactory)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 PermutingTupleReference (org.apache.hyracks.storage.am.common.tuples.PermutingTupleReference)1 IInvertedIndex (org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndex)1 IInvertedIndexAccessor (org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexAccessor)1 InvertedIndexSearchPredicate (org.apache.hyracks.storage.am.lsm.invertedindex.search.InvertedIndexSearchPredicate)1 IToken (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken)1 TokenizerType (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.TokenizerInfo.TokenizerType)1 UTF8WordTokenFactory (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8WordTokenFactory)1 InvertedIndexTokenizingTupleIterator (org.apache.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTokenizingTupleIterator)1 PartitionedInvertedIndexTokenizingTupleIterator (org.apache.hyracks.storage.am.lsm.invertedindex.util.PartitionedInvertedIndexTokenizingTupleIterator)1