Examples with IBinaryTokenizerFactory - org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory

Example 1 with IBinaryTokenizerFactory

use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory in project asterixdb by apache.

the class InvertedIndexPOperator method buildInvertedIndexRuntime.

public Pair<IOperatorDescriptor, AlgebricksPartitionConstraint> buildInvertedIndexRuntime(MetadataProvider metadataProvider, JobGenContext context, JobSpecification jobSpec, AbstractUnnestMapOperator unnestMap, IOperatorSchema opSchema, boolean retainInput, boolean retainMissing, String datasetName, Dataset dataset, String indexName, ATypeTag searchKeyType, int[] keyFields, SearchModifierType searchModifierType, IAlgebricksConstantValue similarityThreshold, int[] minFilterFieldIndexes, int[] maxFilterFieldIndexes, boolean isFullTextSearchQuery) throws AlgebricksException {
    try {
        IAObject simThresh = ((AsterixConstantValue) similarityThreshold).getObject();
        int numPrimaryKeys = dataset.getPrimaryKeys().size();
        Index secondaryIndex = MetadataManager.INSTANCE.getIndex(metadataProvider.getMetadataTxnContext(), dataset.getDataverseName(), dataset.getDatasetName(), indexName);
        if (secondaryIndex == null) {
            throw new AlgebricksException("Code generation error: no index " + indexName + " for dataset " + datasetName);
        }
        IVariableTypeEnvironment typeEnv = context.getTypeEnvironment(unnestMap);
        RecordDescriptor outputRecDesc = JobGenHelper.mkRecordDescriptor(typeEnv, opSchema, context);
        Pair<IFileSplitProvider, AlgebricksPartitionConstraint> secondarySplitsAndConstraint = metadataProvider.getSplitProviderAndConstraints(dataset, indexName);
        // TODO: Here we assume there is only one search key field.
        int queryField = keyFields[0];
        // Get tokenizer and search modifier factories.
        IInvertedIndexSearchModifierFactory searchModifierFactory = InvertedIndexAccessMethod.getSearchModifierFactory(searchModifierType, simThresh, secondaryIndex);
        IBinaryTokenizerFactory queryTokenizerFactory = InvertedIndexAccessMethod.getBinaryTokenizerFactory(searchModifierType, searchKeyType, secondaryIndex);
        IIndexDataflowHelperFactory dataflowHelperFactory = new IndexDataflowHelperFactory(metadataProvider.getStorageComponentProvider().getStorageManager(), secondarySplitsAndConstraint.first);
        LSMInvertedIndexSearchOperatorDescriptor invIndexSearchOp = new LSMInvertedIndexSearchOperatorDescriptor(jobSpec, outputRecDesc, queryField, dataflowHelperFactory, queryTokenizerFactory, searchModifierFactory, retainInput, retainMissing, context.getMissingWriterFactory(), dataset.getSearchCallbackFactory(metadataProvider.getStorageComponentProvider(), secondaryIndex, ((JobEventListenerFactory) jobSpec.getJobletEventListenerFactory()).getJobId(), IndexOperation.SEARCH, null), minFilterFieldIndexes, maxFilterFieldIndexes, isFullTextSearchQuery, numPrimaryKeys, false);
        return new Pair<>(invIndexSearchOp, secondarySplitsAndConstraint.second);
    } catch (MetadataException e) {
        throw new AlgebricksException(e);
    }
}

Also used : RecordDescriptor(org.apache.hyracks.api.dataflow.value.RecordDescriptor) IFileSplitProvider(org.apache.hyracks.dataflow.std.file.IFileSplitProvider) IAObject(org.apache.asterix.om.base.IAObject) AlgebricksException(org.apache.hyracks.algebricks.common.exceptions.AlgebricksException) Index(org.apache.asterix.metadata.entities.Index) IDataSourceIndex(org.apache.hyracks.algebricks.core.algebra.metadata.IDataSourceIndex) IInvertedIndexSearchModifierFactory(org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexSearchModifierFactory) JobEventListenerFactory(org.apache.asterix.runtime.job.listener.JobEventListenerFactory) AlgebricksPartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint) MetadataException(org.apache.asterix.metadata.MetadataException) LSMInvertedIndexSearchOperatorDescriptor(org.apache.hyracks.storage.am.lsm.invertedindex.dataflow.LSMInvertedIndexSearchOperatorDescriptor) AsterixConstantValue(org.apache.asterix.om.constants.AsterixConstantValue) IIndexDataflowHelperFactory(org.apache.hyracks.storage.am.common.dataflow.IIndexDataflowHelperFactory) IBinaryTokenizerFactory(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory) AlgebricksPartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint) IndexDataflowHelperFactory(org.apache.hyracks.storage.am.common.dataflow.IndexDataflowHelperFactory) IIndexDataflowHelperFactory(org.apache.hyracks.storage.am.common.dataflow.IIndexDataflowHelperFactory) IVariableTypeEnvironment(org.apache.hyracks.algebricks.core.algebra.expressions.IVariableTypeEnvironment) Pair(org.apache.hyracks.algebricks.common.utils.Pair)

Example 2 with IBinaryTokenizerFactory

use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory in project asterixdb by apache.

the class MetadataProvider method getBinaryTokenizerRuntime.

// Get a Tokenizer for the bulk-loading data into a n-gram or keyword index.
private Pair<IOperatorDescriptor, AlgebricksPartitionConstraint> getBinaryTokenizerRuntime(String dataverseName, String datasetName, String indexName, IOperatorSchema inputSchema, IOperatorSchema propagatedSchema, List<LogicalVariable> primaryKeys, List<LogicalVariable> secondaryKeys, RecordDescriptor recordDesc, JobSpecification spec, IndexType indexType) throws AlgebricksException {
    // Sanity checks.
    if (primaryKeys.size() > 1) {
        throw new AlgebricksException("Cannot tokenize composite primary key.");
    }
    if (secondaryKeys.size() > 1) {
        throw new AlgebricksException("Cannot tokenize composite secondary key fields.");
    }
    boolean isPartitioned;
    if (indexType == IndexType.LENGTH_PARTITIONED_WORD_INVIX || indexType == IndexType.LENGTH_PARTITIONED_NGRAM_INVIX) {
        isPartitioned = true;
    } else {
        isPartitioned = false;
    }
    // Number of Keys that needs to be propagated
    int numKeys = inputSchema.getSize();
    // Get the rest of Logical Variables that are not (PK or SK) and each
    // variable's positions.
    // These variables will be propagated through TokenizeOperator.
    List<LogicalVariable> otherKeys = new ArrayList<>();
    if (inputSchema.getSize() > 0) {
        for (int k = 0; k < inputSchema.getSize(); k++) {
            boolean found = false;
            for (LogicalVariable varKey : primaryKeys) {
                if (varKey.equals(inputSchema.getVariable(k))) {
                    found = true;
                    break;
                } else {
                    found = false;
                }
            }
            if (!found) {
                for (LogicalVariable varKey : secondaryKeys) {
                    if (varKey.equals(inputSchema.getVariable(k))) {
                        found = true;
                        break;
                    } else {
                        found = false;
                    }
                }
            }
            if (!found) {
                otherKeys.add(inputSchema.getVariable(k));
            }
        }
    }
    // For tokenization, sorting and loading.
    // One token (+ optional partitioning field) + primary keys + secondary
    // keys + other variables
    // secondary keys and other variables will be just passed to the
    // IndexInsertDelete Operator.
    int numTokenKeyPairFields = (!isPartitioned) ? 1 + numKeys : 2 + numKeys;
    // generate field permutations for the input
    int[] fieldPermutation = new int[numKeys];
    int[] modificationCallbackPrimaryKeyFields = new int[primaryKeys.size()];
    int i = 0;
    int j = 0;
    for (LogicalVariable varKey : primaryKeys) {
        int idx = propagatedSchema.findVariable(varKey);
        fieldPermutation[i] = idx;
        modificationCallbackPrimaryKeyFields[j] = i;
        i++;
        j++;
    }
    for (LogicalVariable varKey : otherKeys) {
        int idx = propagatedSchema.findVariable(varKey);
        fieldPermutation[i] = idx;
        i++;
    }
    for (LogicalVariable varKey : secondaryKeys) {
        int idx = propagatedSchema.findVariable(varKey);
        fieldPermutation[i] = idx;
        i++;
    }
    Dataset dataset = MetadataManagerUtil.findExistingDataset(mdTxnCtx, dataverseName, datasetName);
    String itemTypeName = dataset.getItemTypeName();
    IAType itemType;
    try {
        itemType = MetadataManager.INSTANCE.getDatatype(mdTxnCtx, dataset.getItemTypeDataverseName(), itemTypeName).getDatatype();
        if (itemType.getTypeTag() != ATypeTag.OBJECT) {
            throw new AlgebricksException("Only record types can be tokenized.");
        }
        ARecordType recType = (ARecordType) itemType;
        // Index parameters.
        Index secondaryIndex = MetadataManager.INSTANCE.getIndex(mdTxnCtx, dataset.getDataverseName(), dataset.getDatasetName(), indexName);
        List<List<String>> secondaryKeyExprs = secondaryIndex.getKeyFieldNames();
        List<IAType> secondaryKeyTypeEntries = secondaryIndex.getKeyFieldTypes();
        int numTokenFields = (!isPartitioned) ? secondaryKeys.size() : secondaryKeys.size() + 1;
        ITypeTraits[] tokenTypeTraits = new ITypeTraits[numTokenFields];
        ITypeTraits[] invListsTypeTraits = new ITypeTraits[primaryKeys.size()];
        // Find the key type of the secondary key. If it's a derived type,
        // return the derived type.
        // e.g. UNORDERED LIST -> return UNORDERED LIST type
        IAType secondaryKeyType;
        Pair<IAType, Boolean> keyPairType = Index.getNonNullableOpenFieldType(secondaryKeyTypeEntries.get(0), secondaryKeyExprs.get(0), recType);
        secondaryKeyType = keyPairType.first;
        List<List<String>> partitioningKeys = dataset.getPrimaryKeys();
        i = 0;
        for (List<String> partitioningKey : partitioningKeys) {
            IAType keyType = recType.getSubFieldType(partitioningKey);
            invListsTypeTraits[i] = TypeTraitProvider.INSTANCE.getTypeTrait(keyType);
            ++i;
        }
        tokenTypeTraits[0] = NonTaggedFormatUtil.getTokenTypeTrait(secondaryKeyType);
        if (isPartitioned) {
            // The partitioning field is hardcoded to be a short *without*
            // an Asterix type tag.
            tokenTypeTraits[1] = ShortPointable.TYPE_TRAITS;
        }
        IBinaryTokenizerFactory tokenizerFactory = NonTaggedFormatUtil.getBinaryTokenizerFactory(secondaryKeyType.getTypeTag(), indexType, secondaryIndex.getGramLength());
        Pair<IFileSplitProvider, AlgebricksPartitionConstraint> splitsAndConstraint = getSplitProviderAndConstraints(dataset, secondaryIndex.getIndexName());
        // Generate Output Record format
        ISerializerDeserializer<?>[] tokenKeyPairFields = new ISerializerDeserializer[numTokenKeyPairFields];
        ITypeTraits[] tokenKeyPairTypeTraits = new ITypeTraits[numTokenKeyPairFields];
        ISerializerDeserializerProvider serdeProvider = FormatUtils.getDefaultFormat().getSerdeProvider();
        // #1. propagate all input variables
        for (int k = 0; k < recordDesc.getFieldCount(); k++) {
            tokenKeyPairFields[k] = recordDesc.getFields()[k];
            tokenKeyPairTypeTraits[k] = recordDesc.getTypeTraits()[k];
        }
        int tokenOffset = recordDesc.getFieldCount();
        // #2. Specify the token type
        tokenKeyPairFields[tokenOffset] = serdeProvider.getSerializerDeserializer(secondaryKeyType);
        tokenKeyPairTypeTraits[tokenOffset] = tokenTypeTraits[0];
        tokenOffset++;
        // #3. Specify the length-partitioning key: number of token
        if (isPartitioned) {
            tokenKeyPairFields[tokenOffset] = ShortSerializerDeserializer.INSTANCE;
            tokenKeyPairTypeTraits[tokenOffset] = tokenTypeTraits[1];
        }
        RecordDescriptor tokenKeyPairRecDesc = new RecordDescriptor(tokenKeyPairFields, tokenKeyPairTypeTraits);
        IOperatorDescriptor tokenizerOp;
        // Keys to be tokenized : SK
        int docField = fieldPermutation[fieldPermutation.length - 1];
        // Keys to be propagated
        int[] keyFields = new int[numKeys];
        for (int k = 0; k < keyFields.length; k++) {
            keyFields[k] = k;
        }
        tokenizerOp = new BinaryTokenizerOperatorDescriptor(spec, tokenKeyPairRecDesc, tokenizerFactory, docField, keyFields, isPartitioned, true);
        return new Pair<>(tokenizerOp, splitsAndConstraint.second);
    } catch (Exception e) {
        throw new AlgebricksException(e);
    }
}

Also used : IFileSplitProvider(org.apache.hyracks.dataflow.std.file.IFileSplitProvider) RecordDescriptor(org.apache.hyracks.api.dataflow.value.RecordDescriptor) ArrayList(java.util.ArrayList) Index(org.apache.asterix.metadata.entities.Index) IDataSourceIndex(org.apache.hyracks.algebricks.core.algebra.metadata.IDataSourceIndex) ISerializerDeserializerProvider(org.apache.hyracks.algebricks.data.ISerializerDeserializerProvider) IBinaryTokenizerFactory(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory) LockList(org.apache.asterix.metadata.lock.LockList) ArrayList(java.util.ArrayList) List(java.util.List) AlgebricksPartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint) Pair(org.apache.hyracks.algebricks.common.utils.Pair) LogicalVariable(org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable) ITypeTraits(org.apache.hyracks.api.dataflow.value.ITypeTraits) Dataset(org.apache.asterix.metadata.entities.Dataset) AlgebricksException(org.apache.hyracks.algebricks.common.exceptions.AlgebricksException) AlgebricksPartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint) DatasetCardinalityHint(org.apache.asterix.metadata.dataset.hints.DatasetHints.DatasetCardinalityHint) AlgebricksAbsolutePartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksAbsolutePartitionConstraint) ISerializerDeserializer(org.apache.hyracks.api.dataflow.value.ISerializerDeserializer) MetadataException(org.apache.asterix.metadata.MetadataException) AlgebricksException(org.apache.hyracks.algebricks.common.exceptions.AlgebricksException) CompilationException(org.apache.asterix.common.exceptions.CompilationException) IOException(java.io.IOException) AsterixException(org.apache.asterix.common.exceptions.AsterixException) BinaryTokenizerOperatorDescriptor(org.apache.hyracks.storage.am.lsm.invertedindex.dataflow.BinaryTokenizerOperatorDescriptor) IOperatorDescriptor(org.apache.hyracks.api.dataflow.IOperatorDescriptor) ARecordType(org.apache.asterix.om.types.ARecordType) IAType(org.apache.asterix.om.types.IAType)

Example 3 with IBinaryTokenizerFactory

use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory in project asterixdb by apache.

the class InvertedIndexResourceFactoryProvider method getResourceFactory.

@Override
public IResourceFactory getResourceFactory(MetadataProvider mdProvider, Dataset dataset, Index index, ARecordType recordType, ARecordType metaType, ILSMMergePolicyFactory mergePolicyFactory, Map<String, String> mergePolicyProperties, ITypeTraits[] filterTypeTraits, IBinaryComparatorFactory[] filterCmpFactories) throws AlgebricksException {
    // Get basic info
    List<List<String>> primaryKeys = dataset.getPrimaryKeys();
    List<List<String>> secondaryKeys = index.getKeyFieldNames();
    List<String> filterFieldName = DatasetUtil.getFilterField(dataset);
    int numPrimaryKeys = primaryKeys.size();
    int numSecondaryKeys = secondaryKeys.size();
    // Validate
    if (dataset.getDatasetType() != DatasetType.INTERNAL) {
        throw new CompilationException(ErrorCode.COMPILATION_INDEX_TYPE_NOT_SUPPORTED_FOR_DATASET_TYPE, index.getIndexType().name(), dataset.getDatasetType());
    }
    if (numPrimaryKeys > 1) {
        throw new AsterixException("Cannot create inverted index on dataset with composite primary key.");
    }
    if (numSecondaryKeys > 1) {
        throw new AsterixException("Cannot create composite inverted index on multiple fields.");
    }
    boolean isPartitioned = index.getIndexType() == IndexType.LENGTH_PARTITIONED_WORD_INVIX || index.getIndexType() == IndexType.LENGTH_PARTITIONED_NGRAM_INVIX;
    int numTokenKeyPairFields = (!isPartitioned) ? 1 + numPrimaryKeys : 2 + numPrimaryKeys;
    int[] invertedIndexFields = null;
    int[] secondaryFilterFieldsForNonBulkLoadOps = null;
    int[] invertedIndexFieldsForNonBulkLoadOps = null;
    int[] secondaryFilterFields = null;
    if (filterFieldName != null) {
        invertedIndexFields = new int[numTokenKeyPairFields];
        for (int i = 0; i < invertedIndexFields.length; i++) {
            invertedIndexFields[i] = i;
        }
        secondaryFilterFieldsForNonBulkLoadOps = new int[filterFieldName.size()];
        secondaryFilterFieldsForNonBulkLoadOps[0] = numSecondaryKeys + numPrimaryKeys;
        invertedIndexFieldsForNonBulkLoadOps = new int[numSecondaryKeys + numPrimaryKeys];
        for (int i = 0; i < invertedIndexFieldsForNonBulkLoadOps.length; i++) {
            invertedIndexFieldsForNonBulkLoadOps[i] = i;
        }
        secondaryFilterFields = new int[filterFieldName.size()];
        secondaryFilterFields[0] = numTokenKeyPairFields - numPrimaryKeys + numPrimaryKeys;
    }
    IStorageComponentProvider storageComponentProvider = mdProvider.getStorageComponentProvider();
    IStorageManager storageManager = storageComponentProvider.getStorageManager();
    ILSMOperationTrackerFactory opTrackerFactory = dataset.getIndexOperationTrackerFactory(index);
    ILSMIOOperationCallbackFactory ioOpCallbackFactory = dataset.getIoOperationCallbackFactory(index);
    IMetadataPageManagerFactory metadataPageManagerFactory = storageComponentProvider.getMetadataPageManagerFactory();
    AsterixVirtualBufferCacheProvider vbcProvider = new AsterixVirtualBufferCacheProvider(dataset.getDatasetId());
    ILSMIOOperationSchedulerProvider ioSchedulerProvider = storageComponentProvider.getIoOperationSchedulerProvider();
    boolean durable = !dataset.isTemp();
    double bloomFilterFalsePositiveRate = mdProvider.getStorageProperties().getBloomFilterFalsePositiveRate();
    ITypeTraits[] typeTraits = getInvListTypeTraits(mdProvider, dataset, recordType, metaType);
    IBinaryComparatorFactory[] cmpFactories = getInvListComparatorFactories(mdProvider, dataset, recordType, metaType);
    ITypeTraits[] tokenTypeTraits = getTokenTypeTraits(dataset, index, recordType, metaType);
    IBinaryComparatorFactory[] tokenCmpFactories = getTokenComparatorFactories(dataset, index, recordType, metaType);
    IBinaryTokenizerFactory tokenizerFactory = getTokenizerFactory(dataset, index, recordType, metaType);
    return new LSMInvertedIndexLocalResourceFactory(storageManager, typeTraits, cmpFactories, filterTypeTraits, filterCmpFactories, secondaryFilterFields, opTrackerFactory, ioOpCallbackFactory, metadataPageManagerFactory, vbcProvider, ioSchedulerProvider, mergePolicyFactory, mergePolicyProperties, durable, tokenTypeTraits, tokenCmpFactories, tokenizerFactory, isPartitioned, invertedIndexFields, secondaryFilterFieldsForNonBulkLoadOps, invertedIndexFieldsForNonBulkLoadOps, bloomFilterFalsePositiveRate);
}

Also used : CompilationException(org.apache.asterix.common.exceptions.CompilationException) ILSMIOOperationCallbackFactory(org.apache.hyracks.storage.am.lsm.common.api.ILSMIOOperationCallbackFactory) LSMInvertedIndexLocalResourceFactory(org.apache.hyracks.storage.am.lsm.invertedindex.dataflow.LSMInvertedIndexLocalResourceFactory) IStorageComponentProvider(org.apache.asterix.common.context.IStorageComponentProvider) ITypeTraits(org.apache.hyracks.api.dataflow.value.ITypeTraits) IMetadataPageManagerFactory(org.apache.hyracks.storage.am.common.api.IMetadataPageManagerFactory) AsterixVirtualBufferCacheProvider(org.apache.asterix.common.context.AsterixVirtualBufferCacheProvider) IBinaryComparatorFactory(org.apache.hyracks.api.dataflow.value.IBinaryComparatorFactory) ILSMOperationTrackerFactory(org.apache.hyracks.storage.am.lsm.common.api.ILSMOperationTrackerFactory) IStorageManager(org.apache.hyracks.storage.common.IStorageManager) AsterixException(org.apache.asterix.common.exceptions.AsterixException) ILSMIOOperationSchedulerProvider(org.apache.hyracks.storage.am.lsm.common.api.ILSMIOOperationSchedulerProvider) IBinaryTokenizerFactory(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory) List(java.util.List)

Example 4 with IBinaryTokenizerFactory

use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory in project asterixdb by apache.

the class LSMInvertedIndexTestUtils method createHashedNGramInvIndexTestContext.

public static LSMInvertedIndexTestContext createHashedNGramInvIndexTestContext(LSMInvertedIndexTestHarness harness, InvertedIndexType invIndexType) throws IOException, HyracksDataException {
    ISerializerDeserializer[] fieldSerdes = getHashedIndexFieldSerdes(invIndexType);
    ITokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
    IBinaryTokenizerFactory tokenizerFactory = new NGramUTF8StringBinaryTokenizerFactory(TEST_GRAM_LENGTH, true, true, false, tokenFactory);
    LSMInvertedIndexTestContext testCtx = LSMInvertedIndexTestContext.create(harness, fieldSerdes, fieldSerdes.length - 1, tokenizerFactory, invIndexType, null, null, null, null, null, null);
    return testCtx;
}

Also used : IBinaryTokenizerFactory(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory) ITokenFactory(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory) HashedUTF8NGramTokenFactory(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8NGramTokenFactory) ISerializerDeserializer(org.apache.hyracks.api.dataflow.value.ISerializerDeserializer) NGramUTF8StringBinaryTokenizerFactory(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizerFactory)

Example 5 with IBinaryTokenizerFactory

use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory in project asterixdb by apache.

the class LSMInvertedIndexTestUtils method createWordInvIndexTestContext.

public static LSMInvertedIndexTestContext createWordInvIndexTestContext(LSMInvertedIndexTestHarness harness, InvertedIndexType invIndexType) throws IOException, HyracksDataException {
    ISerializerDeserializer[] fieldSerdes = getNonHashedIndexFieldSerdes(invIndexType);
    ITokenFactory tokenFactory = new UTF8WordTokenFactory();
    IBinaryTokenizerFactory tokenizerFactory = new DelimitedUTF8StringBinaryTokenizerFactory(true, false, tokenFactory);
    LSMInvertedIndexTestContext testCtx = LSMInvertedIndexTestContext.create(harness, fieldSerdes, fieldSerdes.length - 1, tokenizerFactory, invIndexType, null, null, null, null, null, null);
    return testCtx;
}

Also used : DelimitedUTF8StringBinaryTokenizerFactory(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizerFactory) IBinaryTokenizerFactory(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory) ITokenFactory(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory) ISerializerDeserializer(org.apache.hyracks.api.dataflow.value.ISerializerDeserializer) UTF8WordTokenFactory(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8WordTokenFactory) HashedUTF8WordTokenFactory(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8WordTokenFactory)

Aggregations

IBinaryTokenizerFactory (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory)8 ISerializerDeserializer (org.apache.hyracks.api.dataflow.value.ISerializerDeserializer)5 ITokenFactory (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory)4 List (java.util.List)2 AsterixException (org.apache.asterix.common.exceptions.AsterixException)2 CompilationException (org.apache.asterix.common.exceptions.CompilationException)2 MetadataException (org.apache.asterix.metadata.MetadataException)2 Index (org.apache.asterix.metadata.entities.Index)2 AlgebricksPartitionConstraint (org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint)2 AlgebricksException (org.apache.hyracks.algebricks.common.exceptions.AlgebricksException)2 Pair (org.apache.hyracks.algebricks.common.utils.Pair)2 IDataSourceIndex (org.apache.hyracks.algebricks.core.algebra.metadata.IDataSourceIndex)2 ITypeTraits (org.apache.hyracks.api.dataflow.value.ITypeTraits)2 RecordDescriptor (org.apache.hyracks.api.dataflow.value.RecordDescriptor)2 IFileSplitProvider (org.apache.hyracks.dataflow.std.file.IFileSplitProvider)2 DelimitedUTF8StringBinaryTokenizerFactory (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizerFactory)2 HashedUTF8NGramTokenFactory (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8NGramTokenFactory)2 HashedUTF8WordTokenFactory (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8WordTokenFactory)2 NGramUTF8StringBinaryTokenizerFactory (org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizerFactory)2 IOException (java.io.IOException)1