use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory in project asterixdb by apache.
the class InvertedIndexPOperator method buildInvertedIndexRuntime.
public Pair<IOperatorDescriptor, AlgebricksPartitionConstraint> buildInvertedIndexRuntime(MetadataProvider metadataProvider, JobGenContext context, JobSpecification jobSpec, AbstractUnnestMapOperator unnestMap, IOperatorSchema opSchema, boolean retainInput, boolean retainMissing, String datasetName, Dataset dataset, String indexName, ATypeTag searchKeyType, int[] keyFields, SearchModifierType searchModifierType, IAlgebricksConstantValue similarityThreshold, int[] minFilterFieldIndexes, int[] maxFilterFieldIndexes, boolean isFullTextSearchQuery) throws AlgebricksException {
try {
IAObject simThresh = ((AsterixConstantValue) similarityThreshold).getObject();
int numPrimaryKeys = dataset.getPrimaryKeys().size();
Index secondaryIndex = MetadataManager.INSTANCE.getIndex(metadataProvider.getMetadataTxnContext(), dataset.getDataverseName(), dataset.getDatasetName(), indexName);
if (secondaryIndex == null) {
throw new AlgebricksException("Code generation error: no index " + indexName + " for dataset " + datasetName);
}
IVariableTypeEnvironment typeEnv = context.getTypeEnvironment(unnestMap);
RecordDescriptor outputRecDesc = JobGenHelper.mkRecordDescriptor(typeEnv, opSchema, context);
Pair<IFileSplitProvider, AlgebricksPartitionConstraint> secondarySplitsAndConstraint = metadataProvider.getSplitProviderAndConstraints(dataset, indexName);
// TODO: Here we assume there is only one search key field.
int queryField = keyFields[0];
// Get tokenizer and search modifier factories.
IInvertedIndexSearchModifierFactory searchModifierFactory = InvertedIndexAccessMethod.getSearchModifierFactory(searchModifierType, simThresh, secondaryIndex);
IBinaryTokenizerFactory queryTokenizerFactory = InvertedIndexAccessMethod.getBinaryTokenizerFactory(searchModifierType, searchKeyType, secondaryIndex);
IIndexDataflowHelperFactory dataflowHelperFactory = new IndexDataflowHelperFactory(metadataProvider.getStorageComponentProvider().getStorageManager(), secondarySplitsAndConstraint.first);
LSMInvertedIndexSearchOperatorDescriptor invIndexSearchOp = new LSMInvertedIndexSearchOperatorDescriptor(jobSpec, outputRecDesc, queryField, dataflowHelperFactory, queryTokenizerFactory, searchModifierFactory, retainInput, retainMissing, context.getMissingWriterFactory(), dataset.getSearchCallbackFactory(metadataProvider.getStorageComponentProvider(), secondaryIndex, ((JobEventListenerFactory) jobSpec.getJobletEventListenerFactory()).getJobId(), IndexOperation.SEARCH, null), minFilterFieldIndexes, maxFilterFieldIndexes, isFullTextSearchQuery, numPrimaryKeys, false);
return new Pair<>(invIndexSearchOp, secondarySplitsAndConstraint.second);
} catch (MetadataException e) {
throw new AlgebricksException(e);
}
}
use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory in project asterixdb by apache.
the class MetadataProvider method getBinaryTokenizerRuntime.
// Get a Tokenizer for the bulk-loading data into a n-gram or keyword index.
private Pair<IOperatorDescriptor, AlgebricksPartitionConstraint> getBinaryTokenizerRuntime(String dataverseName, String datasetName, String indexName, IOperatorSchema inputSchema, IOperatorSchema propagatedSchema, List<LogicalVariable> primaryKeys, List<LogicalVariable> secondaryKeys, RecordDescriptor recordDesc, JobSpecification spec, IndexType indexType) throws AlgebricksException {
// Sanity checks.
if (primaryKeys.size() > 1) {
throw new AlgebricksException("Cannot tokenize composite primary key.");
}
if (secondaryKeys.size() > 1) {
throw new AlgebricksException("Cannot tokenize composite secondary key fields.");
}
boolean isPartitioned;
if (indexType == IndexType.LENGTH_PARTITIONED_WORD_INVIX || indexType == IndexType.LENGTH_PARTITIONED_NGRAM_INVIX) {
isPartitioned = true;
} else {
isPartitioned = false;
}
// Number of Keys that needs to be propagated
int numKeys = inputSchema.getSize();
// Get the rest of Logical Variables that are not (PK or SK) and each
// variable's positions.
// These variables will be propagated through TokenizeOperator.
List<LogicalVariable> otherKeys = new ArrayList<>();
if (inputSchema.getSize() > 0) {
for (int k = 0; k < inputSchema.getSize(); k++) {
boolean found = false;
for (LogicalVariable varKey : primaryKeys) {
if (varKey.equals(inputSchema.getVariable(k))) {
found = true;
break;
} else {
found = false;
}
}
if (!found) {
for (LogicalVariable varKey : secondaryKeys) {
if (varKey.equals(inputSchema.getVariable(k))) {
found = true;
break;
} else {
found = false;
}
}
}
if (!found) {
otherKeys.add(inputSchema.getVariable(k));
}
}
}
// For tokenization, sorting and loading.
// One token (+ optional partitioning field) + primary keys + secondary
// keys + other variables
// secondary keys and other variables will be just passed to the
// IndexInsertDelete Operator.
int numTokenKeyPairFields = (!isPartitioned) ? 1 + numKeys : 2 + numKeys;
// generate field permutations for the input
int[] fieldPermutation = new int[numKeys];
int[] modificationCallbackPrimaryKeyFields = new int[primaryKeys.size()];
int i = 0;
int j = 0;
for (LogicalVariable varKey : primaryKeys) {
int idx = propagatedSchema.findVariable(varKey);
fieldPermutation[i] = idx;
modificationCallbackPrimaryKeyFields[j] = i;
i++;
j++;
}
for (LogicalVariable varKey : otherKeys) {
int idx = propagatedSchema.findVariable(varKey);
fieldPermutation[i] = idx;
i++;
}
for (LogicalVariable varKey : secondaryKeys) {
int idx = propagatedSchema.findVariable(varKey);
fieldPermutation[i] = idx;
i++;
}
Dataset dataset = MetadataManagerUtil.findExistingDataset(mdTxnCtx, dataverseName, datasetName);
String itemTypeName = dataset.getItemTypeName();
IAType itemType;
try {
itemType = MetadataManager.INSTANCE.getDatatype(mdTxnCtx, dataset.getItemTypeDataverseName(), itemTypeName).getDatatype();
if (itemType.getTypeTag() != ATypeTag.OBJECT) {
throw new AlgebricksException("Only record types can be tokenized.");
}
ARecordType recType = (ARecordType) itemType;
// Index parameters.
Index secondaryIndex = MetadataManager.INSTANCE.getIndex(mdTxnCtx, dataset.getDataverseName(), dataset.getDatasetName(), indexName);
List<List<String>> secondaryKeyExprs = secondaryIndex.getKeyFieldNames();
List<IAType> secondaryKeyTypeEntries = secondaryIndex.getKeyFieldTypes();
int numTokenFields = (!isPartitioned) ? secondaryKeys.size() : secondaryKeys.size() + 1;
ITypeTraits[] tokenTypeTraits = new ITypeTraits[numTokenFields];
ITypeTraits[] invListsTypeTraits = new ITypeTraits[primaryKeys.size()];
// Find the key type of the secondary key. If it's a derived type,
// return the derived type.
// e.g. UNORDERED LIST -> return UNORDERED LIST type
IAType secondaryKeyType;
Pair<IAType, Boolean> keyPairType = Index.getNonNullableOpenFieldType(secondaryKeyTypeEntries.get(0), secondaryKeyExprs.get(0), recType);
secondaryKeyType = keyPairType.first;
List<List<String>> partitioningKeys = dataset.getPrimaryKeys();
i = 0;
for (List<String> partitioningKey : partitioningKeys) {
IAType keyType = recType.getSubFieldType(partitioningKey);
invListsTypeTraits[i] = TypeTraitProvider.INSTANCE.getTypeTrait(keyType);
++i;
}
tokenTypeTraits[0] = NonTaggedFormatUtil.getTokenTypeTrait(secondaryKeyType);
if (isPartitioned) {
// The partitioning field is hardcoded to be a short *without*
// an Asterix type tag.
tokenTypeTraits[1] = ShortPointable.TYPE_TRAITS;
}
IBinaryTokenizerFactory tokenizerFactory = NonTaggedFormatUtil.getBinaryTokenizerFactory(secondaryKeyType.getTypeTag(), indexType, secondaryIndex.getGramLength());
Pair<IFileSplitProvider, AlgebricksPartitionConstraint> splitsAndConstraint = getSplitProviderAndConstraints(dataset, secondaryIndex.getIndexName());
// Generate Output Record format
ISerializerDeserializer<?>[] tokenKeyPairFields = new ISerializerDeserializer[numTokenKeyPairFields];
ITypeTraits[] tokenKeyPairTypeTraits = new ITypeTraits[numTokenKeyPairFields];
ISerializerDeserializerProvider serdeProvider = FormatUtils.getDefaultFormat().getSerdeProvider();
// #1. propagate all input variables
for (int k = 0; k < recordDesc.getFieldCount(); k++) {
tokenKeyPairFields[k] = recordDesc.getFields()[k];
tokenKeyPairTypeTraits[k] = recordDesc.getTypeTraits()[k];
}
int tokenOffset = recordDesc.getFieldCount();
// #2. Specify the token type
tokenKeyPairFields[tokenOffset] = serdeProvider.getSerializerDeserializer(secondaryKeyType);
tokenKeyPairTypeTraits[tokenOffset] = tokenTypeTraits[0];
tokenOffset++;
// #3. Specify the length-partitioning key: number of token
if (isPartitioned) {
tokenKeyPairFields[tokenOffset] = ShortSerializerDeserializer.INSTANCE;
tokenKeyPairTypeTraits[tokenOffset] = tokenTypeTraits[1];
}
RecordDescriptor tokenKeyPairRecDesc = new RecordDescriptor(tokenKeyPairFields, tokenKeyPairTypeTraits);
IOperatorDescriptor tokenizerOp;
// Keys to be tokenized : SK
int docField = fieldPermutation[fieldPermutation.length - 1];
// Keys to be propagated
int[] keyFields = new int[numKeys];
for (int k = 0; k < keyFields.length; k++) {
keyFields[k] = k;
}
tokenizerOp = new BinaryTokenizerOperatorDescriptor(spec, tokenKeyPairRecDesc, tokenizerFactory, docField, keyFields, isPartitioned, true);
return new Pair<>(tokenizerOp, splitsAndConstraint.second);
} catch (Exception e) {
throw new AlgebricksException(e);
}
}
use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory in project asterixdb by apache.
the class InvertedIndexResourceFactoryProvider method getResourceFactory.
@Override
public IResourceFactory getResourceFactory(MetadataProvider mdProvider, Dataset dataset, Index index, ARecordType recordType, ARecordType metaType, ILSMMergePolicyFactory mergePolicyFactory, Map<String, String> mergePolicyProperties, ITypeTraits[] filterTypeTraits, IBinaryComparatorFactory[] filterCmpFactories) throws AlgebricksException {
// Get basic info
List<List<String>> primaryKeys = dataset.getPrimaryKeys();
List<List<String>> secondaryKeys = index.getKeyFieldNames();
List<String> filterFieldName = DatasetUtil.getFilterField(dataset);
int numPrimaryKeys = primaryKeys.size();
int numSecondaryKeys = secondaryKeys.size();
// Validate
if (dataset.getDatasetType() != DatasetType.INTERNAL) {
throw new CompilationException(ErrorCode.COMPILATION_INDEX_TYPE_NOT_SUPPORTED_FOR_DATASET_TYPE, index.getIndexType().name(), dataset.getDatasetType());
}
if (numPrimaryKeys > 1) {
throw new AsterixException("Cannot create inverted index on dataset with composite primary key.");
}
if (numSecondaryKeys > 1) {
throw new AsterixException("Cannot create composite inverted index on multiple fields.");
}
boolean isPartitioned = index.getIndexType() == IndexType.LENGTH_PARTITIONED_WORD_INVIX || index.getIndexType() == IndexType.LENGTH_PARTITIONED_NGRAM_INVIX;
int numTokenKeyPairFields = (!isPartitioned) ? 1 + numPrimaryKeys : 2 + numPrimaryKeys;
int[] invertedIndexFields = null;
int[] secondaryFilterFieldsForNonBulkLoadOps = null;
int[] invertedIndexFieldsForNonBulkLoadOps = null;
int[] secondaryFilterFields = null;
if (filterFieldName != null) {
invertedIndexFields = new int[numTokenKeyPairFields];
for (int i = 0; i < invertedIndexFields.length; i++) {
invertedIndexFields[i] = i;
}
secondaryFilterFieldsForNonBulkLoadOps = new int[filterFieldName.size()];
secondaryFilterFieldsForNonBulkLoadOps[0] = numSecondaryKeys + numPrimaryKeys;
invertedIndexFieldsForNonBulkLoadOps = new int[numSecondaryKeys + numPrimaryKeys];
for (int i = 0; i < invertedIndexFieldsForNonBulkLoadOps.length; i++) {
invertedIndexFieldsForNonBulkLoadOps[i] = i;
}
secondaryFilterFields = new int[filterFieldName.size()];
secondaryFilterFields[0] = numTokenKeyPairFields - numPrimaryKeys + numPrimaryKeys;
}
IStorageComponentProvider storageComponentProvider = mdProvider.getStorageComponentProvider();
IStorageManager storageManager = storageComponentProvider.getStorageManager();
ILSMOperationTrackerFactory opTrackerFactory = dataset.getIndexOperationTrackerFactory(index);
ILSMIOOperationCallbackFactory ioOpCallbackFactory = dataset.getIoOperationCallbackFactory(index);
IMetadataPageManagerFactory metadataPageManagerFactory = storageComponentProvider.getMetadataPageManagerFactory();
AsterixVirtualBufferCacheProvider vbcProvider = new AsterixVirtualBufferCacheProvider(dataset.getDatasetId());
ILSMIOOperationSchedulerProvider ioSchedulerProvider = storageComponentProvider.getIoOperationSchedulerProvider();
boolean durable = !dataset.isTemp();
double bloomFilterFalsePositiveRate = mdProvider.getStorageProperties().getBloomFilterFalsePositiveRate();
ITypeTraits[] typeTraits = getInvListTypeTraits(mdProvider, dataset, recordType, metaType);
IBinaryComparatorFactory[] cmpFactories = getInvListComparatorFactories(mdProvider, dataset, recordType, metaType);
ITypeTraits[] tokenTypeTraits = getTokenTypeTraits(dataset, index, recordType, metaType);
IBinaryComparatorFactory[] tokenCmpFactories = getTokenComparatorFactories(dataset, index, recordType, metaType);
IBinaryTokenizerFactory tokenizerFactory = getTokenizerFactory(dataset, index, recordType, metaType);
return new LSMInvertedIndexLocalResourceFactory(storageManager, typeTraits, cmpFactories, filterTypeTraits, filterCmpFactories, secondaryFilterFields, opTrackerFactory, ioOpCallbackFactory, metadataPageManagerFactory, vbcProvider, ioSchedulerProvider, mergePolicyFactory, mergePolicyProperties, durable, tokenTypeTraits, tokenCmpFactories, tokenizerFactory, isPartitioned, invertedIndexFields, secondaryFilterFieldsForNonBulkLoadOps, invertedIndexFieldsForNonBulkLoadOps, bloomFilterFalsePositiveRate);
}
use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory in project asterixdb by apache.
the class LSMInvertedIndexTestUtils method createHashedNGramInvIndexTestContext.
public static LSMInvertedIndexTestContext createHashedNGramInvIndexTestContext(LSMInvertedIndexTestHarness harness, InvertedIndexType invIndexType) throws IOException, HyracksDataException {
ISerializerDeserializer[] fieldSerdes = getHashedIndexFieldSerdes(invIndexType);
ITokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
IBinaryTokenizerFactory tokenizerFactory = new NGramUTF8StringBinaryTokenizerFactory(TEST_GRAM_LENGTH, true, true, false, tokenFactory);
LSMInvertedIndexTestContext testCtx = LSMInvertedIndexTestContext.create(harness, fieldSerdes, fieldSerdes.length - 1, tokenizerFactory, invIndexType, null, null, null, null, null, null);
return testCtx;
}
use of org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory in project asterixdb by apache.
the class LSMInvertedIndexTestUtils method createWordInvIndexTestContext.
public static LSMInvertedIndexTestContext createWordInvIndexTestContext(LSMInvertedIndexTestHarness harness, InvertedIndexType invIndexType) throws IOException, HyracksDataException {
ISerializerDeserializer[] fieldSerdes = getNonHashedIndexFieldSerdes(invIndexType);
ITokenFactory tokenFactory = new UTF8WordTokenFactory();
IBinaryTokenizerFactory tokenizerFactory = new DelimitedUTF8StringBinaryTokenizerFactory(true, false, tokenFactory);
LSMInvertedIndexTestContext testCtx = LSMInvertedIndexTestContext.create(harness, fieldSerdes, fieldSerdes.length - 1, tokenizerFactory, invIndexType, null, null, null, null, null, null);
return testCtx;
}
Aggregations