use of org.apache.hyracks.algebricks.data.ISerializerDeserializerProvider in project asterixdb by apache.
the class MetadataProvider method getBinaryTokenizerRuntime.
// Get a Tokenizer for the bulk-loading data into a n-gram or keyword index.
private Pair<IOperatorDescriptor, AlgebricksPartitionConstraint> getBinaryTokenizerRuntime(String dataverseName, String datasetName, String indexName, IOperatorSchema inputSchema, IOperatorSchema propagatedSchema, List<LogicalVariable> primaryKeys, List<LogicalVariable> secondaryKeys, RecordDescriptor recordDesc, JobSpecification spec, IndexType indexType) throws AlgebricksException {
// Sanity checks.
if (primaryKeys.size() > 1) {
throw new AlgebricksException("Cannot tokenize composite primary key.");
}
if (secondaryKeys.size() > 1) {
throw new AlgebricksException("Cannot tokenize composite secondary key fields.");
}
boolean isPartitioned;
if (indexType == IndexType.LENGTH_PARTITIONED_WORD_INVIX || indexType == IndexType.LENGTH_PARTITIONED_NGRAM_INVIX) {
isPartitioned = true;
} else {
isPartitioned = false;
}
// Number of Keys that needs to be propagated
int numKeys = inputSchema.getSize();
// Get the rest of Logical Variables that are not (PK or SK) and each
// variable's positions.
// These variables will be propagated through TokenizeOperator.
List<LogicalVariable> otherKeys = new ArrayList<>();
if (inputSchema.getSize() > 0) {
for (int k = 0; k < inputSchema.getSize(); k++) {
boolean found = false;
for (LogicalVariable varKey : primaryKeys) {
if (varKey.equals(inputSchema.getVariable(k))) {
found = true;
break;
} else {
found = false;
}
}
if (!found) {
for (LogicalVariable varKey : secondaryKeys) {
if (varKey.equals(inputSchema.getVariable(k))) {
found = true;
break;
} else {
found = false;
}
}
}
if (!found) {
otherKeys.add(inputSchema.getVariable(k));
}
}
}
// For tokenization, sorting and loading.
// One token (+ optional partitioning field) + primary keys + secondary
// keys + other variables
// secondary keys and other variables will be just passed to the
// IndexInsertDelete Operator.
int numTokenKeyPairFields = (!isPartitioned) ? 1 + numKeys : 2 + numKeys;
// generate field permutations for the input
int[] fieldPermutation = new int[numKeys];
int[] modificationCallbackPrimaryKeyFields = new int[primaryKeys.size()];
int i = 0;
int j = 0;
for (LogicalVariable varKey : primaryKeys) {
int idx = propagatedSchema.findVariable(varKey);
fieldPermutation[i] = idx;
modificationCallbackPrimaryKeyFields[j] = i;
i++;
j++;
}
for (LogicalVariable varKey : otherKeys) {
int idx = propagatedSchema.findVariable(varKey);
fieldPermutation[i] = idx;
i++;
}
for (LogicalVariable varKey : secondaryKeys) {
int idx = propagatedSchema.findVariable(varKey);
fieldPermutation[i] = idx;
i++;
}
Dataset dataset = MetadataManagerUtil.findExistingDataset(mdTxnCtx, dataverseName, datasetName);
String itemTypeName = dataset.getItemTypeName();
IAType itemType;
try {
itemType = MetadataManager.INSTANCE.getDatatype(mdTxnCtx, dataset.getItemTypeDataverseName(), itemTypeName).getDatatype();
if (itemType.getTypeTag() != ATypeTag.OBJECT) {
throw new AlgebricksException("Only record types can be tokenized.");
}
ARecordType recType = (ARecordType) itemType;
// Index parameters.
Index secondaryIndex = MetadataManager.INSTANCE.getIndex(mdTxnCtx, dataset.getDataverseName(), dataset.getDatasetName(), indexName);
List<List<String>> secondaryKeyExprs = secondaryIndex.getKeyFieldNames();
List<IAType> secondaryKeyTypeEntries = secondaryIndex.getKeyFieldTypes();
int numTokenFields = (!isPartitioned) ? secondaryKeys.size() : secondaryKeys.size() + 1;
ITypeTraits[] tokenTypeTraits = new ITypeTraits[numTokenFields];
ITypeTraits[] invListsTypeTraits = new ITypeTraits[primaryKeys.size()];
// Find the key type of the secondary key. If it's a derived type,
// return the derived type.
// e.g. UNORDERED LIST -> return UNORDERED LIST type
IAType secondaryKeyType;
Pair<IAType, Boolean> keyPairType = Index.getNonNullableOpenFieldType(secondaryKeyTypeEntries.get(0), secondaryKeyExprs.get(0), recType);
secondaryKeyType = keyPairType.first;
List<List<String>> partitioningKeys = dataset.getPrimaryKeys();
i = 0;
for (List<String> partitioningKey : partitioningKeys) {
IAType keyType = recType.getSubFieldType(partitioningKey);
invListsTypeTraits[i] = TypeTraitProvider.INSTANCE.getTypeTrait(keyType);
++i;
}
tokenTypeTraits[0] = NonTaggedFormatUtil.getTokenTypeTrait(secondaryKeyType);
if (isPartitioned) {
// The partitioning field is hardcoded to be a short *without*
// an Asterix type tag.
tokenTypeTraits[1] = ShortPointable.TYPE_TRAITS;
}
IBinaryTokenizerFactory tokenizerFactory = NonTaggedFormatUtil.getBinaryTokenizerFactory(secondaryKeyType.getTypeTag(), indexType, secondaryIndex.getGramLength());
Pair<IFileSplitProvider, AlgebricksPartitionConstraint> splitsAndConstraint = getSplitProviderAndConstraints(dataset, secondaryIndex.getIndexName());
// Generate Output Record format
ISerializerDeserializer<?>[] tokenKeyPairFields = new ISerializerDeserializer[numTokenKeyPairFields];
ITypeTraits[] tokenKeyPairTypeTraits = new ITypeTraits[numTokenKeyPairFields];
ISerializerDeserializerProvider serdeProvider = FormatUtils.getDefaultFormat().getSerdeProvider();
// #1. propagate all input variables
for (int k = 0; k < recordDesc.getFieldCount(); k++) {
tokenKeyPairFields[k] = recordDesc.getFields()[k];
tokenKeyPairTypeTraits[k] = recordDesc.getTypeTraits()[k];
}
int tokenOffset = recordDesc.getFieldCount();
// #2. Specify the token type
tokenKeyPairFields[tokenOffset] = serdeProvider.getSerializerDeserializer(secondaryKeyType);
tokenKeyPairTypeTraits[tokenOffset] = tokenTypeTraits[0];
tokenOffset++;
// #3. Specify the length-partitioning key: number of token
if (isPartitioned) {
tokenKeyPairFields[tokenOffset] = ShortSerializerDeserializer.INSTANCE;
tokenKeyPairTypeTraits[tokenOffset] = tokenTypeTraits[1];
}
RecordDescriptor tokenKeyPairRecDesc = new RecordDescriptor(tokenKeyPairFields, tokenKeyPairTypeTraits);
IOperatorDescriptor tokenizerOp;
// Keys to be tokenized : SK
int docField = fieldPermutation[fieldPermutation.length - 1];
// Keys to be propagated
int[] keyFields = new int[numKeys];
for (int k = 0; k < keyFields.length; k++) {
keyFields[k] = k;
}
tokenizerOp = new BinaryTokenizerOperatorDescriptor(spec, tokenKeyPairRecDesc, tokenizerFactory, docField, keyFields, isPartitioned, true);
return new Pair<>(tokenizerOp, splitsAndConstraint.second);
} catch (Exception e) {
throw new AlgebricksException(e);
}
}
use of org.apache.hyracks.algebricks.data.ISerializerDeserializerProvider in project asterixdb by apache.
the class Dataset method getPrimaryRecordDescriptor.
/**
* Gets the record descriptor for primary records of this dataset.
*
* @param metadataProvider,
* the metadata provider.
* @return the record descriptor for primary records of this dataset.
* @throws AlgebricksException
*/
public RecordDescriptor getPrimaryRecordDescriptor(MetadataProvider metadataProvider) throws AlgebricksException {
List<List<String>> partitioningKeys = getPrimaryKeys();
int numPrimaryKeys = partitioningKeys.size();
ISerializerDeserializer[] primaryRecFields = new ISerializerDeserializer[numPrimaryKeys + 1 + (hasMetaPart() ? 1 : 0)];
ITypeTraits[] primaryTypeTraits = new ITypeTraits[numPrimaryKeys + 1 + (hasMetaPart() ? 1 : 0)];
ISerializerDeserializerProvider serdeProvider = metadataProvider.getFormat().getSerdeProvider();
List<Integer> indicators = null;
if (hasMetaPart()) {
indicators = ((InternalDatasetDetails) getDatasetDetails()).getKeySourceIndicator();
}
ARecordType itemType = (ARecordType) metadataProvider.findType(this);
ARecordType metaType = (ARecordType) metadataProvider.findMetaType(this);
// Set the serde/traits for primary keys
for (int i = 0; i < numPrimaryKeys; i++) {
IAType keyType = (indicators == null || indicators.get(i) == 0) ? itemType.getSubFieldType(partitioningKeys.get(i)) : metaType.getSubFieldType(partitioningKeys.get(i));
primaryRecFields[i] = serdeProvider.getSerializerDeserializer(keyType);
primaryTypeTraits[i] = TypeTraitProvider.INSTANCE.getTypeTrait(keyType);
}
// Set the serde for the record field
primaryRecFields[numPrimaryKeys] = SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(itemType);
primaryTypeTraits[numPrimaryKeys] = TypeTraitProvider.INSTANCE.getTypeTrait(itemType);
if (hasMetaPart()) {
// Set the serde and traits for the meta record field
primaryRecFields[numPrimaryKeys + 1] = SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(metaType);
primaryTypeTraits[numPrimaryKeys + 1] = TypeTraitProvider.INSTANCE.getTypeTrait(itemType);
}
return new RecordDescriptor(primaryRecFields, primaryTypeTraits);
}
use of org.apache.hyracks.algebricks.data.ISerializerDeserializerProvider in project asterixdb by apache.
the class JobGenHelper method mkRecordDescriptor.
@SuppressWarnings("rawtypes")
public static RecordDescriptor mkRecordDescriptor(IVariableTypeEnvironment env, IOperatorSchema opSchema, JobGenContext context) throws AlgebricksException {
ISerializerDeserializer[] fields = new ISerializerDeserializer[opSchema.getSize()];
ITypeTraits[] typeTraits = new ITypeTraits[opSchema.getSize()];
ISerializerDeserializerProvider sdp = context.getSerializerDeserializerProvider();
ITypeTraitProvider ttp = context.getTypeTraitProvider();
int i = 0;
for (LogicalVariable var : opSchema) {
Object t = env.getVarType(var);
if (t == null) {
LOGGER.warning("No type for variable " + var);
}
fields[i] = sdp.getSerializerDeserializer(t);
typeTraits[i] = ttp.getTypeTrait(t);
i++;
}
return new RecordDescriptor(fields, typeTraits);
}
use of org.apache.hyracks.algebricks.data.ISerializerDeserializerProvider in project asterixdb by apache.
the class SecondaryIndexOperationsHelper method setPrimaryRecDescAndComparators.
protected void setPrimaryRecDescAndComparators() throws AlgebricksException {
List<List<String>> partitioningKeys = dataset.getPrimaryKeys();
ISerializerDeserializer[] primaryRecFields = new ISerializerDeserializer[numPrimaryKeys + 1 + (dataset.hasMetaPart() ? 1 : 0)];
ITypeTraits[] primaryTypeTraits = new ITypeTraits[numPrimaryKeys + 1 + (dataset.hasMetaPart() ? 1 : 0)];
primaryComparatorFactories = new IBinaryComparatorFactory[numPrimaryKeys];
primaryBloomFilterKeyFields = new int[numPrimaryKeys];
ISerializerDeserializerProvider serdeProvider = metadataProvider.getFormat().getSerdeProvider();
List<Integer> indicators = null;
if (dataset.hasMetaPart()) {
indicators = ((InternalDatasetDetails) dataset.getDatasetDetails()).getKeySourceIndicator();
}
for (int i = 0; i < numPrimaryKeys; i++) {
IAType keyType = (indicators == null || indicators.get(i) == 0) ? itemType.getSubFieldType(partitioningKeys.get(i)) : metaType.getSubFieldType(partitioningKeys.get(i));
primaryRecFields[i] = serdeProvider.getSerializerDeserializer(keyType);
primaryComparatorFactories[i] = BinaryComparatorFactoryProvider.INSTANCE.getBinaryComparatorFactory(keyType, true);
primaryTypeTraits[i] = TypeTraitProvider.INSTANCE.getTypeTrait(keyType);
primaryBloomFilterKeyFields[i] = i;
}
primaryRecFields[numPrimaryKeys] = payloadSerde;
primaryTypeTraits[numPrimaryKeys] = TypeTraitProvider.INSTANCE.getTypeTrait(itemType);
if (dataset.hasMetaPart()) {
primaryRecFields[numPrimaryKeys + 1] = payloadSerde;
primaryTypeTraits[numPrimaryKeys + 1] = TypeTraitProvider.INSTANCE.getTypeTrait(metaType);
}
primaryRecDesc = new RecordDescriptor(primaryRecFields, primaryTypeTraits);
}
use of org.apache.hyracks.algebricks.data.ISerializerDeserializerProvider in project asterixdb by apache.
the class SecondaryInvertedIndexOperationsHelper method setSecondaryRecDescAndComparators.
@Override
@SuppressWarnings("rawtypes")
protected void setSecondaryRecDescAndComparators() throws AlgebricksException {
int numSecondaryKeys = index.getKeyFieldNames().size();
IndexType indexType = index.getIndexType();
boolean isEnforcingKeyTypes = index.isEnforcingKeyFileds();
// Sanity checks.
if (numPrimaryKeys > 1) {
throw new CompilationException(ErrorCode.COMPILATION_ILLEGAL_INDEX_FOR_DATASET_WITH_COMPOSITE_PRIMARY_INDEX, indexType, RecordUtil.toFullyQualifiedName(dataset.getDataverseName(), dataset.getDatasetName()));
}
if (numSecondaryKeys > 1) {
throw new CompilationException(ErrorCode.COMPILATION_ILLEGAL_INDEX_NUM_OF_FIELD, numSecondaryKeys, indexType, 1);
}
if (indexType == IndexType.LENGTH_PARTITIONED_WORD_INVIX || indexType == IndexType.LENGTH_PARTITIONED_NGRAM_INVIX) {
isPartitioned = true;
} else {
isPartitioned = false;
}
// Prepare record descriptor used in the assign op, and the optional
// select op.
secondaryFieldAccessEvalFactories = new IScalarEvaluatorFactory[numSecondaryKeys + numFilterFields];
ISerializerDeserializer[] secondaryRecFields = new ISerializerDeserializer[numPrimaryKeys + numSecondaryKeys + numFilterFields];
ISerializerDeserializer[] enforcedRecFields = new ISerializerDeserializer[1 + numPrimaryKeys + numFilterFields];
secondaryTypeTraits = new ITypeTraits[numSecondaryKeys + numPrimaryKeys];
ITypeTraits[] enforcedTypeTraits = new ITypeTraits[1 + numPrimaryKeys];
ISerializerDeserializerProvider serdeProvider = FormatUtils.getDefaultFormat().getSerdeProvider();
ITypeTraitProvider typeTraitProvider = FormatUtils.getDefaultFormat().getTypeTraitProvider();
if (numSecondaryKeys > 0) {
secondaryFieldAccessEvalFactories[0] = FormatUtils.getDefaultFormat().getFieldAccessEvaluatorFactory(isEnforcingKeyTypes ? enforcedItemType : itemType, index.getKeyFieldNames().get(0), numPrimaryKeys);
Pair<IAType, Boolean> keyTypePair = Index.getNonNullableOpenFieldType(index.getKeyFieldTypes().get(0), index.getKeyFieldNames().get(0), itemType);
secondaryKeyType = keyTypePair.first;
anySecondaryKeyIsNullable = anySecondaryKeyIsNullable || keyTypePair.second;
ISerializerDeserializer keySerde = serdeProvider.getSerializerDeserializer(secondaryKeyType);
secondaryRecFields[0] = keySerde;
secondaryTypeTraits[0] = typeTraitProvider.getTypeTrait(secondaryKeyType);
}
if (numFilterFields > 0) {
secondaryFieldAccessEvalFactories[numSecondaryKeys] = FormatUtils.getDefaultFormat().getFieldAccessEvaluatorFactory(itemType, filterFieldName, numPrimaryKeys);
Pair<IAType, Boolean> keyTypePair = Index.getNonNullableKeyFieldType(filterFieldName, itemType);
IAType type = keyTypePair.first;
ISerializerDeserializer serde = serdeProvider.getSerializerDeserializer(type);
secondaryRecFields[numPrimaryKeys + numSecondaryKeys] = serde;
}
secondaryRecDesc = new RecordDescriptor(secondaryRecFields);
// Comparators and type traits for tokens.
int numTokenFields = (!isPartitioned) ? numSecondaryKeys : numSecondaryKeys + 1;
tokenComparatorFactories = new IBinaryComparatorFactory[numTokenFields];
tokenTypeTraits = new ITypeTraits[numTokenFields];
tokenComparatorFactories[0] = NonTaggedFormatUtil.getTokenBinaryComparatorFactory(secondaryKeyType);
tokenTypeTraits[0] = NonTaggedFormatUtil.getTokenTypeTrait(secondaryKeyType);
if (isPartitioned) {
// The partitioning field is hardcoded to be a short *without* an Asterix type tag.
tokenComparatorFactories[1] = PointableBinaryComparatorFactory.of(ShortPointable.FACTORY);
tokenTypeTraits[1] = ShortPointable.TYPE_TRAITS;
}
// Set tokenizer factory.
// TODO: We might want to expose the hashing option at the AQL level,
// and add the choice to the index metadata.
tokenizerFactory = NonTaggedFormatUtil.getBinaryTokenizerFactory(secondaryKeyType.getTypeTag(), indexType, index.getGramLength());
// Type traits for inverted-list elements. Inverted lists contain
// primary keys.
invListsTypeTraits = new ITypeTraits[numPrimaryKeys];
if (numPrimaryKeys > 0) {
invListsTypeTraits[0] = primaryRecDesc.getTypeTraits()[0];
enforcedRecFields[0] = primaryRecDesc.getFields()[0];
enforcedTypeTraits[0] = primaryRecDesc.getTypeTraits()[0];
}
enforcedRecFields[numPrimaryKeys] = serdeProvider.getSerializerDeserializer(itemType);
enforcedRecDesc = new RecordDescriptor(enforcedRecFields, enforcedTypeTraits);
// For tokenization, sorting and loading.
// One token (+ optional partitioning field) + primary keys.
numTokenKeyPairFields = (!isPartitioned) ? 1 + numPrimaryKeys : 2 + numPrimaryKeys;
ISerializerDeserializer[] tokenKeyPairFields = new ISerializerDeserializer[numTokenKeyPairFields + numFilterFields];
ITypeTraits[] tokenKeyPairTypeTraits = new ITypeTraits[numTokenKeyPairFields];
tokenKeyPairComparatorFactories = new IBinaryComparatorFactory[numTokenKeyPairFields];
tokenKeyPairFields[0] = serdeProvider.getSerializerDeserializer(secondaryKeyType);
tokenKeyPairTypeTraits[0] = tokenTypeTraits[0];
tokenKeyPairComparatorFactories[0] = NonTaggedFormatUtil.getTokenBinaryComparatorFactory(secondaryKeyType);
int pkOff = 1;
if (isPartitioned) {
tokenKeyPairFields[1] = ShortSerializerDeserializer.INSTANCE;
tokenKeyPairTypeTraits[1] = tokenTypeTraits[1];
tokenKeyPairComparatorFactories[1] = PointableBinaryComparatorFactory.of(ShortPointable.FACTORY);
pkOff = 2;
}
if (numPrimaryKeys > 0) {
tokenKeyPairFields[pkOff] = primaryRecDesc.getFields()[0];
tokenKeyPairTypeTraits[pkOff] = primaryRecDesc.getTypeTraits()[0];
tokenKeyPairComparatorFactories[pkOff] = primaryComparatorFactories[0];
}
if (numFilterFields > 0) {
tokenKeyPairFields[numPrimaryKeys + pkOff] = secondaryRecFields[numPrimaryKeys + numSecondaryKeys];
}
tokenKeyPairRecDesc = new RecordDescriptor(tokenKeyPairFields, tokenKeyPairTypeTraits);
if (filterFieldName != null) {
invertedIndexFields = new int[numTokenKeyPairFields];
for (int i = 0; i < invertedIndexFields.length; i++) {
invertedIndexFields[i] = i;
}
secondaryFilterFieldsForNonBulkLoadOps = new int[numFilterFields];
secondaryFilterFieldsForNonBulkLoadOps[0] = numSecondaryKeys + numPrimaryKeys;
invertedIndexFieldsForNonBulkLoadOps = new int[numSecondaryKeys + numPrimaryKeys];
for (int i = 0; i < invertedIndexFieldsForNonBulkLoadOps.length; i++) {
invertedIndexFieldsForNonBulkLoadOps[i] = i;
}
}
}
Aggregations