use of org.apache.hyracks.storage.am.common.dataflow.TreeIndexBulkLoadOperatorDescriptor in project asterixdb by apache.
the class SecondaryBTreeOperationsHelper method buildLoadingJobSpec.
@Override
public JobSpecification buildLoadingJobSpec() throws AlgebricksException {
JobSpecification spec = RuntimeUtils.createJobSpecification(metadataProvider.getApplicationContext());
boolean isEnforcingKeyTypes = index.isEnforcingKeyFileds();
int[] fieldPermutation = createFieldPermutationForBulkLoadOp(index.getKeyFieldNames().size());
IIndexDataflowHelperFactory dataflowHelperFactory = new IndexDataflowHelperFactory(metadataProvider.getStorageComponentProvider().getStorageManager(), secondaryFileSplitProvider);
if (dataset.getDatasetType() == DatasetType.EXTERNAL) {
/*
* In case of external data,
* this method is used to build loading jobs for both initial load on index creation
* and transaction load on dataset referesh
*/
// Create external indexing scan operator
ExternalScanOperatorDescriptor primaryScanOp = createExternalIndexingOp(spec);
// Assign op.
AbstractOperatorDescriptor sourceOp = primaryScanOp;
if (isEnforcingKeyTypes && !enforcedItemType.equals(itemType)) {
sourceOp = createCastOp(spec, dataset.getDatasetType());
spec.connect(new OneToOneConnectorDescriptor(spec), primaryScanOp, 0, sourceOp, 0);
}
AlgebricksMetaOperatorDescriptor asterixAssignOp = createExternalAssignOp(spec, index.getKeyFieldNames().size(), secondaryRecDesc);
// If any of the secondary fields are nullable, then add a select op that filters nulls.
AlgebricksMetaOperatorDescriptor selectOp = null;
if (anySecondaryKeyIsNullable || isEnforcingKeyTypes) {
selectOp = createFilterNullsSelectOp(spec, index.getKeyFieldNames().size(), secondaryRecDesc);
}
// Sort by secondary keys.
ExternalSortOperatorDescriptor sortOp = createSortOp(spec, secondaryComparatorFactories, secondaryRecDesc);
// Create secondary BTree bulk load op.
AbstractSingleActivityOperatorDescriptor secondaryBulkLoadOp;
IOperatorDescriptor root;
if (externalFiles != null) {
// Transaction load
secondaryBulkLoadOp = createExternalIndexBulkModifyOp(spec, fieldPermutation, dataflowHelperFactory, GlobalConfig.DEFAULT_TREE_FILL_FACTOR);
} else {
// Initial load
secondaryBulkLoadOp = createExternalIndexBulkLoadOp(spec, fieldPermutation, dataflowHelperFactory, GlobalConfig.DEFAULT_TREE_FILL_FACTOR);
}
AlgebricksMetaOperatorDescriptor metaOp = new AlgebricksMetaOperatorDescriptor(spec, 1, 0, new IPushRuntimeFactory[] { new SinkRuntimeFactory() }, new RecordDescriptor[] { secondaryRecDesc });
spec.connect(new OneToOneConnectorDescriptor(spec), secondaryBulkLoadOp, 0, metaOp, 0);
root = metaOp;
spec.connect(new OneToOneConnectorDescriptor(spec), sourceOp, 0, asterixAssignOp, 0);
if (anySecondaryKeyIsNullable || isEnforcingKeyTypes) {
spec.connect(new OneToOneConnectorDescriptor(spec), asterixAssignOp, 0, selectOp, 0);
spec.connect(new OneToOneConnectorDescriptor(spec), selectOp, 0, sortOp, 0);
} else {
spec.connect(new OneToOneConnectorDescriptor(spec), asterixAssignOp, 0, sortOp, 0);
}
spec.connect(new OneToOneConnectorDescriptor(spec), sortOp, 0, secondaryBulkLoadOp, 0);
spec.addRoot(root);
spec.setConnectorPolicyAssignmentPolicy(new ConnectorPolicyAssignmentPolicy());
return spec;
} else {
// Create dummy key provider for feeding the primary index scan.
IOperatorDescriptor keyProviderOp = DatasetUtil.createDummyKeyProviderOp(spec, dataset, metadataProvider);
JobId jobId = IndexUtil.bindJobEventListener(spec, metadataProvider);
// Create primary index scan op.
IOperatorDescriptor primaryScanOp = DatasetUtil.createPrimaryIndexScanOp(spec, metadataProvider, dataset, jobId);
// Assign op.
IOperatorDescriptor sourceOp = primaryScanOp;
if (isEnforcingKeyTypes && !enforcedItemType.equals(itemType)) {
sourceOp = createCastOp(spec, dataset.getDatasetType());
spec.connect(new OneToOneConnectorDescriptor(spec), primaryScanOp, 0, sourceOp, 0);
}
AlgebricksMetaOperatorDescriptor asterixAssignOp = createAssignOp(spec, index.getKeyFieldNames().size(), secondaryRecDesc);
// If any of the secondary fields are nullable, then add a select op that filters nulls.
AlgebricksMetaOperatorDescriptor selectOp = null;
if (anySecondaryKeyIsNullable || isEnforcingKeyTypes) {
selectOp = createFilterNullsSelectOp(spec, index.getKeyFieldNames().size(), secondaryRecDesc);
}
// Sort by secondary keys.
ExternalSortOperatorDescriptor sortOp = createSortOp(spec, secondaryComparatorFactories, secondaryRecDesc);
// Create secondary BTree bulk load op.
TreeIndexBulkLoadOperatorDescriptor secondaryBulkLoadOp = createTreeIndexBulkLoadOp(spec, fieldPermutation, dataflowHelperFactory, GlobalConfig.DEFAULT_TREE_FILL_FACTOR);
AlgebricksMetaOperatorDescriptor metaOp = new AlgebricksMetaOperatorDescriptor(spec, 1, 0, new IPushRuntimeFactory[] { new SinkRuntimeFactory() }, new RecordDescriptor[] { secondaryRecDesc });
// Connect the operators.
spec.connect(new OneToOneConnectorDescriptor(spec), keyProviderOp, 0, primaryScanOp, 0);
spec.connect(new OneToOneConnectorDescriptor(spec), sourceOp, 0, asterixAssignOp, 0);
if (anySecondaryKeyIsNullable || isEnforcingKeyTypes) {
spec.connect(new OneToOneConnectorDescriptor(spec), asterixAssignOp, 0, selectOp, 0);
spec.connect(new OneToOneConnectorDescriptor(spec), selectOp, 0, sortOp, 0);
} else {
spec.connect(new OneToOneConnectorDescriptor(spec), asterixAssignOp, 0, sortOp, 0);
}
spec.connect(new OneToOneConnectorDescriptor(spec), sortOp, 0, secondaryBulkLoadOp, 0);
spec.connect(new OneToOneConnectorDescriptor(spec), secondaryBulkLoadOp, 0, metaOp, 0);
spec.addRoot(metaOp);
spec.setConnectorPolicyAssignmentPolicy(new ConnectorPolicyAssignmentPolicy());
return spec;
}
}
use of org.apache.hyracks.storage.am.common.dataflow.TreeIndexBulkLoadOperatorDescriptor in project asterixdb by apache.
the class SecondaryRTreeOperationsHelper method buildLoadingJobSpec.
@Override
public JobSpecification buildLoadingJobSpec() throws AsterixException, AlgebricksException {
/***************************************************
* [ About PointMBR Optimization ]
* Instead of storing a MBR(4 doubles) for a point(2 doubles) in RTree leaf node,
* PointMBR concept is introduced.
* PointMBR is a way to store a point as 2 doubles in RTree leaf node.
* This reduces RTree index size roughly in half.
* In order to fully benefit from the PointMBR concept, besides RTree,
* external sort operator during bulk-loading (from either data loading or index creation)
* must deal with point as 2 doubles instead of 4 doubles. Otherwise, external sort will suffer from twice as
* many doubles as it actually requires. For this purpose,
* PointMBR specific optimization logic is added as follows:
* 1) CreateMBR function in assign operator generates 2 doubles, instead of 4 doubles.
* 2) External sort operator sorts points represented with 2 doubles.
* 3) Bulk-loading in RTree takes 4 doubles by reading 2 doubles twice and then,
* do the same work as non-point MBR cases.
***************************************************/
JobSpecification spec = RuntimeUtils.createJobSpecification(metadataProvider.getApplicationContext());
int[] fieldPermutation = createFieldPermutationForBulkLoadOp(numNestedSecondaryKeyFields);
int numNestedSecondaryKeFieldsConsideringPointMBR = isPointMBR ? numNestedSecondaryKeyFields / 2 : numNestedSecondaryKeyFields;
RecordDescriptor secondaryRecDescConsideringPointMBR = isPointMBR ? secondaryRecDescForPointMBR : secondaryRecDesc;
boolean isEnforcingKeyTypes = index.isEnforcingKeyFileds();
IIndexDataflowHelperFactory indexDataflowHelperFactory = new IndexDataflowHelperFactory(metadataProvider.getStorageComponentProvider().getStorageManager(), secondaryFileSplitProvider);
if (dataset.getDatasetType() == DatasetType.INTERNAL) {
// Create dummy key provider for feeding the primary index scan.
IOperatorDescriptor keyProviderOp = DatasetUtil.createDummyKeyProviderOp(spec, dataset, metadataProvider);
JobId jobId = IndexUtil.bindJobEventListener(spec, metadataProvider);
// Create primary index scan op.
IOperatorDescriptor primaryScanOp = DatasetUtil.createPrimaryIndexScanOp(spec, metadataProvider, dataset, jobId);
// Assign op.
IOperatorDescriptor sourceOp = primaryScanOp;
if (isEnforcingKeyTypes && !enforcedItemType.equals(itemType)) {
sourceOp = createCastOp(spec, dataset.getDatasetType());
spec.connect(new OneToOneConnectorDescriptor(spec), primaryScanOp, 0, sourceOp, 0);
}
AlgebricksMetaOperatorDescriptor asterixAssignOp = createAssignOp(spec, numNestedSecondaryKeFieldsConsideringPointMBR, secondaryRecDescConsideringPointMBR);
// If any of the secondary fields are nullable, then add a select op that filters nulls.
AlgebricksMetaOperatorDescriptor selectOp = null;
if (anySecondaryKeyIsNullable || isEnforcingKeyTypes) {
selectOp = createFilterNullsSelectOp(spec, numNestedSecondaryKeFieldsConsideringPointMBR, secondaryRecDescConsideringPointMBR);
}
// Sort by secondary keys.
ExternalSortOperatorDescriptor sortOp = createSortOp(spec, new IBinaryComparatorFactory[] { MetadataProvider.proposeLinearizer(keyType, secondaryComparatorFactories.length) }, isPointMBR ? secondaryRecDescForPointMBR : secondaryRecDesc);
// Create secondary RTree bulk load op.
TreeIndexBulkLoadOperatorDescriptor secondaryBulkLoadOp = createTreeIndexBulkLoadOp(spec, fieldPermutation, indexDataflowHelperFactory, GlobalConfig.DEFAULT_TREE_FILL_FACTOR);
AlgebricksMetaOperatorDescriptor metaOp = new AlgebricksMetaOperatorDescriptor(spec, 1, 0, new IPushRuntimeFactory[] { new SinkRuntimeFactory() }, new RecordDescriptor[] {});
// Connect the operators.
spec.connect(new OneToOneConnectorDescriptor(spec), keyProviderOp, 0, primaryScanOp, 0);
spec.connect(new OneToOneConnectorDescriptor(spec), sourceOp, 0, asterixAssignOp, 0);
if (anySecondaryKeyIsNullable || isEnforcingKeyTypes) {
spec.connect(new OneToOneConnectorDescriptor(spec), asterixAssignOp, 0, selectOp, 0);
spec.connect(new OneToOneConnectorDescriptor(spec), selectOp, 0, sortOp, 0);
} else {
spec.connect(new OneToOneConnectorDescriptor(spec), asterixAssignOp, 0, sortOp, 0);
}
spec.connect(new OneToOneConnectorDescriptor(spec), sortOp, 0, secondaryBulkLoadOp, 0);
spec.connect(new OneToOneConnectorDescriptor(spec), secondaryBulkLoadOp, 0, metaOp, 0);
spec.addRoot(metaOp);
spec.setConnectorPolicyAssignmentPolicy(new ConnectorPolicyAssignmentPolicy());
} else {
// External dataset
/*
* In case of external data, this method is used to build loading jobs for both
* initial load on index creation
* and transaction load on dataset referesh
*/
// Create external indexing scan operator
ExternalScanOperatorDescriptor primaryScanOp = createExternalIndexingOp(spec);
AbstractOperatorDescriptor sourceOp = primaryScanOp;
if (isEnforcingKeyTypes && !enforcedItemType.equals(itemType)) {
sourceOp = createCastOp(spec, dataset.getDatasetType());
spec.connect(new OneToOneConnectorDescriptor(spec), primaryScanOp, 0, sourceOp, 0);
}
// Assign op.
AlgebricksMetaOperatorDescriptor asterixAssignOp = createExternalAssignOp(spec, numNestedSecondaryKeFieldsConsideringPointMBR, secondaryRecDescConsideringPointMBR);
// If any of the secondary fields are nullable, then add a select op that filters nulls.
AlgebricksMetaOperatorDescriptor selectOp = null;
if (anySecondaryKeyIsNullable || isEnforcingKeyTypes) {
selectOp = createFilterNullsSelectOp(spec, numNestedSecondaryKeFieldsConsideringPointMBR, secondaryRecDescConsideringPointMBR);
}
// Sort by secondary keys.
ExternalSortOperatorDescriptor sortOp = createSortOp(spec, new IBinaryComparatorFactory[] { MetadataProvider.proposeLinearizer(keyType, secondaryComparatorFactories.length) }, isPointMBR ? secondaryRecDescForPointMBR : secondaryRecDesc);
// Create secondary RTree bulk load op.
IOperatorDescriptor root;
AbstractSingleActivityOperatorDescriptor secondaryBulkLoadOp;
if (externalFiles != null) {
// Transaction load
secondaryBulkLoadOp = createExternalIndexBulkModifyOp(spec, fieldPermutation, indexDataflowHelperFactory, GlobalConfig.DEFAULT_TREE_FILL_FACTOR);
} else {
// Initial load
secondaryBulkLoadOp = createExternalIndexBulkLoadOp(spec, fieldPermutation, indexDataflowHelperFactory, GlobalConfig.DEFAULT_TREE_FILL_FACTOR);
}
AlgebricksMetaOperatorDescriptor metaOp = new AlgebricksMetaOperatorDescriptor(spec, 1, 0, new IPushRuntimeFactory[] { new SinkRuntimeFactory() }, new RecordDescriptor[] { secondaryRecDesc });
spec.connect(new OneToOneConnectorDescriptor(spec), secondaryBulkLoadOp, 0, metaOp, 0);
root = metaOp;
spec.connect(new OneToOneConnectorDescriptor(spec), sourceOp, 0, asterixAssignOp, 0);
if (anySecondaryKeyIsNullable || isEnforcingKeyTypes) {
spec.connect(new OneToOneConnectorDescriptor(spec), asterixAssignOp, 0, selectOp, 0);
spec.connect(new OneToOneConnectorDescriptor(spec), selectOp, 0, sortOp, 0);
} else {
spec.connect(new OneToOneConnectorDescriptor(spec), asterixAssignOp, 0, sortOp, 0);
}
spec.connect(new OneToOneConnectorDescriptor(spec), sortOp, 0, secondaryBulkLoadOp, 0);
spec.addRoot(root);
spec.setConnectorPolicyAssignmentPolicy(new ConnectorPolicyAssignmentPolicy());
}
return spec;
}
use of org.apache.hyracks.storage.am.common.dataflow.TreeIndexBulkLoadOperatorDescriptor in project asterixdb by apache.
the class SecondaryIndexOperationsHelper method createTreeIndexBulkLoadOp.
protected TreeIndexBulkLoadOperatorDescriptor createTreeIndexBulkLoadOp(JobSpecification spec, int[] fieldPermutation, IIndexDataflowHelperFactory dataflowHelperFactory, float fillFactor) throws AlgebricksException {
TreeIndexBulkLoadOperatorDescriptor treeIndexBulkLoadOp = new TreeIndexBulkLoadOperatorDescriptor(spec, secondaryRecDesc, fieldPermutation, fillFactor, false, numElementsHint, false, dataflowHelperFactory);
AlgebricksPartitionConstraintHelper.setPartitionConstraintInJobSpec(spec, treeIndexBulkLoadOp, secondaryPartitionConstraint);
return treeIndexBulkLoadOp;
}
use of org.apache.hyracks.storage.am.common.dataflow.TreeIndexBulkLoadOperatorDescriptor in project asterixdb by apache.
the class AbstractRTreeOperatorTest method loadSecondaryIndex.
protected void loadSecondaryIndex() throws Exception {
JobSpecification spec = new JobSpecification();
// build dummy tuple containing nothing
ArrayTupleBuilder tb = new ArrayTupleBuilder(primaryKeyFieldCount * 2);
DataOutput dos = tb.getDataOutput();
tb.reset();
new UTF8StringSerializerDeserializer().serialize("0", dos);
tb.addFieldEndOffset();
ISerializerDeserializer[] keyRecDescSers = { new UTF8StringSerializerDeserializer(), new UTF8StringSerializerDeserializer() };
RecordDescriptor keyRecDesc = new RecordDescriptor(keyRecDescSers);
ConstantTupleSourceOperatorDescriptor keyProviderOp = new ConstantTupleSourceOperatorDescriptor(spec, keyRecDesc, tb.getFieldEndOffsets(), tb.getByteArray(), tb.getSize());
PartitionConstraintHelper.addAbsoluteLocationConstraint(spec, keyProviderOp, NC1_ID);
// - infinity
int[] lowKeyFields = null;
// + infinity
int[] highKeyFields = null;
// scan primary index
BTreeSearchOperatorDescriptor primarySearchOp = new BTreeSearchOperatorDescriptor(spec, primaryRecDesc, lowKeyFields, highKeyFields, true, true, primaryHelperFactory, false, false, null, NoOpOperationCallbackFactory.INSTANCE, null, null, false);
PartitionConstraintHelper.addAbsoluteLocationConstraint(spec, primarySearchOp, NC1_ID);
// load secondary index
int[] fieldPermutation = { 6, 7, 8, 9, 0 };
TreeIndexBulkLoadOperatorDescriptor secondaryBulkLoad = new TreeIndexBulkLoadOperatorDescriptor(spec, secondaryRecDesc, fieldPermutation, 0.7f, false, 1000L, true, secondaryHelperFactory);
PartitionConstraintHelper.addAbsoluteLocationConstraint(spec, secondaryBulkLoad, NC1_ID);
NullSinkOperatorDescriptor nsOpDesc = new NullSinkOperatorDescriptor(spec);
PartitionConstraintHelper.addAbsoluteLocationConstraint(spec, nsOpDesc, NC1_ID);
spec.connect(new OneToOneConnectorDescriptor(spec), keyProviderOp, 0, primarySearchOp, 0);
spec.connect(new OneToOneConnectorDescriptor(spec), primarySearchOp, 0, secondaryBulkLoad, 0);
spec.connect(new OneToOneConnectorDescriptor(spec), secondaryBulkLoad, 0, nsOpDesc, 0);
spec.addRoot(nsOpDesc);
runTest(spec);
}
use of org.apache.hyracks.storage.am.common.dataflow.TreeIndexBulkLoadOperatorDescriptor in project asterixdb by apache.
the class MetadataProvider method getBTreeRuntime.
private Pair<IOperatorDescriptor, AlgebricksPartitionConstraint> getBTreeRuntime(String dataverseName, String datasetName, String indexName, IOperatorSchema propagatedSchema, List<LogicalVariable> primaryKeys, List<LogicalVariable> secondaryKeys, List<LogicalVariable> additionalNonKeyFields, AsterixTupleFilterFactory filterFactory, RecordDescriptor inputRecordDesc, JobGenContext context, JobSpecification spec, IndexOperation indexOp, boolean bulkload, List<LogicalVariable> prevSecondaryKeys, List<LogicalVariable> prevAdditionalFilteringKeys) throws AlgebricksException {
Dataset dataset = MetadataManagerUtil.findExistingDataset(mdTxnCtx, dataverseName, datasetName);
boolean temp = dataset.getDatasetDetails().isTemp();
isTemporaryDatasetWriteJob = isTemporaryDatasetWriteJob && temp;
int numKeys = primaryKeys.size() + secondaryKeys.size();
int numFilterFields = DatasetUtil.getFilterField(dataset) == null ? 0 : 1;
// generate field permutations
int[] fieldPermutation = new int[numKeys + numFilterFields];
int[] modificationCallbackPrimaryKeyFields = new int[primaryKeys.size()];
int i = 0;
int j = 0;
for (LogicalVariable varKey : secondaryKeys) {
int idx = propagatedSchema.findVariable(varKey);
fieldPermutation[i] = idx;
i++;
}
for (LogicalVariable varKey : primaryKeys) {
int idx = propagatedSchema.findVariable(varKey);
fieldPermutation[i] = idx;
modificationCallbackPrimaryKeyFields[j] = i;
i++;
j++;
}
if (numFilterFields > 0) {
int idx = propagatedSchema.findVariable(additionalNonKeyFields.get(0));
fieldPermutation[numKeys] = idx;
}
int[] prevFieldPermutation = null;
if (indexOp == IndexOperation.UPSERT) {
// generate field permutations for prev record
prevFieldPermutation = new int[numKeys + numFilterFields];
int k = 0;
for (LogicalVariable varKey : prevSecondaryKeys) {
int idx = propagatedSchema.findVariable(varKey);
prevFieldPermutation[k] = idx;
k++;
}
for (LogicalVariable varKey : primaryKeys) {
int idx = propagatedSchema.findVariable(varKey);
prevFieldPermutation[k] = idx;
k++;
}
// Filter can only be one field!
if (numFilterFields > 0) {
int idx = propagatedSchema.findVariable(prevAdditionalFilteringKeys.get(0));
prevFieldPermutation[numKeys] = idx;
}
}
try {
// Index parameters.
Index secondaryIndex = MetadataManager.INSTANCE.getIndex(mdTxnCtx, dataset.getDataverseName(), dataset.getDatasetName(), indexName);
Pair<IFileSplitProvider, AlgebricksPartitionConstraint> splitsAndConstraint = getSplitProviderAndConstraints(dataset, secondaryIndex.getIndexName());
// prepare callback
JobId jobId = ((JobEventListenerFactory) spec.getJobletEventListenerFactory()).getJobId();
IModificationOperationCallbackFactory modificationCallbackFactory = dataset.getModificationCallbackFactory(storaegComponentProvider, secondaryIndex, jobId, indexOp, modificationCallbackPrimaryKeyFields);
IIndexDataflowHelperFactory idfh = new IndexDataflowHelperFactory(storaegComponentProvider.getStorageManager(), splitsAndConstraint.first);
IOperatorDescriptor op;
if (bulkload) {
long numElementsHint = getCardinalityPerPartitionHint(dataset);
op = new TreeIndexBulkLoadOperatorDescriptor(spec, inputRecordDesc, fieldPermutation, GlobalConfig.DEFAULT_TREE_FILL_FACTOR, false, numElementsHint, false, idfh);
} else if (indexOp == IndexOperation.UPSERT) {
op = new LSMSecondaryUpsertOperatorDescriptor(spec, inputRecordDesc, fieldPermutation, idfh, filterFactory, modificationCallbackFactory, prevFieldPermutation);
} else {
op = new LSMTreeInsertDeleteOperatorDescriptor(spec, inputRecordDesc, fieldPermutation, indexOp, idfh, filterFactory, false, modificationCallbackFactory);
}
return new Pair<>(op, splitsAndConstraint.second);
} catch (Exception e) {
throw new AlgebricksException(e);
}
}
Aggregations