Search in sources :

Example 1 with IOperatorDescriptor

use of org.apache.hyracks.api.dataflow.IOperatorDescriptor in project asterixdb by apache.

the class FeedOperations method combineIntakeCollectJobs.

private static JobSpecification combineIntakeCollectJobs(MetadataProvider metadataProvider, Feed feed, JobSpecification intakeJob, List<JobSpecification> jobsList, List<FeedConnection> feedConnections, String[] intakeLocations) throws AlgebricksException, HyracksDataException {
    JobSpecification jobSpec = new JobSpecification(intakeJob.getFrameSize());
    // copy ingestor
    FeedIntakeOperatorDescriptor firstOp = (FeedIntakeOperatorDescriptor) intakeJob.getOperatorMap().get(new OperatorDescriptorId(0));
    FeedIntakeOperatorDescriptor ingestionOp;
    if (firstOp.getAdaptorFactory() == null) {
        ingestionOp = new FeedIntakeOperatorDescriptor(jobSpec, feed, firstOp.getAdaptorLibraryName(), firstOp.getAdaptorFactoryClassName(), firstOp.getAdapterOutputType(), firstOp.getPolicyAccessor(), firstOp.getOutputRecordDescriptors()[0]);
    } else {
        ingestionOp = new FeedIntakeOperatorDescriptor(jobSpec, feed, firstOp.getAdaptorFactory(), firstOp.getAdapterOutputType(), firstOp.getPolicyAccessor(), firstOp.getOutputRecordDescriptors()[0]);
    }
    // create replicator
    ReplicateOperatorDescriptor replicateOp = new ReplicateOperatorDescriptor(jobSpec, ingestionOp.getOutputRecordDescriptors()[0], jobsList.size());
    jobSpec.connect(new OneToOneConnectorDescriptor(jobSpec), ingestionOp, 0, replicateOp, 0);
    PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, ingestionOp, intakeLocations);
    PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, replicateOp, intakeLocations);
    // Loop over the jobs to copy operators and connections
    Map<OperatorDescriptorId, OperatorDescriptorId> operatorIdMapping = new HashMap<>();
    Map<ConnectorDescriptorId, ConnectorDescriptorId> connectorIdMapping = new HashMap<>();
    Map<OperatorDescriptorId, List<LocationConstraint>> operatorLocations = new HashMap<>();
    Map<OperatorDescriptorId, Integer> operatorCounts = new HashMap<>();
    List<JobId> jobIds = new ArrayList<>();
    FeedMetaOperatorDescriptor metaOp;
    for (int iter1 = 0; iter1 < jobsList.size(); iter1++) {
        FeedConnection curFeedConnection = feedConnections.get(iter1);
        JobSpecification subJob = jobsList.get(iter1);
        operatorIdMapping.clear();
        Map<OperatorDescriptorId, IOperatorDescriptor> operatorsMap = subJob.getOperatorMap();
        String datasetName = feedConnections.get(iter1).getDatasetName();
        FeedConnectionId feedConnectionId = new FeedConnectionId(ingestionOp.getEntityId(), datasetName);
        FeedPolicyEntity feedPolicyEntity = FeedMetadataUtil.validateIfPolicyExists(curFeedConnection.getDataverseName(), curFeedConnection.getPolicyName(), metadataProvider.getMetadataTxnContext());
        for (Map.Entry<OperatorDescriptorId, IOperatorDescriptor> entry : operatorsMap.entrySet()) {
            IOperatorDescriptor opDesc = entry.getValue();
            OperatorDescriptorId oldId = opDesc.getOperatorId();
            OperatorDescriptorId opId = null;
            if (opDesc instanceof LSMTreeInsertDeleteOperatorDescriptor && ((LSMTreeInsertDeleteOperatorDescriptor) opDesc).isPrimary()) {
                metaOp = new FeedMetaOperatorDescriptor(jobSpec, feedConnectionId, opDesc, feedPolicyEntity.getProperties(), FeedRuntimeType.STORE);
                opId = metaOp.getOperatorId();
                opDesc.setOperatorId(opId);
            } else {
                if (opDesc instanceof AlgebricksMetaOperatorDescriptor) {
                    AlgebricksMetaOperatorDescriptor algOp = (AlgebricksMetaOperatorDescriptor) opDesc;
                    IPushRuntimeFactory[] runtimeFactories = algOp.getPipeline().getRuntimeFactories();
                    // Tweak AssignOp to work with messages
                    if (runtimeFactories[0] instanceof AssignRuntimeFactory && runtimeFactories.length > 1) {
                        IConnectorDescriptor connectorDesc = subJob.getOperatorInputMap().get(opDesc.getOperatorId()).get(0);
                        // anything on the network interface needs to be message compatible
                        if (connectorDesc instanceof MToNPartitioningConnectorDescriptor) {
                            metaOp = new FeedMetaOperatorDescriptor(jobSpec, feedConnectionId, opDesc, feedPolicyEntity.getProperties(), FeedRuntimeType.COMPUTE);
                            opId = metaOp.getOperatorId();
                            opDesc.setOperatorId(opId);
                        }
                    }
                }
                if (opId == null) {
                    opId = jobSpec.createOperatorDescriptorId(opDesc);
                }
            }
            operatorIdMapping.put(oldId, opId);
        }
        // copy connectors
        connectorIdMapping.clear();
        for (Entry<ConnectorDescriptorId, IConnectorDescriptor> entry : subJob.getConnectorMap().entrySet()) {
            IConnectorDescriptor connDesc = entry.getValue();
            ConnectorDescriptorId newConnId;
            if (connDesc instanceof MToNPartitioningConnectorDescriptor) {
                MToNPartitioningConnectorDescriptor m2nConn = (MToNPartitioningConnectorDescriptor) connDesc;
                connDesc = new MToNPartitioningWithMessageConnectorDescriptor(jobSpec, m2nConn.getTuplePartitionComputerFactory());
                newConnId = connDesc.getConnectorId();
            } else {
                newConnId = jobSpec.createConnectorDescriptor(connDesc);
            }
            connectorIdMapping.put(entry.getKey(), newConnId);
        }
        // make connections between operators
        for (Entry<ConnectorDescriptorId, Pair<Pair<IOperatorDescriptor, Integer>, Pair<IOperatorDescriptor, Integer>>> entry : subJob.getConnectorOperatorMap().entrySet()) {
            ConnectorDescriptorId newId = connectorIdMapping.get(entry.getKey());
            IConnectorDescriptor connDesc = jobSpec.getConnectorMap().get(newId);
            Pair<IOperatorDescriptor, Integer> leftOp = entry.getValue().getLeft();
            Pair<IOperatorDescriptor, Integer> rightOp = entry.getValue().getRight();
            IOperatorDescriptor leftOpDesc = jobSpec.getOperatorMap().get(leftOp.getLeft().getOperatorId());
            IOperatorDescriptor rightOpDesc = jobSpec.getOperatorMap().get(rightOp.getLeft().getOperatorId());
            if (leftOp.getLeft() instanceof FeedCollectOperatorDescriptor) {
                jobSpec.connect(new OneToOneConnectorDescriptor(jobSpec), replicateOp, iter1, leftOpDesc, leftOp.getRight());
            }
            jobSpec.connect(connDesc, leftOpDesc, leftOp.getRight(), rightOpDesc, rightOp.getRight());
        }
        // prepare for setting partition constraints
        operatorLocations.clear();
        operatorCounts.clear();
        for (Constraint constraint : subJob.getUserConstraints()) {
            LValueConstraintExpression lexpr = constraint.getLValue();
            ConstraintExpression cexpr = constraint.getRValue();
            OperatorDescriptorId opId;
            switch(lexpr.getTag()) {
                case PARTITION_COUNT:
                    opId = ((PartitionCountExpression) lexpr).getOperatorDescriptorId();
                    operatorCounts.put(operatorIdMapping.get(opId), (int) ((ConstantExpression) cexpr).getValue());
                    break;
                case PARTITION_LOCATION:
                    opId = ((PartitionLocationExpression) lexpr).getOperatorDescriptorId();
                    IOperatorDescriptor opDesc = jobSpec.getOperatorMap().get(operatorIdMapping.get(opId));
                    List<LocationConstraint> locations = operatorLocations.get(opDesc.getOperatorId());
                    if (locations == null) {
                        locations = new ArrayList<>();
                        operatorLocations.put(opDesc.getOperatorId(), locations);
                    }
                    String location = (String) ((ConstantExpression) cexpr).getValue();
                    LocationConstraint lc = new LocationConstraint(location, ((PartitionLocationExpression) lexpr).getPartition());
                    locations.add(lc);
                    break;
                default:
                    break;
            }
        }
        // set absolute location constraints
        for (Entry<OperatorDescriptorId, List<LocationConstraint>> entry : operatorLocations.entrySet()) {
            IOperatorDescriptor opDesc = jobSpec.getOperatorMap().get(entry.getKey());
            // why do we need to sort?
            Collections.sort(entry.getValue(), (LocationConstraint o1, LocationConstraint o2) -> {
                return o1.partition - o2.partition;
            });
            String[] locations = new String[entry.getValue().size()];
            for (int j = 0; j < locations.length; ++j) {
                locations[j] = entry.getValue().get(j).location;
            }
            PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, opDesc, locations);
        }
        // set count constraints
        for (Entry<OperatorDescriptorId, Integer> entry : operatorCounts.entrySet()) {
            IOperatorDescriptor opDesc = jobSpec.getOperatorMap().get(entry.getKey());
            if (!operatorLocations.keySet().contains(entry.getKey())) {
                PartitionConstraintHelper.addPartitionCountConstraint(jobSpec, opDesc, entry.getValue());
            }
        }
        // roots
        for (OperatorDescriptorId root : subJob.getRoots()) {
            jobSpec.addRoot(jobSpec.getOperatorMap().get(operatorIdMapping.get(root)));
        }
        jobIds.add(((JobEventListenerFactory) subJob.getJobletEventListenerFactory()).getJobId());
    }
    // jobEventListenerFactory
    jobSpec.setJobletEventListenerFactory(new MultiTransactionJobletEventListenerFactory(jobIds, true));
    // useConnectorSchedulingPolicy
    jobSpec.setUseConnectorPolicyForScheduling(jobsList.get(0).isUseConnectorPolicyForScheduling());
    // connectorAssignmentPolicy
    jobSpec.setConnectorPolicyAssignmentPolicy(jobsList.get(0).getConnectorPolicyAssignmentPolicy());
    return jobSpec;
}
Also used : HashMap(java.util.HashMap) AlgebricksPartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint) AlgebricksAbsolutePartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksAbsolutePartitionConstraint) Constraint(org.apache.hyracks.api.constraints.Constraint) LocationConstraint(org.apache.asterix.metadata.feeds.LocationConstraint) ConstantExpression(org.apache.hyracks.api.constraints.expressions.ConstantExpression) ConnectorDescriptorId(org.apache.hyracks.api.dataflow.ConnectorDescriptorId) ArrayList(java.util.ArrayList) OneToOneConnectorDescriptor(org.apache.hyracks.dataflow.std.connectors.OneToOneConnectorDescriptor) LocationConstraint(org.apache.asterix.metadata.feeds.LocationConstraint) List(java.util.List) ArrayList(java.util.ArrayList) AlgebricksMetaOperatorDescriptor(org.apache.hyracks.algebricks.runtime.operators.meta.AlgebricksMetaOperatorDescriptor) LValueConstraintExpression(org.apache.hyracks.api.constraints.expressions.LValueConstraintExpression) ReplicateOperatorDescriptor(org.apache.hyracks.dataflow.std.misc.ReplicateOperatorDescriptor) Map(java.util.Map) HashMap(java.util.HashMap) FeedCollectOperatorDescriptor(org.apache.asterix.external.operators.FeedCollectOperatorDescriptor) FeedPolicyEntity(org.apache.asterix.metadata.entities.FeedPolicyEntity) JobSpecification(org.apache.hyracks.api.job.JobSpecification) FeedIntakeOperatorDescriptor(org.apache.asterix.external.operators.FeedIntakeOperatorDescriptor) JobId(org.apache.asterix.common.transactions.JobId) Pair(org.apache.commons.lang3.tuple.Pair) IConnectorDescriptor(org.apache.hyracks.api.dataflow.IConnectorDescriptor) MToNPartitioningWithMessageConnectorDescriptor(org.apache.hyracks.dataflow.std.connectors.MToNPartitioningWithMessageConnectorDescriptor) OperatorDescriptorId(org.apache.hyracks.api.dataflow.OperatorDescriptorId) LValueConstraintExpression(org.apache.hyracks.api.constraints.expressions.LValueConstraintExpression) ConstraintExpression(org.apache.hyracks.api.constraints.expressions.ConstraintExpression) FeedConnection(org.apache.asterix.metadata.entities.FeedConnection) MultiTransactionJobletEventListenerFactory(org.apache.asterix.runtime.job.listener.MultiTransactionJobletEventListenerFactory) MToNPartitioningConnectorDescriptor(org.apache.hyracks.dataflow.std.connectors.MToNPartitioningConnectorDescriptor) AssignRuntimeFactory(org.apache.hyracks.algebricks.runtime.operators.std.AssignRuntimeFactory) IPushRuntimeFactory(org.apache.hyracks.algebricks.runtime.base.IPushRuntimeFactory) AlgebricksPartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint) AlgebricksAbsolutePartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksAbsolutePartitionConstraint) Constraint(org.apache.hyracks.api.constraints.Constraint) LocationConstraint(org.apache.asterix.metadata.feeds.LocationConstraint) FeedMetaOperatorDescriptor(org.apache.asterix.external.operators.FeedMetaOperatorDescriptor) IOperatorDescriptor(org.apache.hyracks.api.dataflow.IOperatorDescriptor) FeedConnectionId(org.apache.asterix.external.feed.management.FeedConnectionId) LSMTreeInsertDeleteOperatorDescriptor(org.apache.asterix.common.dataflow.LSMTreeInsertDeleteOperatorDescriptor)

Example 2 with IOperatorDescriptor

use of org.apache.hyracks.api.dataflow.IOperatorDescriptor in project asterixdb by apache.

the class MetadataProvider method getBTreeRuntime.

private Pair<IOperatorDescriptor, AlgebricksPartitionConstraint> getBTreeRuntime(String dataverseName, String datasetName, String indexName, IOperatorSchema propagatedSchema, List<LogicalVariable> primaryKeys, List<LogicalVariable> secondaryKeys, List<LogicalVariable> additionalNonKeyFields, AsterixTupleFilterFactory filterFactory, RecordDescriptor inputRecordDesc, JobGenContext context, JobSpecification spec, IndexOperation indexOp, boolean bulkload, List<LogicalVariable> prevSecondaryKeys, List<LogicalVariable> prevAdditionalFilteringKeys) throws AlgebricksException {
    Dataset dataset = MetadataManagerUtil.findExistingDataset(mdTxnCtx, dataverseName, datasetName);
    boolean temp = dataset.getDatasetDetails().isTemp();
    isTemporaryDatasetWriteJob = isTemporaryDatasetWriteJob && temp;
    int numKeys = primaryKeys.size() + secondaryKeys.size();
    int numFilterFields = DatasetUtil.getFilterField(dataset) == null ? 0 : 1;
    // generate field permutations
    int[] fieldPermutation = new int[numKeys + numFilterFields];
    int[] modificationCallbackPrimaryKeyFields = new int[primaryKeys.size()];
    int i = 0;
    int j = 0;
    for (LogicalVariable varKey : secondaryKeys) {
        int idx = propagatedSchema.findVariable(varKey);
        fieldPermutation[i] = idx;
        i++;
    }
    for (LogicalVariable varKey : primaryKeys) {
        int idx = propagatedSchema.findVariable(varKey);
        fieldPermutation[i] = idx;
        modificationCallbackPrimaryKeyFields[j] = i;
        i++;
        j++;
    }
    if (numFilterFields > 0) {
        int idx = propagatedSchema.findVariable(additionalNonKeyFields.get(0));
        fieldPermutation[numKeys] = idx;
    }
    int[] prevFieldPermutation = null;
    if (indexOp == IndexOperation.UPSERT) {
        // generate field permutations for prev record
        prevFieldPermutation = new int[numKeys + numFilterFields];
        int k = 0;
        for (LogicalVariable varKey : prevSecondaryKeys) {
            int idx = propagatedSchema.findVariable(varKey);
            prevFieldPermutation[k] = idx;
            k++;
        }
        for (LogicalVariable varKey : primaryKeys) {
            int idx = propagatedSchema.findVariable(varKey);
            prevFieldPermutation[k] = idx;
            k++;
        }
        // Filter can only be one field!
        if (numFilterFields > 0) {
            int idx = propagatedSchema.findVariable(prevAdditionalFilteringKeys.get(0));
            prevFieldPermutation[numKeys] = idx;
        }
    }
    try {
        // Index parameters.
        Index secondaryIndex = MetadataManager.INSTANCE.getIndex(mdTxnCtx, dataset.getDataverseName(), dataset.getDatasetName(), indexName);
        Pair<IFileSplitProvider, AlgebricksPartitionConstraint> splitsAndConstraint = getSplitProviderAndConstraints(dataset, secondaryIndex.getIndexName());
        // prepare callback
        JobId jobId = ((JobEventListenerFactory) spec.getJobletEventListenerFactory()).getJobId();
        IModificationOperationCallbackFactory modificationCallbackFactory = dataset.getModificationCallbackFactory(storaegComponentProvider, secondaryIndex, jobId, indexOp, modificationCallbackPrimaryKeyFields);
        IIndexDataflowHelperFactory idfh = new IndexDataflowHelperFactory(storaegComponentProvider.getStorageManager(), splitsAndConstraint.first);
        IOperatorDescriptor op;
        if (bulkload) {
            long numElementsHint = getCardinalityPerPartitionHint(dataset);
            op = new TreeIndexBulkLoadOperatorDescriptor(spec, inputRecordDesc, fieldPermutation, GlobalConfig.DEFAULT_TREE_FILL_FACTOR, false, numElementsHint, false, idfh);
        } else if (indexOp == IndexOperation.UPSERT) {
            op = new LSMSecondaryUpsertOperatorDescriptor(spec, inputRecordDesc, fieldPermutation, idfh, filterFactory, modificationCallbackFactory, prevFieldPermutation);
        } else {
            op = new LSMTreeInsertDeleteOperatorDescriptor(spec, inputRecordDesc, fieldPermutation, indexOp, idfh, filterFactory, false, modificationCallbackFactory);
        }
        return new Pair<>(op, splitsAndConstraint.second);
    } catch (Exception e) {
        throw new AlgebricksException(e);
    }
}
Also used : LogicalVariable(org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable) Dataset(org.apache.asterix.metadata.entities.Dataset) IFileSplitProvider(org.apache.hyracks.dataflow.std.file.IFileSplitProvider) AlgebricksException(org.apache.hyracks.algebricks.common.exceptions.AlgebricksException) Index(org.apache.asterix.metadata.entities.Index) IDataSourceIndex(org.apache.hyracks.algebricks.core.algebra.metadata.IDataSourceIndex) JobEventListenerFactory(org.apache.asterix.runtime.job.listener.JobEventListenerFactory) AlgebricksPartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint) DatasetCardinalityHint(org.apache.asterix.metadata.dataset.hints.DatasetHints.DatasetCardinalityHint) AlgebricksAbsolutePartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksAbsolutePartitionConstraint) MetadataException(org.apache.asterix.metadata.MetadataException) AlgebricksException(org.apache.hyracks.algebricks.common.exceptions.AlgebricksException) CompilationException(org.apache.asterix.common.exceptions.CompilationException) IOException(java.io.IOException) AsterixException(org.apache.asterix.common.exceptions.AsterixException) LSMSecondaryUpsertOperatorDescriptor(org.apache.asterix.runtime.operators.LSMSecondaryUpsertOperatorDescriptor) IIndexDataflowHelperFactory(org.apache.hyracks.storage.am.common.dataflow.IIndexDataflowHelperFactory) IOperatorDescriptor(org.apache.hyracks.api.dataflow.IOperatorDescriptor) AlgebricksPartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint) IModificationOperationCallbackFactory(org.apache.hyracks.storage.am.common.api.IModificationOperationCallbackFactory) LSMTreeInsertDeleteOperatorDescriptor(org.apache.asterix.common.dataflow.LSMTreeInsertDeleteOperatorDescriptor) TreeIndexBulkLoadOperatorDescriptor(org.apache.hyracks.storage.am.common.dataflow.TreeIndexBulkLoadOperatorDescriptor) IndexDataflowHelperFactory(org.apache.hyracks.storage.am.common.dataflow.IndexDataflowHelperFactory) IIndexDataflowHelperFactory(org.apache.hyracks.storage.am.common.dataflow.IIndexDataflowHelperFactory) JobId(org.apache.asterix.common.transactions.JobId) Pair(org.apache.hyracks.algebricks.common.utils.Pair)

Example 3 with IOperatorDescriptor

use of org.apache.hyracks.api.dataflow.IOperatorDescriptor in project asterixdb by apache.

the class MetadataProvider method getRTreeRuntime.

private Pair<IOperatorDescriptor, AlgebricksPartitionConstraint> getRTreeRuntime(String dataverseName, String datasetName, String indexName, IOperatorSchema propagatedSchema, List<LogicalVariable> primaryKeys, List<LogicalVariable> secondaryKeys, List<LogicalVariable> additionalNonKeyFields, AsterixTupleFilterFactory filterFactory, RecordDescriptor recordDesc, JobGenContext context, JobSpecification spec, IndexOperation indexOp, boolean bulkload, List<LogicalVariable> prevSecondaryKeys, List<LogicalVariable> prevAdditionalFilteringKeys) throws AlgebricksException {
    try {
        Dataset dataset = MetadataManagerUtil.findExistingDataset(mdTxnCtx, dataverseName, datasetName);
        boolean temp = dataset.getDatasetDetails().isTemp();
        isTemporaryDatasetWriteJob = isTemporaryDatasetWriteJob && temp;
        String itemTypeName = dataset.getItemTypeName();
        IAType itemType = MetadataManager.INSTANCE.getDatatype(mdTxnCtx, dataset.getItemTypeDataverseName(), itemTypeName).getDatatype();
        validateRecordType(itemType);
        ARecordType recType = (ARecordType) itemType;
        Index secondaryIndex = MetadataManager.INSTANCE.getIndex(mdTxnCtx, dataset.getDataverseName(), dataset.getDatasetName(), indexName);
        List<List<String>> secondaryKeyExprs = secondaryIndex.getKeyFieldNames();
        List<IAType> secondaryKeyTypes = secondaryIndex.getKeyFieldTypes();
        Pair<IAType, Boolean> keyPairType = Index.getNonNullableOpenFieldType(secondaryKeyTypes.get(0), secondaryKeyExprs.get(0), recType);
        IAType spatialType = keyPairType.first;
        int dimension = NonTaggedFormatUtil.getNumDimensions(spatialType.getTypeTag());
        int numSecondaryKeys = dimension * 2;
        int numPrimaryKeys = primaryKeys.size();
        int numKeys = numSecondaryKeys + numPrimaryKeys;
        int numFilterFields = DatasetUtil.getFilterField(dataset) == null ? 0 : 1;
        int[] fieldPermutation = new int[numKeys + numFilterFields];
        int[] modificationCallbackPrimaryKeyFields = new int[primaryKeys.size()];
        int i = 0;
        int j = 0;
        for (LogicalVariable varKey : secondaryKeys) {
            int idx = propagatedSchema.findVariable(varKey);
            fieldPermutation[i] = idx;
            i++;
        }
        for (LogicalVariable varKey : primaryKeys) {
            int idx = propagatedSchema.findVariable(varKey);
            fieldPermutation[i] = idx;
            modificationCallbackPrimaryKeyFields[j] = i;
            i++;
            j++;
        }
        if (numFilterFields > 0) {
            int idx = propagatedSchema.findVariable(additionalNonKeyFields.get(0));
            fieldPermutation[numKeys] = idx;
        }
        int[] prevFieldPermutation = null;
        if (indexOp == IndexOperation.UPSERT) {
            // Get field permutation for previous value
            prevFieldPermutation = new int[numKeys + numFilterFields];
            i = 0;
            // Get field permutation for new value
            for (LogicalVariable varKey : prevSecondaryKeys) {
                int idx = propagatedSchema.findVariable(varKey);
                prevFieldPermutation[i] = idx;
                i++;
            }
            for (int k = 0; k < numPrimaryKeys; k++) {
                prevFieldPermutation[k + i] = fieldPermutation[k + i];
                i++;
            }
            if (numFilterFields > 0) {
                int idx = propagatedSchema.findVariable(prevAdditionalFilteringKeys.get(0));
                prevFieldPermutation[numKeys] = idx;
            }
        }
        Pair<IFileSplitProvider, AlgebricksPartitionConstraint> splitsAndConstraint = getSplitProviderAndConstraints(dataset, secondaryIndex.getIndexName());
        // prepare callback
        JobId jobId = ((JobEventListenerFactory) spec.getJobletEventListenerFactory()).getJobId();
        IModificationOperationCallbackFactory modificationCallbackFactory = dataset.getModificationCallbackFactory(storaegComponentProvider, secondaryIndex, jobId, indexOp, modificationCallbackPrimaryKeyFields);
        IIndexDataflowHelperFactory indexDataflowHelperFactory = new IndexDataflowHelperFactory(storaegComponentProvider.getStorageManager(), splitsAndConstraint.first);
        IOperatorDescriptor op;
        if (bulkload) {
            long numElementsHint = getCardinalityPerPartitionHint(dataset);
            op = new TreeIndexBulkLoadOperatorDescriptor(spec, recordDesc, fieldPermutation, GlobalConfig.DEFAULT_TREE_FILL_FACTOR, false, numElementsHint, false, indexDataflowHelperFactory);
        } else if (indexOp == IndexOperation.UPSERT) {
            op = new LSMSecondaryUpsertOperatorDescriptor(spec, recordDesc, fieldPermutation, indexDataflowHelperFactory, filterFactory, modificationCallbackFactory, prevFieldPermutation);
        } else {
            op = new LSMTreeInsertDeleteOperatorDescriptor(spec, recordDesc, fieldPermutation, indexOp, indexDataflowHelperFactory, filterFactory, false, modificationCallbackFactory);
        }
        return new Pair<>(op, splitsAndConstraint.second);
    } catch (MetadataException e) {
        throw new AlgebricksException(e);
    }
}
Also used : IFileSplitProvider(org.apache.hyracks.dataflow.std.file.IFileSplitProvider) Index(org.apache.asterix.metadata.entities.Index) IDataSourceIndex(org.apache.hyracks.algebricks.core.algebra.metadata.IDataSourceIndex) MetadataException(org.apache.asterix.metadata.MetadataException) LSMSecondaryUpsertOperatorDescriptor(org.apache.asterix.runtime.operators.LSMSecondaryUpsertOperatorDescriptor) LockList(org.apache.asterix.metadata.lock.LockList) ArrayList(java.util.ArrayList) List(java.util.List) AlgebricksPartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint) TreeIndexBulkLoadOperatorDescriptor(org.apache.hyracks.storage.am.common.dataflow.TreeIndexBulkLoadOperatorDescriptor) IndexDataflowHelperFactory(org.apache.hyracks.storage.am.common.dataflow.IndexDataflowHelperFactory) IIndexDataflowHelperFactory(org.apache.hyracks.storage.am.common.dataflow.IIndexDataflowHelperFactory) JobId(org.apache.asterix.common.transactions.JobId) Pair(org.apache.hyracks.algebricks.common.utils.Pair) LogicalVariable(org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable) Dataset(org.apache.asterix.metadata.entities.Dataset) AlgebricksException(org.apache.hyracks.algebricks.common.exceptions.AlgebricksException) JobEventListenerFactory(org.apache.asterix.runtime.job.listener.JobEventListenerFactory) AlgebricksPartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint) DatasetCardinalityHint(org.apache.asterix.metadata.dataset.hints.DatasetHints.DatasetCardinalityHint) AlgebricksAbsolutePartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksAbsolutePartitionConstraint) IIndexDataflowHelperFactory(org.apache.hyracks.storage.am.common.dataflow.IIndexDataflowHelperFactory) IOperatorDescriptor(org.apache.hyracks.api.dataflow.IOperatorDescriptor) IModificationOperationCallbackFactory(org.apache.hyracks.storage.am.common.api.IModificationOperationCallbackFactory) LSMTreeInsertDeleteOperatorDescriptor(org.apache.asterix.common.dataflow.LSMTreeInsertDeleteOperatorDescriptor) ARecordType(org.apache.asterix.om.types.ARecordType) IAType(org.apache.asterix.om.types.IAType)

Example 4 with IOperatorDescriptor

use of org.apache.hyracks.api.dataflow.IOperatorDescriptor in project asterixdb by apache.

the class MetadataProvider method getBinaryTokenizerRuntime.

// Get a Tokenizer for the bulk-loading data into a n-gram or keyword index.
private Pair<IOperatorDescriptor, AlgebricksPartitionConstraint> getBinaryTokenizerRuntime(String dataverseName, String datasetName, String indexName, IOperatorSchema inputSchema, IOperatorSchema propagatedSchema, List<LogicalVariable> primaryKeys, List<LogicalVariable> secondaryKeys, RecordDescriptor recordDesc, JobSpecification spec, IndexType indexType) throws AlgebricksException {
    // Sanity checks.
    if (primaryKeys.size() > 1) {
        throw new AlgebricksException("Cannot tokenize composite primary key.");
    }
    if (secondaryKeys.size() > 1) {
        throw new AlgebricksException("Cannot tokenize composite secondary key fields.");
    }
    boolean isPartitioned;
    if (indexType == IndexType.LENGTH_PARTITIONED_WORD_INVIX || indexType == IndexType.LENGTH_PARTITIONED_NGRAM_INVIX) {
        isPartitioned = true;
    } else {
        isPartitioned = false;
    }
    // Number of Keys that needs to be propagated
    int numKeys = inputSchema.getSize();
    // Get the rest of Logical Variables that are not (PK or SK) and each
    // variable's positions.
    // These variables will be propagated through TokenizeOperator.
    List<LogicalVariable> otherKeys = new ArrayList<>();
    if (inputSchema.getSize() > 0) {
        for (int k = 0; k < inputSchema.getSize(); k++) {
            boolean found = false;
            for (LogicalVariable varKey : primaryKeys) {
                if (varKey.equals(inputSchema.getVariable(k))) {
                    found = true;
                    break;
                } else {
                    found = false;
                }
            }
            if (!found) {
                for (LogicalVariable varKey : secondaryKeys) {
                    if (varKey.equals(inputSchema.getVariable(k))) {
                        found = true;
                        break;
                    } else {
                        found = false;
                    }
                }
            }
            if (!found) {
                otherKeys.add(inputSchema.getVariable(k));
            }
        }
    }
    // For tokenization, sorting and loading.
    // One token (+ optional partitioning field) + primary keys + secondary
    // keys + other variables
    // secondary keys and other variables will be just passed to the
    // IndexInsertDelete Operator.
    int numTokenKeyPairFields = (!isPartitioned) ? 1 + numKeys : 2 + numKeys;
    // generate field permutations for the input
    int[] fieldPermutation = new int[numKeys];
    int[] modificationCallbackPrimaryKeyFields = new int[primaryKeys.size()];
    int i = 0;
    int j = 0;
    for (LogicalVariable varKey : primaryKeys) {
        int idx = propagatedSchema.findVariable(varKey);
        fieldPermutation[i] = idx;
        modificationCallbackPrimaryKeyFields[j] = i;
        i++;
        j++;
    }
    for (LogicalVariable varKey : otherKeys) {
        int idx = propagatedSchema.findVariable(varKey);
        fieldPermutation[i] = idx;
        i++;
    }
    for (LogicalVariable varKey : secondaryKeys) {
        int idx = propagatedSchema.findVariable(varKey);
        fieldPermutation[i] = idx;
        i++;
    }
    Dataset dataset = MetadataManagerUtil.findExistingDataset(mdTxnCtx, dataverseName, datasetName);
    String itemTypeName = dataset.getItemTypeName();
    IAType itemType;
    try {
        itemType = MetadataManager.INSTANCE.getDatatype(mdTxnCtx, dataset.getItemTypeDataverseName(), itemTypeName).getDatatype();
        if (itemType.getTypeTag() != ATypeTag.OBJECT) {
            throw new AlgebricksException("Only record types can be tokenized.");
        }
        ARecordType recType = (ARecordType) itemType;
        // Index parameters.
        Index secondaryIndex = MetadataManager.INSTANCE.getIndex(mdTxnCtx, dataset.getDataverseName(), dataset.getDatasetName(), indexName);
        List<List<String>> secondaryKeyExprs = secondaryIndex.getKeyFieldNames();
        List<IAType> secondaryKeyTypeEntries = secondaryIndex.getKeyFieldTypes();
        int numTokenFields = (!isPartitioned) ? secondaryKeys.size() : secondaryKeys.size() + 1;
        ITypeTraits[] tokenTypeTraits = new ITypeTraits[numTokenFields];
        ITypeTraits[] invListsTypeTraits = new ITypeTraits[primaryKeys.size()];
        // Find the key type of the secondary key. If it's a derived type,
        // return the derived type.
        // e.g. UNORDERED LIST -> return UNORDERED LIST type
        IAType secondaryKeyType;
        Pair<IAType, Boolean> keyPairType = Index.getNonNullableOpenFieldType(secondaryKeyTypeEntries.get(0), secondaryKeyExprs.get(0), recType);
        secondaryKeyType = keyPairType.first;
        List<List<String>> partitioningKeys = dataset.getPrimaryKeys();
        i = 0;
        for (List<String> partitioningKey : partitioningKeys) {
            IAType keyType = recType.getSubFieldType(partitioningKey);
            invListsTypeTraits[i] = TypeTraitProvider.INSTANCE.getTypeTrait(keyType);
            ++i;
        }
        tokenTypeTraits[0] = NonTaggedFormatUtil.getTokenTypeTrait(secondaryKeyType);
        if (isPartitioned) {
            // The partitioning field is hardcoded to be a short *without*
            // an Asterix type tag.
            tokenTypeTraits[1] = ShortPointable.TYPE_TRAITS;
        }
        IBinaryTokenizerFactory tokenizerFactory = NonTaggedFormatUtil.getBinaryTokenizerFactory(secondaryKeyType.getTypeTag(), indexType, secondaryIndex.getGramLength());
        Pair<IFileSplitProvider, AlgebricksPartitionConstraint> splitsAndConstraint = getSplitProviderAndConstraints(dataset, secondaryIndex.getIndexName());
        // Generate Output Record format
        ISerializerDeserializer<?>[] tokenKeyPairFields = new ISerializerDeserializer[numTokenKeyPairFields];
        ITypeTraits[] tokenKeyPairTypeTraits = new ITypeTraits[numTokenKeyPairFields];
        ISerializerDeserializerProvider serdeProvider = FormatUtils.getDefaultFormat().getSerdeProvider();
        // #1. propagate all input variables
        for (int k = 0; k < recordDesc.getFieldCount(); k++) {
            tokenKeyPairFields[k] = recordDesc.getFields()[k];
            tokenKeyPairTypeTraits[k] = recordDesc.getTypeTraits()[k];
        }
        int tokenOffset = recordDesc.getFieldCount();
        // #2. Specify the token type
        tokenKeyPairFields[tokenOffset] = serdeProvider.getSerializerDeserializer(secondaryKeyType);
        tokenKeyPairTypeTraits[tokenOffset] = tokenTypeTraits[0];
        tokenOffset++;
        // #3. Specify the length-partitioning key: number of token
        if (isPartitioned) {
            tokenKeyPairFields[tokenOffset] = ShortSerializerDeserializer.INSTANCE;
            tokenKeyPairTypeTraits[tokenOffset] = tokenTypeTraits[1];
        }
        RecordDescriptor tokenKeyPairRecDesc = new RecordDescriptor(tokenKeyPairFields, tokenKeyPairTypeTraits);
        IOperatorDescriptor tokenizerOp;
        // Keys to be tokenized : SK
        int docField = fieldPermutation[fieldPermutation.length - 1];
        // Keys to be propagated
        int[] keyFields = new int[numKeys];
        for (int k = 0; k < keyFields.length; k++) {
            keyFields[k] = k;
        }
        tokenizerOp = new BinaryTokenizerOperatorDescriptor(spec, tokenKeyPairRecDesc, tokenizerFactory, docField, keyFields, isPartitioned, true);
        return new Pair<>(tokenizerOp, splitsAndConstraint.second);
    } catch (Exception e) {
        throw new AlgebricksException(e);
    }
}
Also used : IFileSplitProvider(org.apache.hyracks.dataflow.std.file.IFileSplitProvider) RecordDescriptor(org.apache.hyracks.api.dataflow.value.RecordDescriptor) ArrayList(java.util.ArrayList) Index(org.apache.asterix.metadata.entities.Index) IDataSourceIndex(org.apache.hyracks.algebricks.core.algebra.metadata.IDataSourceIndex) ISerializerDeserializerProvider(org.apache.hyracks.algebricks.data.ISerializerDeserializerProvider) IBinaryTokenizerFactory(org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory) LockList(org.apache.asterix.metadata.lock.LockList) ArrayList(java.util.ArrayList) List(java.util.List) AlgebricksPartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint) Pair(org.apache.hyracks.algebricks.common.utils.Pair) LogicalVariable(org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable) ITypeTraits(org.apache.hyracks.api.dataflow.value.ITypeTraits) Dataset(org.apache.asterix.metadata.entities.Dataset) AlgebricksException(org.apache.hyracks.algebricks.common.exceptions.AlgebricksException) AlgebricksPartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint) DatasetCardinalityHint(org.apache.asterix.metadata.dataset.hints.DatasetHints.DatasetCardinalityHint) AlgebricksAbsolutePartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksAbsolutePartitionConstraint) ISerializerDeserializer(org.apache.hyracks.api.dataflow.value.ISerializerDeserializer) MetadataException(org.apache.asterix.metadata.MetadataException) AlgebricksException(org.apache.hyracks.algebricks.common.exceptions.AlgebricksException) CompilationException(org.apache.asterix.common.exceptions.CompilationException) IOException(java.io.IOException) AsterixException(org.apache.asterix.common.exceptions.AsterixException) BinaryTokenizerOperatorDescriptor(org.apache.hyracks.storage.am.lsm.invertedindex.dataflow.BinaryTokenizerOperatorDescriptor) IOperatorDescriptor(org.apache.hyracks.api.dataflow.IOperatorDescriptor) ARecordType(org.apache.asterix.om.types.ARecordType) IAType(org.apache.asterix.om.types.IAType)

Example 5 with IOperatorDescriptor

use of org.apache.hyracks.api.dataflow.IOperatorDescriptor in project asterixdb by apache.

the class MetadataProvider method getInsertOrDeleteRuntime.

private Pair<IOperatorDescriptor, AlgebricksPartitionConstraint> getInsertOrDeleteRuntime(IndexOperation indexOp, IDataSource<DataSourceId> dataSource, IOperatorSchema propagatedSchema, List<LogicalVariable> keys, LogicalVariable payload, List<LogicalVariable> additionalNonKeyFields, RecordDescriptor inputRecordDesc, JobGenContext context, JobSpecification spec, boolean bulkload, List<LogicalVariable> additionalNonFilteringFields) throws AlgebricksException {
    String datasetName = dataSource.getId().getDatasourceName();
    Dataset dataset = MetadataManagerUtil.findExistingDataset(mdTxnCtx, dataSource.getId().getDataverseName(), datasetName);
    boolean temp = dataset.getDatasetDetails().isTemp();
    isTemporaryDatasetWriteJob = isTemporaryDatasetWriteJob && temp;
    int numKeys = keys.size();
    int numFilterFields = DatasetUtil.getFilterField(dataset) == null ? 0 : 1;
    // Move key fields to front.
    int[] fieldPermutation = new int[numKeys + 1 + numFilterFields + (additionalNonFilteringFields == null ? 0 : additionalNonFilteringFields.size())];
    int[] bloomFilterKeyFields = new int[numKeys];
    int i = 0;
    for (LogicalVariable varKey : keys) {
        int idx = propagatedSchema.findVariable(varKey);
        fieldPermutation[i] = idx;
        bloomFilterKeyFields[i] = i;
        i++;
    }
    fieldPermutation[i++] = propagatedSchema.findVariable(payload);
    if (numFilterFields > 0) {
        int idx = propagatedSchema.findVariable(additionalNonKeyFields.get(0));
        fieldPermutation[i++] = idx;
    }
    if (additionalNonFilteringFields != null) {
        for (LogicalVariable variable : additionalNonFilteringFields) {
            int idx = propagatedSchema.findVariable(variable);
            fieldPermutation[i++] = idx;
        }
    }
    try {
        Index primaryIndex = MetadataManager.INSTANCE.getIndex(mdTxnCtx, dataset.getDataverseName(), dataset.getDatasetName(), dataset.getDatasetName());
        Pair<IFileSplitProvider, AlgebricksPartitionConstraint> splitsAndConstraint = getSplitProviderAndConstraints(dataset);
        // prepare callback
        int[] primaryKeyFields = new int[numKeys];
        for (i = 0; i < numKeys; i++) {
            primaryKeyFields[i] = i;
        }
        IModificationOperationCallbackFactory modificationCallbackFactory = dataset.getModificationCallbackFactory(storaegComponentProvider, primaryIndex, jobId, indexOp, primaryKeyFields);
        IIndexDataflowHelperFactory idfh = new IndexDataflowHelperFactory(storaegComponentProvider.getStorageManager(), splitsAndConstraint.first);
        IOperatorDescriptor op;
        if (bulkload) {
            long numElementsHint = getCardinalityPerPartitionHint(dataset);
            op = new TreeIndexBulkLoadOperatorDescriptor(spec, inputRecordDesc, fieldPermutation, GlobalConfig.DEFAULT_TREE_FILL_FACTOR, true, numElementsHint, true, idfh);
        } else {
            op = new LSMTreeInsertDeleteOperatorDescriptor(spec, inputRecordDesc, fieldPermutation, indexOp, idfh, null, true, modificationCallbackFactory);
        }
        return new Pair<>(op, splitsAndConstraint.second);
    } catch (MetadataException me) {
        throw new AlgebricksException(me);
    }
}
Also used : LogicalVariable(org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable) Dataset(org.apache.asterix.metadata.entities.Dataset) IFileSplitProvider(org.apache.hyracks.dataflow.std.file.IFileSplitProvider) AlgebricksException(org.apache.hyracks.algebricks.common.exceptions.AlgebricksException) Index(org.apache.asterix.metadata.entities.Index) IDataSourceIndex(org.apache.hyracks.algebricks.core.algebra.metadata.IDataSourceIndex) AlgebricksPartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint) DatasetCardinalityHint(org.apache.asterix.metadata.dataset.hints.DatasetHints.DatasetCardinalityHint) AlgebricksAbsolutePartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksAbsolutePartitionConstraint) MetadataException(org.apache.asterix.metadata.MetadataException) IIndexDataflowHelperFactory(org.apache.hyracks.storage.am.common.dataflow.IIndexDataflowHelperFactory) IOperatorDescriptor(org.apache.hyracks.api.dataflow.IOperatorDescriptor) AlgebricksPartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint) IModificationOperationCallbackFactory(org.apache.hyracks.storage.am.common.api.IModificationOperationCallbackFactory) LSMTreeInsertDeleteOperatorDescriptor(org.apache.asterix.common.dataflow.LSMTreeInsertDeleteOperatorDescriptor) TreeIndexBulkLoadOperatorDescriptor(org.apache.hyracks.storage.am.common.dataflow.TreeIndexBulkLoadOperatorDescriptor) IndexDataflowHelperFactory(org.apache.hyracks.storage.am.common.dataflow.IndexDataflowHelperFactory) IIndexDataflowHelperFactory(org.apache.hyracks.storage.am.common.dataflow.IIndexDataflowHelperFactory) Pair(org.apache.hyracks.algebricks.common.utils.Pair)

Aggregations

IOperatorDescriptor (org.apache.hyracks.api.dataflow.IOperatorDescriptor)89 JobSpecification (org.apache.hyracks.api.job.JobSpecification)61 RecordDescriptor (org.apache.hyracks.api.dataflow.value.RecordDescriptor)52 IFileSplitProvider (org.apache.hyracks.dataflow.std.file.IFileSplitProvider)51 OneToOneConnectorDescriptor (org.apache.hyracks.dataflow.std.connectors.OneToOneConnectorDescriptor)48 ConstantFileSplitProvider (org.apache.hyracks.dataflow.std.file.ConstantFileSplitProvider)48 Test (org.junit.Test)41 UTF8StringSerializerDeserializer (org.apache.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer)37 IConnectorDescriptor (org.apache.hyracks.api.dataflow.IConnectorDescriptor)34 FileScanOperatorDescriptor (org.apache.hyracks.dataflow.std.file.FileScanOperatorDescriptor)34 DelimitedDataTupleParserFactory (org.apache.hyracks.dataflow.std.file.DelimitedDataTupleParserFactory)33 ManagedFileSplit (org.apache.hyracks.api.io.ManagedFileSplit)30 FileSplit (org.apache.hyracks.api.io.FileSplit)28 AlgebricksPartitionConstraint (org.apache.hyracks.algebricks.common.constraints.AlgebricksPartitionConstraint)26 IValueParserFactory (org.apache.hyracks.dataflow.common.data.parsers.IValueParserFactory)26 ResultSetId (org.apache.hyracks.api.dataset.ResultSetId)24 ResultWriterOperatorDescriptor (org.apache.hyracks.dataflow.std.result.ResultWriterOperatorDescriptor)23 ILogicalOperator (org.apache.hyracks.algebricks.core.algebra.base.ILogicalOperator)19 PlainFileWriterOperatorDescriptor (org.apache.hyracks.dataflow.std.file.PlainFileWriterOperatorDescriptor)19 FieldHashPartitionComputerFactory (org.apache.hyracks.dataflow.common.data.partition.FieldHashPartitionComputerFactory)18