Search in sources :

Example 11 with FieldOperation

use of io.cdap.cdap.etl.api.lineage.field.FieldOperation in project cdap by caskdata.

the class FieldLineageProcessor method validateAndConvert.

public Set<Operation> validateAndConvert(Map<String, List<FieldOperation>> allStageOperations) {
    Map<String, List<FieldOperation>> allOperations = new HashMap<>(allStageOperations);
    // Set of stages for which no implicit merge operation is required even if
    // stage has multiple inputs, for example join stages
    Set<String> noMergeRequiredStages = new HashSet<>();
    for (StageSpec stageSpec : pipelineSpec.getStages()) {
        if (BatchJoiner.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
            noMergeRequiredStages.add(stageSpec.getName());
        }
    }
    // validate the stage operations
    Map<String, InvalidFieldOperations> stageInvalids = new HashMap<>();
    Map<String, Map<String, List<String>>> stageRedundants = new HashMap<>();
    for (StageSpec stageSpec : pipelineSpec.getStages()) {
        Map<String, Schema> inputSchemas = stageSpec.getInputSchemas();
        // TODO: CDAP-16428 populate the schema if macro is enabled to avoid this
        if (inputSchemas == null) {
            LOG.warn("Field lineage will not be recorded since the input schema is not set. ");
            return Collections.emptySet();
        }
        // If current stage is of type JOIN add fields as inputstageName.fieldName
        List<String> stageInputs = new ArrayList<>();
        List<String> stageOutputs = new ArrayList<>();
        if (BatchJoiner.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
            for (Map.Entry<String, Schema> entry : inputSchemas.entrySet()) {
                Schema schema = entry.getValue();
                if (schema != null && schema.getFields() != null) {
                    stageInputs.addAll(schema.getFields().stream().map(field -> entry.getKey() + "." + field.getName()).collect(Collectors.toList()));
                }
            }
        } else {
            for (Map.Entry<String, Schema> entry : inputSchemas.entrySet()) {
                Schema schema = entry.getValue();
                if (schema != null && schema.getFields() != null) {
                    stageInputs.addAll(schema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList()));
                }
            }
        }
        Schema outputSchema = stageSpec.getOutputSchema();
        if (outputSchema != null && outputSchema.getFields() != null) {
            stageOutputs.addAll(outputSchema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList()));
        }
        String stageName = stageSpec.getName();
        // only auto generate for stages that have input and output schema
        if (!stageInputs.isEmpty() && !stageOutputs.isEmpty()) {
            allOperations.compute(stageName, (stage, fieldOperations) -> {
                // output schema
                if (fieldOperations == null || fieldOperations.isEmpty()) {
                    return Collections.singletonList(new FieldTransformOperation("Transform", "", stageInputs, stageOutputs));
                }
                return fieldOperations;
            });
        }
        List<FieldOperation> fieldOperations = allOperations.computeIfAbsent(stageName, stage -> Collections.emptyList());
        StageOperationsValidator.Builder builder = new StageOperationsValidator.Builder(fieldOperations);
        builder.addStageInputs(stageInputs);
        builder.addStageOutputs(stageOutputs);
        StageOperationsValidator stageOperationsValidator = builder.build();
        stageOperationsValidator.validate();
        LOG.trace("Stage Name: {}", stageName);
        LOG.trace("Stage Operations {}", GSON.toJson(fieldOperations));
        LOG.trace("Stage inputs: {}", stageInputs);
        LOG.trace("Stage outputs: {}", stageOutputs);
        InvalidFieldOperations invalidFieldOperations = stageOperationsValidator.getStageInvalids();
        if (invalidFieldOperations != null) {
            stageInvalids.put(stageName, invalidFieldOperations);
        }
        if (!stageOperationsValidator.getRedundantOutputs().isEmpty()) {
            stageRedundants.put(stageName, stageOperationsValidator.getRedundantOutputs());
        }
    }
    if (!stageRedundants.isEmpty()) {
        LOG.debug("The pipeline has redundant operations {} and they will be ignored", stageRedundants);
    }
    if (!stageInvalids.isEmpty()) {
        // Do not throw but just log the exception message for validation failure
        // Once most of the plugins are updated to write lineage exception can be thrown
        LOG.debug(new InvalidLineageException(stageInvalids).getMessage());
    }
    LineageOperationsProcessor processor = new LineageOperationsProcessor(pipelineSpec.getConnections(), allOperations, noMergeRequiredStages);
    return processor.process();
}
Also used : HashMap(java.util.HashMap) Schema(io.cdap.cdap.api.data.schema.Schema) GsonBuilder(com.google.gson.GsonBuilder) ArrayList(java.util.ArrayList) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) ArrayList(java.util.ArrayList) List(java.util.List) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) HashSet(java.util.HashSet) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) HashMap(java.util.HashMap) Map(java.util.Map)

Example 12 with FieldOperation

use of io.cdap.cdap.etl.api.lineage.field.FieldOperation in project cdap by caskdata.

the class SmartWorkflow method destroy.

@Override
public void destroy() {
    WorkflowContext workflowContext = getContext();
    PipelineRuntime pipelineRuntime = new PipelineRuntime(workflowContext, workflowMetrics);
    // Execute the post actions only if pipeline is not running in preview mode.
    if (!workflowContext.getDataTracer(PostAction.PLUGIN_TYPE).isEnabled()) {
        for (Map.Entry<String, PostAction> endingActionEntry : postActions.entrySet()) {
            String name = endingActionEntry.getKey();
            PostAction action = endingActionEntry.getValue();
            StageSpec stageSpec = stageSpecs.get(name);
            BatchActionContext context = new WorkflowBackedActionContext(workflowContext, pipelineRuntime, stageSpec);
            try {
                action.run(context);
            } catch (Throwable t) {
                LOG.error("Error while running post action {}.", name, t);
            }
        }
    }
    Map<String, String> connectorDatasets = GSON.fromJson(workflowContext.getWorkflowSpecification().getProperty(Constants.CONNECTOR_DATASETS), STAGE_DATASET_MAP);
    // publish all alerts
    for (Map.Entry<String, AlertPublisher> alertPublisherEntry : alertPublishers.entrySet()) {
        String stageName = alertPublisherEntry.getKey();
        AlertPublisher alertPublisher = alertPublisherEntry.getValue();
        FileSet alertConnector = workflowContext.getDataset(connectorDatasets.get(stageName));
        try (CloseableIterator<Alert> alerts = new AlertReader(alertConnector)) {
            if (!alerts.hasNext()) {
                continue;
            }
            StageMetrics stageMetrics = new DefaultStageMetrics(workflowMetrics, stageName);
            StageSpec stageSpec = stageSpecs.get(stageName);
            AlertPublisherContext alertContext = new DefaultAlertPublisherContext(pipelineRuntime, stageSpec, workflowContext, workflowContext.getAdmin());
            alertPublisher.initialize(alertContext);
            TrackedIterator<Alert> trackedIterator = new TrackedIterator<>(alerts, stageMetrics, Constants.Metrics.RECORDS_IN);
            alertPublisher.publish(trackedIterator);
        } catch (Exception e) {
            LOG.warn("Stage {} had errors publishing alerts. Alerts may not have been published.", stageName, e);
        } finally {
            try {
                alertPublisher.destroy();
            } catch (Exception e) {
                LOG.warn("Error destroying alert publisher for stage {}", stageName, e);
            }
        }
    }
    ProgramStatus status = getContext().getState().getStatus();
    if (status == ProgramStatus.FAILED) {
        WRAPPERLOGGER.error("Pipeline '{}' failed.", getContext().getApplicationSpecification().getName());
    } else {
        WRAPPERLOGGER.info("Pipeline '{}' {}.", getContext().getApplicationSpecification().getName(), status == ProgramStatus.COMPLETED ? "succeeded" : status.name().toLowerCase());
    }
    MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), workflowContext.getLogicalStartTime(), workflowContext, workflowContext, workflowContext.getNamespace());
    // Get resolved plugin properties
    Map<String, Map<String, String>> resolvedProperties = new HashMap<>();
    for (StageSpec spec : stageSpecs.values()) {
        String stageName = spec.getName();
        resolvedProperties.put(stageName, workflowContext.getPluginProperties(stageName, macroEvaluator).getProperties());
    }
    // Add resolved plugin properties to workflow token as a JSON String
    workflowContext.getToken().put(RESOLVED_PLUGIN_PROPERTIES_MAP, GSON.toJson(resolvedProperties));
    // record only if the Workflow is successful
    if (status != ProgramStatus.COMPLETED) {
        return;
    }
    // Collect field operations from each phase
    WorkflowToken token = workflowContext.getToken();
    List<NodeValue> allNodeValues = token.getAll(Constants.FIELD_OPERATION_KEY_IN_WORKFLOW_TOKEN);
    if (allNodeValues.isEmpty()) {
        // no field lineage recorded by any stage
        return;
    }
    Map<String, List<FieldOperation>> allStageOperations = new HashMap<>();
    for (StageSpec stageSpec : stageSpecs.values()) {
        allStageOperations.put(stageSpec.getName(), new ArrayList<>());
    }
    for (NodeValue nodeValue : allNodeValues) {
        Map<String, List<FieldOperation>> stageOperations = GSON.fromJson(nodeValue.getValue().toString(), STAGE_OPERATIONS_MAP);
        for (Map.Entry<String, List<FieldOperation>> entry : stageOperations.entrySet()) {
            // ignore them
            if (allStageOperations.containsKey(entry.getKey())) {
                allStageOperations.get(entry.getKey()).addAll(entry.getValue());
            }
        }
    }
    FieldLineageProcessor processor = new FieldLineageProcessor(spec);
    Set<Operation> processedOperations = processor.validateAndConvert(allStageOperations);
    if (!processedOperations.isEmpty()) {
        workflowContext.record(processedOperations);
    }
}
Also used : NodeValue(io.cdap.cdap.api.workflow.NodeValue) PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) BatchActionContext(io.cdap.cdap.etl.api.batch.BatchActionContext) WorkflowBackedActionContext(io.cdap.cdap.etl.batch.WorkflowBackedActionContext) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) WorkflowToken(io.cdap.cdap.api.workflow.WorkflowToken) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) AlertReader(io.cdap.cdap.etl.batch.connector.AlertReader) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) List(java.util.List) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) StageMetrics(io.cdap.cdap.etl.api.StageMetrics) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) AlertPublisherContext(io.cdap.cdap.etl.api.AlertPublisherContext) DefaultAlertPublisherContext(io.cdap.cdap.etl.common.DefaultAlertPublisherContext) FieldLineageProcessor(io.cdap.cdap.etl.lineage.FieldLineageProcessor) AlertPublisher(io.cdap.cdap.etl.api.AlertPublisher) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) TrackedIterator(io.cdap.cdap.etl.common.TrackedIterator) WorkflowContext(io.cdap.cdap.api.workflow.WorkflowContext) DisjointConnectionsException(io.cdap.cdap.etl.planner.DisjointConnectionsException) ValidationException(io.cdap.cdap.etl.api.validation.ValidationException) Alert(io.cdap.cdap.etl.api.Alert) PostAction(io.cdap.cdap.etl.api.batch.PostAction) DefaultAlertPublisherContext(io.cdap.cdap.etl.common.DefaultAlertPublisherContext) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) ProgramStatus(io.cdap.cdap.api.ProgramStatus)

Example 13 with FieldOperation

use of io.cdap.cdap.etl.api.lineage.field.FieldOperation in project hydrator-plugins by cdapio.

the class CSVParser method prepareRun.

@Override
public void prepareRun(StageSubmitterContext context) throws Exception {
    super.prepareRun(context);
    FailureCollector collector = context.getFailureCollector();
    config.validate(collector);
    collector.getOrThrowException();
    // Read from config.field and output to fields
    init();
    if (fields != null) {
        FieldOperation operation = new FieldTransformOperation("Parse", "Parsed CSV data from expected field.", Collections.singletonList(config.field), fields.stream().map(Schema.Field::getName).collect(Collectors.toList()));
        context.record(Collections.singletonList(operation));
    }
}
Also used : Field(io.cdap.cdap.api.data.schema.Schema.Field) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) FailureCollector(io.cdap.cdap.etl.api.FailureCollector)

Example 14 with FieldOperation

use of io.cdap.cdap.etl.api.lineage.field.FieldOperation in project hydrator-plugins by cdapio.

the class Hasher method prepareRun.

@Override
public void prepareRun(StageSubmitterContext context) throws Exception {
    FailureCollector failureCollector = context.getFailureCollector();
    config.validate(context.getInputSchema(), failureCollector);
    failureCollector.getOrThrowException();
    if (context.getInputSchema() == null || context.getInputSchema().getFields() == null) {
        return;
    }
    // Set a list of operations only for the fields in inputSchema and with type string, and identity for
    // the non-string ones present in the output.
    List<String> hashedFields = context.getInputSchema().getFields().stream().filter(field -> config.getFields().contains(field.getName()) && field.getSchema().getType() == Schema.Type.STRING).map(Schema.Field::getName).collect(Collectors.toList());
    List<String> identityFields = TransformLineageRecorderUtils.getFields(context.getInputSchema());
    identityFields.removeAll(hashedFields);
    List<FieldOperation> output = new ArrayList<>();
    output.addAll(TransformLineageRecorderUtils.generateOneToOnes(hashedFields, "hash", "Used the digest algorithm to hash the fields."));
    output.addAll(TransformLineageRecorderUtils.generateOneToOnes(identityFields, "identity", TransformLineageRecorderUtils.IDENTITY_TRANSFORM_DESCRIPTION));
    context.record(output);
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) FailureCollector(io.cdap.cdap.etl.api.FailureCollector)

Example 15 with FieldOperation

use of io.cdap.cdap.etl.api.lineage.field.FieldOperation in project cdap by cdapio.

the class SmartWorkflow method destroy.

@Override
public void destroy() {
    WorkflowContext workflowContext = getContext();
    PipelineRuntime pipelineRuntime = new PipelineRuntime(workflowContext, workflowMetrics);
    // Execute the post actions only if pipeline is not running in preview mode.
    if (!workflowContext.getDataTracer(PostAction.PLUGIN_TYPE).isEnabled()) {
        for (Map.Entry<String, PostAction> endingActionEntry : postActions.entrySet()) {
            String name = endingActionEntry.getKey();
            PostAction action = endingActionEntry.getValue();
            StageSpec stageSpec = stageSpecs.get(name);
            BatchActionContext context = new WorkflowBackedActionContext(workflowContext, pipelineRuntime, stageSpec);
            try {
                action.run(context);
            } catch (Throwable t) {
                LOG.error("Error while running post action {}.", name, t);
            }
        }
    }
    Map<String, String> connectorDatasets = GSON.fromJson(workflowContext.getWorkflowSpecification().getProperty(Constants.CONNECTOR_DATASETS), STAGE_DATASET_MAP);
    // publish all alerts
    for (Map.Entry<String, AlertPublisher> alertPublisherEntry : alertPublishers.entrySet()) {
        String stageName = alertPublisherEntry.getKey();
        AlertPublisher alertPublisher = alertPublisherEntry.getValue();
        FileSet alertConnector = workflowContext.getDataset(connectorDatasets.get(stageName));
        try (CloseableIterator<Alert> alerts = new AlertReader(alertConnector)) {
            if (!alerts.hasNext()) {
                continue;
            }
            StageMetrics stageMetrics = new DefaultStageMetrics(workflowMetrics, stageName);
            StageSpec stageSpec = stageSpecs.get(stageName);
            AlertPublisherContext alertContext = new DefaultAlertPublisherContext(pipelineRuntime, stageSpec, workflowContext, workflowContext.getAdmin());
            alertPublisher.initialize(alertContext);
            TrackedIterator<Alert> trackedIterator = new TrackedIterator<>(alerts, stageMetrics, Constants.Metrics.RECORDS_IN);
            alertPublisher.publish(trackedIterator);
        } catch (Exception e) {
            LOG.warn("Stage {} had errors publishing alerts. Alerts may not have been published.", stageName, e);
        } finally {
            try {
                alertPublisher.destroy();
            } catch (Exception e) {
                LOG.warn("Error destroying alert publisher for stage {}", stageName, e);
            }
        }
    }
    ProgramStatus status = getContext().getState().getStatus();
    if (status == ProgramStatus.FAILED) {
        WRAPPERLOGGER.error("Pipeline '{}' failed.", getContext().getApplicationSpecification().getName());
    } else {
        WRAPPERLOGGER.info("Pipeline '{}' {}.", getContext().getApplicationSpecification().getName(), status == ProgramStatus.COMPLETED ? "succeeded" : status.name().toLowerCase());
    }
    MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), workflowContext.getLogicalStartTime(), workflowContext, workflowContext, workflowContext.getNamespace());
    // Get resolved plugin properties
    Map<String, Map<String, String>> resolvedProperties = new HashMap<>();
    for (StageSpec spec : stageSpecs.values()) {
        String stageName = spec.getName();
        resolvedProperties.put(stageName, workflowContext.getPluginProperties(stageName, macroEvaluator).getProperties());
    }
    // Add resolved plugin properties to workflow token as a JSON String
    workflowContext.getToken().put(RESOLVED_PLUGIN_PROPERTIES_MAP, GSON.toJson(resolvedProperties));
    // record only if the Workflow is successful
    if (status != ProgramStatus.COMPLETED) {
        return;
    }
    // Collect field operations from each phase
    WorkflowToken token = workflowContext.getToken();
    List<NodeValue> allNodeValues = token.getAll(Constants.FIELD_OPERATION_KEY_IN_WORKFLOW_TOKEN);
    if (allNodeValues.isEmpty()) {
        // no field lineage recorded by any stage
        return;
    }
    Map<String, List<FieldOperation>> allStageOperations = new HashMap<>();
    for (StageSpec stageSpec : stageSpecs.values()) {
        allStageOperations.put(stageSpec.getName(), new ArrayList<>());
    }
    for (NodeValue nodeValue : allNodeValues) {
        Map<String, List<FieldOperation>> stageOperations = GSON.fromJson(nodeValue.getValue().toString(), STAGE_OPERATIONS_MAP);
        for (Map.Entry<String, List<FieldOperation>> entry : stageOperations.entrySet()) {
            // ignore them
            if (allStageOperations.containsKey(entry.getKey())) {
                allStageOperations.get(entry.getKey()).addAll(entry.getValue());
            }
        }
    }
    FieldLineageProcessor processor = new FieldLineageProcessor(spec);
    Set<Operation> processedOperations = processor.validateAndConvert(allStageOperations);
    if (!processedOperations.isEmpty()) {
        workflowContext.record(processedOperations);
    }
}
Also used : NodeValue(io.cdap.cdap.api.workflow.NodeValue) PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) BatchActionContext(io.cdap.cdap.etl.api.batch.BatchActionContext) WorkflowBackedActionContext(io.cdap.cdap.etl.batch.WorkflowBackedActionContext) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) WorkflowToken(io.cdap.cdap.api.workflow.WorkflowToken) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) AlertReader(io.cdap.cdap.etl.batch.connector.AlertReader) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) List(java.util.List) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) StageMetrics(io.cdap.cdap.etl.api.StageMetrics) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) AlertPublisherContext(io.cdap.cdap.etl.api.AlertPublisherContext) DefaultAlertPublisherContext(io.cdap.cdap.etl.common.DefaultAlertPublisherContext) FieldLineageProcessor(io.cdap.cdap.etl.lineage.FieldLineageProcessor) AlertPublisher(io.cdap.cdap.etl.api.AlertPublisher) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) TrackedIterator(io.cdap.cdap.etl.common.TrackedIterator) WorkflowContext(io.cdap.cdap.api.workflow.WorkflowContext) DisjointConnectionsException(io.cdap.cdap.etl.planner.DisjointConnectionsException) ValidationException(io.cdap.cdap.etl.api.validation.ValidationException) Alert(io.cdap.cdap.etl.api.Alert) PostAction(io.cdap.cdap.etl.api.batch.PostAction) DefaultAlertPublisherContext(io.cdap.cdap.etl.common.DefaultAlertPublisherContext) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) ProgramStatus(io.cdap.cdap.api.ProgramStatus)

Aggregations

FieldOperation (io.cdap.cdap.etl.api.lineage.field.FieldOperation)63 FieldTransformOperation (io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation)53 ArrayList (java.util.ArrayList)47 Test (org.junit.Test)41 List (java.util.List)39 FieldReadOperation (io.cdap.cdap.etl.api.lineage.field.FieldReadOperation)37 FieldWriteOperation (io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation)36 HashMap (java.util.HashMap)36 HashSet (java.util.HashSet)34 ImmutableList (com.google.common.collect.ImmutableList)32 Operation (io.cdap.cdap.api.lineage.field.Operation)30 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)28 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)28 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)28 Connection (io.cdap.cdap.etl.proto.Connection)26 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)20 Schema (io.cdap.cdap.api.data.schema.Schema)13 FieldLineageInfo (io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo)8 Map (java.util.Map)7 JoinField (io.cdap.cdap.etl.api.join.JoinField)6