use of io.cdap.cdap.etl.api.lineage.field.FieldOperation in project cdap by caskdata.
the class FieldLineageProcessor method validateAndConvert.
public Set<Operation> validateAndConvert(Map<String, List<FieldOperation>> allStageOperations) {
Map<String, List<FieldOperation>> allOperations = new HashMap<>(allStageOperations);
// Set of stages for which no implicit merge operation is required even if
// stage has multiple inputs, for example join stages
Set<String> noMergeRequiredStages = new HashSet<>();
for (StageSpec stageSpec : pipelineSpec.getStages()) {
if (BatchJoiner.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
noMergeRequiredStages.add(stageSpec.getName());
}
}
// validate the stage operations
Map<String, InvalidFieldOperations> stageInvalids = new HashMap<>();
Map<String, Map<String, List<String>>> stageRedundants = new HashMap<>();
for (StageSpec stageSpec : pipelineSpec.getStages()) {
Map<String, Schema> inputSchemas = stageSpec.getInputSchemas();
// TODO: CDAP-16428 populate the schema if macro is enabled to avoid this
if (inputSchemas == null) {
LOG.warn("Field lineage will not be recorded since the input schema is not set. ");
return Collections.emptySet();
}
// If current stage is of type JOIN add fields as inputstageName.fieldName
List<String> stageInputs = new ArrayList<>();
List<String> stageOutputs = new ArrayList<>();
if (BatchJoiner.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
for (Map.Entry<String, Schema> entry : inputSchemas.entrySet()) {
Schema schema = entry.getValue();
if (schema != null && schema.getFields() != null) {
stageInputs.addAll(schema.getFields().stream().map(field -> entry.getKey() + "." + field.getName()).collect(Collectors.toList()));
}
}
} else {
for (Map.Entry<String, Schema> entry : inputSchemas.entrySet()) {
Schema schema = entry.getValue();
if (schema != null && schema.getFields() != null) {
stageInputs.addAll(schema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList()));
}
}
}
Schema outputSchema = stageSpec.getOutputSchema();
if (outputSchema != null && outputSchema.getFields() != null) {
stageOutputs.addAll(outputSchema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList()));
}
String stageName = stageSpec.getName();
// only auto generate for stages that have input and output schema
if (!stageInputs.isEmpty() && !stageOutputs.isEmpty()) {
allOperations.compute(stageName, (stage, fieldOperations) -> {
// output schema
if (fieldOperations == null || fieldOperations.isEmpty()) {
return Collections.singletonList(new FieldTransformOperation("Transform", "", stageInputs, stageOutputs));
}
return fieldOperations;
});
}
List<FieldOperation> fieldOperations = allOperations.computeIfAbsent(stageName, stage -> Collections.emptyList());
StageOperationsValidator.Builder builder = new StageOperationsValidator.Builder(fieldOperations);
builder.addStageInputs(stageInputs);
builder.addStageOutputs(stageOutputs);
StageOperationsValidator stageOperationsValidator = builder.build();
stageOperationsValidator.validate();
LOG.trace("Stage Name: {}", stageName);
LOG.trace("Stage Operations {}", GSON.toJson(fieldOperations));
LOG.trace("Stage inputs: {}", stageInputs);
LOG.trace("Stage outputs: {}", stageOutputs);
InvalidFieldOperations invalidFieldOperations = stageOperationsValidator.getStageInvalids();
if (invalidFieldOperations != null) {
stageInvalids.put(stageName, invalidFieldOperations);
}
if (!stageOperationsValidator.getRedundantOutputs().isEmpty()) {
stageRedundants.put(stageName, stageOperationsValidator.getRedundantOutputs());
}
}
if (!stageRedundants.isEmpty()) {
LOG.debug("The pipeline has redundant operations {} and they will be ignored", stageRedundants);
}
if (!stageInvalids.isEmpty()) {
// Do not throw but just log the exception message for validation failure
// Once most of the plugins are updated to write lineage exception can be thrown
LOG.debug(new InvalidLineageException(stageInvalids).getMessage());
}
LineageOperationsProcessor processor = new LineageOperationsProcessor(pipelineSpec.getConnections(), allOperations, noMergeRequiredStages);
return processor.process();
}
use of io.cdap.cdap.etl.api.lineage.field.FieldOperation in project cdap by caskdata.
the class SmartWorkflow method destroy.
@Override
public void destroy() {
WorkflowContext workflowContext = getContext();
PipelineRuntime pipelineRuntime = new PipelineRuntime(workflowContext, workflowMetrics);
// Execute the post actions only if pipeline is not running in preview mode.
if (!workflowContext.getDataTracer(PostAction.PLUGIN_TYPE).isEnabled()) {
for (Map.Entry<String, PostAction> endingActionEntry : postActions.entrySet()) {
String name = endingActionEntry.getKey();
PostAction action = endingActionEntry.getValue();
StageSpec stageSpec = stageSpecs.get(name);
BatchActionContext context = new WorkflowBackedActionContext(workflowContext, pipelineRuntime, stageSpec);
try {
action.run(context);
} catch (Throwable t) {
LOG.error("Error while running post action {}.", name, t);
}
}
}
Map<String, String> connectorDatasets = GSON.fromJson(workflowContext.getWorkflowSpecification().getProperty(Constants.CONNECTOR_DATASETS), STAGE_DATASET_MAP);
// publish all alerts
for (Map.Entry<String, AlertPublisher> alertPublisherEntry : alertPublishers.entrySet()) {
String stageName = alertPublisherEntry.getKey();
AlertPublisher alertPublisher = alertPublisherEntry.getValue();
FileSet alertConnector = workflowContext.getDataset(connectorDatasets.get(stageName));
try (CloseableIterator<Alert> alerts = new AlertReader(alertConnector)) {
if (!alerts.hasNext()) {
continue;
}
StageMetrics stageMetrics = new DefaultStageMetrics(workflowMetrics, stageName);
StageSpec stageSpec = stageSpecs.get(stageName);
AlertPublisherContext alertContext = new DefaultAlertPublisherContext(pipelineRuntime, stageSpec, workflowContext, workflowContext.getAdmin());
alertPublisher.initialize(alertContext);
TrackedIterator<Alert> trackedIterator = new TrackedIterator<>(alerts, stageMetrics, Constants.Metrics.RECORDS_IN);
alertPublisher.publish(trackedIterator);
} catch (Exception e) {
LOG.warn("Stage {} had errors publishing alerts. Alerts may not have been published.", stageName, e);
} finally {
try {
alertPublisher.destroy();
} catch (Exception e) {
LOG.warn("Error destroying alert publisher for stage {}", stageName, e);
}
}
}
ProgramStatus status = getContext().getState().getStatus();
if (status == ProgramStatus.FAILED) {
WRAPPERLOGGER.error("Pipeline '{}' failed.", getContext().getApplicationSpecification().getName());
} else {
WRAPPERLOGGER.info("Pipeline '{}' {}.", getContext().getApplicationSpecification().getName(), status == ProgramStatus.COMPLETED ? "succeeded" : status.name().toLowerCase());
}
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), workflowContext.getLogicalStartTime(), workflowContext, workflowContext, workflowContext.getNamespace());
// Get resolved plugin properties
Map<String, Map<String, String>> resolvedProperties = new HashMap<>();
for (StageSpec spec : stageSpecs.values()) {
String stageName = spec.getName();
resolvedProperties.put(stageName, workflowContext.getPluginProperties(stageName, macroEvaluator).getProperties());
}
// Add resolved plugin properties to workflow token as a JSON String
workflowContext.getToken().put(RESOLVED_PLUGIN_PROPERTIES_MAP, GSON.toJson(resolvedProperties));
// record only if the Workflow is successful
if (status != ProgramStatus.COMPLETED) {
return;
}
// Collect field operations from each phase
WorkflowToken token = workflowContext.getToken();
List<NodeValue> allNodeValues = token.getAll(Constants.FIELD_OPERATION_KEY_IN_WORKFLOW_TOKEN);
if (allNodeValues.isEmpty()) {
// no field lineage recorded by any stage
return;
}
Map<String, List<FieldOperation>> allStageOperations = new HashMap<>();
for (StageSpec stageSpec : stageSpecs.values()) {
allStageOperations.put(stageSpec.getName(), new ArrayList<>());
}
for (NodeValue nodeValue : allNodeValues) {
Map<String, List<FieldOperation>> stageOperations = GSON.fromJson(nodeValue.getValue().toString(), STAGE_OPERATIONS_MAP);
for (Map.Entry<String, List<FieldOperation>> entry : stageOperations.entrySet()) {
// ignore them
if (allStageOperations.containsKey(entry.getKey())) {
allStageOperations.get(entry.getKey()).addAll(entry.getValue());
}
}
}
FieldLineageProcessor processor = new FieldLineageProcessor(spec);
Set<Operation> processedOperations = processor.validateAndConvert(allStageOperations);
if (!processedOperations.isEmpty()) {
workflowContext.record(processedOperations);
}
}
use of io.cdap.cdap.etl.api.lineage.field.FieldOperation in project hydrator-plugins by cdapio.
the class CSVParser method prepareRun.
@Override
public void prepareRun(StageSubmitterContext context) throws Exception {
super.prepareRun(context);
FailureCollector collector = context.getFailureCollector();
config.validate(collector);
collector.getOrThrowException();
// Read from config.field and output to fields
init();
if (fields != null) {
FieldOperation operation = new FieldTransformOperation("Parse", "Parsed CSV data from expected field.", Collections.singletonList(config.field), fields.stream().map(Schema.Field::getName).collect(Collectors.toList()));
context.record(Collections.singletonList(operation));
}
}
use of io.cdap.cdap.etl.api.lineage.field.FieldOperation in project hydrator-plugins by cdapio.
the class Hasher method prepareRun.
@Override
public void prepareRun(StageSubmitterContext context) throws Exception {
FailureCollector failureCollector = context.getFailureCollector();
config.validate(context.getInputSchema(), failureCollector);
failureCollector.getOrThrowException();
if (context.getInputSchema() == null || context.getInputSchema().getFields() == null) {
return;
}
// Set a list of operations only for the fields in inputSchema and with type string, and identity for
// the non-string ones present in the output.
List<String> hashedFields = context.getInputSchema().getFields().stream().filter(field -> config.getFields().contains(field.getName()) && field.getSchema().getType() == Schema.Type.STRING).map(Schema.Field::getName).collect(Collectors.toList());
List<String> identityFields = TransformLineageRecorderUtils.getFields(context.getInputSchema());
identityFields.removeAll(hashedFields);
List<FieldOperation> output = new ArrayList<>();
output.addAll(TransformLineageRecorderUtils.generateOneToOnes(hashedFields, "hash", "Used the digest algorithm to hash the fields."));
output.addAll(TransformLineageRecorderUtils.generateOneToOnes(identityFields, "identity", TransformLineageRecorderUtils.IDENTITY_TRANSFORM_DESCRIPTION));
context.record(output);
}
use of io.cdap.cdap.etl.api.lineage.field.FieldOperation in project cdap by cdapio.
the class SmartWorkflow method destroy.
@Override
public void destroy() {
WorkflowContext workflowContext = getContext();
PipelineRuntime pipelineRuntime = new PipelineRuntime(workflowContext, workflowMetrics);
// Execute the post actions only if pipeline is not running in preview mode.
if (!workflowContext.getDataTracer(PostAction.PLUGIN_TYPE).isEnabled()) {
for (Map.Entry<String, PostAction> endingActionEntry : postActions.entrySet()) {
String name = endingActionEntry.getKey();
PostAction action = endingActionEntry.getValue();
StageSpec stageSpec = stageSpecs.get(name);
BatchActionContext context = new WorkflowBackedActionContext(workflowContext, pipelineRuntime, stageSpec);
try {
action.run(context);
} catch (Throwable t) {
LOG.error("Error while running post action {}.", name, t);
}
}
}
Map<String, String> connectorDatasets = GSON.fromJson(workflowContext.getWorkflowSpecification().getProperty(Constants.CONNECTOR_DATASETS), STAGE_DATASET_MAP);
// publish all alerts
for (Map.Entry<String, AlertPublisher> alertPublisherEntry : alertPublishers.entrySet()) {
String stageName = alertPublisherEntry.getKey();
AlertPublisher alertPublisher = alertPublisherEntry.getValue();
FileSet alertConnector = workflowContext.getDataset(connectorDatasets.get(stageName));
try (CloseableIterator<Alert> alerts = new AlertReader(alertConnector)) {
if (!alerts.hasNext()) {
continue;
}
StageMetrics stageMetrics = new DefaultStageMetrics(workflowMetrics, stageName);
StageSpec stageSpec = stageSpecs.get(stageName);
AlertPublisherContext alertContext = new DefaultAlertPublisherContext(pipelineRuntime, stageSpec, workflowContext, workflowContext.getAdmin());
alertPublisher.initialize(alertContext);
TrackedIterator<Alert> trackedIterator = new TrackedIterator<>(alerts, stageMetrics, Constants.Metrics.RECORDS_IN);
alertPublisher.publish(trackedIterator);
} catch (Exception e) {
LOG.warn("Stage {} had errors publishing alerts. Alerts may not have been published.", stageName, e);
} finally {
try {
alertPublisher.destroy();
} catch (Exception e) {
LOG.warn("Error destroying alert publisher for stage {}", stageName, e);
}
}
}
ProgramStatus status = getContext().getState().getStatus();
if (status == ProgramStatus.FAILED) {
WRAPPERLOGGER.error("Pipeline '{}' failed.", getContext().getApplicationSpecification().getName());
} else {
WRAPPERLOGGER.info("Pipeline '{}' {}.", getContext().getApplicationSpecification().getName(), status == ProgramStatus.COMPLETED ? "succeeded" : status.name().toLowerCase());
}
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), workflowContext.getLogicalStartTime(), workflowContext, workflowContext, workflowContext.getNamespace());
// Get resolved plugin properties
Map<String, Map<String, String>> resolvedProperties = new HashMap<>();
for (StageSpec spec : stageSpecs.values()) {
String stageName = spec.getName();
resolvedProperties.put(stageName, workflowContext.getPluginProperties(stageName, macroEvaluator).getProperties());
}
// Add resolved plugin properties to workflow token as a JSON String
workflowContext.getToken().put(RESOLVED_PLUGIN_PROPERTIES_MAP, GSON.toJson(resolvedProperties));
// record only if the Workflow is successful
if (status != ProgramStatus.COMPLETED) {
return;
}
// Collect field operations from each phase
WorkflowToken token = workflowContext.getToken();
List<NodeValue> allNodeValues = token.getAll(Constants.FIELD_OPERATION_KEY_IN_WORKFLOW_TOKEN);
if (allNodeValues.isEmpty()) {
// no field lineage recorded by any stage
return;
}
Map<String, List<FieldOperation>> allStageOperations = new HashMap<>();
for (StageSpec stageSpec : stageSpecs.values()) {
allStageOperations.put(stageSpec.getName(), new ArrayList<>());
}
for (NodeValue nodeValue : allNodeValues) {
Map<String, List<FieldOperation>> stageOperations = GSON.fromJson(nodeValue.getValue().toString(), STAGE_OPERATIONS_MAP);
for (Map.Entry<String, List<FieldOperation>> entry : stageOperations.entrySet()) {
// ignore them
if (allStageOperations.containsKey(entry.getKey())) {
allStageOperations.get(entry.getKey()).addAll(entry.getValue());
}
}
}
FieldLineageProcessor processor = new FieldLineageProcessor(spec);
Set<Operation> processedOperations = processor.validateAndConvert(allStageOperations);
if (!processedOperations.isEmpty()) {
workflowContext.record(processedOperations);
}
}
Aggregations