use of io.cdap.cdap.etl.validation.LoggingFailureCollector in project cdap by caskdata.
the class PipelinePhasePreparer method validateAutoJoiner.
private void validateAutoJoiner(AutoJoiner autoJoiner, StageSpec stageSpec) {
// validate that the join definition is not null
// it could be null at configure time due to macros not being evaluated, but at this
// point all macros should be evaluated and the definition should be non-null.
String stageName = stageSpec.getName();
String pluginName = stageSpec.getPlugin().getName();
FailureCollector failureCollector = new LoggingFailureCollector(stageSpec.getName(), stageSpec.getInputSchemas());
AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(stageSpec.getInputSchemas(), failureCollector);
JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
failureCollector.getOrThrowException();
if (joinDefinition == null) {
throw new IllegalArgumentException(String.format("Joiner stage '%s' using plugin '%s' did not provide a join definition. " + "Check with the plugin developer to make sure it is implemented correctly.", stageName, pluginName));
}
// validate that the stages mentioned in the join definition are actually inputs into the joiner.
Set<String> inputStages = stageSpec.getInputSchemas().keySet();
Set<String> joinStages = joinDefinition.getStages().stream().map(JoinStage::getStageName).collect(Collectors.toSet());
Set<String> missingInputs = Sets.difference(inputStages, joinStages);
if (!missingInputs.isEmpty()) {
throw new IllegalArgumentException(String.format("Joiner stage '%s' using plugin '%s' did not include input stage %s in the join. " + "Check with the plugin developer to make sure it is implemented correctly.", stageName, pluginName, String.join(", ", missingInputs)));
}
Set<String> extraInputs = Sets.difference(joinStages, inputStages);
if (!extraInputs.isEmpty()) {
throw new IllegalArgumentException(String.format("Joiner stage '%s' using plugin '%s' is trying to join stage %s, which is not an input. " + "Check with the plugin developer to make sure it is implemented correctly.", stageName, pluginName, String.join(", ", missingInputs)));
}
}
use of io.cdap.cdap.etl.validation.LoggingFailureCollector in project cdap by caskdata.
the class SparkPipelineRunner method handleJoin.
protected SparkCollection<Object> handleJoin(Map<String, SparkCollection<Object>> inputDataCollections, PipelinePhase pipelinePhase, PluginFunctionContext pluginFunctionContext, StageSpec stageSpec, FunctionCache.Factory functionCacheFactory, Object plugin, Integer numPartitions, StageStatisticsCollector collector, Set<String> shufflers) throws Exception {
String stageName = stageSpec.getName();
if (plugin instanceof BatchJoiner) {
BatchJoiner<Object, Object, Object> joiner = (BatchJoiner<Object, Object, Object>) plugin;
BatchJoinerRuntimeContext joinerRuntimeContext = pluginFunctionContext.createBatchRuntimeContext();
joiner.initialize(joinerRuntimeContext);
shufflers.add(stageName);
return handleJoin(joiner, inputDataCollections, stageSpec, functionCacheFactory, numPartitions, collector);
} else if (plugin instanceof AutoJoiner) {
AutoJoiner autoJoiner = (AutoJoiner) plugin;
Map<String, Schema> inputSchemas = new HashMap<>();
for (String inputStageName : pipelinePhase.getStageInputs(stageName)) {
StageSpec inputStageSpec = pipelinePhase.getStage(inputStageName);
inputSchemas.put(inputStageName, inputStageSpec.getOutputSchema());
}
FailureCollector failureCollector = new LoggingFailureCollector(stageName, inputSchemas);
AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(inputSchemas, failureCollector);
// joinDefinition will always be non-null because
// it is checked by PipelinePhasePreparer at the start of the run.
JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
failureCollector.getOrThrowException();
if (joinDefinition.getStages().stream().noneMatch(JoinStage::isBroadcast)) {
shufflers.add(stageName);
}
return handleAutoJoin(stageName, joinDefinition, inputDataCollections, numPartitions);
} else {
// should never happen unless there is a bug in the code. should have failed during deployment
throw new IllegalStateException(String.format("Stage '%s' is an unknown joiner type %s", stageName, plugin.getClass().getName()));
}
}
use of io.cdap.cdap.etl.validation.LoggingFailureCollector in project cdap by caskdata.
the class MapReduceTransformExecutorFactory method getTransformation.
@SuppressWarnings("unchecked")
@Override
protected <IN, OUT> TrackedTransform<IN, OUT> getTransformation(StageSpec stageSpec) throws Exception {
String stageName = stageSpec.getName();
String pluginType = stageSpec.getPluginType();
StageMetrics stageMetrics = new DefaultStageMetrics(metrics, stageName);
TaskAttemptContext taskAttemptContext = (TaskAttemptContext) taskContext.getHadoopContext();
StageStatisticsCollector collector = collectStageStatistics ? new MapReduceStageStatisticsCollector(stageName, taskAttemptContext) : new NoopStageStatisticsCollector();
if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
BatchAggregator<?, ?, ?> batchAggregator;
if (plugin instanceof BatchReducibleAggregator) {
BatchReducibleAggregator<?, ?, ?, ?> reducibleAggregator = (BatchReducibleAggregator<?, ?, ?, ?>) plugin;
batchAggregator = new AggregatorBridge<>(reducibleAggregator);
} else {
batchAggregator = (BatchAggregator<?, ?, ?>) plugin;
}
BatchRuntimeContext runtimeContext = createRuntimeContext(stageSpec);
batchAggregator.initialize(runtimeContext);
if (isMapPhase) {
return getTrackedEmitKeyStep(new MapperAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, getDataTracer(stageName), collector);
} else {
return getTrackedAggregateStep(new ReducerAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, getDataTracer(stageName), collector);
}
} else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
BatchJoiner<?, ?, ?> batchJoiner;
Set<String> filterNullKeyStages = new HashSet<>();
if (plugin instanceof BatchAutoJoiner) {
BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
FailureCollector failureCollector = new LoggingFailureCollector(stageName, stageSpec.getInputSchemas());
DefaultAutoJoinerContext context = DefaultAutoJoinerContext.from(stageSpec.getInputSchemas(), failureCollector);
// definition will be non-null due to validate by PipelinePhasePreparer at the start of the run
JoinDefinition joinDefinition = autoJoiner.define(context);
JoinCondition condition = joinDefinition.getCondition();
// should never happen as it's checked at deployment time, but add this to be safe.
if (condition.getOp() != JoinCondition.Op.KEY_EQUALITY) {
failureCollector.addFailure(String.format("Join stage '%s' uses a %s condition, which is not supported with the MapReduce engine.", stageName, condition.getOp()), "Switch to a different execution engine.");
}
failureCollector.getOrThrowException();
batchJoiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
// this is the same as filtering out records that have a null key if they are from an optional stage
if (condition.getOp() == JoinCondition.Op.KEY_EQUALITY && !((JoinCondition.OnKeys) condition).isNullSafe()) {
filterNullKeyStages = joinDefinition.getStages().stream().filter(s -> !s.isRequired()).map(JoinStage::getStageName).collect(Collectors.toSet());
}
} else {
batchJoiner = (BatchJoiner<?, ?, ?>) plugin;
}
BatchJoinerRuntimeContext runtimeContext = createRuntimeContext(stageSpec);
batchJoiner.initialize(runtimeContext);
if (isMapPhase) {
return getTrackedEmitKeyStep(new MapperJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName, filterNullKeyStages), stageMetrics, getDataTracer(stageName), collector);
} else {
return getTrackedMergeStep(new ReducerJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName, runtimeContext.getInputSchemas().size()), stageMetrics, getDataTracer(stageName), collector);
}
}
return super.getTransformation(stageSpec);
}
use of io.cdap.cdap.etl.validation.LoggingFailureCollector in project cdap by caskdata.
the class SparkStreamingPipelineRunner method handleJoin.
@Override
protected SparkCollection<Object> handleJoin(Map<String, SparkCollection<Object>> inputDataCollections, PipelinePhase pipelinePhase, PluginFunctionContext pluginFunctionContext, StageSpec stageSpec, FunctionCache.Factory functionCacheFactory, Object plugin, Integer numPartitions, StageStatisticsCollector collector, Set<String> shufflers) throws Exception {
String stageName = stageSpec.getName();
BatchJoiner<?, ?, ?> joiner;
if (plugin instanceof BatchAutoJoiner) {
BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
Map<String, Schema> inputSchemas = new HashMap<>();
for (String inputStageName : pipelinePhase.getStageInputs(stageName)) {
StageSpec inputStageSpec = pipelinePhase.getStage(inputStageName);
inputSchemas.put(inputStageName, inputStageSpec.getOutputSchema());
}
FailureCollector failureCollector = new LoggingFailureCollector(stageName, inputSchemas);
AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(inputSchemas, failureCollector);
failureCollector.getOrThrowException();
JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
if (joinDefinition == null) {
throw new IllegalStateException(String.format("Joiner stage '%s' did not specify a join definition. " + "Check with the plugin developer to ensure it is implemented correctly.", stageName));
}
joiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
} else if (plugin instanceof BatchJoiner) {
joiner = (BatchJoiner) plugin;
} else {
// should never happen unless there is a bug in the code. should have failed during deployment
throw new IllegalStateException(String.format("Stage '%s' is an unknown joiner type %s", stageName, plugin.getClass().getName()));
}
BatchJoinerRuntimeContext joinerRuntimeContext = pluginFunctionContext.createBatchRuntimeContext();
joiner.initialize(joinerRuntimeContext);
shufflers.add(stageName);
return handleJoin(joiner, inputDataCollections, stageSpec, functionCacheFactory, numPartitions, collector);
}
Aggregations