use of io.cdap.cdap.etl.api.join.JoinDefinition in project cdap by caskdata.
the class JoinMergeFunction method createInitializedJoiner.
private <K, V, O> BatchJoiner<K, V, O> createInitializedJoiner() throws Exception {
Object plugin = pluginFunctionContext.createPlugin();
BatchJoiner<K, V, O> joiner;
if (plugin instanceof BatchAutoJoiner) {
String stageName = pluginFunctionContext.getStageName();
BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
AutoJoinerContext autoJoinerContext = pluginFunctionContext.createAutoJoinerContext();
JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
autoJoinerContext.getFailureCollector().getOrThrowException();
if (joinDefinition == null) {
throw new IllegalStateException(String.format("Join stage '%s' did not specify a join definition. " + "Check with the plugin developer to ensure it is implemented correctly.", stageName));
}
joiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
} else {
joiner = (BatchJoiner<K, V, O>) plugin;
BatchJoinerRuntimeContext context = pluginFunctionContext.createBatchRuntimeContext();
joiner.initialize(context);
}
return joiner;
}
use of io.cdap.cdap.etl.api.join.JoinDefinition in project cdap by caskdata.
the class JoinOnFunction method createInitializedJoinOnTransform.
private JoinOnTransform<INPUT_RECORD, JOIN_KEY> createInitializedJoinOnTransform() throws Exception {
Object plugin = pluginFunctionContext.createPlugin();
BatchJoiner<JOIN_KEY, INPUT_RECORD, Object> joiner;
boolean filterNullKeys = false;
if (plugin instanceof BatchAutoJoiner) {
BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
AutoJoinerContext autoJoinerContext = pluginFunctionContext.createAutoJoinerContext();
JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
autoJoinerContext.getFailureCollector().getOrThrowException();
String stageName = pluginFunctionContext.getStageName();
if (joinDefinition == null) {
throw new IllegalStateException(String.format("Join stage '%s' did not specify a join definition. " + "Check with the plugin developer to ensure it is implemented correctly.", stageName));
}
JoinCondition condition = joinDefinition.getCondition();
/*
Filter out the record if it comes from an optional stage
and the key is null, or if any of the fields in the key is null.
For example, suppose we are performing a left outer join on:
A (id, name) = (0, alice), (null, bob)
B (id, email) = (0, alice@example.com), (null, placeholder@example.com)
The final output should be:
joined (A.id, A.name, B.email) = (0, alice, alice@example.com), (null, bob, null, null)
that is, the bob record should not be joined to the placeholder@example email, even though both their
ids are null.
*/
if (condition.getOp() == JoinCondition.Op.KEY_EQUALITY && !((JoinCondition.OnKeys) condition).isNullSafe()) {
filterNullKeys = joinDefinition.getStages().stream().filter(s -> !s.isRequired()).map(JoinStage::getStageName).anyMatch(s -> s.equals(inputStageName));
}
joiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
} else {
joiner = (BatchJoiner<JOIN_KEY, INPUT_RECORD, Object>) plugin;
BatchJoinerRuntimeContext context = pluginFunctionContext.createBatchRuntimeContext();
joiner.initialize(context);
}
return new JoinOnTransform<>(joiner, inputStageName, filterNullKeys);
}
use of io.cdap.cdap.etl.api.join.JoinDefinition in project cdap by caskdata.
the class MapReduceTransformExecutorFactory method getTransformation.
@SuppressWarnings("unchecked")
@Override
protected <IN, OUT> TrackedTransform<IN, OUT> getTransformation(StageSpec stageSpec) throws Exception {
String stageName = stageSpec.getName();
String pluginType = stageSpec.getPluginType();
StageMetrics stageMetrics = new DefaultStageMetrics(metrics, stageName);
TaskAttemptContext taskAttemptContext = (TaskAttemptContext) taskContext.getHadoopContext();
StageStatisticsCollector collector = collectStageStatistics ? new MapReduceStageStatisticsCollector(stageName, taskAttemptContext) : new NoopStageStatisticsCollector();
if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
BatchAggregator<?, ?, ?> batchAggregator;
if (plugin instanceof BatchReducibleAggregator) {
BatchReducibleAggregator<?, ?, ?, ?> reducibleAggregator = (BatchReducibleAggregator<?, ?, ?, ?>) plugin;
batchAggregator = new AggregatorBridge<>(reducibleAggregator);
} else {
batchAggregator = (BatchAggregator<?, ?, ?>) plugin;
}
BatchRuntimeContext runtimeContext = createRuntimeContext(stageSpec);
batchAggregator.initialize(runtimeContext);
if (isMapPhase) {
return getTrackedEmitKeyStep(new MapperAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, getDataTracer(stageName), collector);
} else {
return getTrackedAggregateStep(new ReducerAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, getDataTracer(stageName), collector);
}
} else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
BatchJoiner<?, ?, ?> batchJoiner;
Set<String> filterNullKeyStages = new HashSet<>();
if (plugin instanceof BatchAutoJoiner) {
BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
FailureCollector failureCollector = new LoggingFailureCollector(stageName, stageSpec.getInputSchemas());
DefaultAutoJoinerContext context = DefaultAutoJoinerContext.from(stageSpec.getInputSchemas(), failureCollector);
// definition will be non-null due to validate by PipelinePhasePreparer at the start of the run
JoinDefinition joinDefinition = autoJoiner.define(context);
JoinCondition condition = joinDefinition.getCondition();
// should never happen as it's checked at deployment time, but add this to be safe.
if (condition.getOp() != JoinCondition.Op.KEY_EQUALITY) {
failureCollector.addFailure(String.format("Join stage '%s' uses a %s condition, which is not supported with the MapReduce engine.", stageName, condition.getOp()), "Switch to a different execution engine.");
}
failureCollector.getOrThrowException();
batchJoiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
// this is the same as filtering out records that have a null key if they are from an optional stage
if (condition.getOp() == JoinCondition.Op.KEY_EQUALITY && !((JoinCondition.OnKeys) condition).isNullSafe()) {
filterNullKeyStages = joinDefinition.getStages().stream().filter(s -> !s.isRequired()).map(JoinStage::getStageName).collect(Collectors.toSet());
}
} else {
batchJoiner = (BatchJoiner<?, ?, ?>) plugin;
}
BatchJoinerRuntimeContext runtimeContext = createRuntimeContext(stageSpec);
batchJoiner.initialize(runtimeContext);
if (isMapPhase) {
return getTrackedEmitKeyStep(new MapperJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName, filterNullKeyStages), stageMetrics, getDataTracer(stageName), collector);
} else {
return getTrackedMergeStep(new ReducerJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName, runtimeContext.getInputSchemas().size()), stageMetrics, getDataTracer(stageName), collector);
}
}
return super.getTransformation(stageSpec);
}
use of io.cdap.cdap.etl.api.join.JoinDefinition in project cdap by caskdata.
the class SparkStreamingPipelineRunner method handleJoin.
@Override
protected SparkCollection<Object> handleJoin(Map<String, SparkCollection<Object>> inputDataCollections, PipelinePhase pipelinePhase, PluginFunctionContext pluginFunctionContext, StageSpec stageSpec, FunctionCache.Factory functionCacheFactory, Object plugin, Integer numPartitions, StageStatisticsCollector collector, Set<String> shufflers) throws Exception {
String stageName = stageSpec.getName();
BatchJoiner<?, ?, ?> joiner;
if (plugin instanceof BatchAutoJoiner) {
BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
Map<String, Schema> inputSchemas = new HashMap<>();
for (String inputStageName : pipelinePhase.getStageInputs(stageName)) {
StageSpec inputStageSpec = pipelinePhase.getStage(inputStageName);
inputSchemas.put(inputStageName, inputStageSpec.getOutputSchema());
}
FailureCollector failureCollector = new LoggingFailureCollector(stageName, inputSchemas);
AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(inputSchemas, failureCollector);
failureCollector.getOrThrowException();
JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
if (joinDefinition == null) {
throw new IllegalStateException(String.format("Joiner stage '%s' did not specify a join definition. " + "Check with the plugin developer to ensure it is implemented correctly.", stageName));
}
joiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
} else if (plugin instanceof BatchJoiner) {
joiner = (BatchJoiner) plugin;
} else {
// should never happen unless there is a bug in the code. should have failed during deployment
throw new IllegalStateException(String.format("Stage '%s' is an unknown joiner type %s", stageName, plugin.getClass().getName()));
}
BatchJoinerRuntimeContext joinerRuntimeContext = pluginFunctionContext.createBatchRuntimeContext();
joiner.initialize(joinerRuntimeContext);
shufflers.add(stageName);
return handleJoin(joiner, inputDataCollections, stageSpec, functionCacheFactory, numPartitions, collector);
}
use of io.cdap.cdap.etl.api.join.JoinDefinition in project cdap by cdapio.
the class PipelinePhasePreparer method validateAutoJoiner.
private void validateAutoJoiner(AutoJoiner autoJoiner, StageSpec stageSpec) {
// validate that the join definition is not null
// it could be null at configure time due to macros not being evaluated, but at this
// point all macros should be evaluated and the definition should be non-null.
String stageName = stageSpec.getName();
String pluginName = stageSpec.getPlugin().getName();
FailureCollector failureCollector = new LoggingFailureCollector(stageSpec.getName(), stageSpec.getInputSchemas());
AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(stageSpec.getInputSchemas(), failureCollector);
JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
failureCollector.getOrThrowException();
if (joinDefinition == null) {
throw new IllegalArgumentException(String.format("Joiner stage '%s' using plugin '%s' did not provide a join definition. " + "Check with the plugin developer to make sure it is implemented correctly.", stageName, pluginName));
}
// validate that the stages mentioned in the join definition are actually inputs into the joiner.
Set<String> inputStages = stageSpec.getInputSchemas().keySet();
Set<String> joinStages = joinDefinition.getStages().stream().map(JoinStage::getStageName).collect(Collectors.toSet());
Set<String> missingInputs = Sets.difference(inputStages, joinStages);
if (!missingInputs.isEmpty()) {
throw new IllegalArgumentException(String.format("Joiner stage '%s' using plugin '%s' did not include input stage %s in the join. " + "Check with the plugin developer to make sure it is implemented correctly.", stageName, pluginName, String.join(", ", missingInputs)));
}
Set<String> extraInputs = Sets.difference(joinStages, inputStages);
if (!extraInputs.isEmpty()) {
throw new IllegalArgumentException(String.format("Joiner stage '%s' using plugin '%s' is trying to join stage %s, which is not an input. " + "Check with the plugin developer to make sure it is implemented correctly.", stageName, pluginName, String.join(", ", missingInputs)));
}
}
Aggregations