use of io.cdap.cdap.etl.api.join.JoinDefinition in project cdap by caskdata.
the class BatchSparkPipelineDriverTest method testShouldJoinOnSQLEngineWithBroadcastAndAlreadyPushedCollection.
@Test
public void testShouldJoinOnSQLEngineWithBroadcastAndAlreadyPushedCollection() {
List<JoinStage> noneBroadcast = Arrays.asList(JoinStage.builder("a", null).setBroadcast(false).build(), JoinStage.builder("b", null).setBroadcast(false).build(), JoinStage.builder("c", null).setBroadcast(true).build());
JoinDefinition joinDefinition = mock(JoinDefinition.class);
doReturn(noneBroadcast).when(joinDefinition).getStages();
Map<String, SparkCollection<Object>> collections = new HashMap<>();
collections.put("a", mock(SQLEngineCollection.class));
collections.put("b", mock(RDDCollection.class));
collections.put("c", mock(RDDCollection.class));
Assert.assertTrue(driver.canJoinOnSQLEngine(STAGE_NAME, joinDefinition, collections));
}
use of io.cdap.cdap.etl.api.join.JoinDefinition in project cdap by caskdata.
the class BatchSparkPipelineDriverTest method testSQLEngineDoesNotSupportJoin.
@Test
public void testSQLEngineDoesNotSupportJoin() {
when(adapter.canJoin(anyString(), any(JoinDefinition.class))).thenReturn(false);
List<JoinStage> noneBroadcast = Arrays.asList(JoinStage.builder("a", null).setBroadcast(false).build(), JoinStage.builder("b", null).setBroadcast(false).build(), JoinStage.builder("c", null).setBroadcast(false).build());
JoinDefinition joinDefinition = mock(JoinDefinition.class);
doReturn(noneBroadcast).when(joinDefinition).getStages();
Map<String, SparkCollection<Object>> collections = new HashMap<>();
collections.put("a", mock(RDDCollection.class));
collections.put("b", mock(RDDCollection.class));
collections.put("c", mock(RDDCollection.class));
Assert.assertFalse(driver.canJoinOnSQLEngine(STAGE_NAME, joinDefinition, collections));
}
use of io.cdap.cdap.etl.api.join.JoinDefinition in project cdap by caskdata.
the class PipelineSpecGenerator method configureAutoJoiner.
private void configureAutoJoiner(String stageName, AutoJoiner autoJoiner, DefaultStageConfigurer stageConfigurer, FailureCollector collector) {
AutoJoinerContext autoContext = DefaultAutoJoinerContext.from(stageConfigurer.getInputSchemas(), collector);
JoinDefinition joinDefinition = autoJoiner.define(autoContext);
if (joinDefinition == null) {
return;
}
validateJoinCondition(stageName, joinDefinition.getCondition(), collector);
stageConfigurer.setOutputSchema(joinDefinition.getOutputSchema());
Set<String> inputStages = stageConfigurer.getInputSchemas().keySet();
Set<String> joinStages = joinDefinition.getStages().stream().map(JoinStage::getStageName).collect(Collectors.toSet());
Set<String> missingInputs = Sets.difference(inputStages, joinStages);
if (!missingInputs.isEmpty()) {
collector.addFailure(String.format("Joiner stage '%s' did not include input stage %s in the join.", stageName, String.join(", ", missingInputs)), "Check with the plugin developer to make sure it is implemented correctly.");
}
Set<String> extraInputs = Sets.difference(joinStages, inputStages);
if (!extraInputs.isEmpty()) {
collector.addFailure(String.format("Joiner stage '%s' is trying to join stage %s, which is not an input.", stageName, String.join(", ", missingInputs)), "Check with the plugin developer to make sure it is implemented correctly.");
}
}
use of io.cdap.cdap.etl.api.join.JoinDefinition in project cdap by caskdata.
the class PipelinePhasePreparer method validateAutoJoiner.
private void validateAutoJoiner(AutoJoiner autoJoiner, StageSpec stageSpec) {
// validate that the join definition is not null
// it could be null at configure time due to macros not being evaluated, but at this
// point all macros should be evaluated and the definition should be non-null.
String stageName = stageSpec.getName();
String pluginName = stageSpec.getPlugin().getName();
FailureCollector failureCollector = new LoggingFailureCollector(stageSpec.getName(), stageSpec.getInputSchemas());
AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(stageSpec.getInputSchemas(), failureCollector);
JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
failureCollector.getOrThrowException();
if (joinDefinition == null) {
throw new IllegalArgumentException(String.format("Joiner stage '%s' using plugin '%s' did not provide a join definition. " + "Check with the plugin developer to make sure it is implemented correctly.", stageName, pluginName));
}
// validate that the stages mentioned in the join definition are actually inputs into the joiner.
Set<String> inputStages = stageSpec.getInputSchemas().keySet();
Set<String> joinStages = joinDefinition.getStages().stream().map(JoinStage::getStageName).collect(Collectors.toSet());
Set<String> missingInputs = Sets.difference(inputStages, joinStages);
if (!missingInputs.isEmpty()) {
throw new IllegalArgumentException(String.format("Joiner stage '%s' using plugin '%s' did not include input stage %s in the join. " + "Check with the plugin developer to make sure it is implemented correctly.", stageName, pluginName, String.join(", ", missingInputs)));
}
Set<String> extraInputs = Sets.difference(joinStages, inputStages);
if (!extraInputs.isEmpty()) {
throw new IllegalArgumentException(String.format("Joiner stage '%s' using plugin '%s' is trying to join stage %s, which is not an input. " + "Check with the plugin developer to make sure it is implemented correctly.", stageName, pluginName, String.join(", ", missingInputs)));
}
}
use of io.cdap.cdap.etl.api.join.JoinDefinition in project cdap by caskdata.
the class SparkPipelineRunner method handleJoin.
protected SparkCollection<Object> handleJoin(Map<String, SparkCollection<Object>> inputDataCollections, PipelinePhase pipelinePhase, PluginFunctionContext pluginFunctionContext, StageSpec stageSpec, FunctionCache.Factory functionCacheFactory, Object plugin, Integer numPartitions, StageStatisticsCollector collector, Set<String> shufflers) throws Exception {
String stageName = stageSpec.getName();
if (plugin instanceof BatchJoiner) {
BatchJoiner<Object, Object, Object> joiner = (BatchJoiner<Object, Object, Object>) plugin;
BatchJoinerRuntimeContext joinerRuntimeContext = pluginFunctionContext.createBatchRuntimeContext();
joiner.initialize(joinerRuntimeContext);
shufflers.add(stageName);
return handleJoin(joiner, inputDataCollections, stageSpec, functionCacheFactory, numPartitions, collector);
} else if (plugin instanceof AutoJoiner) {
AutoJoiner autoJoiner = (AutoJoiner) plugin;
Map<String, Schema> inputSchemas = new HashMap<>();
for (String inputStageName : pipelinePhase.getStageInputs(stageName)) {
StageSpec inputStageSpec = pipelinePhase.getStage(inputStageName);
inputSchemas.put(inputStageName, inputStageSpec.getOutputSchema());
}
FailureCollector failureCollector = new LoggingFailureCollector(stageName, inputSchemas);
AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(inputSchemas, failureCollector);
// joinDefinition will always be non-null because
// it is checked by PipelinePhasePreparer at the start of the run.
JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
failureCollector.getOrThrowException();
if (joinDefinition.getStages().stream().noneMatch(JoinStage::isBroadcast)) {
shufflers.add(stageName);
}
return handleAutoJoin(stageName, joinDefinition, inputDataCollections, numPartitions);
} else {
// should never happen unless there is a bug in the code. should have failed during deployment
throw new IllegalStateException(String.format("Stage '%s' is an unknown joiner type %s", stageName, plugin.getClass().getName()));
}
}
Aggregations