use of io.cdap.cdap.etl.api.join.JoinStage in project cdap by cdapio.
the class MockAutoJoiner method define.
@Nullable
@Override
public JoinDefinition define(AutoJoinerContext context) {
if (conf.containsMacro(Conf.STAGES) || conf.containsMacro(Conf.KEY) || conf.containsMacro(Conf.REQUIRED) || conf.containsMacro(Conf.SELECT)) {
return null;
}
Map<String, JoinStage> inputStages = context.getInputStages();
List<JoinStage> from = new ArrayList<>(inputStages.size());
Set<String> required = new HashSet<>(conf.getRequired());
Set<String> broadcast = new HashSet<>(conf.getBroadcast());
List<JoinField> selectedFields = conf.getSelect();
boolean shouldGenerateSelected = selectedFields.isEmpty();
JoinCondition condition = conf.getJoinConditionExpr();
JoinCondition.OnKeys.Builder conditionBuilder = condition != null ? null : JoinCondition.onKeys().setNullSafe(conf.isNullSafe());
for (String stageName : conf.getStages()) {
JoinStage.Builder stageBuilder = JoinStage.builder(inputStages.get(stageName));
if (!required.contains(stageName)) {
stageBuilder.isOptional();
}
if (broadcast.contains(stageName)) {
stageBuilder.setBroadcast(true);
}
JoinStage stage = stageBuilder.build();
from.add(stage);
if (conditionBuilder != null) {
conditionBuilder.addKey(new JoinKey(stageName, conf.getKey()));
}
Schema stageSchema = stage.getSchema();
if (!shouldGenerateSelected || stageSchema == null) {
continue;
}
for (Schema.Field field : stageSchema.getFields()) {
// alias everything to stage_field
selectedFields.add(new JoinField(stageName, field.getName(), String.format("%s_%s", stageName, field.getName())));
}
}
condition = condition == null ? conditionBuilder.build() : condition;
JoinDefinition.Builder builder = JoinDefinition.builder().select(selectedFields).on(condition).from(from).setOutputSchemaName(String.join(".", conf.getStages()));
Schema outputSchema = conf.getSchema();
if (outputSchema != null) {
builder.setOutputSchema(outputSchema);
}
if (conf.getDistributionName() != null && conf.getDistributionSize() != null) {
builder.setDistributionFactor(conf.getDistributionSize(), conf.getDistributionName());
}
return builder.build();
}
use of io.cdap.cdap.etl.api.join.JoinStage in project cdap by cdapio.
the class BatchSparkPipelineDriverTest method testShouldJoinOnSQLEngineWithoutBroadcast.
@Test
public void testShouldJoinOnSQLEngineWithoutBroadcast() {
List<JoinStage> noneBroadcast = Arrays.asList(JoinStage.builder("a", null).setBroadcast(false).build(), JoinStage.builder("b", null).setBroadcast(false).build(), JoinStage.builder("c", null).setBroadcast(false).build());
JoinDefinition joinDefinition = mock(JoinDefinition.class);
doReturn(noneBroadcast).when(joinDefinition).getStages();
Map<String, SparkCollection<Object>> collections = new HashMap<>();
collections.put("a", mock(RDDCollection.class));
collections.put("b", mock(RDDCollection.class));
collections.put("c", mock(RDDCollection.class));
Assert.assertTrue(driver.canJoinOnSQLEngine(STAGE_NAME, joinDefinition, collections));
}
use of io.cdap.cdap.etl.api.join.JoinStage in project cdap by cdapio.
the class BatchSparkPipelineDriverTest method testShouldJoinOnSQLEngineWithBroadcastAndAlreadyPushedCollection.
@Test
public void testShouldJoinOnSQLEngineWithBroadcastAndAlreadyPushedCollection() {
List<JoinStage> noneBroadcast = Arrays.asList(JoinStage.builder("a", null).setBroadcast(false).build(), JoinStage.builder("b", null).setBroadcast(false).build(), JoinStage.builder("c", null).setBroadcast(true).build());
JoinDefinition joinDefinition = mock(JoinDefinition.class);
doReturn(noneBroadcast).when(joinDefinition).getStages();
Map<String, SparkCollection<Object>> collections = new HashMap<>();
collections.put("a", mock(SQLEngineCollection.class));
collections.put("b", mock(RDDCollection.class));
collections.put("c", mock(RDDCollection.class));
Assert.assertTrue(driver.canJoinOnSQLEngine(STAGE_NAME, joinDefinition, collections));
}
use of io.cdap.cdap.etl.api.join.JoinStage in project cdap by cdapio.
the class BatchSparkPipelineDriverTest method testShouldNotJoinOnSQLEngineWithBroadcast.
@Test
public void testShouldNotJoinOnSQLEngineWithBroadcast() {
List<JoinStage> noneBroadcast = Arrays.asList(JoinStage.builder("a", null).setBroadcast(false).build(), JoinStage.builder("b", null).setBroadcast(false).build(), JoinStage.builder("c", null).setBroadcast(true).build());
JoinDefinition joinDefinition = mock(JoinDefinition.class);
doReturn(noneBroadcast).when(joinDefinition).getStages();
Map<String, SparkCollection<Object>> collections = new HashMap<>();
collections.put("a", mock(RDDCollection.class));
collections.put("b", mock(RDDCollection.class));
collections.put("c", mock(RDDCollection.class));
Assert.assertFalse(driver.canJoinOnSQLEngine(STAGE_NAME, joinDefinition, collections));
}
use of io.cdap.cdap.etl.api.join.JoinStage in project cdap by cdapio.
the class BatchSparkPipelineDriver method handleAutoJoin.
@Override
@SuppressWarnings("unchecked")
protected SparkCollection<Object> handleAutoJoin(String stageName, JoinDefinition joinDefinition, Map<String, SparkCollection<Object>> inputDataCollections, @Nullable Integer numPartitions) {
if (sqlEngineAdapter != null && canJoinOnSQLEngine(stageName, joinDefinition, inputDataCollections)) {
// collections representing data that has been pushed to the SQL engine.
for (JoinStage joinStage : joinDefinition.getStages()) {
String joinStageName = joinStage.getStageName();
// If the input collection is already a SQL Engine collection, there's no need to push.
if (inputDataCollections.get(joinStageName) instanceof SQLBackedCollection) {
continue;
}
SparkCollection<Object> collection = inputDataCollections.get(joinStage.getStageName());
SQLEngineJob<SQLDataset> pushJob = sqlEngineAdapter.push(joinStageName, joinStage.getSchema(), collection);
inputDataCollections.put(joinStageName, new SQLEngineCollection<>(sec, functionCacheFactory, jsc, new SQLContext(jsc), datasetContext, sinkFactory, collection, joinStageName, sqlEngineAdapter, pushJob));
}
}
return super.handleAutoJoin(stageName, joinDefinition, inputDataCollections, numPartitions);
}
Aggregations