use of io.cdap.cdap.etl.api.join.JoinKey in project cdap by caskdata.
the class MockAutoJoiner method define.
@Nullable
@Override
public JoinDefinition define(AutoJoinerContext context) {
if (conf.containsMacro(Conf.STAGES) || conf.containsMacro(Conf.KEY) || conf.containsMacro(Conf.REQUIRED) || conf.containsMacro(Conf.SELECT)) {
return null;
}
Map<String, JoinStage> inputStages = context.getInputStages();
List<JoinStage> from = new ArrayList<>(inputStages.size());
Set<String> required = new HashSet<>(conf.getRequired());
Set<String> broadcast = new HashSet<>(conf.getBroadcast());
List<JoinField> selectedFields = conf.getSelect();
boolean shouldGenerateSelected = selectedFields.isEmpty();
JoinCondition condition = conf.getJoinConditionExpr();
JoinCondition.OnKeys.Builder conditionBuilder = condition != null ? null : JoinCondition.onKeys().setNullSafe(conf.isNullSafe());
for (String stageName : conf.getStages()) {
JoinStage.Builder stageBuilder = JoinStage.builder(inputStages.get(stageName));
if (!required.contains(stageName)) {
stageBuilder.isOptional();
}
if (broadcast.contains(stageName)) {
stageBuilder.setBroadcast(true);
}
JoinStage stage = stageBuilder.build();
from.add(stage);
if (conditionBuilder != null) {
conditionBuilder.addKey(new JoinKey(stageName, conf.getKey()));
}
Schema stageSchema = stage.getSchema();
if (!shouldGenerateSelected || stageSchema == null) {
continue;
}
for (Schema.Field field : stageSchema.getFields()) {
// alias everything to stage_field
selectedFields.add(new JoinField(stageName, field.getName(), String.format("%s_%s", stageName, field.getName())));
}
}
condition = condition == null ? conditionBuilder.build() : condition;
JoinDefinition.Builder builder = JoinDefinition.builder().select(selectedFields).on(condition).from(from).setOutputSchemaName(String.join(".", conf.getStages()));
Schema outputSchema = conf.getSchema();
if (outputSchema != null) {
builder.setOutputSchema(outputSchema);
}
if (conf.getDistributionName() != null && conf.getDistributionSize() != null) {
builder.setDistributionFactor(conf.getDistributionSize(), conf.getDistributionName());
}
return builder.build();
}
use of io.cdap.cdap.etl.api.join.JoinKey in project cdap by caskdata.
the class PipelineSpecGeneratorTest method testAutoJoin.
@Test
public void testAutoJoin() {
/*
* ---- transformA --------|
* | |
* source ---| |-- autojoin --- sink
* | |
* ---- transformABC ------|
*/
ETLBatchConfig config = ETLBatchConfig.builder().setTimeSchedule("* * * * *").addStage(new ETLStage("source", MOCK_SOURCE)).addStage(new ETLStage("tA", MOCK_TRANSFORM_A)).addStage(new ETLStage("tABC", MOCK_TRANSFORM_ABC)).addStage(new ETLStage("autojoin", MOCK_AUTO_JOINER)).addStage(new ETLStage("sink", MOCK_SINK)).addConnection("source", "tA").addConnection("source", "tABC").addConnection("tA", "autojoin").addConnection("tABC", "autojoin").addConnection("autojoin", "sink").setNumOfRecordsPreview(100).build();
joinDefinition = JoinDefinition.builder().select(new JoinField("tA", "a"), new JoinField("tABC", "b"), new JoinField("tABC", "c")).from(JoinStage.builder("tA", SCHEMA_A).isRequired().build(), JoinStage.builder("tABC", SCHEMA_ABC).isOptional().build()).on(JoinCondition.onKeys().addKey(new JoinKey("tA", Collections.singletonList("a"))).addKey(new JoinKey("tABC", Collections.singletonList("a"))).build()).setOutputSchemaName("abc.joined").build();
Schema joinSchema = Schema.recordOf("abc.joined", Schema.Field.of("a", Schema.of(Schema.Type.STRING)), Schema.Field.of("b", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("c", Schema.nullableOf(Schema.of(Schema.Type.INT))));
Map<String, String> emptyMap = new HashMap<>();
PipelineSpec expected = BatchPipelineSpec.builder().addStage(StageSpec.builder("source", new PluginSpec(BatchSource.PLUGIN_TYPE, "mocksource", emptyMap, ARTIFACT_ID)).addOutput(SCHEMA_A, "tA", "tABC").build()).addStage(StageSpec.builder("tA", new PluginSpec(Transform.PLUGIN_TYPE, "mockA", emptyMap, ARTIFACT_ID)).addInputSchema("source", SCHEMA_A).addOutput(SCHEMA_A, "autojoin").setErrorSchema(SCHEMA_B).build()).addStage(StageSpec.builder("tABC", new PluginSpec(Transform.PLUGIN_TYPE, "mockABC", emptyMap, ARTIFACT_ID)).addInputSchema("source", SCHEMA_A).addOutput(SCHEMA_ABC, "autojoin").setErrorSchema(SCHEMA_A).build()).addStage(StageSpec.builder("autojoin", new PluginSpec(BatchJoiner.PLUGIN_TYPE, "mockautojoiner", emptyMap, ARTIFACT_ID)).addInputSchema("tA", SCHEMA_A).addInputSchema("tABC", SCHEMA_ABC).addOutput(joinSchema, "sink").setErrorSchema(SCHEMA_ABC).build()).addStage(StageSpec.builder("sink", new PluginSpec(BatchSink.PLUGIN_TYPE, "mocksink", emptyMap, ARTIFACT_ID)).addInputSchema("autojoin", joinSchema).setErrorSchema(joinSchema).build()).addConnections(config.getConnections()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setStageLoggingEnabled(config.isStageLoggingEnabled()).setNumOfRecordsPreview(config.getNumOfRecordsPreview()).build();
PipelineSpec actual = specGenerator.generateSpec(config);
Assert.assertEquals(expected, actual);
}
Aggregations