use of io.cdap.cdap.etl.api.join.JoinStage in project hydrator-plugins by cdapio.
the class JoinerConfigTest method testJoinerConfigWithInvalidJoinKeys.
@Test
public void testJoinerConfigWithInvalidJoinKeys() {
String selectedFields = "film.film_id, film.film_name, " + "filmActor.actor_name as renamed_actor, filmCategory.category_name as renamed_category";
Schema filmCategorySchema = Schema.recordOf("filmCategory", Schema.Field.of("film_id", Schema.of(Schema.Type.LONG)), Schema.Field.of("film_name", Schema.of(Schema.Type.STRING)), Schema.Field.of("category_name", Schema.of(Schema.Type.STRING)));
JoinerConfig config = new JoinerConfig("film.film_id=filmActor.film_id=filmCategory.film_id&" + "film.film_name=filmActor.film_name=filmCategory.film_name", selectedFields, "film,filmActor,filmCategory");
Joiner joiner = new Joiner(config);
FailureCollector collector = new MockFailureCollector();
Map<String, JoinStage> inputStages = new HashMap<>();
inputStages.put("film", JoinStage.builder("film", FILM_SCHEMA).build());
inputStages.put("filmActor", JoinStage.builder("filmActor", FILM_ACTOR_SCHEMA).build());
inputStages.put("fileCategory", JoinStage.builder("filmCategory", filmCategorySchema).build());
AutoJoinerContext autoJoinerContext = new MockAutoJoinerContext(inputStages, collector);
try {
joiner.define(autoJoinerContext);
} catch (ValidationException e) {
Assert.assertEquals(1, e.getFailures().size());
Assert.assertEquals(1, e.getFailures().get(0).getCauses().size());
Cause expectedCause = new Cause();
expectedCause.addAttribute(CauseAttributes.STAGE_CONFIG, JoinerConfig.JOIN_KEYS);
expectedCause.addAttribute("stage", "mockstage");
Assert.assertEquals(JoinerConfig.JOIN_KEYS, e.getFailures().get(0).getCauses().get(0).getAttribute(CauseAttributes.STAGE_CONFIG));
}
}
use of io.cdap.cdap.etl.api.join.JoinStage in project cdap by caskdata.
the class SparkPipelineRunner method handleAutoJoinWithSQL.
/*
Implement a join by generating a SQL query that Spark will execute.
Joins on key equality are not implemented this way because they have special repartitioning
that allows them to specify a different number of partitions for different joins in the same pipeline.
When Spark handles SQL queries, it uses spark.sql.shuffle.partitions number of partitions, which is a global
setting that applies to any SQL join in the pipeline.
*/
private SparkCollection<Object> handleAutoJoinWithSQL(String stageName, JoinDefinition joinDefinition, Map<String, SparkCollection<Object>> inputDataCollections) {
JoinCondition.OnExpression condition = (JoinCondition.OnExpression) joinDefinition.getCondition();
Map<String, String> aliases = condition.getDatasetAliases();
// earlier validation ensure there are exactly 2 inputs being joined
JoinStage leftStage = joinDefinition.getStages().get(0);
JoinStage rightStage = joinDefinition.getStages().get(1);
String leftStageName = leftStage.getStageName();
String rightStageName = rightStage.getStageName();
SparkCollection<Object> leftData = inputDataCollections.get(leftStageName);
JoinCollection leftCollection = new JoinCollection(leftStageName, inputDataCollections.get(leftStageName), leftStage.getSchema(), Collections.emptyList(), leftStage.isRequired(), leftStage.isBroadcast());
JoinCollection rightCollection = new JoinCollection(rightStageName, inputDataCollections.get(rightStageName), rightStage.getSchema(), Collections.emptyList(), rightStage.isRequired(), rightStage.isBroadcast());
JoinExpressionRequest joinRequest = new JoinExpressionRequest(stageName, joinDefinition.getSelectedFields(), leftCollection, rightCollection, condition, joinDefinition.getOutputSchema(), joinDefinition);
return leftData.join(joinRequest);
}
use of io.cdap.cdap.etl.api.join.JoinStage in project cdap by caskdata.
the class BatchSparkPipelineDriverTest method testShouldNotJoinOnSQLEngineWithBroadcast.
@Test
public void testShouldNotJoinOnSQLEngineWithBroadcast() {
List<JoinStage> noneBroadcast = Arrays.asList(JoinStage.builder("a", null).setBroadcast(false).build(), JoinStage.builder("b", null).setBroadcast(false).build(), JoinStage.builder("c", null).setBroadcast(true).build());
JoinDefinition joinDefinition = mock(JoinDefinition.class);
doReturn(noneBroadcast).when(joinDefinition).getStages();
Map<String, SparkCollection<Object>> collections = new HashMap<>();
collections.put("a", mock(RDDCollection.class));
collections.put("b", mock(RDDCollection.class));
collections.put("c", mock(RDDCollection.class));
Assert.assertFalse(driver.canJoinOnSQLEngine(STAGE_NAME, joinDefinition, collections));
}
use of io.cdap.cdap.etl.api.join.JoinStage in project cdap by caskdata.
the class BatchSparkPipelineDriverTest method testShouldJoinOnSQLEngineWithoutBroadcast.
@Test
public void testShouldJoinOnSQLEngineWithoutBroadcast() {
List<JoinStage> noneBroadcast = Arrays.asList(JoinStage.builder("a", null).setBroadcast(false).build(), JoinStage.builder("b", null).setBroadcast(false).build(), JoinStage.builder("c", null).setBroadcast(false).build());
JoinDefinition joinDefinition = mock(JoinDefinition.class);
doReturn(noneBroadcast).when(joinDefinition).getStages();
Map<String, SparkCollection<Object>> collections = new HashMap<>();
collections.put("a", mock(RDDCollection.class));
collections.put("b", mock(RDDCollection.class));
collections.put("c", mock(RDDCollection.class));
Assert.assertTrue(driver.canJoinOnSQLEngine(STAGE_NAME, joinDefinition, collections));
}
use of io.cdap.cdap.etl.api.join.JoinStage in project cdap by caskdata.
the class MockAutoJoiner method define.
@Nullable
@Override
public JoinDefinition define(AutoJoinerContext context) {
if (conf.containsMacro(Conf.STAGES) || conf.containsMacro(Conf.KEY) || conf.containsMacro(Conf.REQUIRED) || conf.containsMacro(Conf.SELECT)) {
return null;
}
Map<String, JoinStage> inputStages = context.getInputStages();
List<JoinStage> from = new ArrayList<>(inputStages.size());
Set<String> required = new HashSet<>(conf.getRequired());
Set<String> broadcast = new HashSet<>(conf.getBroadcast());
List<JoinField> selectedFields = conf.getSelect();
boolean shouldGenerateSelected = selectedFields.isEmpty();
JoinCondition condition = conf.getJoinConditionExpr();
JoinCondition.OnKeys.Builder conditionBuilder = condition != null ? null : JoinCondition.onKeys().setNullSafe(conf.isNullSafe());
for (String stageName : conf.getStages()) {
JoinStage.Builder stageBuilder = JoinStage.builder(inputStages.get(stageName));
if (!required.contains(stageName)) {
stageBuilder.isOptional();
}
if (broadcast.contains(stageName)) {
stageBuilder.setBroadcast(true);
}
JoinStage stage = stageBuilder.build();
from.add(stage);
if (conditionBuilder != null) {
conditionBuilder.addKey(new JoinKey(stageName, conf.getKey()));
}
Schema stageSchema = stage.getSchema();
if (!shouldGenerateSelected || stageSchema == null) {
continue;
}
for (Schema.Field field : stageSchema.getFields()) {
// alias everything to stage_field
selectedFields.add(new JoinField(stageName, field.getName(), String.format("%s_%s", stageName, field.getName())));
}
}
condition = condition == null ? conditionBuilder.build() : condition;
JoinDefinition.Builder builder = JoinDefinition.builder().select(selectedFields).on(condition).from(from).setOutputSchemaName(String.join(".", conf.getStages()));
Schema outputSchema = conf.getSchema();
if (outputSchema != null) {
builder.setOutputSchema(outputSchema);
}
if (conf.getDistributionName() != null && conf.getDistributionSize() != null) {
builder.setDistributionFactor(conf.getDistributionSize(), conf.getDistributionName());
}
return builder.build();
}
Aggregations