use of co.cask.cdap.etl.batch.BatchPipelineSpec in project cdap by caskdata.
the class PipelineSpecGeneratorTest method testGenerateSpec.
@Test
public void testGenerateSpec() {
/*
* ---- t1 ------------
* | | |
* source --- | |--- t3 --- sink1
* | | |
* ------------ t2 --------------- sink2
* | |
* | |
* -------------------------
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MOCK_SOURCE)).addStage(new ETLStage("sink1", MOCK_SINK)).addStage(new ETLStage("sink2", MOCK_SINK)).addStage(new ETLStage("t1", MOCK_TRANSFORM_A)).addStage(new ETLStage("t2", MOCK_TRANSFORM_A)).addStage(new ETLStage("t3", MOCK_TRANSFORM_B)).addConnection("source", "t1").addConnection("source", "t2").addConnection("source", "sink2").addConnection("t1", "t2").addConnection("t1", "t3").addConnection("t1", "sink2").addConnection("t2", "sink2").addConnection("t2", "t3").addConnection("t3", "sink1").build();
// test the spec generated is correct, with the right input and output schemas and artifact information.
BatchPipelineSpec actual = specGenerator.generateSpec(etlConfig);
Map<String, String> emptyMap = ImmutableMap.of();
PipelineSpec expected = BatchPipelineSpec.builder().addStage(StageSpec.builder("source", new PluginSpec(BatchSource.PLUGIN_TYPE, "mocksource", emptyMap, ARTIFACT_ID)).addOutputSchema(SCHEMA_A, "t1", "t2", "sink2").build()).addStage(StageSpec.builder("sink1", new PluginSpec(BatchSink.PLUGIN_TYPE, "mocksink", emptyMap, ARTIFACT_ID)).addInputSchema("t3", SCHEMA_B).setErrorSchema(SCHEMA_B).build()).addStage(StageSpec.builder("sink2", new PluginSpec(BatchSink.PLUGIN_TYPE, "mocksink", emptyMap, ARTIFACT_ID)).addInputSchemas(ImmutableMap.of("t1", SCHEMA_A, "t2", SCHEMA_A, "source", SCHEMA_A)).setErrorSchema(SCHEMA_A).build()).addStage(StageSpec.builder("t1", new PluginSpec(Transform.PLUGIN_TYPE, "mockA", emptyMap, ARTIFACT_ID)).addInputSchema("source", SCHEMA_A).addOutputSchema(SCHEMA_A, "t2", "t3", "sink2").setErrorSchema(SCHEMA_B).build()).addStage(StageSpec.builder("t2", new PluginSpec(Transform.PLUGIN_TYPE, "mockA", emptyMap, ARTIFACT_ID)).addInputSchemas(ImmutableMap.of("source", SCHEMA_A, "t1", SCHEMA_A)).addOutputSchema(SCHEMA_A, "t3", "sink2").setErrorSchema(SCHEMA_B).build()).addStage(StageSpec.builder("t3", new PluginSpec(Transform.PLUGIN_TYPE, "mockB", emptyMap, ARTIFACT_ID)).addInputSchemas(ImmutableMap.of("t1", SCHEMA_A, "t2", SCHEMA_A)).addOutputSchema(SCHEMA_B, "sink1").setErrorSchema(SCHEMA_A).build()).addConnections(etlConfig.getConnections()).setResources(etlConfig.getResources()).setDriverResources(new Resources(1024, 1)).setClientResources(new Resources(1024, 1)).setStageLoggingEnabled(etlConfig.isStageLoggingEnabled()).build();
Assert.assertEquals(expected, actual);
}
use of co.cask.cdap.etl.batch.BatchPipelineSpec in project cdap by caskdata.
the class DataPipelineApp method configure.
@Override
public void configure() {
ETLBatchConfig config = getConfig();
setDescription(Objects.firstNonNull(config.getDescription(), DEFAULT_DESCRIPTION));
BatchPipelineSpec spec = new BatchPipelineSpecGenerator<>(getConfigurer(), ImmutableSet.of(BatchSource.PLUGIN_TYPE), ImmutableSet.of(BatchSink.PLUGIN_TYPE, SparkSink.PLUGIN_TYPE, AlertPublisher.PLUGIN_TYPE), config.getEngine()).generateSpec(config);
addWorkflow(new SmartWorkflow(spec, supportedPluginTypes, getConfigurer(), config.getEngine()));
ScheduleBuilder scheduleBuilder = buildSchedule(SCHEDULE_NAME, ProgramType.WORKFLOW, SmartWorkflow.NAME).setDescription("Data pipeline schedule");
Integer maxConcurrentRuns = config.getMaxConcurrentRuns();
if (maxConcurrentRuns != null) {
scheduleBuilder.withConcurrency(maxConcurrentRuns);
}
schedule(scheduleBuilder.triggerByTime(config.getSchedule()));
}
Aggregations