use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.
the class ETLBatchApplication method configure.
@Override
public void configure() {
ETLBatchConfig config = getConfig().convertOldConfig();
setDescription(DEFAULT_DESCRIPTION);
PipelineSpecGenerator<ETLBatchConfig, BatchPipelineSpec> specGenerator = new BatchPipelineSpecGenerator(getConfigurer(), ImmutableSet.of(BatchSource.PLUGIN_TYPE), ImmutableSet.of(BatchSink.PLUGIN_TYPE), TimePartitionedFileSet.class, FileSetProperties.builder().setInputFormat(AvroKeyInputFormat.class).setOutputFormat(AvroKeyOutputFormat.class).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", Constants.ERROR_SCHEMA.toString()).build(), config.getEngine());
BatchPipelineSpec spec = specGenerator.generateSpec(config);
int sourceCount = 0;
for (StageSpec stageSpec : spec.getStages()) {
if (BatchSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
sourceCount++;
}
}
if (sourceCount != 1) {
throw new IllegalArgumentException("Invalid pipeline. There must only be one source.");
}
PipelinePlanner planner = new PipelinePlanner(SUPPORTED_PLUGIN_TYPES, ImmutableSet.<String>of(), ImmutableSet.<String>of(), ImmutableSet.<String>of());
PipelinePlan plan = planner.plan(spec);
if (plan.getPhases().size() != 1) {
// should never happen if there is only one source
throw new IllegalArgumentException("There was an error planning the pipeline. There should only be one phase.");
}
PipelinePhase pipeline = plan.getPhases().values().iterator().next();
switch(config.getEngine()) {
case MAPREDUCE:
BatchPhaseSpec batchPhaseSpec = new BatchPhaseSpec(ETLMapReduce.NAME, pipeline, config.getResources(), config.getDriverResources(), config.getClientResources(), config.isStageLoggingEnabled(), config.isProcessTimingEnabled(), new HashMap<String, String>(), config.getNumOfRecordsPreview(), config.getProperties());
addMapReduce(new ETLMapReduce(batchPhaseSpec));
break;
case SPARK:
batchPhaseSpec = new BatchPhaseSpec(ETLSpark.class.getSimpleName(), pipeline, config.getResources(), config.getDriverResources(), config.getClientResources(), config.isStageLoggingEnabled(), config.isProcessTimingEnabled(), new HashMap<String, String>(), config.getNumOfRecordsPreview(), config.getProperties());
addSpark(new ETLSpark(batchPhaseSpec));
break;
default:
throw new IllegalArgumentException(String.format("Invalid execution engine '%s'. Must be one of %s.", config.getEngine(), Joiner.on(',').join(Engine.values())));
}
addWorkflow(new ETLWorkflow(spec, config.getEngine()));
scheduleWorkflow(Schedules.builder(SCHEDULE_NAME).setDescription("ETL Batch schedule").createTimeSchedule(config.getSchedule()), ETLWorkflow.NAME);
}
use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.
the class DataPipelineTest method testExternalSparkProgramPipelines.
@Test
public void testExternalSparkProgramPipelines() throws Exception {
File testDir = TMP_FOLDER.newFolder("sparkProgramTest");
File input = new File(testDir, "poem.txt");
try (PrintWriter writer = new PrintWriter(input.getAbsolutePath())) {
writer.println("this");
writer.println("is");
writer.println("a");
writer.println("poem");
writer.println("it");
writer.println("is");
writer.println("a");
writer.println("bad");
writer.println("poem");
}
File wordCountOutput = new File(testDir, "poem_counts");
File filterOutput = new File(testDir, "poem_filtered");
String args = String.format("%s %s", input.getAbsolutePath(), wordCountOutput.getAbsolutePath());
Map<String, String> wordCountProperties = ImmutableMap.of("program.args", args);
Map<String, String> filterProperties = ImmutableMap.of("inputPath", input.getAbsolutePath(), "outputPath", filterOutput.getAbsolutePath(), "filterStr", "bad");
ETLBatchConfig etlConfig = co.cask.cdap.etl.proto.v2.ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("wordcount", new ETLPlugin(WORDCOUNT_PLUGIN, SPARK_TYPE, wordCountProperties, null))).addStage(new ETLStage("filter", new ETLPlugin(FILTER_PLUGIN, SPARK_TYPE, filterProperties, null))).addConnection("wordcount", "filter").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("sparkProgramTest");
ApplicationManager appManager = deployApplication(appId, appRequest);
WorkflowManager manager = appManager.getWorkflowManager(SmartWorkflow.NAME);
manager.start();
manager.waitForRun(ProgramRunStatus.COMPLETED, 3, TimeUnit.MINUTES);
// check wordcount output
/*
this is a poem
it is a bad poem
*/
Map<String, Integer> expected = new HashMap<>();
expected.put("this", 1);
expected.put("is", 2);
expected.put("a", 2);
expected.put("poem", 2);
expected.put("it", 1);
expected.put("bad", 1);
Map<String, Integer> counts = new HashMap<>();
File[] files = wordCountOutput.listFiles();
Assert.assertNotNull("No output files for wordcount found.", files);
for (File file : files) {
String fileName = file.getName();
if (fileName.startsWith(".") || fileName.equals("_SUCCESS")) {
continue;
}
try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
String line;
while ((line = reader.readLine()) != null) {
String[] fields = line.split(" ");
counts.put(fields[0], Integer.parseInt(fields[1]));
}
}
}
Assert.assertEquals(expected, counts);
// check filter output
files = filterOutput.listFiles();
Assert.assertNotNull("No output files for filter program found.", files);
List<String> expectedLines = ImmutableList.of("this", "is", "a", "poem", "it", "is", "a", "poem");
List<String> actualLines = new ArrayList<>();
for (File file : files) {
String fileName = file.getName();
if (fileName.startsWith(".") || fileName.equals("_SUCCESS")) {
continue;
}
try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
String line;
while ((line = reader.readLine()) != null) {
actualLines.add(line);
}
}
}
Assert.assertEquals(expectedLines, actualLines);
}
use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.
the class DataPipelineTest method testNoMacroMapReduce.
/**
* Tests that if no macro is provided to the dataset name property, datasets will be created at config time.
*/
@Test
public void testNoMacroMapReduce() throws Exception {
/*
* Trivial MapReduce pipeline from batch source to batch sink.
*
* source --------- sink
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MockRuntimeDatasetSource.getPlugin("mrinput", "configTimeMockSourceDataset"))).addStage(new ETLStage("sink", MockRuntimeDatasetSink.getPlugin("mroutput", "configTimeMockSinkDataset"))).addConnection("source", "sink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("MRApp");
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
// set runtime arguments for macro substitution
Map<String, String> runtimeArguments = ImmutableMap.of("runtime", "mockRuntime", "sink", "SinkDataset", "source", "Source", "runtimeSource", "mockRuntimeSourceDataset");
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
// make sure the datasets were created at configure time
Assert.assertNotNull(getDataset("configTimeMockSourceDataset").get());
Assert.assertNotNull(getDataset("configTimeMockSinkDataset").get());
workflowManager.setRuntimeArgs(runtimeArguments);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
}
use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.
the class DataPipelineTest method testSequentialAggregators.
private void testSequentialAggregators(Engine engine) throws Exception {
String sourceName = "linearAggInput-" + engine.name();
String sinkName = "linearAggOutput-" + engine.name();
/*
* source --> filter1 --> aggregator1 --> aggregator2 --> filter2 --> sink
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").setEngine(engine).addStage(new ETLStage("source", MockSource.getPlugin(sourceName))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("name", "bob"))).addStage(new ETLStage("filter2", StringValueFilterTransform.getPlugin("name", "jane"))).addStage(new ETLStage("aggregator1", IdentityAggregator.getPlugin())).addStage(new ETLStage("aggregator2", IdentityAggregator.getPlugin())).addConnection("source", "filter1").addConnection("filter1", "aggregator1").addConnection("aggregator1", "aggregator2").addConnection("aggregator2", "filter2").addConnection("filter2", "sink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("LinearAggApp-" + engine);
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
StructuredRecord recordJane = StructuredRecord.builder(schema).set("name", "jane").build();
// write one record to each source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(sourceName));
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob, recordJane));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// check output
DataSetManager<Table> sinkManager = getDataset(sinkName);
Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel);
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
validateMetric(3, appId, "source.records.out");
validateMetric(3, appId, "filter1.records.in");
validateMetric(2, appId, "filter1.records.out");
validateMetric(2, appId, "aggregator1.records.in");
validateMetric(2, appId, "aggregator1.records.out");
validateMetric(2, appId, "aggregator2.records.in");
validateMetric(2, appId, "aggregator2.records.out");
validateMetric(2, appId, "filter2.records.in");
validateMetric(1, appId, "filter2.records.out");
validateMetric(1, appId, "sink.records.out");
}
Aggregations