use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.
the class PipelineTest method testTextFileSinkAndDeletePostAction.
@Test
public void testTextFileSinkAndDeletePostAction() throws Exception {
// create the pipeline config
String inputName = "sinkTestInput";
String outputName = "sinkTestOutput";
String outputDirName = "users";
ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
Map<String, String> sinkProperties = new HashMap<>();
sinkProperties.put(TextFileSetSink.Conf.FILESET_NAME, outputName);
sinkProperties.put(TextFileSetSink.Conf.FIELD_SEPARATOR, "|");
sinkProperties.put(TextFileSetSink.Conf.OUTPUT_DIR, "${dir}");
ETLStage sink = new ETLStage("sink", new ETLPlugin(TextFileSetSink.NAME, BatchSink.PLUGIN_TYPE, sinkProperties, null));
Map<String, String> actionProperties = new HashMap<>();
actionProperties.put(FilesetDeletePostAction.Conf.FILESET_NAME, outputName);
// mapreduce writes multiple files to the output directory. Along with the actual output,
// there are various .crc files that do not contain any of the output content.
actionProperties.put(FilesetDeletePostAction.Conf.DELETE_REGEX, ".*\\.crc|_SUCCESS");
actionProperties.put(FilesetDeletePostAction.Conf.DIRECTORY, outputDirName);
ETLStage postAction = new ETLStage("cleanup", new ETLPlugin(FilesetDeletePostAction.NAME, PostAction.PLUGIN_TYPE, actionProperties, null));
ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addPostAction(postAction).addConnection(source.getName(), sink.getName()).build();
// create the pipeline
ApplicationId pipelineId = NamespaceId.DEFAULT.app("textSinkTestPipeline");
ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
// write some data to the input fileset
Schema inputSchema = Schema.recordOf("test", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.STRING)));
Map<String, String> users = new HashMap<>();
users.put("samuel", "wallet");
users.put("dwayne", "rock");
users.put("christopher", "cowbell");
List<StructuredRecord> inputRecords = new ArrayList<>();
for (Map.Entry<String, String> userEntry : users.entrySet()) {
String name = userEntry.getKey();
String item = userEntry.getValue();
inputRecords.add(StructuredRecord.builder(inputSchema).set("name", name).set("item", item).build());
}
DataSetManager<Table> inputManager = getDataset(inputName);
MockSource.writeInput(inputManager, inputRecords);
// run the pipeline
Map<String, String> runtimeArgs = new HashMap<>();
// the ${dir} macro will be substituted with "users" for our pipeline run
runtimeArgs.put("dir", outputDirName);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(runtimeArgs);
workflowManager.waitForFinish(4, TimeUnit.MINUTES);
// check the pipeline output
DataSetManager<FileSet> outputManager = getDataset(outputName);
FileSet output = outputManager.get();
Location outputDir = output.getBaseLocation().append(outputDirName);
Map<String, String> actual = new HashMap<>();
for (Location outputFile : outputDir.list()) {
if (outputFile.getName().endsWith(".crc") || "_SUCCESS".equals(outputFile.getName())) {
Assert.fail("Post action did not delete file " + outputFile.getName());
}
try (BufferedReader reader = new BufferedReader(new InputStreamReader(outputFile.getInputStream()))) {
String line;
while ((line = reader.readLine()) != null) {
String[] parts = line.split("\\|");
actual.put(parts[0], parts[1]);
}
}
}
Assert.assertEquals(actual, users);
}
use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.
the class ETLBatchApplication method configure.
@Override
public void configure() {
ETLBatchConfig config = getConfig().convertOldConfig();
setDescription(DEFAULT_DESCRIPTION);
PipelineSpecGenerator<ETLBatchConfig, BatchPipelineSpec> specGenerator = new BatchPipelineSpecGenerator(getConfigurer(), ImmutableSet.of(BatchSource.PLUGIN_TYPE), ImmutableSet.of(BatchSink.PLUGIN_TYPE), TimePartitionedFileSet.class, FileSetProperties.builder().setInputFormat(AvroKeyInputFormat.class).setOutputFormat(AvroKeyOutputFormat.class).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", Constants.ERROR_SCHEMA.toString()).build(), config.getEngine());
BatchPipelineSpec spec = specGenerator.generateSpec(config);
int sourceCount = 0;
for (StageSpec stageSpec : spec.getStages()) {
if (BatchSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
sourceCount++;
}
}
if (sourceCount != 1) {
throw new IllegalArgumentException("Invalid pipeline. There must only be one source.");
}
PipelinePlanner planner = new PipelinePlanner(SUPPORTED_PLUGIN_TYPES, ImmutableSet.<String>of(), ImmutableSet.<String>of(), ImmutableSet.<String>of());
PipelinePlan plan = planner.plan(spec);
if (plan.getPhases().size() != 1) {
// should never happen if there is only one source
throw new IllegalArgumentException("There was an error planning the pipeline. There should only be one phase.");
}
PipelinePhase pipeline = plan.getPhases().values().iterator().next();
switch(config.getEngine()) {
case MAPREDUCE:
BatchPhaseSpec batchPhaseSpec = new BatchPhaseSpec(ETLMapReduce.NAME, pipeline, config.getResources(), config.getDriverResources(), config.getClientResources(), config.isStageLoggingEnabled(), config.isProcessTimingEnabled(), new HashMap<String, String>(), config.getNumOfRecordsPreview(), config.getProperties());
addMapReduce(new ETLMapReduce(batchPhaseSpec));
break;
case SPARK:
batchPhaseSpec = new BatchPhaseSpec(ETLSpark.class.getSimpleName(), pipeline, config.getResources(), config.getDriverResources(), config.getClientResources(), config.isStageLoggingEnabled(), config.isProcessTimingEnabled(), new HashMap<String, String>(), config.getNumOfRecordsPreview(), config.getProperties());
addSpark(new ETLSpark(batchPhaseSpec));
break;
default:
throw new IllegalArgumentException(String.format("Invalid execution engine '%s'. Must be one of %s.", config.getEngine(), Joiner.on(',').join(Engine.values())));
}
addWorkflow(new ETLWorkflow(spec, config.getEngine()));
scheduleWorkflow(Schedules.builder(SCHEDULE_NAME).setDescription("ETL Batch schedule").createTimeSchedule(config.getSchedule()), ETLWorkflow.NAME);
}
use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.
the class DataPipelineTest method testExternalSparkProgramPipelines.
@Test
public void testExternalSparkProgramPipelines() throws Exception {
File testDir = TMP_FOLDER.newFolder("sparkProgramTest");
File input = new File(testDir, "poem.txt");
try (PrintWriter writer = new PrintWriter(input.getAbsolutePath())) {
writer.println("this");
writer.println("is");
writer.println("a");
writer.println("poem");
writer.println("it");
writer.println("is");
writer.println("a");
writer.println("bad");
writer.println("poem");
}
File wordCountOutput = new File(testDir, "poem_counts");
File filterOutput = new File(testDir, "poem_filtered");
String args = String.format("%s %s", input.getAbsolutePath(), wordCountOutput.getAbsolutePath());
Map<String, String> wordCountProperties = ImmutableMap.of("program.args", args);
Map<String, String> filterProperties = ImmutableMap.of("inputPath", input.getAbsolutePath(), "outputPath", filterOutput.getAbsolutePath(), "filterStr", "bad");
ETLBatchConfig etlConfig = co.cask.cdap.etl.proto.v2.ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("wordcount", new ETLPlugin(WORDCOUNT_PLUGIN, SPARK_TYPE, wordCountProperties, null))).addStage(new ETLStage("filter", new ETLPlugin(FILTER_PLUGIN, SPARK_TYPE, filterProperties, null))).addConnection("wordcount", "filter").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("sparkProgramTest");
ApplicationManager appManager = deployApplication(appId, appRequest);
WorkflowManager manager = appManager.getWorkflowManager(SmartWorkflow.NAME);
manager.start();
manager.waitForRun(ProgramRunStatus.COMPLETED, 3, TimeUnit.MINUTES);
// check wordcount output
/*
this is a poem
it is a bad poem
*/
Map<String, Integer> expected = new HashMap<>();
expected.put("this", 1);
expected.put("is", 2);
expected.put("a", 2);
expected.put("poem", 2);
expected.put("it", 1);
expected.put("bad", 1);
Map<String, Integer> counts = new HashMap<>();
File[] files = wordCountOutput.listFiles();
Assert.assertNotNull("No output files for wordcount found.", files);
for (File file : files) {
String fileName = file.getName();
if (fileName.startsWith(".") || fileName.equals("_SUCCESS")) {
continue;
}
try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
String line;
while ((line = reader.readLine()) != null) {
String[] fields = line.split(" ");
counts.put(fields[0], Integer.parseInt(fields[1]));
}
}
}
Assert.assertEquals(expected, counts);
// check filter output
files = filterOutput.listFiles();
Assert.assertNotNull("No output files for filter program found.", files);
List<String> expectedLines = ImmutableList.of("this", "is", "a", "poem", "it", "is", "a", "poem");
List<String> actualLines = new ArrayList<>();
for (File file : files) {
String fileName = file.getName();
if (fileName.startsWith(".") || fileName.equals("_SUCCESS")) {
continue;
}
try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
String line;
while ((line = reader.readLine()) != null) {
actualLines.add(line);
}
}
}
Assert.assertEquals(expectedLines, actualLines);
}
use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.
the class DataPipelineTest method testNoMacroMapReduce.
/**
* Tests that if no macro is provided to the dataset name property, datasets will be created at config time.
*/
@Test
public void testNoMacroMapReduce() throws Exception {
/*
* Trivial MapReduce pipeline from batch source to batch sink.
*
* source --------- sink
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MockRuntimeDatasetSource.getPlugin("mrinput", "configTimeMockSourceDataset"))).addStage(new ETLStage("sink", MockRuntimeDatasetSink.getPlugin("mroutput", "configTimeMockSinkDataset"))).addConnection("source", "sink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("MRApp");
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
// set runtime arguments for macro substitution
Map<String, String> runtimeArguments = ImmutableMap.of("runtime", "mockRuntime", "sink", "SinkDataset", "source", "Source", "runtimeSource", "mockRuntimeSourceDataset");
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
// make sure the datasets were created at configure time
Assert.assertNotNull(getDataset("configTimeMockSourceDataset").get());
Assert.assertNotNull(getDataset("configTimeMockSinkDataset").get());
workflowManager.setRuntimeArgs(runtimeArguments);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
}
use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.
the class DataPipelineTest method testSequentialAggregators.
private void testSequentialAggregators(Engine engine) throws Exception {
String sourceName = "linearAggInput-" + engine.name();
String sinkName = "linearAggOutput-" + engine.name();
/*
* source --> filter1 --> aggregator1 --> aggregator2 --> filter2 --> sink
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").setEngine(engine).addStage(new ETLStage("source", MockSource.getPlugin(sourceName))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("name", "bob"))).addStage(new ETLStage("filter2", StringValueFilterTransform.getPlugin("name", "jane"))).addStage(new ETLStage("aggregator1", IdentityAggregator.getPlugin())).addStage(new ETLStage("aggregator2", IdentityAggregator.getPlugin())).addConnection("source", "filter1").addConnection("filter1", "aggregator1").addConnection("aggregator1", "aggregator2").addConnection("aggregator2", "filter2").addConnection("filter2", "sink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("LinearAggApp-" + engine);
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
StructuredRecord recordJane = StructuredRecord.builder(schema).set("name", "jane").build();
// write one record to each source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(sourceName));
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob, recordJane));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// check output
DataSetManager<Table> sinkManager = getDataset(sinkName);
Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel);
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
validateMetric(3, appId, "source.records.out");
validateMetric(3, appId, "filter1.records.in");
validateMetric(2, appId, "filter1.records.out");
validateMetric(2, appId, "aggregator1.records.in");
validateMetric(2, appId, "aggregator1.records.out");
validateMetric(2, appId, "aggregator2.records.in");
validateMetric(2, appId, "aggregator2.records.out");
validateMetric(2, appId, "filter2.records.in");
validateMetric(1, appId, "filter2.records.out");
validateMetric(1, appId, "sink.records.out");
}
Aggregations