use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class DataStreamsTest method testWindower.
@Test
public void testWindower() throws Exception {
/*
* source --> window(width=10,interval=1) --> aggregator --> filter --> sink
*/
Schema schema = Schema.recordOf("data", Schema.Field.of("x", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build());
String sinkName = "windowOut";
// source sleeps 1 second between outputs
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 1000L))).addStage(new ETLStage("window", Window.getPlugin(30, 1))).addStage(new ETLStage("agg", FieldCountAggregator.getPlugin("x", "string"))).addStage(new ETLStage("filter", StringValueFilterTransform.getPlugin("x", "all"))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addConnection("source", "window").addConnection("window", "agg").addConnection("agg", "filter").addConnection("filter", "sink").setBatchInterval("1s").build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("WindowerApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForStatus(true, 10, 1);
// the sink should contain at least one record with count of 3, and no records with more than 3.
// less than 3 if the window doesn't contain all 3 records yet, but there should eventually be a window
// that contains all 3.
final DataSetManager<Table> outputManager = getDataset(sinkName);
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
outputManager.flush();
boolean sawThree = false;
for (StructuredRecord record : MockSink.readOutput(outputManager)) {
long count = record.get("ct");
if (count == 3L) {
sawThree = true;
}
Assert.assertTrue(count <= 3L);
}
return sawThree;
}
}, 2, TimeUnit.MINUTES);
sparkManager.stop();
}
use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class PreviewDataStreamsTest method testDataStreamsPreviewRun.
@Test
public void testDataStreamsPreviewRun() throws Exception {
PreviewManager previewManager = getPreviewManager();
String sinkTableName = "singleOutput";
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> records = new ArrayList<>();
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
StructuredRecord recordTest = StructuredRecord.builder(schema).set("name", "test").build();
records.add(recordSamuel);
records.add(recordBob);
records.add(recordTest);
/*
* source --> transform -> sink
*/
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, records))).addStage(new ETLStage("transform", IdentityTransform.getPlugin())).addStage(new ETLStage("sink", MockSink.getPlugin(sinkTableName))).addConnection("source", "transform").addConnection("transform", "sink").setNumOfRecordsPreview(100).setBatchInterval("1s").build();
// Construct the preview config with the program name and program type.
PreviewConfig previewConfig = new PreviewConfig(DataStreamsSparkLauncher.NAME, ProgramType.SPARK, Collections.<String, String>emptyMap(), 1);
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig, previewConfig);
// Start the preview and get the corresponding PreviewRunner.
ApplicationId previewId = previewManager.start(NamespaceId.DEFAULT, appRequest);
final PreviewRunner previewRunner = previewManager.getRunner(previewId);
// Wait for the preview to be running and wait until the records are processed in the sink.
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
Map<String, List<JsonElement>> data = previewRunner.getData("sink");
return data != null && data.get(DATA_TRACER_PROPERTY) != null && data.get(DATA_TRACER_PROPERTY).size() == 3;
}
}, 1, TimeUnit.MINUTES);
// check data in source and transform
checkPreviewStore(previewRunner, "source", 3);
checkPreviewStore(previewRunner, "transform", 3);
// Wait for the pipeline to be shutdown by timer.
TimeUnit.MINUTES.sleep(1);
Tasks.waitFor(PreviewStatus.Status.KILLED_BY_TIMER, new Callable<PreviewStatus.Status>() {
@Override
public PreviewStatus.Status call() throws Exception {
return previewRunner.getStatus().getStatus();
}
}, 1, TimeUnit.MINUTES);
// Validate the metrics for preview
validateMetric(3, previewId, "source.records.out", previewRunner);
validateMetric(3, previewId, "transform.records.in", previewRunner);
validateMetric(3, previewId, "transform.records.out", previewRunner);
validateMetric(3, previewId, "sink.records.in", previewRunner);
validateMetric(3, previewId, "sink.records.out", previewRunner);
// Check the sink table is not created in the real space.
DataSetManager<Table> sinkManager = getDataset(sinkTableName);
Assert.assertNull(sinkManager.get());
}
use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class PipelineTest method testTextFileSinkAndDeletePostAction.
@Test
public void testTextFileSinkAndDeletePostAction() throws Exception {
// create the pipeline config
String inputName = "sinkTestInput";
String outputName = "sinkTestOutput";
String outputDirName = "users";
ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
Map<String, String> sinkProperties = new HashMap<>();
sinkProperties.put(TextFileSetSink.Conf.FILESET_NAME, outputName);
sinkProperties.put(TextFileSetSink.Conf.FIELD_SEPARATOR, "|");
sinkProperties.put(TextFileSetSink.Conf.OUTPUT_DIR, "${dir}");
ETLStage sink = new ETLStage("sink", new ETLPlugin(TextFileSetSink.NAME, BatchSink.PLUGIN_TYPE, sinkProperties, null));
Map<String, String> actionProperties = new HashMap<>();
actionProperties.put(FilesetDeletePostAction.Conf.FILESET_NAME, outputName);
// mapreduce writes multiple files to the output directory. Along with the actual output,
// there are various .crc files that do not contain any of the output content.
actionProperties.put(FilesetDeletePostAction.Conf.DELETE_REGEX, ".*\\.crc|_SUCCESS");
actionProperties.put(FilesetDeletePostAction.Conf.DIRECTORY, outputDirName);
ETLStage postAction = new ETLStage("cleanup", new ETLPlugin(FilesetDeletePostAction.NAME, PostAction.PLUGIN_TYPE, actionProperties, null));
ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addPostAction(postAction).addConnection(source.getName(), sink.getName()).build();
// create the pipeline
ApplicationId pipelineId = NamespaceId.DEFAULT.app("textSinkTestPipeline");
ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
// write some data to the input fileset
Schema inputSchema = Schema.recordOf("test", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.STRING)));
Map<String, String> users = new HashMap<>();
users.put("samuel", "wallet");
users.put("dwayne", "rock");
users.put("christopher", "cowbell");
List<StructuredRecord> inputRecords = new ArrayList<>();
for (Map.Entry<String, String> userEntry : users.entrySet()) {
String name = userEntry.getKey();
String item = userEntry.getValue();
inputRecords.add(StructuredRecord.builder(inputSchema).set("name", name).set("item", item).build());
}
DataSetManager<Table> inputManager = getDataset(inputName);
MockSource.writeInput(inputManager, inputRecords);
// run the pipeline
Map<String, String> runtimeArgs = new HashMap<>();
// the ${dir} macro will be substituted with "users" for our pipeline run
runtimeArgs.put("dir", outputDirName);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(runtimeArgs);
workflowManager.waitForFinish(4, TimeUnit.MINUTES);
// check the pipeline output
DataSetManager<FileSet> outputManager = getDataset(outputName);
FileSet output = outputManager.get();
Location outputDir = output.getBaseLocation().append(outputDirName);
Map<String, String> actual = new HashMap<>();
for (Location outputFile : outputDir.list()) {
if (outputFile.getName().endsWith(".crc") || "_SUCCESS".equals(outputFile.getName())) {
Assert.fail("Post action did not delete file " + outputFile.getName());
}
try (BufferedReader reader = new BufferedReader(new InputStreamReader(outputFile.getInputStream()))) {
String line;
while ((line = reader.readLine()) != null) {
String[] parts = line.split("\\|");
actual.put(parts[0], parts[1]);
}
}
}
Assert.assertEquals(actual, users);
}
use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class PipelineTest method testWordCountSparkSink.
@SuppressWarnings("ConstantConditions")
@Test
public void testWordCountSparkSink() throws Exception {
String inputName = "sparkSinkInput";
String outputName = "sparkSinkOutput";
// create the pipeline config
ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
Map<String, String> sinkProperties = new HashMap<>();
sinkProperties.put("field", "text");
sinkProperties.put("tableName", outputName);
ETLStage sink = new ETLStage("sink", new ETLPlugin(WordCountSink.NAME, SparkSink.PLUGIN_TYPE, sinkProperties, null));
ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addConnection(source.getName(), sink.getName()).build();
// create the pipeline
ApplicationId pipelineId = NamespaceId.DEFAULT.app("sparkSinkTestPipeline");
ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
// write the input
Schema inputSchema = Schema.recordOf("text", Schema.Field.of("text", Schema.of(Schema.Type.STRING)));
DataSetManager<Table> inputManager = getDataset(inputName);
List<StructuredRecord> inputRecords = new ArrayList<>();
inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello World").build());
inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Hal").build());
inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Sam").build());
MockSource.writeInput(inputManager, inputRecords);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForFinish(4, TimeUnit.MINUTES);
DataSetManager<KeyValueTable> outputManager = getDataset(outputName);
KeyValueTable output = outputManager.get();
Assert.assertEquals(3L, Bytes.toLong(output.read("Hello")));
Assert.assertEquals(1L, Bytes.toLong(output.read("World")));
Assert.assertEquals(2L, Bytes.toLong(output.read("my")));
Assert.assertEquals(2L, Bytes.toLong(output.read("name")));
Assert.assertEquals(2L, Bytes.toLong(output.read("is")));
Assert.assertEquals(1L, Bytes.toLong(output.read("Hal")));
Assert.assertEquals(1L, Bytes.toLong(output.read("Sam")));
}
use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class PipelineTest method testTextFileSourceAndMoveAction.
@Test
public void testTextFileSourceAndMoveAction() throws Exception {
// create the pipeline config
String moveFromName = "sourceTestMoveFrom";
String inputName = "sourceTestInput";
String outputName = "sourceTestOutput";
Map<String, String> actionProperties = new HashMap<>();
actionProperties.put(FilesetMoveAction.Conf.SOURCE_FILESET, "sourceTestMoveFrom");
actionProperties.put(FilesetMoveAction.Conf.DEST_FILESET, inputName);
ETLStage moveAction = new ETLStage("moveInput", new ETLPlugin(FilesetMoveAction.NAME, Action.PLUGIN_TYPE, actionProperties, null));
Map<String, String> sourceProperties = new HashMap<>();
sourceProperties.put(TextFileSetSource.Conf.FILESET_NAME, inputName);
sourceProperties.put(TextFileSetSource.Conf.CREATE_IF_NOT_EXISTS, "true");
sourceProperties.put(TextFileSetSource.Conf.DELETE_INPUT_ON_SUCCESS, "true");
sourceProperties.put(TextFileSetSource.Conf.FILES, "${file}");
ETLStage source = new ETLStage("source", new ETLPlugin(TextFileSetSource.NAME, BatchSource.PLUGIN_TYPE, sourceProperties, null));
ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputName));
ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addStage(moveAction).addConnection(moveAction.getName(), source.getName()).addConnection(source.getName(), sink.getName()).build();
// create the move from fileset
addDatasetInstance(FileSet.class.getName(), moveFromName);
// create the pipeline
ApplicationId pipelineId = NamespaceId.DEFAULT.app("textSourceTestPipeline");
ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
// write some files that will be moved to the input fileset
DataSetManager<FileSet> moveFromManager = getDataset(moveFromName);
// this file starts with '.' and should be ignored.
Location invisibleFile = moveFromManager.get().getBaseLocation().append(".hidden");
try (OutputStream outputStream = invisibleFile.getOutputStream()) {
outputStream.write(Bytes.toBytes("this should not be read"));
}
// this file should be moved
String line1 = "Hello World!";
String line2 = "Good to meet you";
String line3 = "My name is Hal";
String inputText = line1 + "\n" + line2 + "\n" + line3;
Location inputFile = moveFromManager.get().getBaseLocation().append("inputFile");
try (OutputStream outputStream = inputFile.getOutputStream()) {
outputStream.write(Bytes.toBytes(inputText));
}
// run the pipeline
Map<String, String> runtimeArgs = new HashMap<>();
// the ${file} macro will be substituted with "inputFile" for our pipeline run
runtimeArgs.put("file", "inputFile");
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(runtimeArgs);
workflowManager.waitForFinish(4, TimeUnit.MINUTES);
// check the pipeline output
DataSetManager<Table> outputManager = getDataset(outputName);
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(outputManager));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA).set("position", (long) inputText.indexOf(line1)).set("text", line1).build());
expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA).set("position", (long) inputText.indexOf(line2)).set("text", line2).build());
expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA).set("position", (long) inputText.indexOf(line3)).set("text", line3).build());
Assert.assertEquals(expected, outputRecords);
// check that the input file does not exist in the moveFrom fileSet,
// and was deleted by the source in the input fileSet
Assert.assertFalse(moveFromManager.get().getBaseLocation().append("inputFile").exists());
DataSetManager<FileSet> inputManager = getDataset(inputName);
Assert.assertFalse(inputManager.get().getBaseLocation().append("inputFile").exists());
}
Aggregations