use of co.cask.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.
the class PipelineTest method testTextFileSourceAndMoveAction.
@Test
public void testTextFileSourceAndMoveAction() throws Exception {
// create the pipeline config
String moveFromName = "sourceTestMoveFrom";
String inputName = "sourceTestInput";
String outputName = "sourceTestOutput";
Map<String, String> actionProperties = new HashMap<>();
actionProperties.put(FilesetMoveAction.Conf.SOURCE_FILESET, "sourceTestMoveFrom");
actionProperties.put(FilesetMoveAction.Conf.DEST_FILESET, inputName);
ETLStage moveAction = new ETLStage("moveInput", new ETLPlugin(FilesetMoveAction.NAME, Action.PLUGIN_TYPE, actionProperties, null));
Map<String, String> sourceProperties = new HashMap<>();
sourceProperties.put(TextFileSetSource.Conf.FILESET_NAME, inputName);
sourceProperties.put(TextFileSetSource.Conf.CREATE_IF_NOT_EXISTS, "true");
sourceProperties.put(TextFileSetSource.Conf.DELETE_INPUT_ON_SUCCESS, "true");
sourceProperties.put(TextFileSetSource.Conf.FILES, "${file}");
ETLStage source = new ETLStage("source", new ETLPlugin(TextFileSetSource.NAME, BatchSource.PLUGIN_TYPE, sourceProperties, null));
ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputName));
ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addStage(moveAction).addConnection(moveAction.getName(), source.getName()).addConnection(source.getName(), sink.getName()).build();
// create the move from fileset
addDatasetInstance(FileSet.class.getName(), moveFromName);
// create the pipeline
ApplicationId pipelineId = NamespaceId.DEFAULT.app("textSourceTestPipeline");
ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
// write some files that will be moved to the input fileset
DataSetManager<FileSet> moveFromManager = getDataset(moveFromName);
// this file starts with '.' and should be ignored.
Location invisibleFile = moveFromManager.get().getBaseLocation().append(".hidden");
try (OutputStream outputStream = invisibleFile.getOutputStream()) {
outputStream.write(Bytes.toBytes("this should not be read"));
}
// this file should be moved
String line1 = "Hello World!";
String line2 = "Good to meet you";
String line3 = "My name is Hal";
String inputText = line1 + "\n" + line2 + "\n" + line3;
Location inputFile = moveFromManager.get().getBaseLocation().append("inputFile");
try (OutputStream outputStream = inputFile.getOutputStream()) {
outputStream.write(Bytes.toBytes(inputText));
}
// run the pipeline
Map<String, String> runtimeArgs = new HashMap<>();
// the ${file} macro will be substituted with "inputFile" for our pipeline run
runtimeArgs.put("file", "inputFile");
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(runtimeArgs);
workflowManager.waitForFinish(4, TimeUnit.MINUTES);
// check the pipeline output
DataSetManager<Table> outputManager = getDataset(outputName);
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(outputManager));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA).set("position", (long) inputText.indexOf(line1)).set("text", line1).build());
expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA).set("position", (long) inputText.indexOf(line2)).set("text", line2).build());
expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA).set("position", (long) inputText.indexOf(line3)).set("text", line3).build());
Assert.assertEquals(expected, outputRecords);
// check that the input file does not exist in the moveFrom fileSet,
// and was deleted by the source in the input fileSet
Assert.assertFalse(moveFromManager.get().getBaseLocation().append("inputFile").exists());
DataSetManager<FileSet> inputManager = getDataset(inputName);
Assert.assertFalse(inputManager.get().getBaseLocation().append("inputFile").exists());
}
use of co.cask.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.
the class PipelineTest method testTextFileSinkAndDeletePostAction.
@Test
public void testTextFileSinkAndDeletePostAction() throws Exception {
// create the pipeline config
String inputName = "sinkTestInput";
String outputName = "sinkTestOutput";
String outputDirName = "users";
ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
Map<String, String> sinkProperties = new HashMap<>();
sinkProperties.put(TextFileSetSink.Conf.FILESET_NAME, outputName);
sinkProperties.put(TextFileSetSink.Conf.FIELD_SEPARATOR, "|");
sinkProperties.put(TextFileSetSink.Conf.OUTPUT_DIR, "${dir}");
ETLStage sink = new ETLStage("sink", new ETLPlugin(TextFileSetSink.NAME, BatchSink.PLUGIN_TYPE, sinkProperties, null));
Map<String, String> actionProperties = new HashMap<>();
actionProperties.put(FilesetDeletePostAction.Conf.FILESET_NAME, outputName);
// mapreduce writes multiple files to the output directory. Along with the actual output,
// there are various .crc files that do not contain any of the output content.
actionProperties.put(FilesetDeletePostAction.Conf.DELETE_REGEX, ".*\\.crc|_SUCCESS");
actionProperties.put(FilesetDeletePostAction.Conf.DIRECTORY, outputDirName);
ETLStage postAction = new ETLStage("cleanup", new ETLPlugin(FilesetDeletePostAction.NAME, PostAction.PLUGIN_TYPE, actionProperties, null));
ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addPostAction(postAction).addConnection(source.getName(), sink.getName()).build();
// create the pipeline
ApplicationId pipelineId = NamespaceId.DEFAULT.app("textSinkTestPipeline");
ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
// write some data to the input fileset
Schema inputSchema = Schema.recordOf("test", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.STRING)));
Map<String, String> users = new HashMap<>();
users.put("samuel", "wallet");
users.put("dwayne", "rock");
users.put("christopher", "cowbell");
List<StructuredRecord> inputRecords = new ArrayList<>();
for (Map.Entry<String, String> userEntry : users.entrySet()) {
String name = userEntry.getKey();
String item = userEntry.getValue();
inputRecords.add(StructuredRecord.builder(inputSchema).set("name", name).set("item", item).build());
}
DataSetManager<Table> inputManager = getDataset(inputName);
MockSource.writeInput(inputManager, inputRecords);
// run the pipeline
Map<String, String> runtimeArgs = new HashMap<>();
// the ${dir} macro will be substituted with "users" for our pipeline run
runtimeArgs.put("dir", outputDirName);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(runtimeArgs);
workflowManager.waitForFinish(4, TimeUnit.MINUTES);
// check the pipeline output
DataSetManager<FileSet> outputManager = getDataset(outputName);
FileSet output = outputManager.get();
Location outputDir = output.getBaseLocation().append(outputDirName);
Map<String, String> actual = new HashMap<>();
for (Location outputFile : outputDir.list()) {
if (outputFile.getName().endsWith(".crc") || "_SUCCESS".equals(outputFile.getName())) {
Assert.fail("Post action did not delete file " + outputFile.getName());
}
try (BufferedReader reader = new BufferedReader(new InputStreamReader(outputFile.getInputStream()))) {
String line;
while ((line = reader.readLine()) != null) {
String[] parts = line.split("\\|");
actual.put(parts[0], parts[1]);
}
}
}
Assert.assertEquals(actual, users);
}
use of co.cask.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.
the class DataPipelineTest method testExternalSparkProgramPipelines.
@Test
public void testExternalSparkProgramPipelines() throws Exception {
File testDir = TMP_FOLDER.newFolder("sparkProgramTest");
File input = new File(testDir, "poem.txt");
try (PrintWriter writer = new PrintWriter(input.getAbsolutePath())) {
writer.println("this");
writer.println("is");
writer.println("a");
writer.println("poem");
writer.println("it");
writer.println("is");
writer.println("a");
writer.println("bad");
writer.println("poem");
}
File wordCountOutput = new File(testDir, "poem_counts");
File filterOutput = new File(testDir, "poem_filtered");
String args = String.format("%s %s", input.getAbsolutePath(), wordCountOutput.getAbsolutePath());
Map<String, String> wordCountProperties = ImmutableMap.of("program.args", args);
Map<String, String> filterProperties = ImmutableMap.of("inputPath", input.getAbsolutePath(), "outputPath", filterOutput.getAbsolutePath(), "filterStr", "bad");
ETLBatchConfig etlConfig = co.cask.cdap.etl.proto.v2.ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("wordcount", new ETLPlugin(WORDCOUNT_PLUGIN, SPARK_TYPE, wordCountProperties, null))).addStage(new ETLStage("filter", new ETLPlugin(FILTER_PLUGIN, SPARK_TYPE, filterProperties, null))).addConnection("wordcount", "filter").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("sparkProgramTest");
ApplicationManager appManager = deployApplication(appId, appRequest);
WorkflowManager manager = appManager.getWorkflowManager(SmartWorkflow.NAME);
manager.start();
manager.waitForRun(ProgramRunStatus.COMPLETED, 3, TimeUnit.MINUTES);
// check wordcount output
/*
this is a poem
it is a bad poem
*/
Map<String, Integer> expected = new HashMap<>();
expected.put("this", 1);
expected.put("is", 2);
expected.put("a", 2);
expected.put("poem", 2);
expected.put("it", 1);
expected.put("bad", 1);
Map<String, Integer> counts = new HashMap<>();
File[] files = wordCountOutput.listFiles();
Assert.assertNotNull("No output files for wordcount found.", files);
for (File file : files) {
String fileName = file.getName();
if (fileName.startsWith(".") || fileName.equals("_SUCCESS")) {
continue;
}
try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
String line;
while ((line = reader.readLine()) != null) {
String[] fields = line.split(" ");
counts.put(fields[0], Integer.parseInt(fields[1]));
}
}
}
Assert.assertEquals(expected, counts);
// check filter output
files = filterOutput.listFiles();
Assert.assertNotNull("No output files for filter program found.", files);
List<String> expectedLines = ImmutableList.of("this", "is", "a", "poem", "it", "is", "a", "poem");
List<String> actualLines = new ArrayList<>();
for (File file : files) {
String fileName = file.getName();
if (fileName.startsWith(".") || fileName.equals("_SUCCESS")) {
continue;
}
try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
String line;
while ((line = reader.readLine()) != null) {
actualLines.add(line);
}
}
}
Assert.assertEquals(expectedLines, actualLines);
}
use of co.cask.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.
the class DataPipelineTest method testSinglePhaseWithSparkSink.
private void testSinglePhaseWithSparkSink() throws Exception {
/*
* source1 ---|
* |--> sparksink
* source2 ---|
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source1", MockSource.getPlugin("messages1", SpamMessage.SCHEMA))).addStage(new ETLStage("source2", MockSource.getPlugin("messages2", SpamMessage.SCHEMA))).addStage(new ETLStage("customsink", new ETLPlugin(NaiveBayesTrainer.PLUGIN_NAME, SparkSink.PLUGIN_TYPE, ImmutableMap.of("fileSetName", "modelFileSet", "path", "output", "fieldToClassify", SpamMessage.TEXT_FIELD, "predictionField", SpamMessage.SPAM_PREDICTION_FIELD), null))).addConnection("source1", "customsink").addConnection("source2", "customsink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("SparkSinkApp");
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
// set up five spam messages and five non-spam messages to be used for classification
List<StructuredRecord> messagesToWrite = new ArrayList<>();
messagesToWrite.add(new SpamMessage("buy our clothes", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("sell your used books to us", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("earn money for free", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("this is definitely not spam", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("you won the lottery", 1.0).toStructuredRecord());
// write records to source1
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset("messages1"));
MockSource.writeInput(inputManager, messagesToWrite);
messagesToWrite.clear();
messagesToWrite.add(new SpamMessage("how was your day", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("what are you up to", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("this is a genuine message", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("this is an even more genuine message", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("could you send me the report", 0.0).toStructuredRecord());
// write records to source2
inputManager = getDataset(NamespaceId.DEFAULT.dataset("messages2"));
MockSource.writeInput(inputManager, messagesToWrite);
// ingest in some messages to be classified
StreamManager textsToClassify = getStreamManager(NaiveBayesTrainer.TEXTS_TO_CLASSIFY);
textsToClassify.send("how are you doing today");
textsToClassify.send("free money money");
textsToClassify.send("what are you doing today");
textsToClassify.send("genuine report");
// manually trigger the pipeline
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<KeyValueTable> classifiedTexts = getDataset(NaiveBayesTrainer.CLASSIFIED_TEXTS);
Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("how are you doing today")), 0.01d);
// only 'free money money' should be predicated as spam
Assert.assertEquals(1.0d, Bytes.toDouble(classifiedTexts.get().read("free money money")), 0.01d);
Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("what are you doing today")), 0.01d);
Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("genuine report")), 0.01d);
validateMetric(5, appId, "source1.records.out");
validateMetric(5, appId, "source2.records.out");
validateMetric(10, appId, "customsink.records.in");
}
use of co.cask.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.
the class DataPipelineTest method testSinglePhaseWithSparkCompute.
private void testSinglePhaseWithSparkCompute() throws Exception {
/*
* source --> sparkcompute --> sink
*/
String classifiedTextsTable = "classifiedTextTable";
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MockSource.getPlugin(NaiveBayesTrainer.TEXTS_TO_CLASSIFY, SpamMessage.SCHEMA))).addStage(new ETLStage("sparkcompute", new ETLPlugin(NaiveBayesClassifier.PLUGIN_NAME, SparkCompute.PLUGIN_TYPE, ImmutableMap.of("fileSetName", "modelFileSet", "path", "output", "fieldToClassify", SpamMessage.TEXT_FIELD, "fieldToSet", SpamMessage.SPAM_PREDICTION_FIELD), null))).addStage(new ETLStage("sink", MockSink.getPlugin(classifiedTextsTable))).addConnection("source", "sparkcompute").addConnection("sparkcompute", "sink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("SparkComputeApp");
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
// write some some messages to be classified
List<StructuredRecord> messagesToWrite = new ArrayList<>();
messagesToWrite.add(new SpamMessage("how are you doing today").toStructuredRecord());
messagesToWrite.add(new SpamMessage("free money money").toStructuredRecord());
messagesToWrite.add(new SpamMessage("what are you doing today").toStructuredRecord());
messagesToWrite.add(new SpamMessage("genuine report").toStructuredRecord());
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(NaiveBayesTrainer.TEXTS_TO_CLASSIFY));
MockSource.writeInput(inputManager, messagesToWrite);
// manually trigger the pipeline
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> classifiedTexts = getDataset(classifiedTextsTable);
List<StructuredRecord> structuredRecords = MockSink.readOutput(classifiedTexts);
Set<SpamMessage> results = new HashSet<>();
for (StructuredRecord structuredRecord : structuredRecords) {
results.add(SpamMessage.fromStructuredRecord(structuredRecord));
}
Set<SpamMessage> expected = new HashSet<>();
expected.add(new SpamMessage("how are you doing today", 0.0));
// only 'free money money' should be predicated as spam
expected.add(new SpamMessage("free money money", 1.0));
expected.add(new SpamMessage("what are you doing today", 0.0));
expected.add(new SpamMessage("genuine report", 0.0));
Assert.assertEquals(expected, results);
validateMetric(4, appId, "source.records.out");
validateMetric(4, appId, "sparkcompute.records.in");
validateMetric(4, appId, "sink.records.in");
}
Aggregations