use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project hydrator-plugins by cdapio.
the class NormalizeTest method deployApplication.
private ApplicationManager deployApplication(Map<String, String> sourceProperties, String inputDatasetName, String outputDatasetName, String applicationName) throws Exception {
ETLStage source = new ETLStage("source", MockSource.getPlugin(inputDatasetName));
ETLStage transform = new ETLStage("normalize", new ETLPlugin("Normalize", Transform.PLUGIN_TYPE, sourceProperties, null));
ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputDatasetName));
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(transform).addStage(sink).addConnection(source.getName(), transform.getName()).addConnection(transform.getName(), sink.getName()).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(BATCH_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app(applicationName);
return deployApplication(appId, appRequest);
}
use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project hydrator-plugins by cdapio.
the class SparkPluginTest method testHttpStreamingSource.
@Test
public void testHttpStreamingSource() throws Exception {
Assert.assertEquals(200, resetFeeds());
final String content = "samuel jackson\ndwayne johnson\nchristopher walken";
Assert.assertEquals(200, writeFeed("people", content));
Map<String, String> properties = ImmutableMap.of("referenceName", "peopleFeed", "url", httpBase + "/feeds/people", "interval", "1");
DataStreamsConfig pipelineConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", new ETLPlugin("HTTPPoller", StreamingSource.PLUGIN_TYPE, properties, null))).addStage(new ETLStage("sink", MockSink.getPlugin("httpOutput"))).addConnection("source", "sink").setBatchInterval("1s").build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(DATASTREAMS_ARTIFACT, pipelineConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("HTTPSourceApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForStatus(true, 10, 1);
final DataSetManager<Table> outputManager = getDataset("httpOutput");
Tasks.waitFor(true, () -> {
outputManager.flush();
Set<String> contents = new HashSet<>();
for (StructuredRecord record : MockSink.readOutput(outputManager)) {
contents.add(record.get("body"));
}
return contents.size() == 1 && contents.contains(content);
}, 4, TimeUnit.MINUTES);
sparkManager.stop();
}
use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project hydrator-plugins by cdapio.
the class SparkPluginTest method testFileSource.
@Test
public void testFileSource() throws Exception {
Schema schema = Schema.recordOf("user", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("first", Schema.of(Schema.Type.STRING)), Schema.Field.of("last", Schema.of(Schema.Type.STRING)));
File folder = tmpFolder.newFolder("fileSourceTest");
File input1 = new File(folder, "input1.txt");
File input2 = new File(folder, "input2.csv");
File ignore1 = new File(folder, "input1.txt.done");
File ignore2 = new File(folder, "input1");
CharStreams.write("1,samuel,jackson\n2,dwayne,johnson", Files.newWriterSupplier(input1, Charsets.UTF_8));
CharStreams.write("3,christopher,walken", Files.newWriterSupplier(input2, Charsets.UTF_8));
CharStreams.write("0,nicolas,cage", Files.newWriterSupplier(ignore1, Charsets.UTF_8));
CharStreams.write("0,orlando,bloom", Files.newWriterSupplier(ignore2, Charsets.UTF_8));
Map<String, String> properties = ImmutableMap.<String, String>builder().put("path", folder.getAbsolutePath()).put("format", "csv").put("schema", schema.toString()).put("referenceName", "fileSourceTestInput").put("ignoreThreshold", "300").put("extensions", "txt,csv").build();
DataStreamsConfig pipelineCfg = DataStreamsConfig.builder().addStage(new ETLStage("source", new ETLPlugin("File", StreamingSource.PLUGIN_TYPE, properties, null))).addStage(new ETLStage("sink", MockSink.getPlugin("fileOutput"))).addConnection("source", "sink").setBatchInterval("1s").build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(DATASTREAMS_ARTIFACT, pipelineCfg);
ApplicationId appId = NamespaceId.DEFAULT.app("FileSourceApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForRun(ProgramRunStatus.RUNNING, 1, TimeUnit.MINUTES);
Map<Long, String> expected = ImmutableMap.of(1L, "samuel jackson", 2L, "dwayne johnson", 3L, "christopher walken");
final DataSetManager<Table> outputManager = getDataset("fileOutput");
Tasks.waitFor(true, () -> {
outputManager.flush();
Map<Long, String> actual = new HashMap<>();
for (StructuredRecord outputRecord : MockSink.readOutput(outputManager)) {
actual.put(outputRecord.get("id"), outputRecord.get("first") + " " + outputRecord.get("last"));
}
return expected.equals(actual);
}, 4, TimeUnit.MINUTES);
// now write a new file to make sure new files are picked up.
File input3 = new File(folder, "input3.txt");
CharStreams.write("4,terry,crews\n5,rocky,balboa", Files.newWriterSupplier(input3, Charsets.UTF_8));
Map<Long, String> expected2 = ImmutableMap.of(4L, "terry crews", 5L, "rocky balboa");
Table outputTable = outputManager.get();
Scanner scanner = outputTable.scan(null, null);
Row row;
while ((row = scanner.next()) != null) {
outputTable.delete(row.getRow());
}
outputManager.flush();
Tasks.waitFor(true, () -> {
outputManager.flush();
Map<Long, String> actual = new HashMap<>();
for (StructuredRecord outputRecord : MockSink.readOutput(outputManager)) {
actual.put(outputRecord.get("id"), outputRecord.get("first") + " " + outputRecord.get("last"));
}
return expected2.equals(actual);
}, 4, TimeUnit.MINUTES);
sparkManager.stop();
}
use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by cdapio.
the class PipelineSpecGenerator method configureStage.
/**
* Configures a stage and returns the spec for it.
*
* @param stage the user provided configuration for the stage
* @param validatedPipeline the validated pipeline config
* @param pluginConfigurer configurer used to configure the stage
* @return the spec for the stage
* @throws ValidationException if the plugin threw an exception during configuration
*/
protected ConfiguredStage configureStage(ETLStage stage, ValidatedPipeline validatedPipeline, DefaultPipelineConfigurer pluginConfigurer) throws ValidationException {
String stageName = stage.getName();
ETLPlugin stagePlugin = stage.getPlugin();
StageSpec.Builder specBuilder = configureStage(stageName, stagePlugin, pluginConfigurer);
DefaultStageConfigurer stageConfigurer = pluginConfigurer.getStageConfigurer();
String pluginType = stage.getPlugin().getType();
if (pluginType.equals(SplitterTransform.PLUGIN_TYPE)) {
Map<String, Schema> outputPortSchemas = stageConfigurer.getOutputPortSchemas();
for (Map.Entry<String, String> outputEntry : validatedPipeline.getOutputPorts(stageName).entrySet()) {
String outputStage = outputEntry.getKey();
String outputPort = outputEntry.getValue();
if (outputPort == null) {
throw new IllegalArgumentException(String.format("Connection from Splitter '%s' to '%s' must specify a port.", stageName, outputStage));
}
specBuilder.addOutput(outputStage, outputPort, outputPortSchemas.get(outputPort));
}
} else {
Schema outputSchema = stageConfigurer.getOutputSchema();
// all the same
if (Condition.PLUGIN_TYPE.equals(pluginType)) {
outputSchema = null;
for (Schema schema : stageConfigurer.getInputSchemas().values()) {
if (schema != null) {
// todo: fix this cleanly and fully
if (outputSchema != null && !Schemas.equalsIgnoringRecordName(outputSchema, schema)) {
throw new IllegalArgumentException("Cannot have different input schemas going into stage " + stageName);
}
outputSchema = schema;
}
}
}
for (String outputStage : validatedPipeline.getOutputs(stageName)) {
specBuilder.addOutput(outputStage, null, outputSchema);
}
}
StageSpec stageSpec = specBuilder.setProcessTimingEnabled(validatedPipeline.isProcessTimingEnabled()).setStageLoggingEnabled(validatedPipeline.isStageLoggingEnabled()).setMaxPreviewRecords(validatedPipeline.getMaxPreviewRecords()).build();
return new ConfiguredStage(stageSpec, pluginConfigurer.getPipelineProperties());
}
use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by cdapio.
the class PipelineTest method testTextFileSinkAndDeletePostAction.
@Test
public void testTextFileSinkAndDeletePostAction() throws Exception {
// create the pipeline config
String inputName = "sinkTestInput";
String outputName = "sinkTestOutput";
String outputDirName = "users";
ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
Map<String, String> sinkProperties = new HashMap<>();
sinkProperties.put(TextFileSetSink.Conf.FILESET_NAME, outputName);
sinkProperties.put(TextFileSetSink.Conf.FIELD_SEPARATOR, "|");
sinkProperties.put(TextFileSetSink.Conf.OUTPUT_DIR, "${dir}");
ETLStage sink = new ETLStage("sink", new ETLPlugin(TextFileSetSink.NAME, BatchSink.PLUGIN_TYPE, sinkProperties, null));
Map<String, String> actionProperties = new HashMap<>();
actionProperties.put(FilesetDeletePostAction.Conf.FILESET_NAME, outputName);
// mapreduce writes multiple files to the output directory. Along with the actual output,
// there are various .crc files that do not contain any of the output content.
actionProperties.put(FilesetDeletePostAction.Conf.DELETE_REGEX, ".*\\.crc|_SUCCESS");
actionProperties.put(FilesetDeletePostAction.Conf.DIRECTORY, outputDirName);
ETLStage postAction = new ETLStage("cleanup", new ETLPlugin(FilesetDeletePostAction.NAME, PostAction.PLUGIN_TYPE, actionProperties, null));
ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addPostAction(postAction).addConnection(source.getName(), sink.getName()).build();
// create the pipeline
ApplicationId pipelineId = NamespaceId.DEFAULT.app("textSinkTestPipeline");
ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
// write some data to the input fileset
Schema inputSchema = Schema.recordOf("test", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.STRING)));
Map<String, String> users = new HashMap<>();
users.put("samuel", "wallet");
users.put("dwayne", "rock");
users.put("christopher", "cowbell");
List<StructuredRecord> inputRecords = new ArrayList<>();
for (Map.Entry<String, String> userEntry : users.entrySet()) {
String name = userEntry.getKey();
String item = userEntry.getValue();
inputRecords.add(StructuredRecord.builder(inputSchema).set("name", name).set("item", item).build());
}
DataSetManager<Table> inputManager = getDataset(inputName);
MockSource.writeInput(inputManager, inputRecords);
// run the pipeline
Map<String, String> runtimeArgs = new HashMap<>();
// the ${dir} macro will be substituted with "users" for our pipeline run
runtimeArgs.put("dir", outputDirName);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(runtimeArgs);
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 4, TimeUnit.MINUTES);
// check the pipeline output
DataSetManager<FileSet> outputManager = getDataset(outputName);
FileSet output = outputManager.get();
Location outputDir = output.getBaseLocation().append(outputDirName);
Map<String, String> actual = new HashMap<>();
for (Location outputFile : outputDir.list()) {
if (outputFile.getName().endsWith(".crc") || "_SUCCESS".equals(outputFile.getName())) {
Assert.fail("Post action did not delete file " + outputFile.getName());
}
try (BufferedReader reader = new BufferedReader(new InputStreamReader(outputFile.getInputStream()))) {
String line;
while ((line = reader.readLine()) != null) {
String[] parts = line.split("\\|");
actual.put(parts[0], parts[1]);
}
}
}
Assert.assertEquals(actual, users);
}
Aggregations