use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.
the class DataPipelineServiceTest method testValidateMultiInputInvalidInputField.
@Test
public void testValidateMultiInputInvalidInputField() throws Exception {
// StringValueFilterTransform will be configured to filter records where field x has value 'y'
// it will be invalid because the type of field x will be an int instead of the required string
String stageName = "tx";
Map<String, String> properties = new HashMap<>();
properties.put("field", "x");
properties.put("value", "y");
ETLStage stage = new ETLStage(stageName, new ETLPlugin(StringValueFilterTransform.NAME, Transform.PLUGIN_TYPE, properties));
Schema inputSchema = Schema.recordOf("x", Schema.Field.of("x", Schema.of(Schema.Type.INT)));
StageValidationRequest requestBody = new StageValidationRequest(stage, ImmutableList.of(new StageSchema("input1", inputSchema), new StageSchema("input2", inputSchema)), false);
StageValidationResponse actual = sendRequest(requestBody);
List<String> expectedInputs = ImmutableList.of("input1", "input2");
Assert.assertNull(actual.getSpec());
Assert.assertEquals(1, actual.getFailures().size());
ValidationFailure failure = actual.getFailures().iterator().next();
// the stage will add 3 causes. Two are related to input field and one is related to config property.
Assert.assertEquals(3, failure.getCauses().size());
Assert.assertEquals("field", failure.getCauses().get(0).getAttribute(CauseAttributes.STAGE_CONFIG));
Assert.assertEquals(stageName, failure.getCauses().get(0).getAttribute(STAGE));
Assert.assertEquals(stageName, failure.getCauses().get(1).getAttribute(STAGE));
Assert.assertEquals(stageName, failure.getCauses().get(2).getAttribute(STAGE));
Assert.assertTrue(expectedInputs.contains(failure.getCauses().get(1).getAttribute(CauseAttributes.INPUT_STAGE)));
Assert.assertTrue(expectedInputs.contains(failure.getCauses().get(2).getAttribute(CauseAttributes.INPUT_STAGE)));
}
use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.
the class PipelineSpecGenerator method configureStage.
/**
* Configures a plugin and returns the spec for it.
*
* @param stageName the unique plugin id
* @param etlPlugin user provided configuration for the plugin
* @param pipelineConfigurer default pipeline configurer to configure the plugin
* @return the spec for the plugin
* @throws IllegalArgumentException if the plugin with same id is already deployed
* @throws ValidationException if the plugin threw an exception during configuration
*/
public StageSpec.Builder configureStage(String stageName, ETLPlugin etlPlugin, DefaultPipelineConfigurer pipelineConfigurer) throws ValidationException {
TrackedPluginSelector pluginSelector = new TrackedPluginSelector(new ArtifactSelectorProvider().getPluginSelector(etlPlugin.getArtifactConfig()));
String type = etlPlugin.getType();
String pluginName = etlPlugin.getName();
DefaultStageConfigurer stageConfigurer = pipelineConfigurer.getStageConfigurer();
FailureCollector collector = stageConfigurer.getFailureCollector();
Object plugin = getPlugin(stageName, etlPlugin, pluginSelector, type, pluginName, collector);
try {
if (type.equals(BatchJoiner.PLUGIN_TYPE)) {
MultiInputPipelineConfigurable multiPlugin = (MultiInputPipelineConfigurable) plugin;
multiPlugin.configurePipeline(pipelineConfigurer);
// to the BatchAutoJoiner while preserving backwards compatibility in the pipeline config.
if (plugin instanceof AutoJoiner) {
configureAutoJoiner(stageName, (AutoJoiner) plugin, stageConfigurer, collector);
}
} else if (type.equals(SplitterTransform.PLUGIN_TYPE)) {
MultiOutputPipelineConfigurable multiOutputPlugin = (MultiOutputPipelineConfigurable) plugin;
multiOutputPlugin.configurePipeline(pipelineConfigurer);
} else if (!type.equals(Constants.SPARK_PROGRAM_PLUGIN_TYPE)) {
PipelineConfigurable singlePlugin = (PipelineConfigurable) plugin;
singlePlugin.configurePipeline(pipelineConfigurer);
// evaluate macros and find out if there is connection used
if ((sourcePluginTypes.contains(type) || BatchSink.PLUGIN_TYPE.equals(type)) && runtimeEvaluator == null) {
pluginConfigurer.evaluateMacros(etlPlugin.getProperties(), connectionEvaluator, options);
}
}
} catch (InvalidConfigPropertyException e) {
collector.addFailure(e.getMessage(), String.format("Provide valid value for config property '%s'.", e.getProperty())).withConfigProperty(e.getProperty());
} catch (InvalidStageException e) {
if (e.getReasons().isEmpty()) {
collector.addFailure(e.getMessage(), null);
}
for (InvalidStageException reason : e.getReasons()) {
if (reason instanceof InvalidConfigPropertyException) {
InvalidConfigPropertyException configException = (InvalidConfigPropertyException) reason;
collector.addFailure(configException.getMessage(), String.format("Provide valid value for config property '%s'.", configException.getProperty())).withConfigProperty(configException.getProperty());
} else {
collector.addFailure(reason.getMessage(), null);
}
}
} catch (ValidationException e) {
throw e;
} catch (NullPointerException e) {
// handle the case where plugin throws null pointer exception, this is to avoid having 'null' as error message
collector.addFailure(String.format("Null error occurred while configuring the stage %s.", stageName), null).withStacktrace(e.getStackTrace());
} catch (ArrayIndexOutOfBoundsException e) {
// handle the case where plugin throws index out of bounds exception,
// this is to avoid having a number like '2', '8' etc as error message
collector.addFailure(String.format("Index out of bounds error occurred while configuring the stage %s.", stageName), null).withStacktrace(e.getStackTrace());
} catch (ConnectionBadRequestException e) {
collector.addFailure(e.getMessage(), "Provide a valid connection name.");
} catch (Exception e) {
collector.addFailure(String.format("Error encountered while configuring the stage: '%s'", e.getMessage()), null).withStacktrace(e.getStackTrace());
}
// throw validation exception if there are any errors being carried by failure collector
collector.getOrThrowException();
PluginSpec pluginSpec = new PluginSpec(type, pluginName, etlPlugin.getProperties(), pluginSelector.getSelectedArtifact());
StageSpec.Builder specBuilder = StageSpec.builder(stageName, pluginSpec).addInputSchemas(pipelineConfigurer.getStageConfigurer().getInputSchemas()).setErrorSchema(stageConfigurer.getErrorSchema());
if (type.equals(SplitterTransform.PLUGIN_TYPE)) {
specBuilder.setPortSchemas(stageConfigurer.getOutputPortSchemas());
} else {
specBuilder.setOutputSchema(stageConfigurer.getOutputSchema());
}
return specBuilder;
}
use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.
the class PipelineTest method testWordCount.
public void testWordCount(String pluginType) throws Exception {
String inputName = "wcInput-" + pluginType;
String outputName = "wcOutput-" + pluginType;
// create the pipeline config
ETLStage source = new ETLStage("wcInput", MockSource.getPlugin(inputName));
ETLStage sink = new ETLStage("wcOutput", MockSink.getPlugin(outputName));
Map<String, String> aggProperties = new HashMap<>();
aggProperties.put("field", "text");
ETLStage agg = new ETLStage("middle", new ETLPlugin("WordCount", pluginType, aggProperties, null));
ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addStage(agg).addConnection(source.getName(), agg.getName()).addConnection(agg.getName(), sink.getName()).build();
// create the pipeline
ApplicationId pipelineId = NamespaceId.DEFAULT.app("wcTestPipeline-" + pluginType);
ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
// write the input
Schema inputSchema = Schema.recordOf("text", Schema.Field.of("text", Schema.of(Schema.Type.STRING)));
DataSetManager<Table> inputManager = getDataset(inputName);
List<StructuredRecord> inputRecords = new ArrayList<>();
inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello World").build());
inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Hal").build());
inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Sam").build());
MockSource.writeInput(inputManager, inputRecords);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 4, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(outputName);
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(outputManager));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "Hello").set("count", 3L).build());
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "World").set("count", 1L).build());
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "my").set("count", 2L).build());
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "name").set("count", 2L).build());
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "is").set("count", 2L).build());
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "Hal").set("count", 1L).build());
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "Sam").set("count", 1L).build());
Assert.assertEquals(expected, outputRecords);
}
use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by cdapio.
the class PipelineSpecGenerator method configureStage.
/**
* Configures a stage and returns the spec for it.
*
* @param stage the user provided configuration for the stage
* @param validatedPipeline the validated pipeline config
* @param pluginConfigurer configurer used to configure the stage
* @return the spec for the stage
* @throws ValidationException if the plugin threw an exception during configuration
*/
protected ConfiguredStage configureStage(ETLStage stage, ValidatedPipeline validatedPipeline, DefaultPipelineConfigurer pluginConfigurer) throws ValidationException {
String stageName = stage.getName();
ETLPlugin stagePlugin = stage.getPlugin();
StageSpec.Builder specBuilder = configureStage(stageName, stagePlugin, pluginConfigurer);
DefaultStageConfigurer stageConfigurer = pluginConfigurer.getStageConfigurer();
String pluginType = stage.getPlugin().getType();
if (pluginType.equals(SplitterTransform.PLUGIN_TYPE)) {
Map<String, Schema> outputPortSchemas = stageConfigurer.getOutputPortSchemas();
for (Map.Entry<String, String> outputEntry : validatedPipeline.getOutputPorts(stageName).entrySet()) {
String outputStage = outputEntry.getKey();
String outputPort = outputEntry.getValue();
if (outputPort == null) {
throw new IllegalArgumentException(String.format("Connection from Splitter '%s' to '%s' must specify a port.", stageName, outputStage));
}
specBuilder.addOutput(outputStage, outputPort, outputPortSchemas.get(outputPort));
}
} else {
Schema outputSchema = stageConfigurer.getOutputSchema();
// all the same
if (Condition.PLUGIN_TYPE.equals(pluginType)) {
outputSchema = null;
for (Schema schema : stageConfigurer.getInputSchemas().values()) {
if (schema != null) {
// todo: fix this cleanly and fully
if (outputSchema != null && !Schemas.equalsIgnoringRecordName(outputSchema, schema)) {
throw new IllegalArgumentException("Cannot have different input schemas going into stage " + stageName);
}
outputSchema = schema;
}
}
}
for (String outputStage : validatedPipeline.getOutputs(stageName)) {
specBuilder.addOutput(outputStage, null, outputSchema);
}
}
StageSpec stageSpec = specBuilder.setProcessTimingEnabled(validatedPipeline.isProcessTimingEnabled()).setStageLoggingEnabled(validatedPipeline.isStageLoggingEnabled()).setMaxPreviewRecords(validatedPipeline.getMaxPreviewRecords()).build();
return new ConfiguredStage(stageSpec, pluginConfigurer.getPipelineProperties());
}
use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by cdapio.
the class PipelineTest method testTextFileSinkAndDeletePostAction.
@Test
public void testTextFileSinkAndDeletePostAction() throws Exception {
// create the pipeline config
String inputName = "sinkTestInput";
String outputName = "sinkTestOutput";
String outputDirName = "users";
ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
Map<String, String> sinkProperties = new HashMap<>();
sinkProperties.put(TextFileSetSink.Conf.FILESET_NAME, outputName);
sinkProperties.put(TextFileSetSink.Conf.FIELD_SEPARATOR, "|");
sinkProperties.put(TextFileSetSink.Conf.OUTPUT_DIR, "${dir}");
ETLStage sink = new ETLStage("sink", new ETLPlugin(TextFileSetSink.NAME, BatchSink.PLUGIN_TYPE, sinkProperties, null));
Map<String, String> actionProperties = new HashMap<>();
actionProperties.put(FilesetDeletePostAction.Conf.FILESET_NAME, outputName);
// mapreduce writes multiple files to the output directory. Along with the actual output,
// there are various .crc files that do not contain any of the output content.
actionProperties.put(FilesetDeletePostAction.Conf.DELETE_REGEX, ".*\\.crc|_SUCCESS");
actionProperties.put(FilesetDeletePostAction.Conf.DIRECTORY, outputDirName);
ETLStage postAction = new ETLStage("cleanup", new ETLPlugin(FilesetDeletePostAction.NAME, PostAction.PLUGIN_TYPE, actionProperties, null));
ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addPostAction(postAction).addConnection(source.getName(), sink.getName()).build();
// create the pipeline
ApplicationId pipelineId = NamespaceId.DEFAULT.app("textSinkTestPipeline");
ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
// write some data to the input fileset
Schema inputSchema = Schema.recordOf("test", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.STRING)));
Map<String, String> users = new HashMap<>();
users.put("samuel", "wallet");
users.put("dwayne", "rock");
users.put("christopher", "cowbell");
List<StructuredRecord> inputRecords = new ArrayList<>();
for (Map.Entry<String, String> userEntry : users.entrySet()) {
String name = userEntry.getKey();
String item = userEntry.getValue();
inputRecords.add(StructuredRecord.builder(inputSchema).set("name", name).set("item", item).build());
}
DataSetManager<Table> inputManager = getDataset(inputName);
MockSource.writeInput(inputManager, inputRecords);
// run the pipeline
Map<String, String> runtimeArgs = new HashMap<>();
// the ${dir} macro will be substituted with "users" for our pipeline run
runtimeArgs.put("dir", outputDirName);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(runtimeArgs);
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 4, TimeUnit.MINUTES);
// check the pipeline output
DataSetManager<FileSet> outputManager = getDataset(outputName);
FileSet output = outputManager.get();
Location outputDir = output.getBaseLocation().append(outputDirName);
Map<String, String> actual = new HashMap<>();
for (Location outputFile : outputDir.list()) {
if (outputFile.getName().endsWith(".crc") || "_SUCCESS".equals(outputFile.getName())) {
Assert.fail("Post action did not delete file " + outputFile.getName());
}
try (BufferedReader reader = new BufferedReader(new InputStreamReader(outputFile.getInputStream()))) {
String line;
while ((line = reader.readLine()) != null) {
String[] parts = line.split("\\|");
actual.put(parts[0], parts[1]);
}
}
}
Assert.assertEquals(actual, users);
}
Aggregations