use of io.cdap.cdap.etl.proto.v2.ETLConfig in project hydrator-plugins by cdapio.
the class NormalizeTest method deployApplication.
private ApplicationManager deployApplication(Map<String, String> sourceProperties, String inputDatasetName, String outputDatasetName, String applicationName) throws Exception {
ETLStage source = new ETLStage("source", MockSource.getPlugin(inputDatasetName));
ETLStage transform = new ETLStage("normalize", new ETLPlugin("Normalize", Transform.PLUGIN_TYPE, sourceProperties, null));
ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputDatasetName));
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(transform).addStage(sink).addConnection(source.getName(), transform.getName()).addConnection(transform.getName(), sink.getName()).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(BATCH_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app(applicationName);
return deployApplication(appId, appRequest);
}
use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by cdapio.
the class PipelineSpecGenerator method configureStages.
/**
* Performs most of the validation and configuration needed by a pipeline.
* Handles stages, connections, resources, and stage logging settings.
*
* @param config user provided ETL config
* @param specBuilder builder for creating a pipeline spec.
* @throws ValidationException if the pipeline is invalid
*/
protected void configureStages(ETLConfig config, PipelineSpec.Builder specBuilder) throws ValidationException {
// validate the config and determine the order we should configure the stages in.
ValidatedPipeline validatedPipeline = validateConfig(config);
List<ETLStage> traversalOrder = validatedPipeline.getTraversalOrder();
Map<String, DefaultPipelineConfigurer> pluginConfigurers = new HashMap<>(traversalOrder.size());
Map<String, String> pluginTypes = new HashMap<>(traversalOrder.size());
for (ETLStage stage : traversalOrder) {
String stageName = stage.getName();
pluginTypes.put(stageName, stage.getPlugin().getType());
pluginConfigurers.put(stageName, new DefaultPipelineConfigurer(pluginConfigurer, datasetConfigurer, stageName, engine, new DefaultStageConfigurer(stageName), featureFlagsProvider));
}
SchemaPropagator schemaPropagator = new SchemaPropagator(pluginConfigurers, validatedPipeline::getOutputs, pluginTypes::get);
// anything prefixed by 'system.[engine].' is a pipeline property.
Map<String, String> pipelineProperties = new HashMap<>();
String prefix = String.format("system.%s.", engine.name().toLowerCase());
int prefixLength = prefix.length();
for (Map.Entry<String, String> property : config.getProperties().entrySet()) {
if (property.getKey().startsWith(prefix)) {
String strippedKey = property.getKey().substring(prefixLength);
pipelineProperties.put(strippedKey, property.getValue());
}
}
// row = property name, column = property value, val = stage that set the property
// this is used so that we can error with a nice message about which stages are setting conflicting properties
Table<String, String, String> propertiesFromStages = HashBasedTable.create();
// configure the stages in order and build up the stage specs
for (ETLStage stage : traversalOrder) {
String stageName = stage.getName();
DefaultPipelineConfigurer pluginConfigurer = pluginConfigurers.get(stageName);
ConfiguredStage configuredStage = configureStage(stage, validatedPipeline, pluginConfigurer);
schemaPropagator.propagateSchema(configuredStage.getStageSpec());
specBuilder.addStage(configuredStage.getStageSpec());
for (Map.Entry<String, String> propertyEntry : configuredStage.pipelineProperties.entrySet()) {
propertiesFromStages.put(propertyEntry.getKey(), propertyEntry.getValue(), stageName);
}
}
// check that multiple stages did not set conflicting properties
for (String propertyName : propertiesFromStages.rowKeySet()) {
// go through all values set for the property name. If there is more than one, we have a conflict.
Map<String, String> propertyValues = propertiesFromStages.row(propertyName);
if (propertyValues.size() > 1) {
StringBuilder errMsg = new StringBuilder("Pipeline property '").append(propertyName).append("' is being set to different values by stages.");
for (Map.Entry<String, String> valueEntry : propertyValues.entrySet()) {
String propertyValue = valueEntry.getKey();
String fromStage = valueEntry.getValue();
errMsg.append(" stage '").append(fromStage).append("' = '").append(propertyValue).append("',");
}
errMsg.deleteCharAt(errMsg.length() - 1);
throw new IllegalArgumentException(errMsg.toString());
}
pipelineProperties.put(propertyName, propertyValues.keySet().iterator().next());
}
specBuilder.addConnections(config.getConnections()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setStageLoggingEnabled(config.isStageLoggingEnabled()).setNumOfRecordsPreview(config.getNumOfRecordsPreview()).setProperties(pipelineProperties).addConnectionsUsed(connectionEvaluator.getUsedConnections()).build();
}
use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by cdapio.
the class DataStreamsTest method testTransformComputeWithMacros.
@Test
public void testTransformComputeWithMacros() throws Exception {
Schema schema = Schema.recordOf("test", Schema.Field.of("id", Schema.of(Schema.Type.STRING)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> input = new ArrayList<>();
StructuredRecord samuelRecord = StructuredRecord.builder(schema).set("id", "123").set("name", "samuel").build();
StructuredRecord jacksonRecord = StructuredRecord.builder(schema).set("id", "456").set("name", "jackson").build();
StructuredRecord dwayneRecord = StructuredRecord.builder(schema).set("id", "789").set("name", "dwayne").build();
StructuredRecord johnsonRecord = StructuredRecord.builder(schema).set("id", "0").set("name", "johnson").build();
input.add(samuelRecord);
input.add(jacksonRecord);
input.add(dwayneRecord);
input.add(johnsonRecord);
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("${field}", "${val1}"))).addStage(new ETLStage("filter2", StringValueFilterCompute.getPlugin("${field}", "${val2}"))).addStage(new ETLStage("sleep", SleepTransform.getPlugin(2L))).addConnection("source", "sleep").addConnection("sleep", "filter1").addConnection("filter1", "filter2").addConnection("filter2", "sink").setBatchInterval("1s").build();
ApplicationId appId = NamespaceId.DEFAULT.app("simpleApp");
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationManager appManager = deployApplication(appId, appRequest);
Set<StructuredRecord> expected = new HashSet<>();
expected.add(samuelRecord);
expected.add(jacksonRecord);
testTransformComputeRun(appManager, expected, "dwayne", "johnson", "macroOutput1");
validateMetric(appId, "source.records.out", 4);
validateMetric(appId, "sleep.records.in", 4);
validateMetric(appId, "sleep.records.out", 4);
validateMetric(appId, "filter1.records.in", 4);
validateMetric(appId, "filter1.records.out", 3);
validateMetric(appId, "filter2.records.in", 3);
validateMetric(appId, "filter2.records.out", 2);
validateMetric(appId, "sink.records.in", 2);
Assert.assertTrue(getMetric(appId, "sleep." + io.cdap.cdap.etl.common.Constants.Metrics.TOTAL_TIME) > 0L);
expected.clear();
expected.add(dwayneRecord);
expected.add(johnsonRecord);
testTransformComputeRun(appManager, expected, "samuel", "jackson", "macroOutput2");
}
use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by cdapio.
the class DataStreamsTest method testAutoJoinNullEquality.
private void testAutoJoinNullEquality(boolean nullSafe) throws Exception {
/*
* customers ----------|
* |
* |---> join ---> sink
* |
* transactions -------|
*/
Schema inputSchema1 = Schema.recordOf("customer", Schema.Field.of("customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
Schema inputSchema2 = Schema.recordOf("transaction", Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)));
Schema outSchema = Schema.recordOf("customers.transactions", Schema.Field.of("customers_customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customers_customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("transactions_t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("transactions_customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("transactions_item_id", Schema.of(Schema.Type.STRING)));
StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema1).set("customer_id", "1").set("customer_name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(inputSchema1).set("customer_name", "bob").build();
StructuredRecord recordJane = StructuredRecord.builder(inputSchema1).set("customer_id", "3").set("customer_name", "jane").build();
StructuredRecord trans1 = StructuredRecord.builder(inputSchema2).set("t_id", "1").set("customer_id", "1").set("item_id", "11").build();
StructuredRecord trans2 = StructuredRecord.builder(inputSchema2).set("t_id", "2").set("customer_id", "3").set("item_id", "22").build();
StructuredRecord trans3 = StructuredRecord.builder(inputSchema2).set("t_id", "3").set("item_id", "33").build();
List<StructuredRecord> input1 = ImmutableList.of(recordSamuel, recordBob, recordJane);
List<StructuredRecord> input2 = ImmutableList.of(trans1, trans2, trans3);
String outputName = UUID.randomUUID().toString();
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("customers", MockSource.getPlugin(inputSchema1, input1))).addStage(new ETLStage("transactions", MockSource.getPlugin(inputSchema2, input2))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("customers", "transactions"), Collections.singletonList("customer_id"), Collections.singletonList("transactions"), Collections.emptyList(), Collections.emptyList(), nullSafe))).addStage(new ETLStage("sink", MockSink.getPlugin(outputName))).addConnection("customers", "join").addConnection("transactions", "join").addConnection("join", "sink").setBatchInterval("5s").setCheckpointDir(checkpointDir).build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
StructuredRecord join1 = StructuredRecord.builder(outSchema).set("customers_customer_id", "1").set("customers_customer_name", "samuel").set("transactions_t_id", "1").set("transactions_customer_id", "1").set("transactions_item_id", "11").build();
StructuredRecord join2 = StructuredRecord.builder(outSchema).set("customers_customer_id", "3").set("customers_customer_name", "jane").set("transactions_t_id", "2").set("transactions_customer_id", "3").set("transactions_item_id", "22").build();
StructuredRecord join3;
if (nullSafe) {
// this transaction has a null customer id, which should match with the null id from customers
join3 = StructuredRecord.builder(outSchema).set("transactions_t_id", "3").set("transactions_item_id", "33").set("customers_customer_name", "bob").build();
} else {
// this transaction has a null customer id, which should not match with the null id from customers
join3 = StructuredRecord.builder(outSchema).set("transactions_t_id", "3").set("transactions_item_id", "33").build();
}
Set<StructuredRecord> expected = ImmutableSet.of(join1, join2, join3);
DataSetManager<Table> outputManager = getDataset(outputName);
Tasks.waitFor(true, () -> {
outputManager.flush();
Set<StructuredRecord> outputRecords = new HashSet<>(MockSink.readOutput(outputManager));
return expected.equals(outputRecords);
}, 4, TimeUnit.MINUTES);
sparkManager.stop();
sparkManager.waitForStopped(10, TimeUnit.SECONDS);
}
use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by cdapio.
the class DataStreamsTest method testWindower.
@Test
public void testWindower() throws Exception {
/*
* source --> window(width=10,interval=1) --> aggregator --> filter --> sink
*/
Schema schema = Schema.recordOf("data", Schema.Field.of("x", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build());
String sinkName = "windowOut";
// source sleeps 1 second between outputs
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 1000L))).addStage(new ETLStage("window", Window.getPlugin(30, 1))).addStage(new ETLStage("agg", FieldCountAggregator.getPlugin("x", "string"))).addStage(new ETLStage("filter", StringValueFilterTransform.getPlugin("x", "all"))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addConnection("source", "window").addConnection("window", "agg").addConnection("agg", "filter").addConnection("filter", "sink").setBatchInterval("1s").setCheckpointDir(checkpointDir).build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("WindowerApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
// the sink should contain at least one record with count of 3, and no records with more than 3.
// less than 3 if the window doesn't contain all 3 records yet, but there should eventually be a window
// that contains all 3.
final DataSetManager<Table> outputManager = getDataset(sinkName);
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
outputManager.flush();
boolean sawThree = false;
for (StructuredRecord record : MockSink.readOutput(outputManager)) {
long count = record.get("ct");
if (count == 3L) {
sawThree = true;
}
Assert.assertTrue(count <= 3L);
}
return sawThree;
}
}, 2, TimeUnit.MINUTES);
sparkManager.stop();
}
Aggregations