Search in sources :

Example 46 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project hydrator-plugins by cdapio.

the class NormalizeTest method deployApplication.

private ApplicationManager deployApplication(Map<String, String> sourceProperties, String inputDatasetName, String outputDatasetName, String applicationName) throws Exception {
    ETLStage source = new ETLStage("source", MockSource.getPlugin(inputDatasetName));
    ETLStage transform = new ETLStage("normalize", new ETLPlugin("Normalize", Transform.PLUGIN_TYPE, sourceProperties, null));
    ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputDatasetName));
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(transform).addStage(sink).addConnection(source.getName(), transform.getName()).addConnection(transform.getName(), sink.getName()).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(BATCH_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app(applicationName);
    return deployApplication(appId, appRequest);
}
Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) AppRequest(io.cdap.cdap.proto.artifact.AppRequest)

Example 47 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by cdapio.

the class PipelineSpecGenerator method configureStages.

/**
 * Performs most of the validation and configuration needed by a pipeline.
 * Handles stages, connections, resources, and stage logging settings.
 *
 * @param config user provided ETL config
 * @param specBuilder builder for creating a pipeline spec.
 * @throws ValidationException if the pipeline is invalid
 */
protected void configureStages(ETLConfig config, PipelineSpec.Builder specBuilder) throws ValidationException {
    // validate the config and determine the order we should configure the stages in.
    ValidatedPipeline validatedPipeline = validateConfig(config);
    List<ETLStage> traversalOrder = validatedPipeline.getTraversalOrder();
    Map<String, DefaultPipelineConfigurer> pluginConfigurers = new HashMap<>(traversalOrder.size());
    Map<String, String> pluginTypes = new HashMap<>(traversalOrder.size());
    for (ETLStage stage : traversalOrder) {
        String stageName = stage.getName();
        pluginTypes.put(stageName, stage.getPlugin().getType());
        pluginConfigurers.put(stageName, new DefaultPipelineConfigurer(pluginConfigurer, datasetConfigurer, stageName, engine, new DefaultStageConfigurer(stageName), featureFlagsProvider));
    }
    SchemaPropagator schemaPropagator = new SchemaPropagator(pluginConfigurers, validatedPipeline::getOutputs, pluginTypes::get);
    // anything prefixed by 'system.[engine].' is a pipeline property.
    Map<String, String> pipelineProperties = new HashMap<>();
    String prefix = String.format("system.%s.", engine.name().toLowerCase());
    int prefixLength = prefix.length();
    for (Map.Entry<String, String> property : config.getProperties().entrySet()) {
        if (property.getKey().startsWith(prefix)) {
            String strippedKey = property.getKey().substring(prefixLength);
            pipelineProperties.put(strippedKey, property.getValue());
        }
    }
    // row = property name, column = property value, val = stage that set the property
    // this is used so that we can error with a nice message about which stages are setting conflicting properties
    Table<String, String, String> propertiesFromStages = HashBasedTable.create();
    // configure the stages in order and build up the stage specs
    for (ETLStage stage : traversalOrder) {
        String stageName = stage.getName();
        DefaultPipelineConfigurer pluginConfigurer = pluginConfigurers.get(stageName);
        ConfiguredStage configuredStage = configureStage(stage, validatedPipeline, pluginConfigurer);
        schemaPropagator.propagateSchema(configuredStage.getStageSpec());
        specBuilder.addStage(configuredStage.getStageSpec());
        for (Map.Entry<String, String> propertyEntry : configuredStage.pipelineProperties.entrySet()) {
            propertiesFromStages.put(propertyEntry.getKey(), propertyEntry.getValue(), stageName);
        }
    }
    // check that multiple stages did not set conflicting properties
    for (String propertyName : propertiesFromStages.rowKeySet()) {
        // go through all values set for the property name. If there is more than one, we have a conflict.
        Map<String, String> propertyValues = propertiesFromStages.row(propertyName);
        if (propertyValues.size() > 1) {
            StringBuilder errMsg = new StringBuilder("Pipeline property '").append(propertyName).append("' is being set to different values by stages.");
            for (Map.Entry<String, String> valueEntry : propertyValues.entrySet()) {
                String propertyValue = valueEntry.getKey();
                String fromStage = valueEntry.getValue();
                errMsg.append(" stage '").append(fromStage).append("' = '").append(propertyValue).append("',");
            }
            errMsg.deleteCharAt(errMsg.length() - 1);
            throw new IllegalArgumentException(errMsg.toString());
        }
        pipelineProperties.put(propertyName, propertyValues.keySet().iterator().next());
    }
    specBuilder.addConnections(config.getConnections()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setStageLoggingEnabled(config.isStageLoggingEnabled()).setNumOfRecordsPreview(config.getNumOfRecordsPreview()).setProperties(pipelineProperties).addConnectionsUsed(connectionEvaluator.getUsedConnections()).build();
}
Also used : HashMap(java.util.HashMap) DefaultStageConfigurer(io.cdap.cdap.etl.common.DefaultStageConfigurer) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) DefaultPipelineConfigurer(io.cdap.cdap.etl.common.DefaultPipelineConfigurer) Map(java.util.Map) HashMap(java.util.HashMap)

Example 48 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by cdapio.

the class DataStreamsTest method testTransformComputeWithMacros.

@Test
public void testTransformComputeWithMacros() throws Exception {
    Schema schema = Schema.recordOf("test", Schema.Field.of("id", Schema.of(Schema.Type.STRING)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = new ArrayList<>();
    StructuredRecord samuelRecord = StructuredRecord.builder(schema).set("id", "123").set("name", "samuel").build();
    StructuredRecord jacksonRecord = StructuredRecord.builder(schema).set("id", "456").set("name", "jackson").build();
    StructuredRecord dwayneRecord = StructuredRecord.builder(schema).set("id", "789").set("name", "dwayne").build();
    StructuredRecord johnsonRecord = StructuredRecord.builder(schema).set("id", "0").set("name", "johnson").build();
    input.add(samuelRecord);
    input.add(jacksonRecord);
    input.add(dwayneRecord);
    input.add(johnsonRecord);
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("${field}", "${val1}"))).addStage(new ETLStage("filter2", StringValueFilterCompute.getPlugin("${field}", "${val2}"))).addStage(new ETLStage("sleep", SleepTransform.getPlugin(2L))).addConnection("source", "sleep").addConnection("sleep", "filter1").addConnection("filter1", "filter2").addConnection("filter2", "sink").setBatchInterval("1s").build();
    ApplicationId appId = NamespaceId.DEFAULT.app("simpleApp");
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(samuelRecord);
    expected.add(jacksonRecord);
    testTransformComputeRun(appManager, expected, "dwayne", "johnson", "macroOutput1");
    validateMetric(appId, "source.records.out", 4);
    validateMetric(appId, "sleep.records.in", 4);
    validateMetric(appId, "sleep.records.out", 4);
    validateMetric(appId, "filter1.records.in", 4);
    validateMetric(appId, "filter1.records.out", 3);
    validateMetric(appId, "filter2.records.in", 3);
    validateMetric(appId, "filter2.records.out", 2);
    validateMetric(appId, "sink.records.in", 2);
    Assert.assertTrue(getMetric(appId, "sleep." + io.cdap.cdap.etl.common.Constants.Metrics.TOTAL_TIME) > 0L);
    expected.clear();
    expected.add(dwayneRecord);
    expected.add(johnsonRecord);
    testTransformComputeRun(appManager, expected, "samuel", "jackson", "macroOutput2");
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 49 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by cdapio.

the class DataStreamsTest method testAutoJoinNullEquality.

private void testAutoJoinNullEquality(boolean nullSafe) throws Exception {
    /*
     * customers ----------|
     *                     |
     *                     |---> join ---> sink
     *                     |
     * transactions -------|
     */
    Schema inputSchema1 = Schema.recordOf("customer", Schema.Field.of("customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    Schema inputSchema2 = Schema.recordOf("transaction", Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)));
    Schema outSchema = Schema.recordOf("customers.transactions", Schema.Field.of("customers_customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customers_customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("transactions_t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("transactions_customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("transactions_item_id", Schema.of(Schema.Type.STRING)));
    StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema1).set("customer_id", "1").set("customer_name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(inputSchema1).set("customer_name", "bob").build();
    StructuredRecord recordJane = StructuredRecord.builder(inputSchema1).set("customer_id", "3").set("customer_name", "jane").build();
    StructuredRecord trans1 = StructuredRecord.builder(inputSchema2).set("t_id", "1").set("customer_id", "1").set("item_id", "11").build();
    StructuredRecord trans2 = StructuredRecord.builder(inputSchema2).set("t_id", "2").set("customer_id", "3").set("item_id", "22").build();
    StructuredRecord trans3 = StructuredRecord.builder(inputSchema2).set("t_id", "3").set("item_id", "33").build();
    List<StructuredRecord> input1 = ImmutableList.of(recordSamuel, recordBob, recordJane);
    List<StructuredRecord> input2 = ImmutableList.of(trans1, trans2, trans3);
    String outputName = UUID.randomUUID().toString();
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("customers", MockSource.getPlugin(inputSchema1, input1))).addStage(new ETLStage("transactions", MockSource.getPlugin(inputSchema2, input2))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("customers", "transactions"), Collections.singletonList("customer_id"), Collections.singletonList("transactions"), Collections.emptyList(), Collections.emptyList(), nullSafe))).addStage(new ETLStage("sink", MockSink.getPlugin(outputName))).addConnection("customers", "join").addConnection("transactions", "join").addConnection("join", "sink").setBatchInterval("5s").setCheckpointDir(checkpointDir).build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    StructuredRecord join1 = StructuredRecord.builder(outSchema).set("customers_customer_id", "1").set("customers_customer_name", "samuel").set("transactions_t_id", "1").set("transactions_customer_id", "1").set("transactions_item_id", "11").build();
    StructuredRecord join2 = StructuredRecord.builder(outSchema).set("customers_customer_id", "3").set("customers_customer_name", "jane").set("transactions_t_id", "2").set("transactions_customer_id", "3").set("transactions_item_id", "22").build();
    StructuredRecord join3;
    if (nullSafe) {
        // this transaction has a null customer id, which should match with the null id from customers
        join3 = StructuredRecord.builder(outSchema).set("transactions_t_id", "3").set("transactions_item_id", "33").set("customers_customer_name", "bob").build();
    } else {
        // this transaction has a null customer id, which should not match with the null id from customers
        join3 = StructuredRecord.builder(outSchema).set("transactions_t_id", "3").set("transactions_item_id", "33").build();
    }
    Set<StructuredRecord> expected = ImmutableSet.of(join1, join2, join3);
    DataSetManager<Table> outputManager = getDataset(outputName);
    Tasks.waitFor(true, () -> {
        outputManager.flush();
        Set<StructuredRecord> outputRecords = new HashSet<>(MockSink.readOutput(outputManager));
        return expected.equals(outputRecords);
    }, 4, TimeUnit.MINUTES);
    sparkManager.stop();
    sparkManager.waitForStopped(10, TimeUnit.SECONDS);
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet)

Example 50 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by cdapio.

the class DataStreamsTest method testWindower.

@Test
public void testWindower() throws Exception {
    /*
     * source --> window(width=10,interval=1) --> aggregator --> filter --> sink
     */
    Schema schema = Schema.recordOf("data", Schema.Field.of("x", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build());
    String sinkName = "windowOut";
    // source sleeps 1 second between outputs
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 1000L))).addStage(new ETLStage("window", Window.getPlugin(30, 1))).addStage(new ETLStage("agg", FieldCountAggregator.getPlugin("x", "string"))).addStage(new ETLStage("filter", StringValueFilterTransform.getPlugin("x", "all"))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addConnection("source", "window").addConnection("window", "agg").addConnection("agg", "filter").addConnection("filter", "sink").setBatchInterval("1s").setCheckpointDir(checkpointDir).build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("WindowerApp");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    // the sink should contain at least one record with count of 3, and no records with more than 3.
    // less than 3 if the window doesn't contain all 3 records yet, but there should eventually be a window
    // that contains all 3.
    final DataSetManager<Table> outputManager = getDataset(sinkName);
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            outputManager.flush();
            boolean sawThree = false;
            for (StructuredRecord record : MockSink.readOutput(outputManager)) {
                long count = record.get("ct");
                if (count == 3L) {
                    sawThree = true;
                }
                Assert.assertTrue(count <= 3L);
            }
            return sawThree;
        }
    }, 2, TimeUnit.MINUTES);
    sparkManager.stop();
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) TimeoutException(java.util.concurrent.TimeoutException) TopicNotFoundException(io.cdap.cdap.api.messaging.TopicNotFoundException) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Aggregations

ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)220 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)207 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)152 ApplicationManager (io.cdap.cdap.test.ApplicationManager)152 Test (org.junit.Test)152 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)151 Table (io.cdap.cdap.api.dataset.table.Table)119 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)111 WorkflowManager (io.cdap.cdap.test.WorkflowManager)111 Schema (io.cdap.cdap.api.data.schema.Schema)98 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)76 ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)69 HashSet (java.util.HashSet)36 HashMap (java.util.HashMap)32 ImmutableMap (com.google.common.collect.ImmutableMap)29 ArrayList (java.util.ArrayList)28 TimeoutException (java.util.concurrent.TimeoutException)22 Map (java.util.Map)18 SpamMessage (io.cdap.cdap.datapipeline.mock.SpamMessage)16 Lineage (io.cdap.cdap.data2.metadata.lineage.Lineage)14