Search in sources :

Example 1 with ETLTransformationPushdown

use of io.cdap.cdap.etl.proto.v2.ETLTransformationPushdown in project cdap by caskdata.

the class PipelineSpecGeneratorTest method testSQLEngineNotEnabled.

@Test
public void testSQLEngineNotEnabled() throws ValidationException {
    ETLBatchConfig config = ETLBatchConfig.builder().setTimeSchedule("* * * * *").addStage(new ETLStage("action", MOCK_ACTION)).setPushdownEnabled(false).setTransformationPushdown(new ETLTransformationPushdown(MOCK_SQL_ENGINE)).build();
    PipelineSpec actual = specGenerator.generateSpec(config);
    Map<String, String> emptyMap = ImmutableMap.of();
    PipelineSpec expected = BatchPipelineSpec.builder().addStage(StageSpec.builder("action", new PluginSpec(Action.PLUGIN_TYPE, "mockaction", emptyMap, ARTIFACT_ID)).build()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setStageLoggingEnabled(config.isStageLoggingEnabled()).setSqlEngineStageSpec(null).build();
    Assert.assertEquals(expected, actual);
}
Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) PluginSpec(io.cdap.cdap.etl.proto.v2.spec.PluginSpec) ETLTransformationPushdown(io.cdap.cdap.etl.proto.v2.ETLTransformationPushdown) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) PipelineSpec(io.cdap.cdap.etl.proto.v2.spec.PipelineSpec) BatchPipelineSpec(io.cdap.cdap.etl.batch.BatchPipelineSpec) Test(org.junit.Test)

Example 2 with ETLTransformationPushdown

use of io.cdap.cdap.etl.proto.v2.ETLTransformationPushdown in project cdap by caskdata.

the class PipelineSpecGeneratorTest method testSQLEngine.

@Test
public void testSQLEngine() throws ValidationException {
    ETLBatchConfig config = ETLBatchConfig.builder().setTimeSchedule("* * * * *").addStage(new ETLStage("action", MOCK_ACTION)).setPushdownEnabled(true).setTransformationPushdown(new ETLTransformationPushdown(MOCK_SQL_ENGINE)).build();
    PipelineSpec actual = specGenerator.generateSpec(config);
    Map<String, String> emptyMap = ImmutableMap.of();
    PipelineSpec expected = BatchPipelineSpec.builder().addStage(StageSpec.builder("action", new PluginSpec(Action.PLUGIN_TYPE, "mockaction", emptyMap, ARTIFACT_ID)).build()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setStageLoggingEnabled(config.isStageLoggingEnabled()).setSqlEngineStageSpec(StageSpec.builder("sqlengine_mocksqlengine", new PluginSpec(BatchSQLEngine.PLUGIN_TYPE, "mocksqlengine", emptyMap, ARTIFACT_ID)).build()).build();
    Assert.assertEquals(expected, actual);
}
Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) PluginSpec(io.cdap.cdap.etl.proto.v2.spec.PluginSpec) ETLTransformationPushdown(io.cdap.cdap.etl.proto.v2.ETLTransformationPushdown) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) PipelineSpec(io.cdap.cdap.etl.proto.v2.spec.PipelineSpec) BatchPipelineSpec(io.cdap.cdap.etl.batch.BatchPipelineSpec) Test(org.junit.Test)

Example 3 with ETLTransformationPushdown

use of io.cdap.cdap.etl.proto.v2.ETLTransformationPushdown in project cdap by caskdata.

the class AutoJoinerTest method testSimpleAutoJoinUsingSQLEngineWithCapabilities.

private void testSimpleAutoJoinUsingSQLEngineWithCapabilities(List<String> required, List<String> broadcast, Set<StructuredRecord> expected, Schema expectedSchema, Engine engine) throws Exception {
    File joinOutputDir = TMP_FOLDER.newFolder();
    /*
         users ------|
                     |--> join --> sink
         purchases --|


         joinOn: users.region = purchases.region and users.user_id = purchases.user_id
     */
    String userInput = UUID.randomUUID().toString();
    String purchaseInput = UUID.randomUUID().toString();
    String sinkOutput = UUID.randomUUID().toString();
    String sinkWithWriteCapabilitiesOutput = UUID.randomUUID().toString();
    String sqlEnginePlugin = UUID.randomUUID().toString();
    ETLBatchConfig config = ETLBatchConfig.builder().setPushdownEnabled(true).setTransformationPushdown(new ETLTransformationPushdown(MockSQLEngineWithCapabilities.getPlugin(sqlEnginePlugin, joinOutputDir.getAbsolutePath(), expectedSchema, expected))).addStage(new ETLStage("users", MockSource.getPlugin(userInput, USER_SCHEMA))).addStage(new ETLStage("purchases", MockSource.getPlugin(purchaseInput, PURCHASE_SCHEMA))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("purchases", "users"), Arrays.asList("region", "user_id"), required, broadcast, Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkOutput))).addStage(new ETLStage("sinkwithwritecapability", MockSinkWithWriteCapability.getPlugin(sinkWithWriteCapabilitiesOutput))).addConnection("users", "join").addConnection("purchases", "join").addConnection("join", "sink").addConnection("join", "sinkwithwritecapability").setEngine(engine).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // write input data
    List<StructuredRecord> userData = Arrays.asList(USER_ALICE, USER_ALYCE, USER_BOB);
    DataSetManager<Table> inputManager = getDataset(userInput);
    MockSource.writeInput(inputManager, userData);
    List<StructuredRecord> purchaseData = new ArrayList<>();
    purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 0).set("purchase_id", 123).build());
    purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 2).set("purchase_id", 456).build());
    inputManager = getDataset(purchaseInput);
    MockSource.writeInput(inputManager, purchaseData);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    Map<String, String> args = ImmutableMap.<String, String>builder().put(MockAutoJoiner.PARTITIONS_ARGUMENT, "1").put(io.cdap.cdap.etl.common.Constants.CONSOLIDATE_STAGES, "false").build();
    workflowManager.startAndWaitForGoodRun(args, ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(sinkOutput);
    List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
    Assert.assertEquals(expected, new HashSet<>(outputRecords));
    validateMetric(5, appId, "join.records.in");
    validateMetric(expected.size(), appId, "join.records.out");
    if (broadcast.isEmpty()) {
        // Ensure all records were written to the SQL engine
        Assert.assertEquals(5, MockSQLEngine.countLinesInDirectory(joinOutputDir));
        validateMetric(12345, appId, "MockWithWriteCapability.records.in");
        validateMetric(12345, appId, "MockWithWriteCapability.records.out");
    } else {
        // Ensure no records are written to the SQL engine if the join contains a broadcast.
        Assert.assertEquals(0, MockSQLEngine.countLinesInDirectory(joinOutputDir));
    }
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLTransformationPushdown(io.cdap.cdap.etl.proto.v2.ETLTransformationPushdown) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) File(java.io.File)

Example 4 with ETLTransformationPushdown

use of io.cdap.cdap.etl.proto.v2.ETLTransformationPushdown in project cdap by caskdata.

the class AutoJoinerTest method testSimpleAutoJoinUsingSQLEngine.

private void testSimpleAutoJoinUsingSQLEngine(List<String> required, List<String> broadcast, Set<StructuredRecord> expected, Schema expectedSchema, Engine engine) throws Exception {
    File joinInputDir = TMP_FOLDER.newFolder();
    File joinOutputDir = TMP_FOLDER.newFolder();
    // If any of the sides of the operation is a join, then we don't need to write as the SQL engine won't be used.
    if (broadcast.isEmpty()) {
        String joinFile = "join-file1.txt";
        MockSQLEngine.writeInput(new File(joinInputDir, joinFile).getAbsolutePath(), expected);
    }
    /*
         users ------|
                     |--> join --> sink
         purchases --|


         joinOn: users.region = purchases.region and users.user_id = purchases.user_id
     */
    String userInput = UUID.randomUUID().toString();
    String purchaseInput = UUID.randomUUID().toString();
    String output = UUID.randomUUID().toString();
    String sqlEnginePlugin = UUID.randomUUID().toString();
    ETLBatchConfig config = ETLBatchConfig.builder().setPushdownEnabled(true).setTransformationPushdown(new ETLTransformationPushdown(MockSQLEngine.getPlugin(sqlEnginePlugin, joinInputDir.getAbsolutePath(), joinOutputDir.getAbsolutePath(), expectedSchema))).addStage(new ETLStage("users", MockSource.getPlugin(userInput, USER_SCHEMA))).addStage(new ETLStage("purchases", MockSource.getPlugin(purchaseInput, PURCHASE_SCHEMA))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("purchases", "users"), Arrays.asList("region", "user_id"), required, broadcast, Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("purchases", "join").addConnection("join", "sink").setEngine(engine).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // write input data
    List<StructuredRecord> userData = Arrays.asList(USER_ALICE, USER_ALYCE, USER_BOB);
    DataSetManager<Table> inputManager = getDataset(userInput);
    MockSource.writeInput(inputManager, userData);
    List<StructuredRecord> purchaseData = new ArrayList<>();
    purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 0).set("purchase_id", 123).build());
    purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 2).set("purchase_id", 456).build());
    inputManager = getDataset(purchaseInput);
    MockSource.writeInput(inputManager, purchaseData);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    Map<String, String> args = Collections.singletonMap(MockAutoJoiner.PARTITIONS_ARGUMENT, "1");
    workflowManager.startAndWaitForGoodRun(args, ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(output);
    List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
    Assert.assertEquals(expected, new HashSet<>(outputRecords));
    validateMetric(5, appId, "join.records.in");
    validateMetric(expected.size(), appId, "join.records.out");
    if (broadcast.isEmpty()) {
        // Ensure all records were written to the SQL engine
        Assert.assertEquals(5, MockSQLEngine.countLinesInDirectory(joinOutputDir));
    } else {
        // Ensure no records are written to the SQL engine if the join contains a broadcast.
        Assert.assertEquals(0, MockSQLEngine.countLinesInDirectory(joinOutputDir));
    }
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLTransformationPushdown(io.cdap.cdap.etl.proto.v2.ETLTransformationPushdown) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) File(java.io.File)

Example 5 with ETLTransformationPushdown

use of io.cdap.cdap.etl.proto.v2.ETLTransformationPushdown in project cdap by caskdata.

the class AutoJoinerTest method testSimpleAutoJoinUsingSQLEngineWithStageSettings.

private void testSimpleAutoJoinUsingSQLEngineWithStageSettings(List<String> required, List<String> broadcast, Set<StructuredRecord> expected, Schema expectedSchema, String includedStages, String excludedStages, Engine engine) throws Exception {
    File joinInputDir = TMP_FOLDER.newFolder();
    File joinOutputDir = TMP_FOLDER.newFolder();
    // If any of the sides of the operation is a join, then we don't need to write as the SQL engine won't be used.
    if (broadcast.isEmpty()) {
        String joinFile = "join-file1.txt";
        MockSQLEngine.writeInput(new File(joinInputDir, joinFile).getAbsolutePath(), expected);
    }
    /*
         users ------|
                     |--> join --> sink
         purchases --|


         joinOn: users.region = purchases.region and users.user_id = purchases.user_id
     */
    String userInput = UUID.randomUUID().toString();
    String purchaseInput = UUID.randomUUID().toString();
    String output = UUID.randomUUID().toString();
    String sqlEnginePlugin = UUID.randomUUID().toString();
    ETLBatchConfig config = ETLBatchConfig.builder().setPushdownEnabled(true).setTransformationPushdown(new ETLTransformationPushdown(MockSQLEngineWithStageSettings.getPlugin(sqlEnginePlugin, joinInputDir.getAbsolutePath(), joinOutputDir.getAbsolutePath(), expectedSchema, includedStages, excludedStages))).addStage(new ETLStage("users", MockSource.getPlugin(userInput, USER_SCHEMA))).addStage(new ETLStage("purchases", MockSource.getPlugin(purchaseInput, PURCHASE_SCHEMA))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("purchases", "users"), Arrays.asList("region", "user_id"), required, broadcast, Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("purchases", "join").addConnection("join", "sink").setEngine(engine).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // write input data
    List<StructuredRecord> userData = Arrays.asList(USER_ALICE, USER_ALYCE, USER_BOB);
    DataSetManager<Table> inputManager = getDataset(userInput);
    MockSource.writeInput(inputManager, userData);
    List<StructuredRecord> purchaseData = new ArrayList<>();
    purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 0).set("purchase_id", 123).build());
    purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 2).set("purchase_id", 456).build());
    inputManager = getDataset(purchaseInput);
    MockSource.writeInput(inputManager, purchaseData);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    Map<String, String> args = Collections.singletonMap(MockAutoJoiner.PARTITIONS_ARGUMENT, "1");
    workflowManager.startAndWaitForGoodRun(args, ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(output);
    List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
    Assert.assertEquals(expected, new HashSet<>(outputRecords));
    validateMetric(5, appId, "join.records.in");
    validateMetric(expected.size(), appId, "join.records.out");
    // If a broadcast stage is set to execute in the engine, the engine must execute the stage
    if (!includedStages.isEmpty() && !broadcast.isEmpty()) {
        // Ensure all records were written to the SQL engine
        Assert.assertEquals(5, MockSQLEngineWithStageSettings.countLinesInDirectory(joinOutputDir));
    } else if (!excludedStages.isEmpty()) {
        // Ensure no records are written to the SQL engine if the join contains a broadcast.
        Assert.assertEquals(0, MockSQLEngineWithStageSettings.countLinesInDirectory(joinOutputDir));
    } else {
        Assert.fail("Should never happen");
    }
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLTransformationPushdown(io.cdap.cdap.etl.proto.v2.ETLTransformationPushdown) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) File(java.io.File)

Aggregations

ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)5 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)5 ETLTransformationPushdown (io.cdap.cdap.etl.proto.v2.ETLTransformationPushdown)5 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)3 Table (io.cdap.cdap.api.dataset.table.Table)3 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)3 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)3 ApplicationManager (io.cdap.cdap.test.ApplicationManager)3 WorkflowManager (io.cdap.cdap.test.WorkflowManager)3 File (java.io.File)3 ArrayList (java.util.ArrayList)3 BatchPipelineSpec (io.cdap.cdap.etl.batch.BatchPipelineSpec)2 PipelineSpec (io.cdap.cdap.etl.proto.v2.spec.PipelineSpec)2 PluginSpec (io.cdap.cdap.etl.proto.v2.spec.PluginSpec)2 Test (org.junit.Test)2