use of io.cdap.cdap.etl.proto.v2.ETLTransformationPushdown in project cdap by caskdata.
the class PipelineSpecGeneratorTest method testSQLEngineNotEnabled.
@Test
public void testSQLEngineNotEnabled() throws ValidationException {
ETLBatchConfig config = ETLBatchConfig.builder().setTimeSchedule("* * * * *").addStage(new ETLStage("action", MOCK_ACTION)).setPushdownEnabled(false).setTransformationPushdown(new ETLTransformationPushdown(MOCK_SQL_ENGINE)).build();
PipelineSpec actual = specGenerator.generateSpec(config);
Map<String, String> emptyMap = ImmutableMap.of();
PipelineSpec expected = BatchPipelineSpec.builder().addStage(StageSpec.builder("action", new PluginSpec(Action.PLUGIN_TYPE, "mockaction", emptyMap, ARTIFACT_ID)).build()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setStageLoggingEnabled(config.isStageLoggingEnabled()).setSqlEngineStageSpec(null).build();
Assert.assertEquals(expected, actual);
}
use of io.cdap.cdap.etl.proto.v2.ETLTransformationPushdown in project cdap by caskdata.
the class PipelineSpecGeneratorTest method testSQLEngine.
@Test
public void testSQLEngine() throws ValidationException {
ETLBatchConfig config = ETLBatchConfig.builder().setTimeSchedule("* * * * *").addStage(new ETLStage("action", MOCK_ACTION)).setPushdownEnabled(true).setTransformationPushdown(new ETLTransformationPushdown(MOCK_SQL_ENGINE)).build();
PipelineSpec actual = specGenerator.generateSpec(config);
Map<String, String> emptyMap = ImmutableMap.of();
PipelineSpec expected = BatchPipelineSpec.builder().addStage(StageSpec.builder("action", new PluginSpec(Action.PLUGIN_TYPE, "mockaction", emptyMap, ARTIFACT_ID)).build()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setStageLoggingEnabled(config.isStageLoggingEnabled()).setSqlEngineStageSpec(StageSpec.builder("sqlengine_mocksqlengine", new PluginSpec(BatchSQLEngine.PLUGIN_TYPE, "mocksqlengine", emptyMap, ARTIFACT_ID)).build()).build();
Assert.assertEquals(expected, actual);
}
use of io.cdap.cdap.etl.proto.v2.ETLTransformationPushdown in project cdap by caskdata.
the class AutoJoinerTest method testSimpleAutoJoinUsingSQLEngineWithCapabilities.
private void testSimpleAutoJoinUsingSQLEngineWithCapabilities(List<String> required, List<String> broadcast, Set<StructuredRecord> expected, Schema expectedSchema, Engine engine) throws Exception {
File joinOutputDir = TMP_FOLDER.newFolder();
/*
users ------|
|--> join --> sink
purchases --|
joinOn: users.region = purchases.region and users.user_id = purchases.user_id
*/
String userInput = UUID.randomUUID().toString();
String purchaseInput = UUID.randomUUID().toString();
String sinkOutput = UUID.randomUUID().toString();
String sinkWithWriteCapabilitiesOutput = UUID.randomUUID().toString();
String sqlEnginePlugin = UUID.randomUUID().toString();
ETLBatchConfig config = ETLBatchConfig.builder().setPushdownEnabled(true).setTransformationPushdown(new ETLTransformationPushdown(MockSQLEngineWithCapabilities.getPlugin(sqlEnginePlugin, joinOutputDir.getAbsolutePath(), expectedSchema, expected))).addStage(new ETLStage("users", MockSource.getPlugin(userInput, USER_SCHEMA))).addStage(new ETLStage("purchases", MockSource.getPlugin(purchaseInput, PURCHASE_SCHEMA))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("purchases", "users"), Arrays.asList("region", "user_id"), required, broadcast, Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkOutput))).addStage(new ETLStage("sinkwithwritecapability", MockSinkWithWriteCapability.getPlugin(sinkWithWriteCapabilitiesOutput))).addConnection("users", "join").addConnection("purchases", "join").addConnection("join", "sink").addConnection("join", "sinkwithwritecapability").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
// write input data
List<StructuredRecord> userData = Arrays.asList(USER_ALICE, USER_ALYCE, USER_BOB);
DataSetManager<Table> inputManager = getDataset(userInput);
MockSource.writeInput(inputManager, userData);
List<StructuredRecord> purchaseData = new ArrayList<>();
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 0).set("purchase_id", 123).build());
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 2).set("purchase_id", 456).build());
inputManager = getDataset(purchaseInput);
MockSource.writeInput(inputManager, purchaseData);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
Map<String, String> args = ImmutableMap.<String, String>builder().put(MockAutoJoiner.PARTITIONS_ARGUMENT, "1").put(io.cdap.cdap.etl.common.Constants.CONSOLIDATE_STAGES, "false").build();
workflowManager.startAndWaitForGoodRun(args, ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(sinkOutput);
List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
Assert.assertEquals(expected, new HashSet<>(outputRecords));
validateMetric(5, appId, "join.records.in");
validateMetric(expected.size(), appId, "join.records.out");
if (broadcast.isEmpty()) {
// Ensure all records were written to the SQL engine
Assert.assertEquals(5, MockSQLEngine.countLinesInDirectory(joinOutputDir));
validateMetric(12345, appId, "MockWithWriteCapability.records.in");
validateMetric(12345, appId, "MockWithWriteCapability.records.out");
} else {
// Ensure no records are written to the SQL engine if the join contains a broadcast.
Assert.assertEquals(0, MockSQLEngine.countLinesInDirectory(joinOutputDir));
}
}
use of io.cdap.cdap.etl.proto.v2.ETLTransformationPushdown in project cdap by caskdata.
the class AutoJoinerTest method testSimpleAutoJoinUsingSQLEngine.
private void testSimpleAutoJoinUsingSQLEngine(List<String> required, List<String> broadcast, Set<StructuredRecord> expected, Schema expectedSchema, Engine engine) throws Exception {
File joinInputDir = TMP_FOLDER.newFolder();
File joinOutputDir = TMP_FOLDER.newFolder();
// If any of the sides of the operation is a join, then we don't need to write as the SQL engine won't be used.
if (broadcast.isEmpty()) {
String joinFile = "join-file1.txt";
MockSQLEngine.writeInput(new File(joinInputDir, joinFile).getAbsolutePath(), expected);
}
/*
users ------|
|--> join --> sink
purchases --|
joinOn: users.region = purchases.region and users.user_id = purchases.user_id
*/
String userInput = UUID.randomUUID().toString();
String purchaseInput = UUID.randomUUID().toString();
String output = UUID.randomUUID().toString();
String sqlEnginePlugin = UUID.randomUUID().toString();
ETLBatchConfig config = ETLBatchConfig.builder().setPushdownEnabled(true).setTransformationPushdown(new ETLTransformationPushdown(MockSQLEngine.getPlugin(sqlEnginePlugin, joinInputDir.getAbsolutePath(), joinOutputDir.getAbsolutePath(), expectedSchema))).addStage(new ETLStage("users", MockSource.getPlugin(userInput, USER_SCHEMA))).addStage(new ETLStage("purchases", MockSource.getPlugin(purchaseInput, PURCHASE_SCHEMA))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("purchases", "users"), Arrays.asList("region", "user_id"), required, broadcast, Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("purchases", "join").addConnection("join", "sink").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
// write input data
List<StructuredRecord> userData = Arrays.asList(USER_ALICE, USER_ALYCE, USER_BOB);
DataSetManager<Table> inputManager = getDataset(userInput);
MockSource.writeInput(inputManager, userData);
List<StructuredRecord> purchaseData = new ArrayList<>();
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 0).set("purchase_id", 123).build());
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 2).set("purchase_id", 456).build());
inputManager = getDataset(purchaseInput);
MockSource.writeInput(inputManager, purchaseData);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
Map<String, String> args = Collections.singletonMap(MockAutoJoiner.PARTITIONS_ARGUMENT, "1");
workflowManager.startAndWaitForGoodRun(args, ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(output);
List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
Assert.assertEquals(expected, new HashSet<>(outputRecords));
validateMetric(5, appId, "join.records.in");
validateMetric(expected.size(), appId, "join.records.out");
if (broadcast.isEmpty()) {
// Ensure all records were written to the SQL engine
Assert.assertEquals(5, MockSQLEngine.countLinesInDirectory(joinOutputDir));
} else {
// Ensure no records are written to the SQL engine if the join contains a broadcast.
Assert.assertEquals(0, MockSQLEngine.countLinesInDirectory(joinOutputDir));
}
}
use of io.cdap.cdap.etl.proto.v2.ETLTransformationPushdown in project cdap by caskdata.
the class AutoJoinerTest method testSimpleAutoJoinUsingSQLEngineWithStageSettings.
private void testSimpleAutoJoinUsingSQLEngineWithStageSettings(List<String> required, List<String> broadcast, Set<StructuredRecord> expected, Schema expectedSchema, String includedStages, String excludedStages, Engine engine) throws Exception {
File joinInputDir = TMP_FOLDER.newFolder();
File joinOutputDir = TMP_FOLDER.newFolder();
// If any of the sides of the operation is a join, then we don't need to write as the SQL engine won't be used.
if (broadcast.isEmpty()) {
String joinFile = "join-file1.txt";
MockSQLEngine.writeInput(new File(joinInputDir, joinFile).getAbsolutePath(), expected);
}
/*
users ------|
|--> join --> sink
purchases --|
joinOn: users.region = purchases.region and users.user_id = purchases.user_id
*/
String userInput = UUID.randomUUID().toString();
String purchaseInput = UUID.randomUUID().toString();
String output = UUID.randomUUID().toString();
String sqlEnginePlugin = UUID.randomUUID().toString();
ETLBatchConfig config = ETLBatchConfig.builder().setPushdownEnabled(true).setTransformationPushdown(new ETLTransformationPushdown(MockSQLEngineWithStageSettings.getPlugin(sqlEnginePlugin, joinInputDir.getAbsolutePath(), joinOutputDir.getAbsolutePath(), expectedSchema, includedStages, excludedStages))).addStage(new ETLStage("users", MockSource.getPlugin(userInput, USER_SCHEMA))).addStage(new ETLStage("purchases", MockSource.getPlugin(purchaseInput, PURCHASE_SCHEMA))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("purchases", "users"), Arrays.asList("region", "user_id"), required, broadcast, Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("purchases", "join").addConnection("join", "sink").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
// write input data
List<StructuredRecord> userData = Arrays.asList(USER_ALICE, USER_ALYCE, USER_BOB);
DataSetManager<Table> inputManager = getDataset(userInput);
MockSource.writeInput(inputManager, userData);
List<StructuredRecord> purchaseData = new ArrayList<>();
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 0).set("purchase_id", 123).build());
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 2).set("purchase_id", 456).build());
inputManager = getDataset(purchaseInput);
MockSource.writeInput(inputManager, purchaseData);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
Map<String, String> args = Collections.singletonMap(MockAutoJoiner.PARTITIONS_ARGUMENT, "1");
workflowManager.startAndWaitForGoodRun(args, ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(output);
List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
Assert.assertEquals(expected, new HashSet<>(outputRecords));
validateMetric(5, appId, "join.records.in");
validateMetric(expected.size(), appId, "join.records.out");
// If a broadcast stage is set to execute in the engine, the engine must execute the stage
if (!includedStages.isEmpty() && !broadcast.isEmpty()) {
// Ensure all records were written to the SQL engine
Assert.assertEquals(5, MockSQLEngineWithStageSettings.countLinesInDirectory(joinOutputDir));
} else if (!excludedStages.isEmpty()) {
// Ensure no records are written to the SQL engine if the join contains a broadcast.
Assert.assertEquals(0, MockSQLEngineWithStageSettings.countLinesInDirectory(joinOutputDir));
} else {
Assert.fail("Should never happen");
}
}
Aggregations