use of io.cdap.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class AutoJoinerTest method testTripleAutoJoin.
private void testTripleAutoJoin(List<String> required, List<String> broadcast, Set<StructuredRecord> expected, Engine engine, List<String> tablesInOrderToJoin) throws Exception {
/*
users ------|
|
purchases --|--> join --> sink
|
interests --|
joinOn: users.region = purchases.region = interests.region and
users.user_id = purchases.user_id = interests.user_id
*/
String userInput = UUID.randomUUID().toString();
String purchaseInput = UUID.randomUUID().toString();
String interestInput = UUID.randomUUID().toString();
String output = UUID.randomUUID().toString();
ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("users", MockSource.getPlugin(userInput, USER_SCHEMA))).addStage(new ETLStage("purchases", MockSource.getPlugin(purchaseInput, PURCHASE_SCHEMA))).addStage(new ETLStage("interests", MockSource.getPlugin(interestInput, INTEREST_SCHEMA))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(tablesInOrderToJoin, Arrays.asList("region", "user_id"), required, broadcast, Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("purchases", "join").addConnection("interests", "join").addConnection("join", "sink").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
// write input data
List<StructuredRecord> userData = Arrays.asList(USER_ALICE, USER_ALYCE, USER_BOB);
DataSetManager<Table> inputManager = getDataset(userInput);
MockSource.writeInput(inputManager, userData);
List<StructuredRecord> purchaseData = new ArrayList<>();
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 0).set("purchase_id", 123).build());
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 2).set("purchase_id", 456).build());
inputManager = getDataset(purchaseInput);
MockSource.writeInput(inputManager, purchaseData);
List<StructuredRecord> interestData = new ArrayList<>();
interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 0).set("interest", "food").build());
interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 0).set("interest", "sports").build());
interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 1).set("interest", "gardening").build());
interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 2).set("interest", "gaming").build());
inputManager = getDataset(interestInput);
MockSource.writeInput(inputManager, interestData);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(output);
List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
Set<StructuredRecord> actual = new HashSet<>();
Schema expectedSchema = expected.iterator().hasNext() ? expected.iterator().next().getSchema() : null;
if (expectedSchema == null || expected.iterator().next().getSchema() == outputRecords.get(0).getSchema()) {
actual = new HashSet<>(outputRecords);
} else {
// reorder the output columns of the join result (actual) to match the column order of expected
for (StructuredRecord sr : outputRecords) {
actual.add(StructuredRecord.builder(expectedSchema).set("purchases_region", sr.get("purchases_region")).set("purchases_purchase_id", sr.get("purchases_purchase_id")).set("purchases_user_id", sr.get("purchases_user_id")).set("users_region", sr.get("users_region")).set("users_user_id", sr.get("users_user_id")).set("users_name", sr.get("users_name")).set("interests_region", sr.get("interests_region")).set("interests_user_id", sr.get("interests_user_id")).set("interests_interest", sr.get("interests_interest")).build());
}
}
Assert.assertEquals(expected, actual);
validateMetric(9, appId, "join.records.in");
validateMetric(expected.size(), appId, "join.records.out");
}
use of io.cdap.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class AutoJoinerTest method testQuadAutoJoin.
private void testQuadAutoJoin(List<String> required, List<String> broadcast, Set<StructuredRecord> expected, Engine engine, List<String> tablesInOrderToJoin) throws Exception {
/*
users ------|
|
purchases --|--> join --> sink
|
interests --|
|
age --------|
joinOn: users.region = purchases.region = interests.region = age.region and
users.user_id = purchases.user_id = interests.user_id = age.user_id
*/
String userInput = UUID.randomUUID().toString();
String purchaseInput = UUID.randomUUID().toString();
String interestInput = UUID.randomUUID().toString();
String ageInput = UUID.randomUUID().toString();
String output = UUID.randomUUID().toString();
ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("users", MockSource.getPlugin(userInput, USER_SCHEMA))).addStage(new ETLStage("purchases", MockSource.getPlugin(purchaseInput, PURCHASE_SCHEMA))).addStage(new ETLStage("interests", MockSource.getPlugin(interestInput, INTEREST_SCHEMA))).addStage(new ETLStage("ages", MockSource.getPlugin(ageInput, AGE_SCHEMA))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(tablesInOrderToJoin, Arrays.asList("region", "user_id"), required, broadcast, Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("purchases", "join").addConnection("interests", "join").addConnection("ages", "join").addConnection("join", "sink").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
// write input data
List<StructuredRecord> userData = Arrays.asList(USER_ALICE, USER_ALYCE, USER_BOB);
DataSetManager<Table> inputManager = getDataset(userInput);
MockSource.writeInput(inputManager, userData);
List<StructuredRecord> purchaseData = new ArrayList<>();
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 0).set("purchase_id", 123).build());
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 2).set("purchase_id", 456).build());
inputManager = getDataset(purchaseInput);
MockSource.writeInput(inputManager, purchaseData);
List<StructuredRecord> interestData = new ArrayList<>();
interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 0).set("interest", "food").build());
interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 0).set("interest", "sports").build());
interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 1).set("interest", "gardening").build());
interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 2).set("interest", "gaming").build());
inputManager = getDataset(interestInput);
MockSource.writeInput(inputManager, interestData);
List<StructuredRecord> ageData = new ArrayList<>();
ageData.add(StructuredRecord.builder(AGE_SCHEMA).set("region", "us").set("user_id", 10).set("age", 20).build());
ageData.add(StructuredRecord.builder(AGE_SCHEMA).set("region", "us").set("user_id", 1).set("age", 30).build());
inputManager = getDataset(ageInput);
MockSource.writeInput(inputManager, ageData);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(output);
List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
Set<StructuredRecord> actual = new HashSet<>();
Schema expectedSchema = expected.iterator().hasNext() ? expected.iterator().next().getSchema() : null;
if (expectedSchema == null || expected.iterator().next().getSchema() == outputRecords.get(0).getSchema()) {
actual = new HashSet<>(outputRecords);
} else {
// reorder the output columns of the join result (actual) to match the column order of expected
for (StructuredRecord sr : outputRecords) {
actual.add(StructuredRecord.builder(expectedSchema).set("ages_region", sr.get("ages_region")).set("ages_age", sr.get("ages_age")).set("ages_user_id", sr.get("ages_user_id")).set("purchases_region", sr.get("purchases_region")).set("purchases_purchase_id", sr.get("purchases_purchase_id")).set("purchases_user_id", sr.get("purchases_user_id")).set("users_region", sr.get("users_region")).set("users_user_id", sr.get("users_user_id")).set("users_name", sr.get("users_name")).set("interests_region", sr.get("interests_region")).set("interests_user_id", sr.get("interests_user_id")).set("interests_interest", sr.get("interests_interest")).build());
}
}
Assert.assertEquals(expected, actual);
validateMetric(11, appId, "join.records.in");
validateMetric(expected.size(), appId, "join.records.out");
}
use of io.cdap.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class AutoJoinerTest method testCaseSensitivity.
@Test
public void testCaseSensitivity() throws Exception {
Schema weird1 = Schema.recordOf("weird1", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("ID", Schema.of(Schema.Type.LONG)), Schema.Field.of("Id", Schema.of(Schema.Type.INT)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
Schema weird2 = Schema.recordOf("weird2", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("ID", Schema.of(Schema.Type.LONG)), Schema.Field.of("val", Schema.of(Schema.Type.STRING)));
String input1 = UUID.randomUUID().toString();
String input2 = UUID.randomUUID().toString();
String output = UUID.randomUUID().toString();
ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("i1", MockSource.getPlugin(input1, weird1))).addStage(new ETLStage("i2", MockSource.getPlugin(input2, weird2))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("i1", "i2"), Arrays.asList("id", "ID"), Arrays.asList("i1", "i2"), Collections.emptyList(), Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("i1", "join").addConnection("i2", "join").addConnection("join", "sink").setEngine(Engine.SPARK).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
// write input data
List<StructuredRecord> input1Data = new ArrayList<>();
input1Data.add(StructuredRecord.builder(weird1).set("id", 0).set("ID", 99L).set("Id", 0).set("name", "zero").build());
input1Data.add(StructuredRecord.builder(weird1).set("id", 1).set("ID", 0L).set("Id", 0).set("name", "one").build());
DataSetManager<Table> inputManager = getDataset(input1);
MockSource.writeInput(inputManager, input1Data);
List<StructuredRecord> input2Data = new ArrayList<>();
input2Data.add(StructuredRecord.builder(weird2).set("id", 0).set("ID", 99L).set("val", "0").build());
input2Data.add(StructuredRecord.builder(weird2).set("id", 1).set("ID", 99L).set("val", "1").build());
input2Data.add(StructuredRecord.builder(weird2).set("id", 0).set("ID", 0L).set("val", "2").build());
inputManager = getDataset(input2);
MockSource.writeInput(inputManager, input2Data);
Schema expectedSchema = Schema.recordOf("i1.i2", Schema.Field.of("i1_id", Schema.of(Schema.Type.INT)), Schema.Field.of("i1_ID", Schema.of(Schema.Type.LONG)), Schema.Field.of("i1_Id", Schema.of(Schema.Type.INT)), Schema.Field.of("i1_name", Schema.of(Schema.Type.STRING)), Schema.Field.of("i2_id", Schema.of(Schema.Type.INT)), Schema.Field.of("i2_ID", Schema.of(Schema.Type.LONG)), Schema.Field.of("i2_val", Schema.of(Schema.Type.STRING)));
StructuredRecord expected = StructuredRecord.builder(expectedSchema).set("i1_id", 0).set("i1_ID", 99L).set("i1_Id", 0).set("i1_name", "zero").set("i2_id", 0).set("i2_ID", 99L).set("i2_val", "0").build();
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
Map<String, String> args = Collections.singletonMap(MockAutoJoiner.PARTITIONS_ARGUMENT, "1");
workflowManager.startAndWaitForGoodRun(args, ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(output);
List<StructuredRecord> actual = MockSink.readOutput(outputManager);
Assert.assertEquals(Collections.singletonList(expected), actual);
}
use of io.cdap.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class AutoJoinerTest method testAutoJoinWithMacros.
private void testAutoJoinWithMacros(Engine engine, List<String> required, Schema expectedSchema, Set<StructuredRecord> expectedRecords, boolean excludeUsers, boolean excludePurchases) throws Exception {
/*
users ------|
|--> join --> sink
purchases --|
joinOn: users.region = purchases.region and users.user_id = purchases.user_id
*/
String userInput = UUID.randomUUID().toString();
String purchaseInput = UUID.randomUUID().toString();
String output = UUID.randomUUID().toString();
Map<String, String> joinerProps = new HashMap<>();
joinerProps.put(MockAutoJoiner.Conf.STAGES, "${stages}");
joinerProps.put(MockAutoJoiner.Conf.KEY, "${key}");
joinerProps.put(MockAutoJoiner.Conf.REQUIRED, "${required}");
joinerProps.put(MockAutoJoiner.Conf.SELECT, "${select}");
if (engine == Engine.SPARK || (required.size() < 2 && engine == Engine.MAPREDUCE)) {
joinerProps.put(MockAutoJoiner.Conf.SCHEMA, "${schema}");
}
ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("users", MockSource.getPlugin(userInput))).addStage(new ETLStage("purchases", MockSource.getPlugin(purchaseInput))).addStage(new ETLStage("join", new ETLPlugin(MockAutoJoiner.NAME, BatchJoiner.PLUGIN_TYPE, joinerProps))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("purchases", "join").addConnection("join", "sink").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
// write input data
if (!excludeUsers) {
List<StructuredRecord> userData = Arrays.asList(USER_ALICE, USER_ALYCE, USER_BOB);
DataSetManager<Table> inputManager = getDataset(userInput);
MockSource.writeInput(inputManager, userData);
}
if (!excludePurchases) {
List<StructuredRecord> purchaseData = new ArrayList<>();
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 0).set("purchase_id", 123).build());
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 2).set("purchase_id", 456).build());
DataSetManager<Table> inputManager = getDataset(purchaseInput);
MockSource.writeInput(inputManager, purchaseData);
}
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
List<JoinField> selectedFields = new ArrayList<>();
selectedFields.add(new JoinField("purchases", "region"));
selectedFields.add(new JoinField("purchases", "purchase_id"));
selectedFields.add(new JoinField("purchases", "user_id"));
selectedFields.add(new JoinField("users", "name"));
Map<String, String> joinerProperties = MockAutoJoiner.getProperties(Arrays.asList("purchases", "users"), Arrays.asList("region", "user_id"), required, Collections.emptyList(), selectedFields, true);
Map<String, String> runtimeArgs = new HashMap<>();
runtimeArgs.put("stages", joinerProperties.get(MockAutoJoiner.Conf.STAGES));
runtimeArgs.put("key", joinerProperties.get(MockAutoJoiner.Conf.KEY));
runtimeArgs.put("required", joinerProperties.get(MockAutoJoiner.Conf.REQUIRED));
runtimeArgs.put("select", joinerProperties.get(MockAutoJoiner.Conf.SELECT));
runtimeArgs.put("schema", expectedSchema.toString());
workflowManager.startAndWaitForGoodRun(runtimeArgs, ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(output);
List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
Assert.assertEquals(expectedRecords, new HashSet<>(outputRecords));
}
use of io.cdap.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class DataPipelineServiceTest method testConnectionMacroSubstitution.
@Test
public void testConnectionMacroSubstitution() throws Exception {
String stageName = "tx";
// test using no connection macro, it should behave as a normal plugin
Map<String, String> properties = Collections.singletonMap("tableName", "test");
ETLStage stage = new ETLStage(stageName, new ETLPlugin(MockSource.NAME, BatchSource.PLUGIN_TYPE, properties));
StageValidationRequest requestBody = new StageValidationRequest(stage, Collections.emptyList(), false);
StageValidationResponse actual = sendRequest(requestBody);
Assert.assertTrue(actual.getFailures().isEmpty());
// use a connection macro, the validation should also pass
properties = Collections.singletonMap("connectionConfig", "${conn(testconn)}");
addConnection("testconn", new ConnectionCreationRequest("", new PluginInfo("test", "dummy", null, Collections.singletonMap("tableName", "newtest"), new ArtifactSelectorConfig())));
stage = new ETLStage(stageName + "conn", new ETLPlugin(MockSource.NAME, BatchSource.PLUGIN_TYPE, properties));
requestBody = new StageValidationRequest(stage, Collections.emptyList(), false);
actual = sendRequest(requestBody);
Assert.assertTrue(actual.getFailures().isEmpty());
// test the properties can still be correctly set if the connection property get evaluated to a json object
addConnection("testconn", new ConnectionCreationRequest("", new PluginInfo("test", "dummy", null, ImmutableMap.of("tableName", "aaa", "key1", "${badval}"), new ArtifactSelectorConfig())));
getPreferencesService().setProperties(NamespaceId.DEFAULT, Collections.singletonMap("badval", "{\"a\" : 1}"));
// test it can still pass validation
actual = sendRequest(new StageValidationRequest(stage, Collections.emptyList(), true));
Assert.assertTrue(actual.getFailures().isEmpty());
}
Aggregations