use of io.cdap.cdap.proto.id.ApplicationId in project cdap by caskdata.
the class AutoJoinerTest method testLeftOuterComplexConditionBroadcast.
@Test
public void testLeftOuterComplexConditionBroadcast() throws Exception {
/*
sales ----------|
|--> join --> sink
categories -----|
joinOn:
sales.price > 1000 and sales.date > 2020-01-01 and
(sales.category <=> categories.id or (sales.category is null and sales.department = categories.department))
*/
Schema salesSchema = Schema.recordOf("sale", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("price", Schema.of(Schema.Type.DOUBLE)), Schema.Field.of("date", Schema.of(Schema.LogicalType.DATETIME)), Schema.Field.of("category", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("department", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
Schema categorySchema = Schema.recordOf("category", Schema.Field.of("id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("department", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("flag", Schema.nullableOf(Schema.of(Schema.Type.BOOLEAN))));
Schema expectedSchema = Schema.recordOf("sales.categories", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("flag", Schema.nullableOf(Schema.of(Schema.Type.BOOLEAN))));
String salesInput = UUID.randomUUID().toString();
String categoriesInput = UUID.randomUUID().toString();
String output = UUID.randomUUID().toString();
List<JoinField> select = new ArrayList<>();
select.add(new JoinField("sales", "id"));
select.add(new JoinField("categories", "flag"));
/*
sales.price > 1000 and sales.date > 2020-01-01 and
(sales.category <=> categories.id or (sales.category is null and sales.department = categories.department))
*/
JoinCondition.OnExpression condition = JoinCondition.onExpression().addDatasetAlias("sales", "S").addDatasetAlias("categories", "C").setExpression("S.price > 1000 and S.date > '2020-01-01 00:00:00' and " + "(S.category = C.id or (S.category is null and S.department = C.department))").build();
Map<String, String> joinerProperties = MockAutoJoiner.getProperties(Arrays.asList("sales", "categories"), Collections.emptyList(), Collections.singletonList("sales"), Collections.singletonList("categories"), select, false, null, condition);
ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("sales", MockSource.getPlugin(salesInput, salesSchema))).addStage(new ETLStage("categories", MockSource.getPlugin(categoriesInput, categorySchema))).addStage(new ETLStage("join", new ETLPlugin(MockAutoJoiner.NAME, BatchJoiner.PLUGIN_TYPE, joinerProperties))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("sales", "join").addConnection("categories", "join").addConnection("join", "sink").setEngine(Engine.SPARK).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
List<StructuredRecord> records = new ArrayList<>();
records.add(StructuredRecord.builder(salesSchema).set("id", 0).set("price", 123.45d).set("date", "2021-01-01 00:00:00").set("category", "electronics").set("department", "entertainment").build());
records.add(StructuredRecord.builder(salesSchema).set("id", 1).set("price", 1000.01d).set("date", "2020-01-01 00:00:01").set("department", "home").build());
records.add(StructuredRecord.builder(salesSchema).set("id", 2).set("price", 5000d).set("date", "2021-01-01 00:00:00").set("category", "furniture").build());
records.add(StructuredRecord.builder(salesSchema).set("id", 3).set("price", 2000d).set("date", "2019-12-31 23:59:59").set("category", "furniture").build());
records.add(StructuredRecord.builder(salesSchema).set("id", 4).set("price", 2000d).set("date", "2020-01-01 12:00:00").set("category", "tv").set("department", "entertainment").build());
DataSetManager<Table> inputManager = getDataset(salesInput);
MockSource.writeInput(inputManager, records);
records.clear();
records.add(StructuredRecord.builder(categorySchema).set("id", "electronics").set("department", "entertainment").set("flag", false).build());
records.add(StructuredRecord.builder(categorySchema).set("id", "furniture").set("department", "home").set("flag", true).build());
records.add(StructuredRecord.builder(categorySchema).set("id", "tv").set("department", "entertainment").set("flag", false).build());
inputManager = getDataset(categoriesInput);
MockSource.writeInput(inputManager, records);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(output);
List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(expectedSchema).set("id", 0).build());
expected.add(StructuredRecord.builder(expectedSchema).set("id", 1).set("flag", true).build());
expected.add(StructuredRecord.builder(expectedSchema).set("id", 2).set("flag", true).build());
expected.add(StructuredRecord.builder(expectedSchema).set("id", 3).build());
expected.add(StructuredRecord.builder(expectedSchema).set("id", 4).set("flag", false).build());
Assert.assertEquals(expected, new HashSet<>(outputRecords));
}
use of io.cdap.cdap.proto.id.ApplicationId in project cdap by caskdata.
the class AutoJoinerTest method testNullEquality.
private void testNullEquality(Engine engine, boolean nullIsEqual, Set<StructuredRecord> expected) throws Exception {
Schema itemSchema = Schema.recordOf("item", Schema.Field.of("id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
Schema attributeSchema = Schema.recordOf("attribute", Schema.Field.of("region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("attr", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
/*
items -------|
|--> join --> sink
attributes --|
joinOn: items.region = attributes.region and items.id = attributes.id
*/
String itemsInput = UUID.randomUUID().toString();
String attributesInput = UUID.randomUUID().toString();
String output = UUID.randomUUID().toString();
ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("items", MockSource.getPlugin(itemsInput, itemSchema))).addStage(new ETLStage("attributes", MockSource.getPlugin(attributesInput, attributeSchema))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("items", "attributes"), Arrays.asList("region", "id"), Collections.singletonList("items"), Collections.emptyList(), Collections.emptyList(), nullIsEqual))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("items", "join").addConnection("attributes", "join").addConnection("join", "sink").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
// write input data
List<StructuredRecord> itemData = new ArrayList<>();
itemData.add(StructuredRecord.builder(itemSchema).set("region", "us").set("id", 0).set("name", "bacon").build());
itemData.add(StructuredRecord.builder(itemSchema).set("id", 1).build());
itemData.add(StructuredRecord.builder(itemSchema).set("region", "us").build());
DataSetManager<Table> inputManager = getDataset(itemsInput);
MockSource.writeInput(inputManager, itemData);
List<StructuredRecord> attributesData = new ArrayList<>();
attributesData.add(StructuredRecord.builder(attributeSchema).set("region", "us").set("id", 0).set("attr", "food").build());
attributesData.add(StructuredRecord.builder(attributeSchema).set("id", 1).set("attr", "car").build());
attributesData.add(StructuredRecord.builder(attributeSchema).set("region", "us").build());
inputManager = getDataset(attributesInput);
MockSource.writeInput(inputManager, attributesData);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(output);
List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
Assert.assertEquals(expected, new HashSet<>(outputRecords));
}
use of io.cdap.cdap.proto.id.ApplicationId in project cdap by caskdata.
the class ReducibleAggregatorTestBase method testFieldCountAgg.
protected void testFieldCountAgg(Engine engine, Map<String, String> arguments) throws Exception {
String runSuffix = engine.name() + "-" + UUID.randomUUID();
String source1Name = "pAggInput1-" + runSuffix;
String source2Name = "pAggInput2-" + runSuffix;
String sink1Name = "pAggOutput1-" + runSuffix;
String sink2Name = "pAggOutput2-" + runSuffix;
Schema inputSchema = Schema.recordOf("testRecord", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.LONG)));
/*
source1 --|--> agg1 --> sink1
|
source2 --|--> agg2 --> sink2
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder().setEngine(engine).addStage(new ETLStage("source1", MockSource.getPlugin(source1Name, inputSchema))).addStage(new ETLStage("source2", MockSource.getPlugin(source2Name, inputSchema))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))).addStage(new ETLStage("agg1", FieldCountReducibleAggregator.getPlugin("user", "string"))).addStage(new ETLStage("agg2", FieldCountReducibleAggregator.getPlugin("item", "long"))).addConnection("source1", "agg1").addConnection("source1", "agg2").addConnection("source2", "agg1").addConnection("source2", "agg2").addConnection("agg1", "sink1").addConnection("agg2", "sink2").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("ParallelAggApp-" + runSuffix);
ApplicationManager appManager = deployApplication(appId, appRequest);
// write few records to each source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(source1Name));
MockSource.writeInput(inputManager, ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 1L).build(), StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 2L).build()));
inputManager = getDataset(NamespaceId.DEFAULT.dataset(source2Name));
MockSource.writeInput(inputManager, ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 3L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 4L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 3L).build()));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(arguments);
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
Schema outputSchema1 = Schema.recordOf("user.count", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
Schema outputSchema2 = Schema.recordOf("item.count", Schema.Field.of("item", Schema.of(Schema.Type.LONG)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
// check output
DataSetManager<Table> sinkManager = getDataset(sink1Name);
Set<StructuredRecord> expected = ImmutableSet.of(StructuredRecord.builder(outputSchema1).set("user", "all").set("ct", 5L).build(), StructuredRecord.builder(outputSchema1).set("user", "samuel").set("ct", 3L).build(), StructuredRecord.builder(outputSchema1).set("user", "john").set("ct", 2L).build());
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
sinkManager = getDataset(sink2Name);
expected = ImmutableSet.of(StructuredRecord.builder(outputSchema2).set("item", 0L).set("ct", 5L).build(), StructuredRecord.builder(outputSchema2).set("item", 1L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 2L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 3L).set("ct", 2L).build(), StructuredRecord.builder(outputSchema2).set("item", 4L).set("ct", 1L).build());
actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
validateMetric(2, appId, "source1.records.out");
validateMetric(3, appId, "source2.records.out");
validateMetric(5, appId, "agg1.records.in");
// 2 users, but FieldCountReduceAggregator always emits an 'all' group
validateMetric(3, appId, "agg1.aggregator.groups");
validateMetric(3, appId, "agg1.records.out");
validateMetric(5, appId, "agg2.records.in");
// 4 items, but FieldCountReduceAggregator always emits an 'all' group
validateMetric(5, appId, "agg2.aggregator.groups");
validateMetric(5, appId, "agg2.records.out");
validateMetric(3, appId, "sink1.records.in");
validateMetric(5, appId, "sink2.records.in");
}
use of io.cdap.cdap.proto.id.ApplicationId in project cdap by caskdata.
the class PreviewDataPipelineTest method testMultiplePhase.
private void testMultiplePhase(Engine engine) throws Exception {
/*
* source1 ----> t1 ------
* | --> innerjoin ----> t4 ------
* source2 ----> t2 ------ |
* | ---> outerjoin --> sink1
* |
* source3 -------------------- t3 ------------------------
*/
PreviewManager previewManager = getPreviewManager();
Schema inputSchema1 = Schema.recordOf("customerRecord", Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_name", Schema.of(Schema.Type.STRING)));
Schema inputSchema2 = Schema.recordOf("itemRecord", Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("item_price", Schema.of(Schema.Type.LONG)), Schema.Field.of("cust_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("cust_name", Schema.of(Schema.Type.STRING)));
Schema inputSchema3 = Schema.recordOf("transactionRecord", Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("c_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("i_id", Schema.of(Schema.Type.STRING)));
Schema outSchema2 = Schema.recordOf("join.output", Schema.Field.of("t_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("c_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("i_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_price", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("cust_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("cust_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
String source1MulitJoinInput = "multiJoinSource1-" + engine;
String source2MultiJoinInput = "multiJoinSource2-" + engine;
String source3MultiJoinInput = "multiJoinSource3-" + engine;
String outputName = "multiJoinOutput-" + engine;
String sinkName = "multiJoinOutputSink-" + engine;
String outerJoinName = "multiJoinOuter-" + engine;
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source1", MockSource.getPlugin(source1MulitJoinInput, inputSchema1))).addStage(new ETLStage("source2", MockSource.getPlugin(source2MultiJoinInput, inputSchema2))).addStage(new ETLStage("source3", MockSource.getPlugin(source3MultiJoinInput, inputSchema3))).addStage(new ETLStage("t1", IdentityTransform.getPlugin())).addStage(new ETLStage("t2", IdentityTransform.getPlugin())).addStage(new ETLStage("t3", IdentityTransform.getPlugin())).addStage(new ETLStage("t4", IdentityTransform.getPlugin())).addStage(new ETLStage("innerjoin", MockJoiner.getPlugin("t1.customer_id=t2.cust_id", "t1,t2", ""))).addStage(new ETLStage(outerJoinName, MockJoiner.getPlugin("t4.item_id=t3.i_id", "", ""))).addStage(new ETLStage(sinkName, MockSink.getPlugin(outputName))).addConnection("source1", "t1").addConnection("source2", "t2").addConnection("source3", "t3").addConnection("t1", "innerjoin").addConnection("t2", "innerjoin").addConnection("innerjoin", "t4").addConnection("t3", outerJoinName).addConnection("t4", outerJoinName).addConnection(outerJoinName, sinkName).setEngine(engine).setNumOfRecordsPreview(100).build();
// Construct the preview config with the program name and program type
PreviewConfig previewConfig = new PreviewConfig(SmartWorkflow.NAME, ProgramType.WORKFLOW, Collections.<String, String>emptyMap(), 10);
// Create the table for the mock source
addDatasetInstance(Table.class.getName(), source1MulitJoinInput, DatasetProperties.of(ImmutableMap.of("schema", inputSchema1.toString())));
addDatasetInstance(Table.class.getName(), source2MultiJoinInput, DatasetProperties.of(ImmutableMap.of("schema", inputSchema2.toString())));
addDatasetInstance(Table.class.getName(), source3MultiJoinInput, DatasetProperties.of(ImmutableMap.of("schema", inputSchema3.toString())));
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig, previewConfig);
// Start the preview and get the corresponding PreviewRunner.
ApplicationId previewId = previewManager.start(NamespaceId.DEFAULT, appRequest);
ingestData(inputSchema1, inputSchema2, inputSchema3, source1MulitJoinInput, source2MultiJoinInput, source3MultiJoinInput);
// Wait for the preview status go into COMPLETED.
Tasks.waitFor(PreviewStatus.Status.COMPLETED, new Callable<PreviewStatus.Status>() {
@Override
public PreviewStatus.Status call() throws Exception {
PreviewStatus status = previewManager.getStatus(previewId);
return status == null ? null : status.getStatus();
}
}, 5, TimeUnit.MINUTES);
checkPreviewStore(previewManager, previewId, sinkName, 3);
validateMetric(3L, previewId, sinkName + ".records.in", previewManager);
}
use of io.cdap.cdap.proto.id.ApplicationId in project cdap by caskdata.
the class DataPipelineServiceTest method setupTest.
@BeforeClass
public static void setupTest() throws Exception {
ArtifactId appArtifactId = NamespaceId.SYSTEM.artifact("cdap-data-pipeline", "6.0.0");
setupBatchArtifacts(appArtifactId, DataPipelineApp.class);
enableCapability("pipeline");
ApplicationId pipeline = NamespaceId.SYSTEM.app("pipeline");
appManager = getApplicationManager(pipeline);
waitForAppToDeploy(appManager, pipeline);
serviceManager = appManager.getServiceManager(io.cdap.cdap.etl.common.Constants.STUDIO_SERVICE_NAME);
serviceManager.startAndWaitForGoodRun(ProgramRunStatus.RUNNING, 2, TimeUnit.MINUTES);
serviceURI = serviceManager.getServiceURL(1, TimeUnit.MINUTES).toURI();
}
Aggregations