Search in sources :

Example 41 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class AutoJoinerTest method testSimpleAutoJoin.

private void testSimpleAutoJoin(List<String> required, List<String> broadcast, Set<StructuredRecord> expected, Engine engine) throws Exception {
    /*
         users ------|
                     |--> join --> sink
         purchases --|

         joinOn: users.region = purchases.region and users.user_id = purchases.user_id
     */
    String userInput = UUID.randomUUID().toString();
    String purchaseInput = UUID.randomUUID().toString();
    String output = UUID.randomUUID().toString();
    ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("users", MockSource.getPlugin(userInput, USER_SCHEMA))).addStage(new ETLStage("purchases", MockSource.getPlugin(purchaseInput, PURCHASE_SCHEMA))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("purchases", "users"), Arrays.asList("region", "user_id"), required, broadcast, Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("purchases", "join").addConnection("join", "sink").setEngine(engine).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // write input data
    List<StructuredRecord> userData = Arrays.asList(USER_ALICE, USER_ALYCE, USER_BOB);
    DataSetManager<Table> inputManager = getDataset(userInput);
    MockSource.writeInput(inputManager, userData);
    List<StructuredRecord> purchaseData = new ArrayList<>();
    purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 0).set("purchase_id", 123).build());
    purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 2).set("purchase_id", 456).build());
    inputManager = getDataset(purchaseInput);
    MockSource.writeInput(inputManager, purchaseData);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    Map<String, String> args = Collections.singletonMap(MockAutoJoiner.PARTITIONS_ARGUMENT, "1");
    workflowManager.startAndWaitForGoodRun(args, ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(output);
    List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
    Assert.assertEquals(expected, new HashSet<>(outputRecords));
    validateMetric(5, appId, "join.records.in");
    validateMetric(expected.size(), appId, "join.records.out");
    if (engine != Engine.SPARK) {
        // In SPARK number of partitions hint is ignored, so additional sinks are created
        validateMetric(1, appId, "sink." + MockSink.INITIALIZED_COUNT_METRIC);
    }
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId)

Example 42 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class AutoJoinerTest method testTripleAutoSingleRequiredJoin.

@Test
public void testTripleAutoSingleRequiredJoin() throws Exception {
    Schema expectedSchema = Schema.recordOf("purchases.users.interests", Schema.Field.of("purchases_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("purchases_purchase_id", Schema.of(Schema.Type.INT)), Schema.Field.of("purchases_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("users_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("users_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("interests_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("interests_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("interests_interest", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").set("interests_region", "us").set("interests_user_id", 0).set("interests_interest", "food").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").set("interests_region", "us").set("interests_user_id", 0).set("interests_interest", "sports").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 456).set("purchases_user_id", 2).set("interests_region", "us").set("interests_user_id", 2).set("interests_interest", "gaming").build());
    // First required : all left joins
    testTripleAutoJoin(Collections.singletonList("purchases"), expected, Engine.SPARK, Arrays.asList("purchases", "users", "interests"));
    testTripleAutoJoin(Collections.singletonList("purchases"), expected, Engine.MAPREDUCE, Collections.emptyList());
    // Middle Required : right join + left join.
    testTripleAutoJoin(Collections.singletonList("purchases"), expected, Engine.SPARK, Arrays.asList("users", "purchases", "interests"));
    testTripleAutoJoin(Collections.singletonList("purchases"), expected, Engine.MAPREDUCE, Arrays.asList("users", "purchases", "interests"));
    // Last Required : outer + left join with coalesce from first 2
    testTripleAutoJoin(Collections.singletonList("purchases"), expected, Engine.SPARK, Arrays.asList("users", "interests", "purchases"));
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 43 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class AutoJoinerTest method testAutoInnerJoinUsingSQLEngine.

@Test
public void testAutoInnerJoinUsingSQLEngine() throws Exception {
    Schema expectedSchema = Schema.recordOf("purchases.users", Schema.Field.of("purchases_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("purchases_purchase_id", Schema.of(Schema.Type.INT)), Schema.Field.of("purchases_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("users_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_name", Schema.of(Schema.Type.STRING)));
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").build());
    testSimpleAutoJoinUsingSQLEngine(Arrays.asList("users", "purchases"), Collections.emptyList(), expected, expectedSchema, Engine.SPARK);
    testSimpleAutoJoinUsingSQLEngineWithCapabilities(Arrays.asList("users", "purchases"), Collections.emptyList(), expected, expectedSchema, Engine.SPARK);
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 44 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class AutoJoinerTest method testLeftOuterComplexConditionBroadcast.

@Test
public void testLeftOuterComplexConditionBroadcast() throws Exception {
    /*
         sales ----------|
                         |--> join --> sink
         categories -----|

         joinOn:
           sales.price > 1000 and sales.date > 2020-01-01 and
           (sales.category <=> categories.id or (sales.category is null and sales.department = categories.department))
     */
    Schema salesSchema = Schema.recordOf("sale", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("price", Schema.of(Schema.Type.DOUBLE)), Schema.Field.of("date", Schema.of(Schema.LogicalType.DATETIME)), Schema.Field.of("category", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("department", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    Schema categorySchema = Schema.recordOf("category", Schema.Field.of("id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("department", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("flag", Schema.nullableOf(Schema.of(Schema.Type.BOOLEAN))));
    Schema expectedSchema = Schema.recordOf("sales.categories", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("flag", Schema.nullableOf(Schema.of(Schema.Type.BOOLEAN))));
    String salesInput = UUID.randomUUID().toString();
    String categoriesInput = UUID.randomUUID().toString();
    String output = UUID.randomUUID().toString();
    List<JoinField> select = new ArrayList<>();
    select.add(new JoinField("sales", "id"));
    select.add(new JoinField("categories", "flag"));
    /*
           sales.price > 1000 and sales.date > 2020-01-01 and
           (sales.category <=> categories.id or (sales.category is null and sales.department = categories.department))
     */
    JoinCondition.OnExpression condition = JoinCondition.onExpression().addDatasetAlias("sales", "S").addDatasetAlias("categories", "C").setExpression("S.price > 1000 and S.date > '2020-01-01 00:00:00' and " + "(S.category = C.id or (S.category is null and S.department = C.department))").build();
    Map<String, String> joinerProperties = MockAutoJoiner.getProperties(Arrays.asList("sales", "categories"), Collections.emptyList(), Collections.singletonList("sales"), Collections.singletonList("categories"), select, false, null, condition);
    ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("sales", MockSource.getPlugin(salesInput, salesSchema))).addStage(new ETLStage("categories", MockSource.getPlugin(categoriesInput, categorySchema))).addStage(new ETLStage("join", new ETLPlugin(MockAutoJoiner.NAME, BatchJoiner.PLUGIN_TYPE, joinerProperties))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("sales", "join").addConnection("categories", "join").addConnection("join", "sink").setEngine(Engine.SPARK).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    List<StructuredRecord> records = new ArrayList<>();
    records.add(StructuredRecord.builder(salesSchema).set("id", 0).set("price", 123.45d).set("date", "2021-01-01 00:00:00").set("category", "electronics").set("department", "entertainment").build());
    records.add(StructuredRecord.builder(salesSchema).set("id", 1).set("price", 1000.01d).set("date", "2020-01-01 00:00:01").set("department", "home").build());
    records.add(StructuredRecord.builder(salesSchema).set("id", 2).set("price", 5000d).set("date", "2021-01-01 00:00:00").set("category", "furniture").build());
    records.add(StructuredRecord.builder(salesSchema).set("id", 3).set("price", 2000d).set("date", "2019-12-31 23:59:59").set("category", "furniture").build());
    records.add(StructuredRecord.builder(salesSchema).set("id", 4).set("price", 2000d).set("date", "2020-01-01 12:00:00").set("category", "tv").set("department", "entertainment").build());
    DataSetManager<Table> inputManager = getDataset(salesInput);
    MockSource.writeInput(inputManager, records);
    records.clear();
    records.add(StructuredRecord.builder(categorySchema).set("id", "electronics").set("department", "entertainment").set("flag", false).build());
    records.add(StructuredRecord.builder(categorySchema).set("id", "furniture").set("department", "home").set("flag", true).build());
    records.add(StructuredRecord.builder(categorySchema).set("id", "tv").set("department", "entertainment").set("flag", false).build());
    inputManager = getDataset(categoriesInput);
    MockSource.writeInput(inputManager, records);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(output);
    List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 0).build());
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 1).set("flag", true).build());
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 2).set("flag", true).build());
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 3).build());
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 4).set("flag", false).build());
    Assert.assertEquals(expected, new HashSet<>(outputRecords));
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) JoinField(io.cdap.cdap.etl.api.join.JoinField) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 45 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class AutoJoinerTest method testNullEquality.

private void testNullEquality(Engine engine, boolean nullIsEqual, Set<StructuredRecord> expected) throws Exception {
    Schema itemSchema = Schema.recordOf("item", Schema.Field.of("id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    Schema attributeSchema = Schema.recordOf("attribute", Schema.Field.of("region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("attr", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    /*
         items -------|
                      |--> join --> sink
         attributes --|

         joinOn: items.region = attributes.region and items.id = attributes.id
     */
    String itemsInput = UUID.randomUUID().toString();
    String attributesInput = UUID.randomUUID().toString();
    String output = UUID.randomUUID().toString();
    ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("items", MockSource.getPlugin(itemsInput, itemSchema))).addStage(new ETLStage("attributes", MockSource.getPlugin(attributesInput, attributeSchema))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("items", "attributes"), Arrays.asList("region", "id"), Collections.singletonList("items"), Collections.emptyList(), Collections.emptyList(), nullIsEqual))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("items", "join").addConnection("attributes", "join").addConnection("join", "sink").setEngine(engine).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // write input data
    List<StructuredRecord> itemData = new ArrayList<>();
    itemData.add(StructuredRecord.builder(itemSchema).set("region", "us").set("id", 0).set("name", "bacon").build());
    itemData.add(StructuredRecord.builder(itemSchema).set("id", 1).build());
    itemData.add(StructuredRecord.builder(itemSchema).set("region", "us").build());
    DataSetManager<Table> inputManager = getDataset(itemsInput);
    MockSource.writeInput(inputManager, itemData);
    List<StructuredRecord> attributesData = new ArrayList<>();
    attributesData.add(StructuredRecord.builder(attributeSchema).set("region", "us").set("id", 0).set("attr", "food").build());
    attributesData.add(StructuredRecord.builder(attributeSchema).set("id", 1).set("attr", "car").build());
    attributesData.add(StructuredRecord.builder(attributeSchema).set("region", "us").build());
    inputManager = getDataset(attributesInput);
    MockSource.writeInput(inputManager, attributesData);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(output);
    List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
    Assert.assertEquals(expected, new HashSet<>(outputRecords));
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId)

Aggregations

StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)210 Schema (io.cdap.cdap.api.data.schema.Schema)169 Test (org.junit.Test)119 Table (io.cdap.cdap.api.dataset.table.Table)76 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)73 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)73 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)68 ApplicationManager (io.cdap.cdap.test.ApplicationManager)68 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)59 WorkflowManager (io.cdap.cdap.test.WorkflowManager)54 HashSet (java.util.HashSet)50 ArrayList (java.util.ArrayList)44 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)40 HashMap (java.util.HashMap)25 File (java.io.File)17 ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)16 FormatSpecification (io.cdap.cdap.api.data.format.FormatSpecification)15 DataStreamsConfig (io.cdap.cdap.etl.proto.v2.DataStreamsConfig)14 SparkManager (io.cdap.cdap.test.SparkManager)12 Map (java.util.Map)12