Search in sources :

Example 31 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class AutoJoinerTest method testTripleAutoTwoRequiredJoin.

@Test
public void testTripleAutoTwoRequiredJoin() throws Exception {
    Schema expectedSchema = Schema.recordOf("purchases.users.interests", Schema.Field.of("purchases_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("purchases_purchase_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("purchases_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("users_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("users_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_name", Schema.of(Schema.Type.STRING)), Schema.Field.of("interests_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("interests_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("interests_interest", Schema.of(Schema.Type.STRING)));
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").set("interests_region", "us").set("interests_user_id", 0).set("interests_interest", "food").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").set("interests_region", "us").set("interests_user_id", 0).set("interests_interest", "sports").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("users_region", "us").set("users_user_id", 1).set("users_name", "bob").set("interests_region", "us").set("interests_user_id", 1).set("interests_interest", "gardening").build());
    testTripleAutoJoin(Arrays.asList("users", "interests"), expected, Engine.SPARK, Collections.emptyList());
    testTripleAutoJoin(Arrays.asList("users", "interests"), expected, Engine.MAPREDUCE, Collections.emptyList());
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 32 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class AutoJoinerTest method testBroadcastJoin.

@Test
public void testBroadcastJoin() throws Exception {
    Schema expectedSchema = Schema.recordOf("purchases.users", Schema.Field.of("purchases_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("purchases_purchase_id", Schema.of(Schema.Type.INT)), Schema.Field.of("purchases_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("users_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_name", Schema.of(Schema.Type.STRING)));
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").build());
    testSimpleAutoJoin(Arrays.asList("users", "purchases"), Collections.singletonList("users"), expected, Engine.SPARK);
    testSimpleAutoJoin(Arrays.asList("users", "purchases"), Collections.singletonList("purchases"), expected, Engine.SPARK);
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 33 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class AutoJoinerTest method testAutoLeftOuterJoinSkewed.

@Test
public void testAutoLeftOuterJoinSkewed() throws Exception {
    Schema expectedSchema = Schema.recordOf("interests.users", Schema.Field.of("interests_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("interests_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("interests_interest", Schema.of(Schema.Type.STRING)), Schema.Field.of("users_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("users_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("users_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(expectedSchema).set("interests_region", "us").set("interests_interest", "hiking").set("interests_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("interests_region", "us").set("interests_interest", "running").set("interests_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("interests_region", "us").set("interests_interest", "cooking").set("interests_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("interests_region", "us").set("interests_interest", "hiking").set("interests_user_id", 1).set("users_region", "us").set("users_user_id", 1).set("users_name", "bob").build());
    testSimpleAutoJoinSkewed(Collections.singletonList("interests"), expected, Engine.SPARK);
    testSimpleAutoJoinSkewed(Collections.singletonList("interests"), expected, Engine.MAPREDUCE);
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 34 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class AutoJoinerTest method testQuadAutoJoin.

private void testQuadAutoJoin(List<String> required, List<String> broadcast, Set<StructuredRecord> expected, Engine engine, List<String> tablesInOrderToJoin) throws Exception {
    /*
         users ------|
                     |
         purchases --|--> join --> sink
                     |
         interests --|
                     |
         age --------|

         joinOn: users.region = purchases.region = interests.region = age.region and
                 users.user_id = purchases.user_id = interests.user_id = age.user_id
     */
    String userInput = UUID.randomUUID().toString();
    String purchaseInput = UUID.randomUUID().toString();
    String interestInput = UUID.randomUUID().toString();
    String ageInput = UUID.randomUUID().toString();
    String output = UUID.randomUUID().toString();
    ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("users", MockSource.getPlugin(userInput, USER_SCHEMA))).addStage(new ETLStage("purchases", MockSource.getPlugin(purchaseInput, PURCHASE_SCHEMA))).addStage(new ETLStage("interests", MockSource.getPlugin(interestInput, INTEREST_SCHEMA))).addStage(new ETLStage("ages", MockSource.getPlugin(ageInput, AGE_SCHEMA))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(tablesInOrderToJoin, Arrays.asList("region", "user_id"), required, broadcast, Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("purchases", "join").addConnection("interests", "join").addConnection("ages", "join").addConnection("join", "sink").setEngine(engine).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // write input data
    List<StructuredRecord> userData = Arrays.asList(USER_ALICE, USER_ALYCE, USER_BOB);
    DataSetManager<Table> inputManager = getDataset(userInput);
    MockSource.writeInput(inputManager, userData);
    List<StructuredRecord> purchaseData = new ArrayList<>();
    purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 0).set("purchase_id", 123).build());
    purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 2).set("purchase_id", 456).build());
    inputManager = getDataset(purchaseInput);
    MockSource.writeInput(inputManager, purchaseData);
    List<StructuredRecord> interestData = new ArrayList<>();
    interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 0).set("interest", "food").build());
    interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 0).set("interest", "sports").build());
    interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 1).set("interest", "gardening").build());
    interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 2).set("interest", "gaming").build());
    inputManager = getDataset(interestInput);
    MockSource.writeInput(inputManager, interestData);
    List<StructuredRecord> ageData = new ArrayList<>();
    ageData.add(StructuredRecord.builder(AGE_SCHEMA).set("region", "us").set("user_id", 10).set("age", 20).build());
    ageData.add(StructuredRecord.builder(AGE_SCHEMA).set("region", "us").set("user_id", 1).set("age", 30).build());
    inputManager = getDataset(ageInput);
    MockSource.writeInput(inputManager, ageData);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(output);
    List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
    Set<StructuredRecord> actual = new HashSet<>();
    Schema expectedSchema = expected.iterator().hasNext() ? expected.iterator().next().getSchema() : null;
    if (expectedSchema == null || expected.iterator().next().getSchema() == outputRecords.get(0).getSchema()) {
        actual = new HashSet<>(outputRecords);
    } else {
        // reorder the output columns of the join result (actual) to match the column order of expected
        for (StructuredRecord sr : outputRecords) {
            actual.add(StructuredRecord.builder(expectedSchema).set("ages_region", sr.get("ages_region")).set("ages_age", sr.get("ages_age")).set("ages_user_id", sr.get("ages_user_id")).set("purchases_region", sr.get("purchases_region")).set("purchases_purchase_id", sr.get("purchases_purchase_id")).set("purchases_user_id", sr.get("purchases_user_id")).set("users_region", sr.get("users_region")).set("users_user_id", sr.get("users_user_id")).set("users_name", sr.get("users_name")).set("interests_region", sr.get("interests_region")).set("interests_user_id", sr.get("interests_user_id")).set("interests_interest", sr.get("interests_interest")).build());
        }
    }
    Assert.assertEquals(expected, actual);
    validateMetric(11, appId, "join.records.in");
    validateMetric(expected.size(), appId, "join.records.out");
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) WorkflowManager(io.cdap.cdap.test.WorkflowManager) Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet)

Example 35 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class AutoJoinerTest method testCaseSensitivity.

@Test
public void testCaseSensitivity() throws Exception {
    Schema weird1 = Schema.recordOf("weird1", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("ID", Schema.of(Schema.Type.LONG)), Schema.Field.of("Id", Schema.of(Schema.Type.INT)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    Schema weird2 = Schema.recordOf("weird2", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("ID", Schema.of(Schema.Type.LONG)), Schema.Field.of("val", Schema.of(Schema.Type.STRING)));
    String input1 = UUID.randomUUID().toString();
    String input2 = UUID.randomUUID().toString();
    String output = UUID.randomUUID().toString();
    ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("i1", MockSource.getPlugin(input1, weird1))).addStage(new ETLStage("i2", MockSource.getPlugin(input2, weird2))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("i1", "i2"), Arrays.asList("id", "ID"), Arrays.asList("i1", "i2"), Collections.emptyList(), Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("i1", "join").addConnection("i2", "join").addConnection("join", "sink").setEngine(Engine.SPARK).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // write input data
    List<StructuredRecord> input1Data = new ArrayList<>();
    input1Data.add(StructuredRecord.builder(weird1).set("id", 0).set("ID", 99L).set("Id", 0).set("name", "zero").build());
    input1Data.add(StructuredRecord.builder(weird1).set("id", 1).set("ID", 0L).set("Id", 0).set("name", "one").build());
    DataSetManager<Table> inputManager = getDataset(input1);
    MockSource.writeInput(inputManager, input1Data);
    List<StructuredRecord> input2Data = new ArrayList<>();
    input2Data.add(StructuredRecord.builder(weird2).set("id", 0).set("ID", 99L).set("val", "0").build());
    input2Data.add(StructuredRecord.builder(weird2).set("id", 1).set("ID", 99L).set("val", "1").build());
    input2Data.add(StructuredRecord.builder(weird2).set("id", 0).set("ID", 0L).set("val", "2").build());
    inputManager = getDataset(input2);
    MockSource.writeInput(inputManager, input2Data);
    Schema expectedSchema = Schema.recordOf("i1.i2", Schema.Field.of("i1_id", Schema.of(Schema.Type.INT)), Schema.Field.of("i1_ID", Schema.of(Schema.Type.LONG)), Schema.Field.of("i1_Id", Schema.of(Schema.Type.INT)), Schema.Field.of("i1_name", Schema.of(Schema.Type.STRING)), Schema.Field.of("i2_id", Schema.of(Schema.Type.INT)), Schema.Field.of("i2_ID", Schema.of(Schema.Type.LONG)), Schema.Field.of("i2_val", Schema.of(Schema.Type.STRING)));
    StructuredRecord expected = StructuredRecord.builder(expectedSchema).set("i1_id", 0).set("i1_ID", 99L).set("i1_Id", 0).set("i1_name", "zero").set("i2_id", 0).set("i2_ID", 99L).set("i2_val", "0").build();
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    Map<String, String> args = Collections.singletonMap(MockAutoJoiner.PARTITIONS_ARGUMENT, "1");
    workflowManager.startAndWaitForGoodRun(args, ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(output);
    List<StructuredRecord> actual = MockSink.readOutput(outputManager);
    Assert.assertEquals(Collections.singletonList(expected), actual);
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Aggregations

StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)210 Schema (io.cdap.cdap.api.data.schema.Schema)169 Test (org.junit.Test)119 Table (io.cdap.cdap.api.dataset.table.Table)76 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)73 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)73 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)68 ApplicationManager (io.cdap.cdap.test.ApplicationManager)68 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)59 WorkflowManager (io.cdap.cdap.test.WorkflowManager)54 HashSet (java.util.HashSet)50 ArrayList (java.util.ArrayList)44 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)40 HashMap (java.util.HashMap)25 File (java.io.File)17 ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)16 FormatSpecification (io.cdap.cdap.api.data.format.FormatSpecification)15 DataStreamsConfig (io.cdap.cdap.etl.proto.v2.DataStreamsConfig)14 SparkManager (io.cdap.cdap.test.SparkManager)12 Map (java.util.Map)12