Search in sources :

Example 26 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class DataPipelineTest method testSimpleConditionWithMultipleOutputActions.

@Test
public void testSimpleConditionWithMultipleOutputActions() throws Exception {
    Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    /*
     *
     * condition --Action--> file ---> trueSink
     *       |
     *       |--->Action--->file----> falseSink
     *
     */
    String appName = "SimpleConditionWithMultipleOutputActions";
    String trueSource = "true" + appName + "Source";
    String falseSource = "false" + appName + "Source";
    String trueSink = "true" + appName + "Sink";
    String falseSink = "false" + appName + "Sink";
    String actionTable = "actionTable" + appName;
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("trueSource", MockSource.getPlugin(trueSource, schema))).addStage(new ETLStage("falseSource", MockSource.getPlugin(falseSource, schema))).addStage(new ETLStage("trueSink", MockSink.getPlugin(trueSink))).addStage(new ETLStage("falseSink", MockSink.getPlugin(falseSink))).addStage(new ETLStage("condition", MockCondition.getPlugin("condition"))).addStage(new ETLStage("action1", MockAction.getPlugin(actionTable, "row1", "key1", "val1"))).addStage(new ETLStage("action2", MockAction.getPlugin(actionTable, "row2", "key2", "val2"))).addConnection("condition", "action1", true).addConnection("action1", "trueSource").addConnection("trueSource", "trueSink").addConnection("condition", "action2", false).addConnection("action2", "falseSource").addConnection("falseSource", "falseSink").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT_RANGE, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app(appName);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
    for (String branch : Arrays.asList("true", "false")) {
        String source = branch.equals("true") ? trueSource : falseSource;
        String sink = branch.equals("true") ? trueSink : falseSink;
        // write records to source
        DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(source));
        MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob));
        WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
        workflowManager.start(ImmutableMap.of("condition.branch.to.execute", branch));
        if (branch.equals("true")) {
            workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
        } else {
            workflowManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 5, TimeUnit.MINUTES);
        }
        // check sink
        DataSetManager<Table> sinkManager = getDataset(sink);
        Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob);
        Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
        Assert.assertEquals(expected, actual);
        validateMetric(2, appId, branch + "Source.records.out");
        validateMetric(2, appId, branch + "Sink.records.in");
        // check Action1 and Action2 is executed correctly
        DataSetManager<Table> actionTableDS = getDataset(actionTable);
        if (branch.equals("true")) {
            Assert.assertEquals("val1", MockAction.readOutput(actionTableDS, "row1", "key1"));
        } else {
            Assert.assertEquals("val2", MockAction.readOutput(actionTableDS, "row2", "key2"));
        }
    }
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Example 27 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class AutoJoinerTest method testTripleAutoJoin.

private void testTripleAutoJoin(List<String> required, List<String> broadcast, Set<StructuredRecord> expected, Engine engine, List<String> tablesInOrderToJoin) throws Exception {
    /*
         users ------|
                     |
         purchases --|--> join --> sink
                     |
         interests --|

         joinOn: users.region = purchases.region = interests.region and
                 users.user_id = purchases.user_id = interests.user_id
     */
    String userInput = UUID.randomUUID().toString();
    String purchaseInput = UUID.randomUUID().toString();
    String interestInput = UUID.randomUUID().toString();
    String output = UUID.randomUUID().toString();
    ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("users", MockSource.getPlugin(userInput, USER_SCHEMA))).addStage(new ETLStage("purchases", MockSource.getPlugin(purchaseInput, PURCHASE_SCHEMA))).addStage(new ETLStage("interests", MockSource.getPlugin(interestInput, INTEREST_SCHEMA))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(tablesInOrderToJoin, Arrays.asList("region", "user_id"), required, broadcast, Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("purchases", "join").addConnection("interests", "join").addConnection("join", "sink").setEngine(engine).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // write input data
    List<StructuredRecord> userData = Arrays.asList(USER_ALICE, USER_ALYCE, USER_BOB);
    DataSetManager<Table> inputManager = getDataset(userInput);
    MockSource.writeInput(inputManager, userData);
    List<StructuredRecord> purchaseData = new ArrayList<>();
    purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 0).set("purchase_id", 123).build());
    purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 2).set("purchase_id", 456).build());
    inputManager = getDataset(purchaseInput);
    MockSource.writeInput(inputManager, purchaseData);
    List<StructuredRecord> interestData = new ArrayList<>();
    interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 0).set("interest", "food").build());
    interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 0).set("interest", "sports").build());
    interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 1).set("interest", "gardening").build());
    interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 2).set("interest", "gaming").build());
    inputManager = getDataset(interestInput);
    MockSource.writeInput(inputManager, interestData);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(output);
    List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
    Set<StructuredRecord> actual = new HashSet<>();
    Schema expectedSchema = expected.iterator().hasNext() ? expected.iterator().next().getSchema() : null;
    if (expectedSchema == null || expected.iterator().next().getSchema() == outputRecords.get(0).getSchema()) {
        actual = new HashSet<>(outputRecords);
    } else {
        // reorder the output columns of the join result (actual) to match the column order of expected
        for (StructuredRecord sr : outputRecords) {
            actual.add(StructuredRecord.builder(expectedSchema).set("purchases_region", sr.get("purchases_region")).set("purchases_purchase_id", sr.get("purchases_purchase_id")).set("purchases_user_id", sr.get("purchases_user_id")).set("users_region", sr.get("users_region")).set("users_user_id", sr.get("users_user_id")).set("users_name", sr.get("users_name")).set("interests_region", sr.get("interests_region")).set("interests_user_id", sr.get("interests_user_id")).set("interests_interest", sr.get("interests_interest")).build());
        }
    }
    Assert.assertEquals(expected, actual);
    validateMetric(9, appId, "join.records.in");
    validateMetric(expected.size(), appId, "join.records.out");
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) WorkflowManager(io.cdap.cdap.test.WorkflowManager) Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet)

Example 28 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class AutoJoinerTest method testTripleAutoNoneRequiredJoin.

@Test
public void testTripleAutoNoneRequiredJoin() throws Exception {
    /*
    In this case, all the JOINS will be full outer joins
    i.e.
    Purchases (outer) Users (outer) Interests
    */
    Schema expectedSchema = Schema.recordOf("purchases.users.interests", Schema.Field.of("purchases_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("purchases_purchase_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("purchases_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("users_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("users_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("users_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("interests_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("interests_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("interests_interest", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 456).set("purchases_user_id", 2).set("interests_region", "us").set("interests_user_id", 2).set("interests_interest", "gaming").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("users_region", "eu").set("users_user_id", 0).set("users_name", "alyce").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").set("interests_region", "us").set("interests_user_id", 0).set("interests_interest", "food").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").set("interests_region", "us").set("interests_user_id", 0).set("interests_interest", "sports").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("users_region", "us").set("users_user_id", 1).set("users_name", "bob").set("interests_region", "us").set("interests_user_id", 1).set("interests_interest", "gardening").build());
    /*
    The output should not be affected by order of joins
     */
    testTripleAutoJoin(Collections.emptyList(), expected, Engine.SPARK, Arrays.asList("purchases", "users", "interests"));
    testTripleAutoJoin(Collections.emptyList(), expected, Engine.SPARK, Arrays.asList("purchases", "interests", "users"));
    testTripleAutoJoin(Collections.emptyList(), expected, Engine.SPARK, Arrays.asList("users", "purchases", "interests"));
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 29 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class AutoJoinerTest method testDoubleBroadcastJoin.

@Test
public void testDoubleBroadcastJoin() throws Exception {
    Schema expectedSchema = Schema.recordOf("purchases.users.interests", Schema.Field.of("purchases_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("purchases_purchase_id", Schema.of(Schema.Type.INT)), Schema.Field.of("purchases_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("users_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("users_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("interests_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("interests_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("interests_interest", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").set("interests_region", "us").set("interests_user_id", 0).set("interests_interest", "food").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").set("interests_region", "us").set("interests_user_id", 0).set("interests_interest", "sports").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 456).set("purchases_user_id", 2).set("interests_region", "us").set("interests_user_id", 2).set("interests_interest", "gaming").build());
    testTripleAutoJoin(Collections.singletonList("purchases"), Arrays.asList("purchases", "interests"), expected, Engine.SPARK, Arrays.asList("purchases", "users", "interests"));
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 30 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class AutoJoinerTest method testAutoJoinWithMacrosAndEmptyInput.

@Test
public void testAutoJoinWithMacrosAndEmptyInput() throws Exception {
    Schema expectedSchema = Schema.recordOf("joined", Schema.Field.of("region", Schema.of(Schema.Type.STRING)), Schema.Field.of("purchase_id", Schema.of(Schema.Type.INT)), Schema.Field.of("user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(expectedSchema).set("region", "us").set("purchase_id", 123).set("user_id", 0).build());
    expected.add(StructuredRecord.builder(expectedSchema).set("region", "us").set("purchase_id", 456).set("user_id", 2).build());
    // right side empty
    testAutoJoinWithMacros(Engine.SPARK, Collections.singletonList("purchases"), expectedSchema, expected, true, false);
    testAutoJoinWithMacros(Engine.MAPREDUCE, Collections.singletonList("purchases"), expectedSchema, expected, true, false);
    // left side empty
    expected.clear();
    testAutoJoinWithMacros(Engine.MAPREDUCE, Collections.singletonList("purchases"), expectedSchema, expected, false, true);
    testAutoJoinWithMacros(Engine.SPARK, Collections.singletonList("purchases"), expectedSchema, expected, false, true);
    // both sides empty
    expected.clear();
    testAutoJoinWithMacros(Engine.MAPREDUCE, Collections.singletonList("purchases"), expectedSchema, expected, true, true);
    testAutoJoinWithMacros(Engine.SPARK, Collections.singletonList("purchases"), expectedSchema, expected, true, true);
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)627 Schema (io.cdap.cdap.api.data.schema.Schema)437 Test (org.junit.Test)398 Table (io.cdap.cdap.api.dataset.table.Table)214 ApplicationManager (io.cdap.cdap.test.ApplicationManager)196 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)186 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)170 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)160 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)145 WorkflowManager (io.cdap.cdap.test.WorkflowManager)126 HashSet (java.util.HashSet)111 ArrayList (java.util.ArrayList)99 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)96 ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)75 HashMap (java.util.HashMap)74 MockEmitter (io.cdap.cdap.etl.mock.common.MockEmitter)62 MockTransformContext (io.cdap.cdap.etl.mock.transform.MockTransformContext)59 File (java.io.File)58 TransformContext (io.cdap.cdap.etl.api.TransformContext)35 FormatSpecification (io.cdap.cdap.api.data.format.FormatSpecification)30