Search in sources :

Example 86 with ETLBatchConfig

use of io.cdap.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.

the class AutoJoinerTest method testNullEquality.

private void testNullEquality(Engine engine, boolean nullIsEqual, Set<StructuredRecord> expected) throws Exception {
    Schema itemSchema = Schema.recordOf("item", Schema.Field.of("id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    Schema attributeSchema = Schema.recordOf("attribute", Schema.Field.of("region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("attr", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    /*
         items -------|
                      |--> join --> sink
         attributes --|

         joinOn: items.region = attributes.region and items.id = attributes.id
     */
    String itemsInput = UUID.randomUUID().toString();
    String attributesInput = UUID.randomUUID().toString();
    String output = UUID.randomUUID().toString();
    ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("items", MockSource.getPlugin(itemsInput, itemSchema))).addStage(new ETLStage("attributes", MockSource.getPlugin(attributesInput, attributeSchema))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("items", "attributes"), Arrays.asList("region", "id"), Collections.singletonList("items"), Collections.emptyList(), Collections.emptyList(), nullIsEqual))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("items", "join").addConnection("attributes", "join").addConnection("join", "sink").setEngine(engine).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // write input data
    List<StructuredRecord> itemData = new ArrayList<>();
    itemData.add(StructuredRecord.builder(itemSchema).set("region", "us").set("id", 0).set("name", "bacon").build());
    itemData.add(StructuredRecord.builder(itemSchema).set("id", 1).build());
    itemData.add(StructuredRecord.builder(itemSchema).set("region", "us").build());
    DataSetManager<Table> inputManager = getDataset(itemsInput);
    MockSource.writeInput(inputManager, itemData);
    List<StructuredRecord> attributesData = new ArrayList<>();
    attributesData.add(StructuredRecord.builder(attributeSchema).set("region", "us").set("id", 0).set("attr", "food").build());
    attributesData.add(StructuredRecord.builder(attributeSchema).set("id", 1).set("attr", "car").build());
    attributesData.add(StructuredRecord.builder(attributeSchema).set("region", "us").build());
    inputManager = getDataset(attributesInput);
    MockSource.writeInput(inputManager, attributesData);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(output);
    List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
    Assert.assertEquals(expected, new HashSet<>(outputRecords));
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId)

Example 87 with ETLBatchConfig

use of io.cdap.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.

the class ReducibleAggregatorTestBase method testFieldCountAgg.

protected void testFieldCountAgg(Engine engine, Map<String, String> arguments) throws Exception {
    String runSuffix = engine.name() + "-" + UUID.randomUUID();
    String source1Name = "pAggInput1-" + runSuffix;
    String source2Name = "pAggInput2-" + runSuffix;
    String sink1Name = "pAggOutput1-" + runSuffix;
    String sink2Name = "pAggOutput2-" + runSuffix;
    Schema inputSchema = Schema.recordOf("testRecord", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.LONG)));
    /*
       source1 --|--> agg1 --> sink1
                 |
       source2 --|--> agg2 --> sink2
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().setEngine(engine).addStage(new ETLStage("source1", MockSource.getPlugin(source1Name, inputSchema))).addStage(new ETLStage("source2", MockSource.getPlugin(source2Name, inputSchema))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))).addStage(new ETLStage("agg1", FieldCountReducibleAggregator.getPlugin("user", "string"))).addStage(new ETLStage("agg2", FieldCountReducibleAggregator.getPlugin("item", "long"))).addConnection("source1", "agg1").addConnection("source1", "agg2").addConnection("source2", "agg1").addConnection("source2", "agg2").addConnection("agg1", "sink1").addConnection("agg2", "sink2").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("ParallelAggApp-" + runSuffix);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // write few records to each source
    DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(source1Name));
    MockSource.writeInput(inputManager, ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 1L).build(), StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 2L).build()));
    inputManager = getDataset(NamespaceId.DEFAULT.dataset(source2Name));
    MockSource.writeInput(inputManager, ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 3L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 4L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 3L).build()));
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start(arguments);
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    Schema outputSchema1 = Schema.recordOf("user.count", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
    Schema outputSchema2 = Schema.recordOf("item.count", Schema.Field.of("item", Schema.of(Schema.Type.LONG)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
    // check output
    DataSetManager<Table> sinkManager = getDataset(sink1Name);
    Set<StructuredRecord> expected = ImmutableSet.of(StructuredRecord.builder(outputSchema1).set("user", "all").set("ct", 5L).build(), StructuredRecord.builder(outputSchema1).set("user", "samuel").set("ct", 3L).build(), StructuredRecord.builder(outputSchema1).set("user", "john").set("ct", 2L).build());
    Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
    Assert.assertEquals(expected, actual);
    sinkManager = getDataset(sink2Name);
    expected = ImmutableSet.of(StructuredRecord.builder(outputSchema2).set("item", 0L).set("ct", 5L).build(), StructuredRecord.builder(outputSchema2).set("item", 1L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 2L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 3L).set("ct", 2L).build(), StructuredRecord.builder(outputSchema2).set("item", 4L).set("ct", 1L).build());
    actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
    Assert.assertEquals(expected, actual);
    validateMetric(2, appId, "source1.records.out");
    validateMetric(3, appId, "source2.records.out");
    validateMetric(5, appId, "agg1.records.in");
    // 2 users, but FieldCountReduceAggregator always emits an 'all' group
    validateMetric(3, appId, "agg1.aggregator.groups");
    validateMetric(3, appId, "agg1.records.out");
    validateMetric(5, appId, "agg2.records.in");
    // 4 items, but FieldCountReduceAggregator always emits an 'all' group
    validateMetric(5, appId, "agg2.aggregator.groups");
    validateMetric(5, appId, "agg2.records.out");
    validateMetric(3, appId, "sink1.records.in");
    validateMetric(5, appId, "sink2.records.in");
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId)

Example 88 with ETLBatchConfig

use of io.cdap.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.

the class PreviewDataPipelineTest method testMultiplePhase.

private void testMultiplePhase(Engine engine) throws Exception {
    /*
     * source1 ----> t1 ------
     *                        | --> innerjoin ----> t4 ------
     * source2 ----> t2 ------                                 |
     *                                                         | ---> outerjoin --> sink1
     *                                                         |
     * source3 -------------------- t3 ------------------------
     */
    PreviewManager previewManager = getPreviewManager();
    Schema inputSchema1 = Schema.recordOf("customerRecord", Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_name", Schema.of(Schema.Type.STRING)));
    Schema inputSchema2 = Schema.recordOf("itemRecord", Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("item_price", Schema.of(Schema.Type.LONG)), Schema.Field.of("cust_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("cust_name", Schema.of(Schema.Type.STRING)));
    Schema inputSchema3 = Schema.recordOf("transactionRecord", Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("c_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("i_id", Schema.of(Schema.Type.STRING)));
    Schema outSchema2 = Schema.recordOf("join.output", Schema.Field.of("t_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("c_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("i_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_price", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("cust_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("cust_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    String source1MulitJoinInput = "multiJoinSource1-" + engine;
    String source2MultiJoinInput = "multiJoinSource2-" + engine;
    String source3MultiJoinInput = "multiJoinSource3-" + engine;
    String outputName = "multiJoinOutput-" + engine;
    String sinkName = "multiJoinOutputSink-" + engine;
    String outerJoinName = "multiJoinOuter-" + engine;
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source1", MockSource.getPlugin(source1MulitJoinInput, inputSchema1))).addStage(new ETLStage("source2", MockSource.getPlugin(source2MultiJoinInput, inputSchema2))).addStage(new ETLStage("source3", MockSource.getPlugin(source3MultiJoinInput, inputSchema3))).addStage(new ETLStage("t1", IdentityTransform.getPlugin())).addStage(new ETLStage("t2", IdentityTransform.getPlugin())).addStage(new ETLStage("t3", IdentityTransform.getPlugin())).addStage(new ETLStage("t4", IdentityTransform.getPlugin())).addStage(new ETLStage("innerjoin", MockJoiner.getPlugin("t1.customer_id=t2.cust_id", "t1,t2", ""))).addStage(new ETLStage(outerJoinName, MockJoiner.getPlugin("t4.item_id=t3.i_id", "", ""))).addStage(new ETLStage(sinkName, MockSink.getPlugin(outputName))).addConnection("source1", "t1").addConnection("source2", "t2").addConnection("source3", "t3").addConnection("t1", "innerjoin").addConnection("t2", "innerjoin").addConnection("innerjoin", "t4").addConnection("t3", outerJoinName).addConnection("t4", outerJoinName).addConnection(outerJoinName, sinkName).setEngine(engine).setNumOfRecordsPreview(100).build();
    // Construct the preview config with the program name and program type
    PreviewConfig previewConfig = new PreviewConfig(SmartWorkflow.NAME, ProgramType.WORKFLOW, Collections.<String, String>emptyMap(), 10);
    // Create the table for the mock source
    addDatasetInstance(Table.class.getName(), source1MulitJoinInput, DatasetProperties.of(ImmutableMap.of("schema", inputSchema1.toString())));
    addDatasetInstance(Table.class.getName(), source2MultiJoinInput, DatasetProperties.of(ImmutableMap.of("schema", inputSchema2.toString())));
    addDatasetInstance(Table.class.getName(), source3MultiJoinInput, DatasetProperties.of(ImmutableMap.of("schema", inputSchema3.toString())));
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig, previewConfig);
    // Start the preview and get the corresponding PreviewRunner.
    ApplicationId previewId = previewManager.start(NamespaceId.DEFAULT, appRequest);
    ingestData(inputSchema1, inputSchema2, inputSchema3, source1MulitJoinInput, source2MultiJoinInput, source3MultiJoinInput);
    // Wait for the preview status go into COMPLETED.
    Tasks.waitFor(PreviewStatus.Status.COMPLETED, new Callable<PreviewStatus.Status>() {

        @Override
        public PreviewStatus.Status call() throws Exception {
            PreviewStatus status = previewManager.getStatus(previewId);
            return status == null ? null : status.getStatus();
        }
    }, 5, TimeUnit.MINUTES);
    checkPreviewStore(previewManager, previewId, sinkName, 3);
    validateMetric(3L, previewId, sinkName + ".records.in", previewManager);
}
Also used : PreviewStatus(io.cdap.cdap.app.preview.PreviewStatus) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) TimeoutException(java.util.concurrent.TimeoutException) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) PreviewManager(io.cdap.cdap.app.preview.PreviewManager) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) PreviewStatus(io.cdap.cdap.app.preview.PreviewStatus) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) PreviewConfig(io.cdap.cdap.proto.artifact.preview.PreviewConfig)

Example 89 with ETLBatchConfig

use of io.cdap.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.

the class DataPipelineServiceTest method testValidatePipelineChecksNamespaceExistence.

@Test
public void testValidatePipelineChecksNamespaceExistence() throws IOException {
    ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("src", MockSource.getPlugin("dummy1"))).addStage(new ETLStage("sink", MockSink.getPlugin("dummy2"))).addConnection("src", "sink").build();
    ArtifactSummary badArtifact = new ArtifactSummary("ghost", "1.0.0");
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(badArtifact, config);
    URL validatePipelineURL = serviceURI.resolve("v1/contexts/ghost/validations/pipeline").toURL();
    HttpRequest request = HttpRequest.builder(HttpMethod.POST, validatePipelineURL).withBody(GSON.toJson(appRequest)).build();
    HttpResponse response = HttpRequests.execute(request, new DefaultHttpRequestConfig(false));
    Assert.assertEquals(404, response.getResponseCode());
}
Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) HttpRequest(io.cdap.common.http.HttpRequest) ArtifactSummary(io.cdap.cdap.api.artifact.ArtifactSummary) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) DefaultHttpRequestConfig(io.cdap.cdap.common.http.DefaultHttpRequestConfig) HttpResponse(io.cdap.common.http.HttpResponse) URL(java.net.URL) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) Test(org.junit.Test)

Example 90 with ETLBatchConfig

use of io.cdap.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.

the class DraftServiceTest method createBatchPipelineDraft.

private Draft createBatchPipelineDraft(DraftId draftId, String name, String description) throws IOException {
    ArtifactSummary artifact = new ArtifactSummary("cdap-data-pipeline", "1.0.0");
    ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("src", MockSource.getPlugin("dummy1"))).addStage(new ETLStage("sink", MockSink.getPlugin("dummy2"))).addConnection("src", "sink").setEngine(Engine.SPARK).build();
    DraftStoreRequest<ETLBatchConfig> batchDraftStoreRequest = new DraftStoreRequest<>(config, "", name, description, 0, artifact);
    long now = System.currentTimeMillis();
    Draft expectedDraft = new Draft(config, name, description, artifact, draftId.getId(), now, now);
    createPipelineDraft(draftId, batchDraftStoreRequest);
    return expectedDraft;
}
Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) DraftStoreRequest(io.cdap.cdap.datapipeline.draft.DraftStoreRequest) Draft(io.cdap.cdap.datapipeline.draft.Draft) ArtifactSummary(io.cdap.cdap.api.artifact.ArtifactSummary) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage)

Aggregations

ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)121 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)121 Test (org.junit.Test)117 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)79 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)76 ApplicationManager (io.cdap.cdap.test.ApplicationManager)74 ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)72 WorkflowManager (io.cdap.cdap.test.WorkflowManager)72 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)70 Table (io.cdap.cdap.api.dataset.table.Table)68 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)64 Schema (io.cdap.cdap.api.data.schema.Schema)58 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)46 AppRequest (co.cask.cdap.proto.artifact.AppRequest)43 ApplicationId (co.cask.cdap.proto.id.ApplicationId)43 ApplicationManager (co.cask.cdap.test.ApplicationManager)40 WorkflowManager (co.cask.cdap.test.WorkflowManager)39 Table (co.cask.cdap.api.dataset.table.Table)36 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)33 ArrayList (java.util.ArrayList)33