Examples with SparkManager - io.cdap.cdap.test.SparkManager

Example 1 with SparkManager

use of io.cdap.cdap.test.SparkManager in project cdap by caskdata.

the class ReportGenerationAppTest method testGenerateReport.

@Test
public void testGenerateReport() throws Exception {
    Map<String, String> runTimeArguments = new HashMap<>();
    // disable tms subscriber thread as the RunMetaFileSet avro files are written directly by the test case
    // if the tms subscriber thread is enabled, in order to find the latest message id to start fetching from,
    // we read the latest RunMetaFileSet avro file's content
    // whereas the derived message_id will be invalid in TMS as these runs aren't in TMS,
    // in order to avoid the exception we disable the tms subscriber thread for the test case
    runTimeArguments.put(Constants.DISABLE_TMS_SUBSCRIBER_THREAD, "true");
    Long currentTimeMillis = System.currentTimeMillis();
    DatasetId metaFileset = createAndInitializeDataset(NamespaceId.DEFAULT, currentTimeMillis);
    SparkManager sparkManager = deployAndStartReportingApplication(NamespaceId.DEFAULT, runTimeArguments);
    URL url = sparkManager.getServiceURL(1, TimeUnit.MINUTES);
    Assert.assertNotNull(url);
    URL reportURL = url.toURI().resolve("reports/").toURL();
    List<Filter> filters = ImmutableList.of(// white list filter
    new ValueFilter<>(Constants.NAMESPACE, ImmutableSet.of("ns1", "ns2"), null), new RangeFilter<>(Constants.DURATION, new RangeFilter.Range<>(null, 500L)), // black list filter
    new ValueFilter<>(Constants.ARTIFACT_NAME, null, ImmutableSet.of("cdap-data-streams", "cdap-data-pipeline")));
    long startSecs = TimeUnit.MILLISECONDS.toSeconds(currentTimeMillis);
    ReportGenerationRequest request = new ReportGenerationRequest("ns1_ns2_report", startSecs, startSecs + 30, new ArrayList<>(ReportField.FIELD_NAME_MAP.keySet()), ImmutableList.of(new Sort(Constants.DURATION, Sort.Order.DESCENDING)), filters);
    HttpURLConnection urlConn = (HttpURLConnection) reportURL.openConnection();
    urlConn.setDoOutput(true);
    urlConn.setRequestMethod("POST");
    urlConn.getOutputStream().write(GSON.toJson(request).getBytes(StandardCharsets.UTF_8));
    if (urlConn.getErrorStream() != null) {
        Assert.fail(Bytes.toString(ByteStreams.toByteArray(urlConn.getErrorStream())));
    }
    Assert.assertEquals(200, urlConn.getResponseCode());
    Map<String, String> reportIdMap = getResponseObject(urlConn, STRING_STRING_MAP);
    String reportId = reportIdMap.get("id");
    Assert.assertNotNull(reportId);
    URL reportIdURL = reportURL.toURI().resolve("info?report-id=" + reportId).toURL();
    validateReportSummary(reportIdURL, startSecs);
    // share the report to get the share id
    URL shareReportURL = reportURL.toURI().resolve(reportId + "/share").toURL();
    HttpURLConnection shareURLConnection = (HttpURLConnection) shareReportURL.openConnection();
    shareURLConnection.setRequestMethod("POST");
    ShareId shareId = getResponseObject(shareURLConnection, ShareId.class);
    // test if we are able to get summary and read the summary using share id
    URL shareIdSummaryURL = reportURL.toURI().resolve("info?share-id=" + shareId.getShareId()).toURL();
    validateReportSummary(shareIdSummaryURL, startSecs);
    // assert the number of report details is correct
    URL reportRunsURL = reportURL.toURI().resolve("download?report-id=" + reportId).toURL();
    validateReportContent(reportRunsURL);
    // test if we are able to download and read the report using share id
    URL shareIdRunsURL = reportURL.toURI().resolve("download?share-id=" + shareId.getShareId()).toURL();
    validateReportContent(shareIdRunsURL);
    // save the report with a new name and description
    URL reportSaveURL = reportURL.toURI().resolve(reportId + "/" + "save").toURL();
    urlConn = (HttpURLConnection) reportSaveURL.openConnection();
    urlConn.setDoOutput(true);
    urlConn.setRequestMethod("POST");
    urlConn.getOutputStream().write(GSON.toJson(new ReportSaveRequest("newName", "newDescription")).getBytes(StandardCharsets.UTF_8));
    if (urlConn.getErrorStream() != null) {
        Assert.fail(Bytes.toString(ByteStreams.toByteArray(urlConn.getErrorStream())));
    }
    Assert.assertEquals(200, urlConn.getResponseCode());
    // verify that the name and description of the report have been updated, and the expiry time is null
    ReportGenerationInfo reportGenerationInfo = getResponseObject(reportIdURL.openConnection(), REPORT_GEN_INFO_TYPE);
    Assert.assertEquals("newName", reportGenerationInfo.getName());
    Assert.assertEquals("newDescription", reportGenerationInfo.getDescription());
    Assert.assertNull(reportGenerationInfo.getExpiry());
    // save the report again should fail
    urlConn = (HttpURLConnection) reportSaveURL.openConnection();
    urlConn.setDoOutput(true);
    urlConn.setRequestMethod("POST");
    urlConn.getOutputStream().write(GSON.toJson(new ReportSaveRequest("anotherNewName", "anotherNewDescription")).getBytes(StandardCharsets.UTF_8));
    if (urlConn.getErrorStream() != null) {
        Assert.fail(Bytes.toString(ByteStreams.toByteArray(urlConn.getErrorStream())));
    }
    Assert.assertEquals(403, urlConn.getResponseCode());
    // delete the report
    URL reportDeleteURL = reportURL.toURI().resolve(reportId).toURL();
    urlConn = (HttpURLConnection) reportDeleteURL.openConnection();
    urlConn.setRequestMethod("DELETE");
    Assert.assertEquals(200, urlConn.getResponseCode());
    // getting status of a deleted report will get 404
    Assert.assertEquals(404, ((HttpURLConnection) reportIdURL.openConnection()).getResponseCode());
    // deleting a deleted report again will get 404
    urlConn = (HttpURLConnection) reportDeleteURL.openConnection();
    urlConn.setRequestMethod("DELETE");
    Assert.assertEquals(404, urlConn.getResponseCode());
    // test querying for time range before the start secs, to verify empty results contents
    validateEmptyReports(reportURL, startSecs - TimeUnit.HOURS.toSeconds(2), startSecs - 30, filters);
    // test querying for time range after the start secs, but with filter that doesnt match any records
    // to verify empty results contents
    List<Filter> filters2 = ImmutableList.of(new ValueFilter<>(Constants.NAMESPACE, ImmutableSet.of("ns1", "ns2"), null), // all the programs are run by user alice, user bob will match no records.
    new ValueFilter<>(Constants.USER, ImmutableSet.of(USER_BOB), null));
    validateEmptyReports(reportURL, startSecs, startSecs + 30, filters2);
    List<Filter> filters3 = ImmutableList.of(new ValueFilter<>(Constants.NAMESPACE, ImmutableSet.of("ns1", "ns2"), null), // all the programs have the same test artifact name, blacklisting that will provide empty results
    new ValueFilter<>(Constants.ARTIFACT_NAME, null, ImmutableSet.of(TEST_ARTIFACT_NAME)));
    validateEmptyReports(reportURL, startSecs, startSecs + 30, filters3);
    sparkManager.stop();
    sparkManager.waitForStopped(60, TimeUnit.SECONDS);
    deleteDatasetInstance(metaFileset);
}

Also used : SparkManager(io.cdap.cdap.test.SparkManager) HashMap(java.util.HashMap) ReportGenerationRequest(io.cdap.cdap.report.proto.ReportGenerationRequest) ReportSaveRequest(io.cdap.cdap.report.proto.ReportSaveRequest) URL(java.net.URL) DatasetId(io.cdap.cdap.proto.id.DatasetId) HttpURLConnection(java.net.HttpURLConnection) ShareId(io.cdap.cdap.report.proto.ShareId) ValueFilter(io.cdap.cdap.report.proto.ValueFilter) RangeFilter(io.cdap.cdap.report.proto.RangeFilter) Filter(io.cdap.cdap.report.proto.Filter) ReportGenerationInfo(io.cdap.cdap.report.proto.ReportGenerationInfo) Sort(io.cdap.cdap.report.proto.Sort) Test(org.junit.Test)

Example 2 with SparkManager

use of io.cdap.cdap.test.SparkManager in project cdap by caskdata.

the class DataStreamsTest method testAutoJoinNullEquality.

private void testAutoJoinNullEquality(boolean nullSafe) throws Exception {
    /*
     * customers ----------|
     *                     |
     *                     |---> join ---> sink
     *                     |
     * transactions -------|
     */
    Schema inputSchema1 = Schema.recordOf("customer", Schema.Field.of("customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    Schema inputSchema2 = Schema.recordOf("transaction", Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)));
    Schema outSchema = Schema.recordOf("customers.transactions", Schema.Field.of("customers_customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customers_customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("transactions_t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("transactions_customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("transactions_item_id", Schema.of(Schema.Type.STRING)));
    StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema1).set("customer_id", "1").set("customer_name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(inputSchema1).set("customer_name", "bob").build();
    StructuredRecord recordJane = StructuredRecord.builder(inputSchema1).set("customer_id", "3").set("customer_name", "jane").build();
    StructuredRecord trans1 = StructuredRecord.builder(inputSchema2).set("t_id", "1").set("customer_id", "1").set("item_id", "11").build();
    StructuredRecord trans2 = StructuredRecord.builder(inputSchema2).set("t_id", "2").set("customer_id", "3").set("item_id", "22").build();
    StructuredRecord trans3 = StructuredRecord.builder(inputSchema2).set("t_id", "3").set("item_id", "33").build();
    List<StructuredRecord> input1 = ImmutableList.of(recordSamuel, recordBob, recordJane);
    List<StructuredRecord> input2 = ImmutableList.of(trans1, trans2, trans3);
    String outputName = UUID.randomUUID().toString();
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("customers", MockSource.getPlugin(inputSchema1, input1))).addStage(new ETLStage("transactions", MockSource.getPlugin(inputSchema2, input2))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("customers", "transactions"), Collections.singletonList("customer_id"), Collections.singletonList("transactions"), Collections.emptyList(), Collections.emptyList(), nullSafe))).addStage(new ETLStage("sink", MockSink.getPlugin(outputName))).addConnection("customers", "join").addConnection("transactions", "join").addConnection("join", "sink").setBatchInterval("5s").setCheckpointDir(checkpointDir).build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    StructuredRecord join1 = StructuredRecord.builder(outSchema).set("customers_customer_id", "1").set("customers_customer_name", "samuel").set("transactions_t_id", "1").set("transactions_customer_id", "1").set("transactions_item_id", "11").build();
    StructuredRecord join2 = StructuredRecord.builder(outSchema).set("customers_customer_id", "3").set("customers_customer_name", "jane").set("transactions_t_id", "2").set("transactions_customer_id", "3").set("transactions_item_id", "22").build();
    StructuredRecord join3;
    if (nullSafe) {
        // this transaction has a null customer id, which should match with the null id from customers
        join3 = StructuredRecord.builder(outSchema).set("transactions_t_id", "3").set("transactions_item_id", "33").set("customers_customer_name", "bob").build();
    } else {
        // this transaction has a null customer id, which should not match with the null id from customers
        join3 = StructuredRecord.builder(outSchema).set("transactions_t_id", "3").set("transactions_item_id", "33").build();
    }
    Set<StructuredRecord> expected = ImmutableSet.of(join1, join2, join3);
    DataSetManager<Table> outputManager = getDataset(outputName);
    Tasks.waitFor(true, () -> {
        outputManager.flush();
        Set<StructuredRecord> outputRecords = new HashSet<>(MockSink.readOutput(outputManager));
        return expected.equals(outputRecords);
    }, 4, TimeUnit.MINUTES);
    sparkManager.stop();
    sparkManager.waitForStopped(10, TimeUnit.SECONDS);
}

Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet)

Example 3 with SparkManager

use of io.cdap.cdap.test.SparkManager in project cdap by caskdata.

the class DataStreamsTest method testTransformComputeRun.

private void testTransformComputeRun(ApplicationManager appManager, final Set<StructuredRecord> expected, String val1, String val2, final String outputName) throws Exception {
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start(ImmutableMap.of("field", "name", "val1", val1, "val2", val2, "output", outputName));
    sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    // since dataset name is a macro, the dataset isn't created until it is needed. Wait for it to exist
    Tasks.waitFor(true, () -> getDataset(outputName).get() != null, 1, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(outputName);
    Tasks.waitFor(true, () -> {
        outputManager.flush();
        Set<StructuredRecord> outputRecords = new HashSet<>(MockSink.readOutput(outputManager));
        return expected.equals(outputRecords);
    }, 1, TimeUnit.MINUTES);
    sparkManager.stop();
    sparkManager.waitForStopped(10, TimeUnit.SECONDS);
}

Also used : SparkManager(io.cdap.cdap.test.SparkManager) Table(io.cdap.cdap.api.dataset.table.Table) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) HashSet(java.util.HashSet)

Example 4 with SparkManager

use of io.cdap.cdap.test.SparkManager in project cdap by caskdata.

the class DataStreamsTest method testAutoJoin.

@Test
public void testAutoJoin() throws Exception {
    /*
     * customers ----------|
     *                     |
     *                     |---> join ---> sink
     *                     |
     * transactions -------|
     */
    Schema inputSchema1 = Schema.recordOf("customer", Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_name", Schema.of(Schema.Type.STRING)));
    Schema inputSchema2 = Schema.recordOf("transaction", Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)));
    Schema outSchema = Schema.recordOf("customers.transactions", Schema.Field.of("customers_customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customers_customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("transactions_t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("transactions_customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("transactions_item_id", Schema.of(Schema.Type.STRING)));
    StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema1).set("customer_id", "1").set("customer_name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(inputSchema1).set("customer_id", "2").set("customer_name", "bob").build();
    StructuredRecord recordJane = StructuredRecord.builder(inputSchema1).set("customer_id", "3").set("customer_name", "jane").build();
    StructuredRecord tx1 = StructuredRecord.builder(inputSchema2).set("t_id", "1").set("customer_id", "1").set("item_id", "11").build();
    StructuredRecord tx2 = StructuredRecord.builder(inputSchema2).set("t_id", "2").set("customer_id", "3").set("item_id", "22").build();
    StructuredRecord tx3 = StructuredRecord.builder(inputSchema2).set("t_id", "3").set("customer_id", "4").set("item_id", "33").build();
    List<StructuredRecord> input1 = ImmutableList.of(recordSamuel, recordBob, recordJane);
    List<StructuredRecord> input2 = ImmutableList.of(tx1, tx2, tx3);
    String outputName = UUID.randomUUID().toString();
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("customers", MockSource.getPlugin(inputSchema1, input1))).addStage(new ETLStage("transactions", MockSource.getPlugin(inputSchema2, input2))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("customers", "transactions"), Collections.singletonList("customer_id"), Collections.singletonList("transactions"), Collections.emptyList(), Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(outputName))).addConnection("customers", "join").addConnection("transactions", "join").addConnection("join", "sink").setBatchInterval("5s").setCheckpointDir(checkpointDir).build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("AutoJoinerApp");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    StructuredRecord join1 = StructuredRecord.builder(outSchema).set("customers_customer_id", "1").set("customers_customer_name", "samuel").set("transactions_t_id", "1").set("transactions_customer_id", "1").set("transactions_item_id", "11").build();
    StructuredRecord join2 = StructuredRecord.builder(outSchema).set("customers_customer_id", "3").set("customers_customer_name", "jane").set("transactions_t_id", "2").set("transactions_customer_id", "3").set("transactions_item_id", "22").build();
    StructuredRecord join3 = StructuredRecord.builder(outSchema).set("transactions_t_id", "3").set("transactions_customer_id", "4").set("transactions_item_id", "33").build();
    Set<StructuredRecord> expected = ImmutableSet.of(join1, join2, join3);
    DataSetManager<Table> outputManager = getDataset(outputName);
    Tasks.waitFor(true, () -> {
        outputManager.flush();
        Set<StructuredRecord> outputRecords = new HashSet<>(MockSink.readOutput(outputManager));
        return expected.equals(outputRecords);
    }, 4, TimeUnit.MINUTES);
    sparkManager.stop();
    sparkManager.waitForStopped(10, TimeUnit.SECONDS);
}

Example 5 with SparkManager

use of io.cdap.cdap.test.SparkManager in project cdap by caskdata.

the class DataStreamsTest method testAggregatorJoinerMacrosWithCheckpoints.

private void testAggregatorJoinerMacrosWithCheckpoints(boolean isReducibleAggregator) throws Exception {
    /*
                 |--> aggregator --> sink1
        users1 --|
                 |----|
                      |--> dupeFlagger --> sink2
        users2 -------|
     */
    Schema userSchema = Schema.recordOf("user", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> users1 = ImmutableList.of(StructuredRecord.builder(userSchema).set("id", 1L).set("name", "Samuel").build(), StructuredRecord.builder(userSchema).set("id", 2L).set("name", "Dwayne").build(), StructuredRecord.builder(userSchema).set("id", 3L).set("name", "Terry").build());
    List<StructuredRecord> users2 = ImmutableList.of(StructuredRecord.builder(userSchema).set("id", 1L).set("name", "Samuel").build(), StructuredRecord.builder(userSchema).set("id", 2L).set("name", "Dwayne").build(), StructuredRecord.builder(userSchema).set("id", 4L).set("name", "Terry").build(), StructuredRecord.builder(userSchema).set("id", 5L).set("name", "Christopher").build());
    DataStreamsConfig pipelineConfig = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("users1", MockSource.getPlugin(userSchema, users1))).addStage(new ETLStage("users2", MockSource.getPlugin(userSchema, users2))).addStage(new ETLStage("sink1", MockSink.getPlugin("sink1"))).addStage(new ETLStage("sink2", MockSink.getPlugin("sink2"))).addStage(new ETLStage("aggregator", isReducibleAggregator ? FieldCountReducibleAggregator.getPlugin("${aggfield}", "${aggType}") : FieldCountAggregator.getPlugin("${aggfield}", "${aggType}"))).addStage(new ETLStage("dupeFlagger", DupeFlagger.getPlugin("users1", "${flagField}"))).addConnection("users1", "aggregator").addConnection("aggregator", "sink1").addConnection("users1", "dupeFlagger").addConnection("users2", "dupeFlagger").addConnection("dupeFlagger", "sink2").setCheckpointDir(checkpointDir).build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, pipelineConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("ParallelAggJoinApp" + isReducibleAggregator);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // run it once with this set of macros
    Map<String, String> arguments = new HashMap<>();
    arguments.put("aggfield", "id");
    arguments.put("aggType", "long");
    arguments.put("flagField", "isDupe");
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start(arguments);
    sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    DataSetManager<Table> sink1 = getDataset("sink1");
    DataSetManager<Table> sink2 = getDataset("sink2");
    Schema aggSchema = Schema.recordOf("id.count", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
    Set<StructuredRecord> expectedAggregates = ImmutableSet.of(StructuredRecord.builder(aggSchema).set("id", 0L).set("ct", 3L).build(), StructuredRecord.builder(aggSchema).set("id", 1L).set("ct", 1L).build(), StructuredRecord.builder(aggSchema).set("id", 2L).set("ct", 1L).build(), StructuredRecord.builder(aggSchema).set("id", 3L).set("ct", 1L).build());
    Schema outputSchema = Schema.recordOf("user.flagged", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("isDupe", Schema.of(Schema.Type.BOOLEAN)));
    Set<StructuredRecord> expectedJoined = ImmutableSet.of(StructuredRecord.builder(outputSchema).set("id", 1L).set("name", "Samuel").set("isDupe", true).build(), StructuredRecord.builder(outputSchema).set("id", 2L).set("name", "Dwayne").set("isDupe", true).build(), StructuredRecord.builder(outputSchema).set("id", 3L).set("name", "Terry").set("isDupe", false).build());
    Tasks.waitFor(true, () -> {
        sink1.flush();
        sink2.flush();
        Set<StructuredRecord> actualAggs = new HashSet<>(MockSink.readOutput(sink1));
        Set<StructuredRecord> actualJoined = new HashSet<>(MockSink.readOutput(sink2));
        return expectedAggregates.equals(actualAggs) && expectedJoined.equals(actualJoined);
    }, 1, TimeUnit.MINUTES);
    sparkManager.stop();
    sparkManager.waitForStopped(30, TimeUnit.SECONDS);
    MockSink.clear(sink1);
    MockSink.clear(sink2);
    // run it again with different macros to make sure they are re-evaluated and not stored in the checkpoint
    arguments = new HashMap<>();
    arguments.put("aggfield", "name");
    arguments.put("aggType", "string");
    arguments.put("flagField", "dupe");
    sparkManager.start(arguments);
    sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    aggSchema = Schema.recordOf("name.count", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
    Set<StructuredRecord> expectedAggregates2 = ImmutableSet.of(StructuredRecord.builder(aggSchema).set("name", "all").set("ct", 3L).build(), StructuredRecord.builder(aggSchema).set("name", "Samuel").set("ct", 1L).build(), StructuredRecord.builder(aggSchema).set("name", "Dwayne").set("ct", 1L).build(), StructuredRecord.builder(aggSchema).set("name", "Terry").set("ct", 1L).build());
    outputSchema = Schema.recordOf("user.flagged", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("dupe", Schema.of(Schema.Type.BOOLEAN)));
    Set<StructuredRecord> expectedJoined2 = ImmutableSet.of(StructuredRecord.builder(outputSchema).set("id", 1L).set("name", "Samuel").set("dupe", true).build(), StructuredRecord.builder(outputSchema).set("id", 2L).set("name", "Dwayne").set("dupe", true).build(), StructuredRecord.builder(outputSchema).set("id", 3L).set("name", "Terry").set("dupe", false).build());
    Tasks.waitFor(true, () -> {
        sink1.flush();
        sink2.flush();
        Set<StructuredRecord> actualAggs = new HashSet<>(MockSink.readOutput(sink1));
        Set<StructuredRecord> actualJoined = new HashSet<>(MockSink.readOutput(sink2));
        return expectedAggregates2.equals(actualAggs) && expectedJoined2.equals(actualJoined);
    }, 1, TimeUnit.MINUTES);
    sparkManager.stop();
    MockSink.clear(sink1);
    MockSink.clear(sink2);
}

Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) Table(io.cdap.cdap.api.dataset.table.Table) HashMap(java.util.HashMap) Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet)

Aggregations

SparkManager (io.cdap.cdap.test.SparkManager)41 ApplicationManager (io.cdap.cdap.test.ApplicationManager)32 Test (org.junit.Test)29 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)17 HashMap (java.util.HashMap)15 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)12 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)12 HashSet (java.util.HashSet)12 Table (io.cdap.cdap.api.dataset.table.Table)11 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)11 Schema (io.cdap.cdap.api.data.schema.Schema)10 DataStreamsConfig (io.cdap.cdap.etl.proto.v2.DataStreamsConfig)10 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)10 FileSet (io.cdap.cdap.api.dataset.lib.FileSet)9 Location (org.apache.twill.filesystem.Location)9 File (java.io.File)8 URL (java.net.URL)7 PrintStream (java.io.PrintStream)6 HttpURLConnection (java.net.HttpURLConnection)6 WorkflowManager (io.cdap.cdap.test.WorkflowManager)5