Search in sources :

Example 26 with SparkManager

use of co.cask.cdap.test.SparkManager in project cdap by caskdata.

the class DataStreamsTest method testJoin.

@Test
public void testJoin() throws Exception {
    /*
     * source1 ----> t1 ------
     *                        | --> innerjoin ----> t4 ------
     * source2 ----> t2 ------                                 |
     *                                                         | ---> outerjoin --> sink1
     *                                                         |
     * source3 -------------------- t3 ------------------------
     */
    Schema inputSchema1 = Schema.recordOf("customerRecord", Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_name", Schema.of(Schema.Type.STRING)));
    Schema inputSchema2 = Schema.recordOf("itemRecord", Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("item_price", Schema.of(Schema.Type.LONG)), Schema.Field.of("cust_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("cust_name", Schema.of(Schema.Type.STRING)));
    Schema inputSchema3 = Schema.recordOf("transactionRecord", Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("c_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("i_id", Schema.of(Schema.Type.STRING)));
    Schema outSchema2 = Schema.recordOf("join.output", Schema.Field.of("t_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("c_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("i_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_price", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("cust_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("cust_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema1).set("customer_id", "1").set("customer_name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(inputSchema1).set("customer_id", "2").set("customer_name", "bob").build();
    StructuredRecord recordJane = StructuredRecord.builder(inputSchema1).set("customer_id", "3").set("customer_name", "jane").build();
    StructuredRecord recordCar = StructuredRecord.builder(inputSchema2).set("item_id", "11").set("item_price", 10000L).set("cust_id", "1").set("cust_name", "samuel").build();
    StructuredRecord recordBike = StructuredRecord.builder(inputSchema2).set("item_id", "22").set("item_price", 100L).set("cust_id", "3").set("cust_name", "jane").build();
    StructuredRecord recordTrasCar = StructuredRecord.builder(inputSchema3).set("t_id", "1").set("c_id", "1").set("i_id", "11").build();
    StructuredRecord recordTrasBike = StructuredRecord.builder(inputSchema3).set("t_id", "2").set("c_id", "3").set("i_id", "22").build();
    StructuredRecord recordTrasPlane = StructuredRecord.builder(inputSchema3).set("t_id", "3").set("c_id", "4").set("i_id", "33").build();
    List<StructuredRecord> input1 = ImmutableList.of(recordSamuel, recordBob, recordJane);
    List<StructuredRecord> input2 = ImmutableList.of(recordCar, recordBike);
    List<StructuredRecord> input3 = ImmutableList.of(recordTrasCar, recordTrasBike, recordTrasPlane);
    String outputName = "multiJoinOutputSink";
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source1", MockSource.getPlugin(inputSchema1, input1))).addStage(new ETLStage("source2", MockSource.getPlugin(inputSchema2, input2))).addStage(new ETLStage("source3", MockSource.getPlugin(inputSchema3, input3))).addStage(new ETLStage("t1", IdentityTransform.getPlugin())).addStage(new ETLStage("t2", IdentityTransform.getPlugin())).addStage(new ETLStage("t3", IdentityTransform.getPlugin())).addStage(new ETLStage("t4", IdentityTransform.getPlugin())).addStage(new ETLStage("innerjoin", MockJoiner.getPlugin("t1.customer_id=t2.cust_id", "t1,t2", ""))).addStage(new ETLStage("outerjoin", MockJoiner.getPlugin("t4.item_id=t3.i_id", "", ""))).addStage(new ETLStage("multijoinSink", MockSink.getPlugin(outputName))).addConnection("source1", "t1").addConnection("source2", "t2").addConnection("source3", "t3").addConnection("t1", "innerjoin").addConnection("t2", "innerjoin").addConnection("innerjoin", "t4").addConnection("t3", "outerjoin").addConnection("t4", "outerjoin").addConnection("outerjoin", "multijoinSink").setBatchInterval("5s").build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("JoinerApp");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForStatus(true, 10, 1);
    StructuredRecord joinRecordSamuel = StructuredRecord.builder(outSchema2).set("customer_id", "1").set("customer_name", "samuel").set("item_id", "11").set("item_price", 10000L).set("cust_id", "1").set("cust_name", "samuel").set("t_id", "1").set("c_id", "1").set("i_id", "11").build();
    StructuredRecord joinRecordJane = StructuredRecord.builder(outSchema2).set("customer_id", "3").set("customer_name", "jane").set("item_id", "22").set("item_price", 100L).set("cust_id", "3").set("cust_name", "jane").set("t_id", "2").set("c_id", "3").set("i_id", "22").build();
    StructuredRecord joinRecordPlane = StructuredRecord.builder(outSchema2).set("t_id", "3").set("c_id", "4").set("i_id", "33").build();
    final Set<StructuredRecord> expected = ImmutableSet.of(joinRecordSamuel, joinRecordJane, joinRecordPlane);
    final DataSetManager<Table> outputManager = getDataset(outputName);
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            outputManager.flush();
            Set<StructuredRecord> outputRecords = new HashSet<>();
            outputRecords.addAll(MockSink.readOutput(outputManager));
            return expected.equals(outputRecords);
        }
    }, 4, TimeUnit.MINUTES);
    sparkManager.stop();
    sparkManager.waitForStatus(false, 10, 1);
    validateMetric(appId, "source1.records.out", 3);
    validateMetric(appId, "source2.records.out", 2);
    validateMetric(appId, "source3.records.out", 3);
    validateMetric(appId, "t1.records.in", 3);
    validateMetric(appId, "t1.records.out", 3);
    validateMetric(appId, "t2.records.in", 2);
    validateMetric(appId, "t2.records.out", 2);
    validateMetric(appId, "t3.records.in", 3);
    validateMetric(appId, "t3.records.out", 3);
    validateMetric(appId, "t4.records.in", 2);
    validateMetric(appId, "t4.records.out", 2);
    validateMetric(appId, "innerjoin.records.in", 5);
    validateMetric(appId, "innerjoin.records.out", 2);
    validateMetric(appId, "outerjoin.records.in", 5);
    validateMetric(appId, "outerjoin.records.out", 3);
    validateMetric(appId, "multijoinSink.records.in", 3);
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) SparkManager(co.cask.cdap.test.SparkManager) Table(co.cask.cdap.api.dataset.table.Table) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) HashSet(java.util.HashSet) Schema(co.cask.cdap.api.data.schema.Schema) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) TimeoutException(java.util.concurrent.TimeoutException) TopicNotFoundException(co.cask.cdap.api.messaging.TopicNotFoundException) DataStreamsConfig(co.cask.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(co.cask.cdap.proto.artifact.AppRequest) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Example 27 with SparkManager

use of co.cask.cdap.test.SparkManager in project cdap by caskdata.

the class DataStreamsTest method testSplitterTransform.

@Test
public void testSplitterTransform() throws Exception {
    Schema schema = Schema.recordOf("user", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("email", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    StructuredRecord user0 = StructuredRecord.builder(schema).set("id", 0L).build();
    StructuredRecord user1 = StructuredRecord.builder(schema).set("id", 1L).set("email", "one@example.com").build();
    StructuredRecord user2 = StructuredRecord.builder(schema).set("id", 2L).set("name", "two").build();
    StructuredRecord user3 = StructuredRecord.builder(schema).set("id", 3L).set("name", "three").set("email", "three@example.com").build();
    String sink1Name = "splitSink1";
    String sink2Name = "splitSink2";
    /*
     *
     *                                            |null --> sink1
     *                       |null--> splitter2 --|
     * source --> splitter1--|                    |non-null --|
     *                       |                                |--> sink2
     *                       |non-null------------------------|
     */
    DataStreamsConfig config = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("source", MockSource.getPlugin(schema, ImmutableList.of(user0, user1, user2, user3)))).addStage(new ETLStage("splitter1", NullFieldSplitterTransform.getPlugin("name"))).addStage(new ETLStage("splitter2", NullFieldSplitterTransform.getPlugin("email"))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))).addConnection("source", "splitter1").addConnection("splitter1", "splitter2", "null").addConnection("splitter1", "sink2", "non-null").addConnection("splitter2", "sink1", "null").addConnection("splitter2", "sink2", "non-null").build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app("SplitterTest");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // run pipeline
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForStatus(true, 10, 1);
    // check output
    // sink1 should only have records where both name and email are null (user0)
    final DataSetManager<Table> sink1Manager = getDataset(sink1Name);
    final Set<StructuredRecord> expected1 = ImmutableSet.of(user0);
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            sink1Manager.flush();
            Set<StructuredRecord> outputRecords = new HashSet<>();
            outputRecords.addAll(MockSink.readOutput(sink1Manager));
            return expected1.equals(outputRecords);
        }
    }, 4, TimeUnit.MINUTES);
    // sink2 should have anything with a non-null name or non-null email
    final DataSetManager<Table> sink2Manager = getDataset(sink2Name);
    final Set<StructuredRecord> expected2 = ImmutableSet.of(user1, user2, user3);
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            sink2Manager.flush();
            Set<StructuredRecord> outputRecords = new HashSet<>();
            outputRecords.addAll(MockSink.readOutput(sink2Manager));
            return expected2.equals(outputRecords);
        }
    }, 4, TimeUnit.MINUTES);
    sparkManager.stop();
    sparkManager.waitForStatus(false, 10, 1);
    validateMetric(appId, "source.records.out", 4);
    validateMetric(appId, "splitter1.records.in", 4);
    validateMetric(appId, "splitter1.records.out.non-null", 2);
    validateMetric(appId, "splitter1.records.out.null", 2);
    validateMetric(appId, "splitter2.records.in", 2);
    validateMetric(appId, "splitter2.records.out.non-null", 1);
    validateMetric(appId, "splitter2.records.out.null", 1);
    validateMetric(appId, "sink1.records.in", 1);
    validateMetric(appId, "sink2.records.in", 3);
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) SparkManager(co.cask.cdap.test.SparkManager) Table(co.cask.cdap.api.dataset.table.Table) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) HashSet(java.util.HashSet) Schema(co.cask.cdap.api.data.schema.Schema) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) TimeoutException(java.util.concurrent.TimeoutException) TopicNotFoundException(co.cask.cdap.api.messaging.TopicNotFoundException) DataStreamsConfig(co.cask.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(co.cask.cdap.proto.artifact.AppRequest) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Example 28 with SparkManager

use of co.cask.cdap.test.SparkManager in project cdap by caskdata.

the class SparkPageRankAppTest method test.

@Test
public void test() throws Exception {
    // Deploy the SparkPageRankApp
    ApplicationManager appManager = deployApplication(SparkPageRankApp.class);
    // Send a stream events to the Stream
    StreamManager streamManager = getStreamManager(SparkPageRankApp.BACKLINK_URL_STREAM);
    streamManager.send(Joiner.on(" ").join(URL_1, URL_2));
    streamManager.send(Joiner.on(" ").join(URL_1, URL_3));
    streamManager.send(Joiner.on(" ").join(URL_2, URL_1));
    streamManager.send(Joiner.on(" ").join(URL_3, URL_1));
    // Start service
    ServiceManager serviceManager = appManager.getServiceManager(SparkPageRankApp.SERVICE_HANDLERS).start();
    // Wait for service to start since the Spark program needs it
    serviceManager.waitForStatus(true);
    // Start the SparkPageRankProgram
    SparkManager sparkManager = appManager.getSparkManager(SparkPageRankApp.PageRankSpark.class.getSimpleName()).start();
    sparkManager.waitForFinish(60, TimeUnit.SECONDS);
    // Run RanksCounter which will count the number of pages for a pr
    MapReduceManager mapReduceManager = appManager.getMapReduceManager(SparkPageRankApp.RanksCounter.class.getSimpleName()).start();
    mapReduceManager.waitForFinish(3, TimeUnit.MINUTES);
    // Query for rank
    URL url = new URL(serviceManager.getServiceURL(15, TimeUnit.SECONDS), SparkPageRankApp.SparkPageRankServiceHandler.RANKS_PATH);
    HttpRequest request = HttpRequest.post(url).withBody(("{\"" + SparkPageRankApp.SparkPageRankServiceHandler.URL_KEY + "\":\"" + URL_1 + "\"}")).build();
    HttpResponse response = HttpRequests.execute(request);
    Assert.assertEquals(HttpURLConnection.HTTP_OK, response.getResponseCode());
    Assert.assertEquals(RANK, response.getResponseBodyAsString());
    // Request total pages for a page rank and verify it
    url = new URL(serviceManager.getServiceURL(15, TimeUnit.SECONDS), SparkPageRankApp.SparkPageRankServiceHandler.TOTAL_PAGES_PATH + "/" + RANK);
    response = HttpRequests.execute(HttpRequest.get(url).build());
    Assert.assertEquals(TOTAL_PAGES, response.getResponseBodyAsString());
}
Also used : HttpRequest(co.cask.common.http.HttpRequest) ApplicationManager(co.cask.cdap.test.ApplicationManager) SparkManager(co.cask.cdap.test.SparkManager) MapReduceManager(co.cask.cdap.test.MapReduceManager) StreamManager(co.cask.cdap.test.StreamManager) ServiceManager(co.cask.cdap.test.ServiceManager) HttpResponse(co.cask.common.http.HttpResponse) URL(java.net.URL) Test(org.junit.Test)

Example 29 with SparkManager

use of co.cask.cdap.test.SparkManager in project cdap by caskdata.

the class SparkTestRun method testTransaction.

@Test
public void testTransaction() throws Exception {
    ApplicationManager applicationManager = deploy(TestSparkApp.class);
    StreamManager streamManager = getStreamManager("SparkStream");
    // Write some sentences to the stream
    streamManager.send("red fox");
    streamManager.send("brown fox");
    streamManager.send("grey fox");
    streamManager.send("brown bear");
    streamManager.send("black bear");
    // Run the spark program
    SparkManager sparkManager = applicationManager.getSparkManager(TransactionSpark.class.getSimpleName());
    sparkManager.start(ImmutableMap.of("source.stream", "SparkStream", "keyvalue.table", "KeyValueTable", "result.all.dataset", "SparkResult", "result.threshold", "2", "result.threshold.dataset", "SparkThresholdResult"));
    // Verify result from dataset before the Spark program terminates
    final DataSetManager<KeyValueTable> resultManager = getDataset("SparkThresholdResult");
    final KeyValueTable resultTable = resultManager.get();
    // Expect the threshold result dataset, with threshold >=2, contains [brown, fox, bear]
    Tasks.waitFor(ImmutableSet.of("brown", "fox", "bear"), new Callable<Set<String>>() {

        @Override
        public Set<String> call() throws Exception {
            // This is to start a new TX
            resultManager.flush();
            LOG.info("Reading from threshold result");
            try (CloseableIterator<KeyValue<byte[], byte[]>> itor = resultTable.scan(null, null)) {
                return ImmutableSet.copyOf(Iterators.transform(itor, new Function<KeyValue<byte[], byte[]>, String>() {

                    @Override
                    public String apply(KeyValue<byte[], byte[]> input) {
                        String word = Bytes.toString(input.getKey());
                        LOG.info("{}, {}", word, Bytes.toInt(input.getValue()));
                        return word;
                    }
                }));
            }
        }
    }, 3, TimeUnit.MINUTES, 1, TimeUnit.SECONDS);
    sparkManager.stop();
    sparkManager.waitForRun(ProgramRunStatus.KILLED, 60, TimeUnit.SECONDS);
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) CloseableIterator(co.cask.cdap.api.dataset.lib.CloseableIterator) SparkManager(co.cask.cdap.test.SparkManager) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) FileSet(co.cask.cdap.api.dataset.lib.FileSet) KeyValue(co.cask.cdap.api.dataset.lib.KeyValue) IOException(java.io.IOException) TransactionSpark(co.cask.cdap.spark.app.TransactionSpark) StreamManager(co.cask.cdap.test.StreamManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Test(org.junit.Test)

Example 30 with SparkManager

use of co.cask.cdap.test.SparkManager in project cdap by caskdata.

the class SparkTestRun method testSparkWithObjectStore.

@Test
public void testSparkWithObjectStore() throws Exception {
    ApplicationManager applicationManager = deploy(SparkAppUsingObjectStore.class);
    DataSetManager<ObjectStore<String>> keysManager = getDataset("keys");
    prepareInputData(keysManager);
    SparkManager sparkManager = applicationManager.getSparkManager(CharCountProgram.class.getSimpleName()).start();
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 1, TimeUnit.MINUTES);
    DataSetManager<KeyValueTable> countManager = getDataset("count");
    checkOutputData(countManager);
    // validate that the table emitted metrics
    // one read + one write in beforeSubmit(), increment (= read + write) in main -> 4
    Tasks.waitFor(4L, new Callable<Long>() {

        @Override
        public Long call() throws Exception {
            Collection<MetricTimeSeries> metrics = getMetricsManager().query(new MetricDataQuery(0, System.currentTimeMillis() / 1000L, Integer.MAX_VALUE, "system." + Constants.Metrics.Name.Dataset.OP_COUNT, AggregationFunction.SUM, ImmutableMap.of(Constants.Metrics.Tag.NAMESPACE, DefaultId.NAMESPACE.getNamespace(), Constants.Metrics.Tag.APP, SparkAppUsingObjectStore.class.getSimpleName(), Constants.Metrics.Tag.SPARK, CharCountProgram.class.getSimpleName(), Constants.Metrics.Tag.DATASET, "totals"), Collections.<String>emptyList()));
            if (metrics.isEmpty()) {
                return 0L;
            }
            Assert.assertEquals(1, metrics.size());
            MetricTimeSeries ts = metrics.iterator().next();
            Assert.assertEquals(1, ts.getTimeValues().size());
            return ts.getTimeValues().get(0).getValue();
        }
    }, 10L, TimeUnit.SECONDS, 50L, TimeUnit.MILLISECONDS);
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) ObjectStore(co.cask.cdap.api.dataset.lib.ObjectStore) SparkAppUsingObjectStore(co.cask.cdap.spark.app.SparkAppUsingObjectStore) SparkManager(co.cask.cdap.test.SparkManager) MetricTimeSeries(co.cask.cdap.api.metrics.MetricTimeSeries) IOException(java.io.IOException) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Collection(java.util.Collection) MetricDataQuery(co.cask.cdap.api.metrics.MetricDataQuery) Test(org.junit.Test)

Aggregations

SparkManager (co.cask.cdap.test.SparkManager)58 ApplicationManager (co.cask.cdap.test.ApplicationManager)52 Test (org.junit.Test)48 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)29 StreamManager (co.cask.cdap.test.StreamManager)21 HashMap (java.util.HashMap)14 ImmutableSet (com.google.common.collect.ImmutableSet)13 Set (java.util.Set)13 FileSet (co.cask.cdap.api.dataset.lib.FileSet)12 TimeoutException (java.util.concurrent.TimeoutException)12 Schema (co.cask.cdap.api.data.schema.Schema)10 Table (co.cask.cdap.api.dataset.table.Table)10 ServiceManager (co.cask.cdap.test.ServiceManager)10 IOException (java.io.IOException)10 URL (java.net.URL)10 Location (org.apache.twill.filesystem.Location)10 TopicNotFoundException (co.cask.cdap.api.messaging.TopicNotFoundException)9 ApplicationId (co.cask.cdap.proto.id.ApplicationId)9 HashSet (java.util.HashSet)9 AppRequest (co.cask.cdap.proto.artifact.AppRequest)8