use of co.cask.cdap.test.SparkManager in project cdap by caskdata.
the class DataStreamsTest method testJoin.
@Test
public void testJoin() throws Exception {
/*
* source1 ----> t1 ------
* | --> innerjoin ----> t4 ------
* source2 ----> t2 ------ |
* | ---> outerjoin --> sink1
* |
* source3 -------------------- t3 ------------------------
*/
Schema inputSchema1 = Schema.recordOf("customerRecord", Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_name", Schema.of(Schema.Type.STRING)));
Schema inputSchema2 = Schema.recordOf("itemRecord", Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("item_price", Schema.of(Schema.Type.LONG)), Schema.Field.of("cust_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("cust_name", Schema.of(Schema.Type.STRING)));
Schema inputSchema3 = Schema.recordOf("transactionRecord", Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("c_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("i_id", Schema.of(Schema.Type.STRING)));
Schema outSchema2 = Schema.recordOf("join.output", Schema.Field.of("t_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("c_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("i_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_price", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("cust_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("cust_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema1).set("customer_id", "1").set("customer_name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(inputSchema1).set("customer_id", "2").set("customer_name", "bob").build();
StructuredRecord recordJane = StructuredRecord.builder(inputSchema1).set("customer_id", "3").set("customer_name", "jane").build();
StructuredRecord recordCar = StructuredRecord.builder(inputSchema2).set("item_id", "11").set("item_price", 10000L).set("cust_id", "1").set("cust_name", "samuel").build();
StructuredRecord recordBike = StructuredRecord.builder(inputSchema2).set("item_id", "22").set("item_price", 100L).set("cust_id", "3").set("cust_name", "jane").build();
StructuredRecord recordTrasCar = StructuredRecord.builder(inputSchema3).set("t_id", "1").set("c_id", "1").set("i_id", "11").build();
StructuredRecord recordTrasBike = StructuredRecord.builder(inputSchema3).set("t_id", "2").set("c_id", "3").set("i_id", "22").build();
StructuredRecord recordTrasPlane = StructuredRecord.builder(inputSchema3).set("t_id", "3").set("c_id", "4").set("i_id", "33").build();
List<StructuredRecord> input1 = ImmutableList.of(recordSamuel, recordBob, recordJane);
List<StructuredRecord> input2 = ImmutableList.of(recordCar, recordBike);
List<StructuredRecord> input3 = ImmutableList.of(recordTrasCar, recordTrasBike, recordTrasPlane);
String outputName = "multiJoinOutputSink";
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source1", MockSource.getPlugin(inputSchema1, input1))).addStage(new ETLStage("source2", MockSource.getPlugin(inputSchema2, input2))).addStage(new ETLStage("source3", MockSource.getPlugin(inputSchema3, input3))).addStage(new ETLStage("t1", IdentityTransform.getPlugin())).addStage(new ETLStage("t2", IdentityTransform.getPlugin())).addStage(new ETLStage("t3", IdentityTransform.getPlugin())).addStage(new ETLStage("t4", IdentityTransform.getPlugin())).addStage(new ETLStage("innerjoin", MockJoiner.getPlugin("t1.customer_id=t2.cust_id", "t1,t2", ""))).addStage(new ETLStage("outerjoin", MockJoiner.getPlugin("t4.item_id=t3.i_id", "", ""))).addStage(new ETLStage("multijoinSink", MockSink.getPlugin(outputName))).addConnection("source1", "t1").addConnection("source2", "t2").addConnection("source3", "t3").addConnection("t1", "innerjoin").addConnection("t2", "innerjoin").addConnection("innerjoin", "t4").addConnection("t3", "outerjoin").addConnection("t4", "outerjoin").addConnection("outerjoin", "multijoinSink").setBatchInterval("5s").build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("JoinerApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForStatus(true, 10, 1);
StructuredRecord joinRecordSamuel = StructuredRecord.builder(outSchema2).set("customer_id", "1").set("customer_name", "samuel").set("item_id", "11").set("item_price", 10000L).set("cust_id", "1").set("cust_name", "samuel").set("t_id", "1").set("c_id", "1").set("i_id", "11").build();
StructuredRecord joinRecordJane = StructuredRecord.builder(outSchema2).set("customer_id", "3").set("customer_name", "jane").set("item_id", "22").set("item_price", 100L).set("cust_id", "3").set("cust_name", "jane").set("t_id", "2").set("c_id", "3").set("i_id", "22").build();
StructuredRecord joinRecordPlane = StructuredRecord.builder(outSchema2).set("t_id", "3").set("c_id", "4").set("i_id", "33").build();
final Set<StructuredRecord> expected = ImmutableSet.of(joinRecordSamuel, joinRecordJane, joinRecordPlane);
final DataSetManager<Table> outputManager = getDataset(outputName);
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
outputManager.flush();
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(outputManager));
return expected.equals(outputRecords);
}
}, 4, TimeUnit.MINUTES);
sparkManager.stop();
sparkManager.waitForStatus(false, 10, 1);
validateMetric(appId, "source1.records.out", 3);
validateMetric(appId, "source2.records.out", 2);
validateMetric(appId, "source3.records.out", 3);
validateMetric(appId, "t1.records.in", 3);
validateMetric(appId, "t1.records.out", 3);
validateMetric(appId, "t2.records.in", 2);
validateMetric(appId, "t2.records.out", 2);
validateMetric(appId, "t3.records.in", 3);
validateMetric(appId, "t3.records.out", 3);
validateMetric(appId, "t4.records.in", 2);
validateMetric(appId, "t4.records.out", 2);
validateMetric(appId, "innerjoin.records.in", 5);
validateMetric(appId, "innerjoin.records.out", 2);
validateMetric(appId, "outerjoin.records.in", 5);
validateMetric(appId, "outerjoin.records.out", 3);
validateMetric(appId, "multijoinSink.records.in", 3);
}
use of co.cask.cdap.test.SparkManager in project cdap by caskdata.
the class DataStreamsTest method testSplitterTransform.
@Test
public void testSplitterTransform() throws Exception {
Schema schema = Schema.recordOf("user", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("email", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
StructuredRecord user0 = StructuredRecord.builder(schema).set("id", 0L).build();
StructuredRecord user1 = StructuredRecord.builder(schema).set("id", 1L).set("email", "one@example.com").build();
StructuredRecord user2 = StructuredRecord.builder(schema).set("id", 2L).set("name", "two").build();
StructuredRecord user3 = StructuredRecord.builder(schema).set("id", 3L).set("name", "three").set("email", "three@example.com").build();
String sink1Name = "splitSink1";
String sink2Name = "splitSink2";
/*
*
* |null --> sink1
* |null--> splitter2 --|
* source --> splitter1--| |non-null --|
* | |--> sink2
* |non-null------------------------|
*/
DataStreamsConfig config = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("source", MockSource.getPlugin(schema, ImmutableList.of(user0, user1, user2, user3)))).addStage(new ETLStage("splitter1", NullFieldSplitterTransform.getPlugin("name"))).addStage(new ETLStage("splitter2", NullFieldSplitterTransform.getPlugin("email"))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))).addConnection("source", "splitter1").addConnection("splitter1", "splitter2", "null").addConnection("splitter1", "sink2", "non-null").addConnection("splitter2", "sink1", "null").addConnection("splitter2", "sink2", "non-null").build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app("SplitterTest");
ApplicationManager appManager = deployApplication(appId, appRequest);
// run pipeline
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForStatus(true, 10, 1);
// check output
// sink1 should only have records where both name and email are null (user0)
final DataSetManager<Table> sink1Manager = getDataset(sink1Name);
final Set<StructuredRecord> expected1 = ImmutableSet.of(user0);
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
sink1Manager.flush();
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(sink1Manager));
return expected1.equals(outputRecords);
}
}, 4, TimeUnit.MINUTES);
// sink2 should have anything with a non-null name or non-null email
final DataSetManager<Table> sink2Manager = getDataset(sink2Name);
final Set<StructuredRecord> expected2 = ImmutableSet.of(user1, user2, user3);
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
sink2Manager.flush();
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(sink2Manager));
return expected2.equals(outputRecords);
}
}, 4, TimeUnit.MINUTES);
sparkManager.stop();
sparkManager.waitForStatus(false, 10, 1);
validateMetric(appId, "source.records.out", 4);
validateMetric(appId, "splitter1.records.in", 4);
validateMetric(appId, "splitter1.records.out.non-null", 2);
validateMetric(appId, "splitter1.records.out.null", 2);
validateMetric(appId, "splitter2.records.in", 2);
validateMetric(appId, "splitter2.records.out.non-null", 1);
validateMetric(appId, "splitter2.records.out.null", 1);
validateMetric(appId, "sink1.records.in", 1);
validateMetric(appId, "sink2.records.in", 3);
}
use of co.cask.cdap.test.SparkManager in project cdap by caskdata.
the class SparkPageRankAppTest method test.
@Test
public void test() throws Exception {
// Deploy the SparkPageRankApp
ApplicationManager appManager = deployApplication(SparkPageRankApp.class);
// Send a stream events to the Stream
StreamManager streamManager = getStreamManager(SparkPageRankApp.BACKLINK_URL_STREAM);
streamManager.send(Joiner.on(" ").join(URL_1, URL_2));
streamManager.send(Joiner.on(" ").join(URL_1, URL_3));
streamManager.send(Joiner.on(" ").join(URL_2, URL_1));
streamManager.send(Joiner.on(" ").join(URL_3, URL_1));
// Start service
ServiceManager serviceManager = appManager.getServiceManager(SparkPageRankApp.SERVICE_HANDLERS).start();
// Wait for service to start since the Spark program needs it
serviceManager.waitForStatus(true);
// Start the SparkPageRankProgram
SparkManager sparkManager = appManager.getSparkManager(SparkPageRankApp.PageRankSpark.class.getSimpleName()).start();
sparkManager.waitForFinish(60, TimeUnit.SECONDS);
// Run RanksCounter which will count the number of pages for a pr
MapReduceManager mapReduceManager = appManager.getMapReduceManager(SparkPageRankApp.RanksCounter.class.getSimpleName()).start();
mapReduceManager.waitForFinish(3, TimeUnit.MINUTES);
// Query for rank
URL url = new URL(serviceManager.getServiceURL(15, TimeUnit.SECONDS), SparkPageRankApp.SparkPageRankServiceHandler.RANKS_PATH);
HttpRequest request = HttpRequest.post(url).withBody(("{\"" + SparkPageRankApp.SparkPageRankServiceHandler.URL_KEY + "\":\"" + URL_1 + "\"}")).build();
HttpResponse response = HttpRequests.execute(request);
Assert.assertEquals(HttpURLConnection.HTTP_OK, response.getResponseCode());
Assert.assertEquals(RANK, response.getResponseBodyAsString());
// Request total pages for a page rank and verify it
url = new URL(serviceManager.getServiceURL(15, TimeUnit.SECONDS), SparkPageRankApp.SparkPageRankServiceHandler.TOTAL_PAGES_PATH + "/" + RANK);
response = HttpRequests.execute(HttpRequest.get(url).build());
Assert.assertEquals(TOTAL_PAGES, response.getResponseBodyAsString());
}
use of co.cask.cdap.test.SparkManager in project cdap by caskdata.
the class SparkTestRun method testTransaction.
@Test
public void testTransaction() throws Exception {
ApplicationManager applicationManager = deploy(TestSparkApp.class);
StreamManager streamManager = getStreamManager("SparkStream");
// Write some sentences to the stream
streamManager.send("red fox");
streamManager.send("brown fox");
streamManager.send("grey fox");
streamManager.send("brown bear");
streamManager.send("black bear");
// Run the spark program
SparkManager sparkManager = applicationManager.getSparkManager(TransactionSpark.class.getSimpleName());
sparkManager.start(ImmutableMap.of("source.stream", "SparkStream", "keyvalue.table", "KeyValueTable", "result.all.dataset", "SparkResult", "result.threshold", "2", "result.threshold.dataset", "SparkThresholdResult"));
// Verify result from dataset before the Spark program terminates
final DataSetManager<KeyValueTable> resultManager = getDataset("SparkThresholdResult");
final KeyValueTable resultTable = resultManager.get();
// Expect the threshold result dataset, with threshold >=2, contains [brown, fox, bear]
Tasks.waitFor(ImmutableSet.of("brown", "fox", "bear"), new Callable<Set<String>>() {
@Override
public Set<String> call() throws Exception {
// This is to start a new TX
resultManager.flush();
LOG.info("Reading from threshold result");
try (CloseableIterator<KeyValue<byte[], byte[]>> itor = resultTable.scan(null, null)) {
return ImmutableSet.copyOf(Iterators.transform(itor, new Function<KeyValue<byte[], byte[]>, String>() {
@Override
public String apply(KeyValue<byte[], byte[]> input) {
String word = Bytes.toString(input.getKey());
LOG.info("{}, {}", word, Bytes.toInt(input.getValue()));
return word;
}
}));
}
}
}, 3, TimeUnit.MINUTES, 1, TimeUnit.SECONDS);
sparkManager.stop();
sparkManager.waitForRun(ProgramRunStatus.KILLED, 60, TimeUnit.SECONDS);
}
use of co.cask.cdap.test.SparkManager in project cdap by caskdata.
the class SparkTestRun method testSparkWithObjectStore.
@Test
public void testSparkWithObjectStore() throws Exception {
ApplicationManager applicationManager = deploy(SparkAppUsingObjectStore.class);
DataSetManager<ObjectStore<String>> keysManager = getDataset("keys");
prepareInputData(keysManager);
SparkManager sparkManager = applicationManager.getSparkManager(CharCountProgram.class.getSimpleName()).start();
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 1, TimeUnit.MINUTES);
DataSetManager<KeyValueTable> countManager = getDataset("count");
checkOutputData(countManager);
// validate that the table emitted metrics
// one read + one write in beforeSubmit(), increment (= read + write) in main -> 4
Tasks.waitFor(4L, new Callable<Long>() {
@Override
public Long call() throws Exception {
Collection<MetricTimeSeries> metrics = getMetricsManager().query(new MetricDataQuery(0, System.currentTimeMillis() / 1000L, Integer.MAX_VALUE, "system." + Constants.Metrics.Name.Dataset.OP_COUNT, AggregationFunction.SUM, ImmutableMap.of(Constants.Metrics.Tag.NAMESPACE, DefaultId.NAMESPACE.getNamespace(), Constants.Metrics.Tag.APP, SparkAppUsingObjectStore.class.getSimpleName(), Constants.Metrics.Tag.SPARK, CharCountProgram.class.getSimpleName(), Constants.Metrics.Tag.DATASET, "totals"), Collections.<String>emptyList()));
if (metrics.isEmpty()) {
return 0L;
}
Assert.assertEquals(1, metrics.size());
MetricTimeSeries ts = metrics.iterator().next();
Assert.assertEquals(1, ts.getTimeValues().size());
return ts.getTimeValues().get(0).getValue();
}
}, 10L, TimeUnit.SECONDS, 50L, TimeUnit.MILLISECONDS);
}
Aggregations