use of co.cask.cdap.proto.artifact.AppRequest in project cdap by caskdata.
the class DataStreamsTest method testAggregatorJoinerMacrosWithCheckpoints.
@Test
public void testAggregatorJoinerMacrosWithCheckpoints() throws Exception {
/*
|--> aggregator --> sink1
users1 --|
|----|
|--> dupeFlagger --> sink2
users2 -------|
*/
Schema userSchema = Schema.recordOf("user", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> users1 = ImmutableList.of(StructuredRecord.builder(userSchema).set("id", 1L).set("name", "Samuel").build(), StructuredRecord.builder(userSchema).set("id", 2L).set("name", "Dwayne").build(), StructuredRecord.builder(userSchema).set("id", 3L).set("name", "Terry").build());
List<StructuredRecord> users2 = ImmutableList.of(StructuredRecord.builder(userSchema).set("id", 1L).set("name", "Samuel").build(), StructuredRecord.builder(userSchema).set("id", 2L).set("name", "Dwayne").build(), StructuredRecord.builder(userSchema).set("id", 4L).set("name", "Terry").build(), StructuredRecord.builder(userSchema).set("id", 5L).set("name", "Christopher").build());
DataStreamsConfig pipelineConfig = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("users1", MockSource.getPlugin(userSchema, users1))).addStage(new ETLStage("users2", MockSource.getPlugin(userSchema, users2))).addStage(new ETLStage("sink1", MockSink.getPlugin("sink1"))).addStage(new ETLStage("sink2", MockSink.getPlugin("sink2"))).addStage(new ETLStage("aggregator", FieldCountAggregator.getPlugin("${aggfield}", "${aggType}"))).addStage(new ETLStage("dupeFlagger", DupeFlagger.getPlugin("users1", "${flagField}"))).addConnection("users1", "aggregator").addConnection("aggregator", "sink1").addConnection("users1", "dupeFlagger").addConnection("users2", "dupeFlagger").addConnection("dupeFlagger", "sink2").build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, pipelineConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("ParallelAggApp");
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
// run it once with this set of macros
Map<String, String> arguments = new HashMap<>();
arguments.put("aggfield", "id");
arguments.put("aggType", "long");
arguments.put("flagField", "isDupe");
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start(arguments);
sparkManager.waitForStatus(true, 10, 1);
final DataSetManager<Table> sink1 = getDataset("sink1");
final DataSetManager<Table> sink2 = getDataset("sink2");
Schema aggSchema = Schema.recordOf("user.count", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
final Set<StructuredRecord> expectedAggregates = ImmutableSet.of(StructuredRecord.builder(aggSchema).set("id", 0L).set("ct", 3L).build(), StructuredRecord.builder(aggSchema).set("id", 1L).set("ct", 1L).build(), StructuredRecord.builder(aggSchema).set("id", 2L).set("ct", 1L).build(), StructuredRecord.builder(aggSchema).set("id", 3L).set("ct", 1L).build());
Schema outputSchema = Schema.recordOf("user.flagged", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("isDupe", Schema.of(Schema.Type.BOOLEAN)));
final Set<StructuredRecord> expectedJoined = ImmutableSet.of(StructuredRecord.builder(outputSchema).set("id", 1L).set("name", "Samuel").set("isDupe", true).build(), StructuredRecord.builder(outputSchema).set("id", 2L).set("name", "Dwayne").set("isDupe", true).build(), StructuredRecord.builder(outputSchema).set("id", 3L).set("name", "Terry").set("isDupe", false).build());
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
sink1.flush();
sink2.flush();
Set<StructuredRecord> actualAggs = new HashSet<>();
Set<StructuredRecord> actualJoined = new HashSet<>();
actualAggs.addAll(MockSink.readOutput(sink1));
actualJoined.addAll(MockSink.readOutput(sink2));
return expectedAggregates.equals(actualAggs) && expectedJoined.equals(actualJoined);
}
}, 1, TimeUnit.MINUTES);
sparkManager.stop();
sparkManager.waitForStatus(false, 30, 1);
MockSink.clear(sink1);
MockSink.clear(sink2);
// run it again with different macros to make sure they are re-evaluated and not stored in the checkpoint
arguments = new HashMap<>();
arguments.put("aggfield", "name");
arguments.put("aggType", "string");
arguments.put("flagField", "dupe");
sparkManager.start(arguments);
sparkManager.waitForStatus(true, 10, 1);
aggSchema = Schema.recordOf("user.count", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
final Set<StructuredRecord> expectedAggregates2 = ImmutableSet.of(StructuredRecord.builder(aggSchema).set("name", "all").set("ct", 3L).build(), StructuredRecord.builder(aggSchema).set("name", "Samuel").set("ct", 1L).build(), StructuredRecord.builder(aggSchema).set("name", "Dwayne").set("ct", 1L).build(), StructuredRecord.builder(aggSchema).set("name", "Terry").set("ct", 1L).build());
outputSchema = Schema.recordOf("user.flagged", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("dupe", Schema.of(Schema.Type.BOOLEAN)));
final Set<StructuredRecord> expectedJoined2 = ImmutableSet.of(StructuredRecord.builder(outputSchema).set("id", 1L).set("name", "Samuel").set("dupe", true).build(), StructuredRecord.builder(outputSchema).set("id", 2L).set("name", "Dwayne").set("dupe", true).build(), StructuredRecord.builder(outputSchema).set("id", 3L).set("name", "Terry").set("dupe", false).build());
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
sink1.flush();
sink2.flush();
Set<StructuredRecord> actualAggs = new HashSet<>();
Set<StructuredRecord> actualJoined = new HashSet<>();
actualAggs.addAll(MockSink.readOutput(sink1));
actualJoined.addAll(MockSink.readOutput(sink2));
return expectedAggregates2.equals(actualAggs) && expectedJoined2.equals(actualJoined);
}
}, 1, TimeUnit.MINUTES);
sparkManager.stop();
}
use of co.cask.cdap.proto.artifact.AppRequest in project cdap by caskdata.
the class DataStreamsTest method testJoin.
@Test
public void testJoin() throws Exception {
/*
* source1 ----> t1 ------
* | --> innerjoin ----> t4 ------
* source2 ----> t2 ------ |
* | ---> outerjoin --> sink1
* |
* source3 -------------------- t3 ------------------------
*/
Schema inputSchema1 = Schema.recordOf("customerRecord", Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_name", Schema.of(Schema.Type.STRING)));
Schema inputSchema2 = Schema.recordOf("itemRecord", Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("item_price", Schema.of(Schema.Type.LONG)), Schema.Field.of("cust_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("cust_name", Schema.of(Schema.Type.STRING)));
Schema inputSchema3 = Schema.recordOf("transactionRecord", Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("c_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("i_id", Schema.of(Schema.Type.STRING)));
Schema outSchema2 = Schema.recordOf("join.output", Schema.Field.of("t_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("c_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("i_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_price", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("cust_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("cust_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema1).set("customer_id", "1").set("customer_name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(inputSchema1).set("customer_id", "2").set("customer_name", "bob").build();
StructuredRecord recordJane = StructuredRecord.builder(inputSchema1).set("customer_id", "3").set("customer_name", "jane").build();
StructuredRecord recordCar = StructuredRecord.builder(inputSchema2).set("item_id", "11").set("item_price", 10000L).set("cust_id", "1").set("cust_name", "samuel").build();
StructuredRecord recordBike = StructuredRecord.builder(inputSchema2).set("item_id", "22").set("item_price", 100L).set("cust_id", "3").set("cust_name", "jane").build();
StructuredRecord recordTrasCar = StructuredRecord.builder(inputSchema3).set("t_id", "1").set("c_id", "1").set("i_id", "11").build();
StructuredRecord recordTrasBike = StructuredRecord.builder(inputSchema3).set("t_id", "2").set("c_id", "3").set("i_id", "22").build();
StructuredRecord recordTrasPlane = StructuredRecord.builder(inputSchema3).set("t_id", "3").set("c_id", "4").set("i_id", "33").build();
List<StructuredRecord> input1 = ImmutableList.of(recordSamuel, recordBob, recordJane);
List<StructuredRecord> input2 = ImmutableList.of(recordCar, recordBike);
List<StructuredRecord> input3 = ImmutableList.of(recordTrasCar, recordTrasBike, recordTrasPlane);
String outputName = "multiJoinOutputSink";
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source1", MockSource.getPlugin(inputSchema1, input1))).addStage(new ETLStage("source2", MockSource.getPlugin(inputSchema2, input2))).addStage(new ETLStage("source3", MockSource.getPlugin(inputSchema3, input3))).addStage(new ETLStage("t1", IdentityTransform.getPlugin())).addStage(new ETLStage("t2", IdentityTransform.getPlugin())).addStage(new ETLStage("t3", IdentityTransform.getPlugin())).addStage(new ETLStage("t4", IdentityTransform.getPlugin())).addStage(new ETLStage("innerjoin", MockJoiner.getPlugin("t1.customer_id=t2.cust_id", "t1,t2", ""))).addStage(new ETLStage("outerjoin", MockJoiner.getPlugin("t4.item_id=t3.i_id", "", ""))).addStage(new ETLStage("multijoinSink", MockSink.getPlugin(outputName))).addConnection("source1", "t1").addConnection("source2", "t2").addConnection("source3", "t3").addConnection("t1", "innerjoin").addConnection("t2", "innerjoin").addConnection("innerjoin", "t4").addConnection("t3", "outerjoin").addConnection("t4", "outerjoin").addConnection("outerjoin", "multijoinSink").setBatchInterval("5s").build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("JoinerApp");
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForStatus(true, 10, 1);
StructuredRecord joinRecordSamuel = StructuredRecord.builder(outSchema2).set("customer_id", "1").set("customer_name", "samuel").set("item_id", "11").set("item_price", 10000L).set("cust_id", "1").set("cust_name", "samuel").set("t_id", "1").set("c_id", "1").set("i_id", "11").build();
StructuredRecord joinRecordJane = StructuredRecord.builder(outSchema2).set("customer_id", "3").set("customer_name", "jane").set("item_id", "22").set("item_price", 100L).set("cust_id", "3").set("cust_name", "jane").set("t_id", "2").set("c_id", "3").set("i_id", "22").build();
StructuredRecord joinRecordPlane = StructuredRecord.builder(outSchema2).set("t_id", "3").set("c_id", "4").set("i_id", "33").build();
final Set<StructuredRecord> expected = ImmutableSet.of(joinRecordSamuel, joinRecordJane, joinRecordPlane);
final DataSetManager<Table> outputManager = getDataset(outputName);
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
outputManager.flush();
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(outputManager));
return expected.equals(outputRecords);
}
}, 4, TimeUnit.MINUTES);
sparkManager.stop();
sparkManager.waitForStatus(false, 10, 1);
validateMetric(appId, "source1.records.out", 3);
validateMetric(appId, "source2.records.out", 2);
validateMetric(appId, "source3.records.out", 3);
validateMetric(appId, "t1.records.in", 3);
validateMetric(appId, "t1.records.out", 3);
validateMetric(appId, "t2.records.in", 2);
validateMetric(appId, "t2.records.out", 2);
validateMetric(appId, "t3.records.in", 3);
validateMetric(appId, "t3.records.out", 3);
validateMetric(appId, "t4.records.in", 2);
validateMetric(appId, "t4.records.out", 2);
validateMetric(appId, "innerjoin.records.in", 5);
validateMetric(appId, "innerjoin.records.out", 2);
validateMetric(appId, "outerjoin.records.in", 5);
validateMetric(appId, "outerjoin.records.out", 3);
validateMetric(appId, "multijoinSink.records.in", 3);
}
use of co.cask.cdap.proto.artifact.AppRequest in project cdap by caskdata.
the class ApplicationClientTestRun method testAppUpdate.
@Test
public void testAppUpdate() throws Exception {
String artifactName = "cfg-programs";
ArtifactId artifactIdV1 = NamespaceId.DEFAULT.artifact(artifactName, "1.0.0");
ArtifactId artifactIdV2 = NamespaceId.DEFAULT.artifact(artifactName, "2.0.0");
ApplicationId appId = NamespaceId.DEFAULT.app("ProgramsApp");
artifactClient.add(NamespaceId.DEFAULT, artifactName, Files.newInputStreamSupplier(createAppJarFile(ConfigurableProgramsApp.class)), "1.0.0");
artifactClient.add(NamespaceId.DEFAULT, artifactName, Files.newInputStreamSupplier(createAppJarFile(ConfigurableProgramsApp2.class)), "2.0.0");
try {
// deploy the app with just the worker
ConfigurableProgramsApp.Programs conf = new ConfigurableProgramsApp.Programs(null, "worker1", "stream1", "dataset1");
AppRequest<ConfigurableProgramsApp.Programs> request = new AppRequest<>(new ArtifactSummary(artifactIdV1.getArtifact(), artifactIdV1.getVersion()), conf);
appClient.deploy(appId, request);
// should only have the worker
Assert.assertTrue(appClient.listPrograms(appId, ProgramType.FLOW).isEmpty());
Assert.assertEquals(1, appClient.listPrograms(appId, ProgramType.WORKER).size());
// update to use just the flow
conf = new ConfigurableProgramsApp.Programs("flow1", null, "stream1", "dataset1");
request = new AppRequest<>(new ArtifactSummary(artifactIdV1.getArtifact(), artifactIdV1.getVersion()), conf);
appClient.update(appId, request);
// should only have the flow
Assert.assertTrue(appClient.listPrograms(appId, ProgramType.WORKER).isEmpty());
Assert.assertEquals(1, appClient.listPrograms(appId, ProgramType.FLOW).size());
// check nonexistent app is not found
try {
appClient.update(NamespaceId.DEFAULT.app("ghost"), request);
Assert.fail();
} catch (NotFoundException e) {
// expected
}
// check different artifact name is invalid
request = new AppRequest<>(new ArtifactSummary("ghost", artifactIdV1.getVersion()), conf);
try {
appClient.update(appId, request);
Assert.fail();
} catch (BadRequestException e) {
// expected
}
// check nonexistent artifact is not found
request = new AppRequest<>(new ArtifactSummary(artifactIdV1.getArtifact(), "0.0.1"), conf);
try {
appClient.update(appId, request);
Assert.fail();
} catch (NotFoundException e) {
// expected
}
// update artifact version. This version uses a different app class with that can add a service
ConfigurableProgramsApp2.Programs conf2 = new ConfigurableProgramsApp2.Programs(null, null, "stream1", "dataset1", "service2");
AppRequest<ConfigurableProgramsApp2.Programs> request2 = new AppRequest<>(new ArtifactSummary(artifactIdV2.getArtifact(), artifactIdV2.getVersion()), conf2);
appClient.update(appId, request2);
// should only have a single service
Assert.assertTrue(appClient.listPrograms(appId, ProgramType.WORKER).isEmpty());
Assert.assertTrue(appClient.listPrograms(appId, ProgramType.FLOW).isEmpty());
Assert.assertEquals(1, appClient.listPrograms(appId, ProgramType.SERVICE).size());
} finally {
appClient.delete(appId);
appClient.waitForDeleted(appId, 30, TimeUnit.SECONDS);
artifactClient.delete(artifactIdV1);
artifactClient.delete(artifactIdV2);
}
}
use of co.cask.cdap.proto.artifact.AppRequest in project cdap by caskdata.
the class ETLWorkerTest method testDAG.
@Test
public void testDAG() throws Exception {
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("x", Schema.of(Schema.Type.INT)));
StructuredRecord record1 = StructuredRecord.builder(schema).set("x", 1).build();
StructuredRecord record2 = StructuredRecord.builder(schema).set("x", 2).build();
StructuredRecord record3 = StructuredRecord.builder(schema).set("x", 3).build();
List<StructuredRecord> input = ImmutableList.of(record1, record2, record3);
/*
* ----- value filter ------- sink1
* |
* source --------- double --------
* | |---- sink2
* ----- identity ------
*/
File sink1Out = TMP_FOLDER.newFolder();
File sink2Out = TMP_FOLDER.newFolder();
ETLRealtimeConfig etlConfig = ETLRealtimeConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(input))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Out))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Out))).addStage(new ETLStage("valueFilter", IntValueFilterTransform.getPlugin("x", 2))).addStage(new ETLStage("double", DoubleTransform.getPlugin())).addStage(new ETLStage("identity", IdentityTransform.getPlugin())).addConnection("source", "valueFilter").addConnection("source", "double").addConnection("source", "identity").addConnection("valueFilter", "sink1").addConnection("double", "sink2").addConnection("identity", "sink2").build();
ApplicationId appId = NamespaceId.DEFAULT.app("dagTest");
AppRequest<ETLRealtimeConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationManager appManager = deployApplication(appId, appRequest);
Assert.assertNotNull(appManager);
WorkerManager workerManager = appManager.getWorkerManager(ETLWorker.NAME);
workerManager.start();
workerManager.waitForStatus(true, 10, 1);
try {
List<StructuredRecord> sink1output = MockSink.getRecords(sink1Out, 0, 10, TimeUnit.SECONDS);
List<StructuredRecord> sink1expected = ImmutableList.of(record1, record3);
Assert.assertEquals(sink1expected, sink1output);
List<StructuredRecord> sink2output = MockSink.getRecords(sink2Out, 0, 10, TimeUnit.SECONDS);
Assert.assertEquals(9, sink2output.size());
} finally {
stopWorker(workerManager);
}
validateMetric(3, appId, "source.records.out");
validateMetric(3, appId, "valueFilter.records.in");
validateMetric(2, appId, "valueFilter.records.out");
validateMetric(3, appId, "double.records.in");
validateMetric(6, appId, "double.records.out");
validateMetric(3, appId, "identity.records.in");
validateMetric(3, appId, "identity.records.out");
validateMetric(2, appId, "sink1.records.in");
validateMetric(9, appId, "sink2.records.in");
}
use of co.cask.cdap.proto.artifact.AppRequest in project cdap by caskdata.
the class MetadataHttpHandlerTestRun method before.
@Before
public void before() throws Exception {
addAppArtifact(artifactId, AppWithDataset.class);
AppRequest<Config> appRequest = new AppRequest<>(new ArtifactSummary(artifactId.getArtifact(), artifactId.getVersion()));
appClient.deploy(application, appRequest);
FormatSpecification format = new FormatSpecification("csv", null, null);
ViewSpecification viewSpec = new ViewSpecification(format, null);
streamViewClient.createOrUpdate(myview, viewSpec);
}
Aggregations