use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class MockSource method getStream.
@Override
public JavaDStream<StructuredRecord> getStream(StreamingContext context) throws Exception {
Schema schema = Schema.parseJson(conf.schema);
List<String> recordsAsStrings = new Gson().fromJson(conf.records, STRING_LIST_TYPE);
final List<StructuredRecord> inputRecords = new ArrayList<>();
for (String recordStr : recordsAsStrings) {
inputRecords.add(StructuredRecordStringConverter.fromJsonString(recordStr, schema));
}
JavaStreamingContext jsc = context.getSparkStreamingContext();
return jsc.receiverStream(new Receiver<StructuredRecord>(StorageLevel.MEMORY_ONLY()) {
@Override
public StorageLevel storageLevel() {
return StorageLevel.MEMORY_ONLY();
}
@Override
public void onStart() {
new Thread() {
@Override
public void run() {
for (StructuredRecord record : inputRecords) {
if (isStarted()) {
store(record);
try {
TimeUnit.MILLISECONDS.sleep(conf.intervalMillis);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
}
@Override
public void interrupt() {
super.interrupt();
}
}.start();
}
@Override
public void onStop() {
}
});
}
use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class FieldsPrefixTransform method transform.
@Override
public void transform(StructuredRecord input, Emitter<StructuredRecord> emitter) throws Exception {
Schema outSchema = config.getOutputSchema(input.getSchema());
StructuredRecord.Builder outputBuilder = StructuredRecord.builder(outSchema);
for (Schema.Field inField : input.getSchema().getFields()) {
outputBuilder.set(config.prefix + inField.getName(), input.get(inField.getName()));
}
emitter.emit(outputBuilder.build());
}
use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class DataPipelineTest method testMultipleJoiner.
private void testMultipleJoiner(Engine engine) throws Exception {
/*
* source1 ----> t1 ------
* | --> innerjoin ----> t4 ------
* source2 ----> t2 ------ |
* | ---> outerjoin --> sink1
* |
* source3 -------------------- t3 ------------------------
*/
Schema inputSchema1 = Schema.recordOf("customerRecord", Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_name", Schema.of(Schema.Type.STRING)));
Schema inputSchema2 = Schema.recordOf("itemRecord", Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("item_price", Schema.of(Schema.Type.LONG)), Schema.Field.of("cust_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("cust_name", Schema.of(Schema.Type.STRING)));
Schema inputSchema3 = Schema.recordOf("transactionRecord", Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("c_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("i_id", Schema.of(Schema.Type.STRING)));
Schema outSchema2 = Schema.recordOf("join.output", Schema.Field.of("t_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("c_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("i_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_price", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("cust_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("cust_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
String source1MulitJoinInput = "multiJoinSource1-" + engine;
String source2MultiJoinInput = "multiJoinSource2-" + engine;
String source3MultiJoinInput = "multiJoinSource3-" + engine;
String outputName = "multiJoinOutput-" + engine;
String sinkName = "multiJoinOutputSink-" + engine;
String outerJoinName = "multiJoinOuter-" + engine;
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source1", MockSource.getPlugin(source1MulitJoinInput, inputSchema1))).addStage(new ETLStage("source2", MockSource.getPlugin(source2MultiJoinInput, inputSchema2))).addStage(new ETLStage("source3", MockSource.getPlugin(source3MultiJoinInput, inputSchema3))).addStage(new ETLStage("t1", IdentityTransform.getPlugin())).addStage(new ETLStage("t2", IdentityTransform.getPlugin())).addStage(new ETLStage("t3", IdentityTransform.getPlugin())).addStage(new ETLStage("t4", IdentityTransform.getPlugin())).addStage(new ETLStage("innerjoin", MockJoiner.getPlugin("t1.customer_id=t2.cust_id", "t1,t2", ""))).addStage(new ETLStage(outerJoinName, MockJoiner.getPlugin("t4.item_id=t3.i_id", "", ""))).addStage(new ETLStage(sinkName, MockSink.getPlugin(outputName))).addConnection("source1", "t1").addConnection("source2", "t2").addConnection("source3", "t3").addConnection("t1", "innerjoin").addConnection("t2", "innerjoin").addConnection("innerjoin", "t4").addConnection("t3", outerJoinName).addConnection("t4", outerJoinName).addConnection(outerJoinName, sinkName).setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("JoinerApp-" + engine);
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema1).set("customer_id", "1").set("customer_name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(inputSchema1).set("customer_id", "2").set("customer_name", "bob").build();
StructuredRecord recordJane = StructuredRecord.builder(inputSchema1).set("customer_id", "3").set("customer_name", "jane").build();
StructuredRecord recordCar = StructuredRecord.builder(inputSchema2).set("item_id", "11").set("item_price", 10000L).set("cust_id", "1").set("cust_name", "samuel").build();
StructuredRecord recordBike = StructuredRecord.builder(inputSchema2).set("item_id", "22").set("item_price", 100L).set("cust_id", "3").set("cust_name", "jane").build();
StructuredRecord recordTrasCar = StructuredRecord.builder(inputSchema3).set("t_id", "1").set("c_id", "1").set("i_id", "11").build();
StructuredRecord recordTrasBike = StructuredRecord.builder(inputSchema3).set("t_id", "2").set("c_id", "3").set("i_id", "22").build();
StructuredRecord recordTrasPlane = StructuredRecord.builder(inputSchema3).set("t_id", "3").set("c_id", "4").set("i_id", "33").build();
// write one record to each source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(source1MulitJoinInput));
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob, recordJane));
inputManager = getDataset(NamespaceId.DEFAULT.dataset(source2MultiJoinInput));
MockSource.writeInput(inputManager, ImmutableList.of(recordCar, recordBike));
inputManager = getDataset(NamespaceId.DEFAULT.dataset(source3MultiJoinInput));
MockSource.writeInput(inputManager, ImmutableList.of(recordTrasCar, recordTrasBike, recordTrasPlane));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
StructuredRecord joinRecordSamuel = StructuredRecord.builder(outSchema2).set("customer_id", "1").set("customer_name", "samuel").set("item_id", "11").set("item_price", 10000L).set("cust_id", "1").set("cust_name", "samuel").set("t_id", "1").set("c_id", "1").set("i_id", "11").build();
StructuredRecord joinRecordJane = StructuredRecord.builder(outSchema2).set("customer_id", "3").set("customer_name", "jane").set("item_id", "22").set("item_price", 100L).set("cust_id", "3").set("cust_name", "jane").set("t_id", "2").set("c_id", "3").set("i_id", "22").build();
StructuredRecord joinRecordPlane = StructuredRecord.builder(outSchema2).set("t_id", "3").set("c_id", "4").set("i_id", "33").build();
DataSetManager<Table> sinkManager = getDataset(outputName);
Set<StructuredRecord> expected = ImmutableSet.of(joinRecordSamuel, joinRecordJane, joinRecordPlane);
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
validateMetric(3, appId, outerJoinName + ".records.out");
validateMetric(3, appId, sinkName + ".records.in");
}
use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class DataPipelineTest method testInnerJoinWithMultiOutput.
public void testInnerJoinWithMultiOutput(Engine engine) throws Exception {
Schema inputSchema1 = Schema.recordOf("customerRecord", Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_name", Schema.of(Schema.Type.STRING)));
Schema inputSchema2 = Schema.recordOf("itemRecord", Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("item_price", Schema.of(Schema.Type.LONG)), Schema.Field.of("cust_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("cust_name", Schema.of(Schema.Type.STRING)));
Schema inputSchema3 = Schema.recordOf("transactionRecord", Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("c_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("c_name", Schema.of(Schema.Type.STRING)));
String input1Name = "source1InnerJoinInput-" + engine;
String input2Name = "source2InnerJoinInput-" + engine;
String input3Name = "source3InnerJoinInput-" + engine;
String outputName = "innerJoinOutput-" + engine;
String outputName2 = "innerJoinOutput2-" + engine;
String joinerName = "innerJoiner-" + engine;
String sinkName = "innerJoinSink-" + engine;
String sinkName2 = "innerJoinSink-2" + engine;
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source1", MockSource.getPlugin(input1Name, inputSchema1))).addStage(new ETLStage("source2", MockSource.getPlugin(input2Name, inputSchema2))).addStage(new ETLStage("source3", MockSource.getPlugin(input3Name, inputSchema3))).addStage(new ETLStage("t1", IdentityTransform.getPlugin())).addStage(new ETLStage("t2", IdentityTransform.getPlugin())).addStage(new ETLStage("t3", IdentityTransform.getPlugin())).addStage(new ETLStage(joinerName, MockJoiner.getPlugin("t1.customer_id=t2.cust_id=t3.c_id&" + "t1.customer_name=t2.cust_name=t3.c_name", "t1,t2,t3", ""))).addStage(new ETLStage(sinkName, MockSink.getPlugin(outputName))).addStage(new ETLStage(sinkName2, MockSink.getPlugin(outputName2))).addConnection("source1", "t1").addConnection("source2", "t2").addConnection("source3", "t3").addConnection("t1", joinerName).addConnection("t2", joinerName).addConnection("t3", joinerName).addConnection(joinerName, sinkName).addConnection(joinerName, sinkName2).setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("InnerJoinApp-" + engine);
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
Schema outSchema = Schema.recordOf("join.output", Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_name", Schema.of(Schema.Type.STRING)), Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("item_price", Schema.of(Schema.Type.LONG)), Schema.Field.of("cust_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("cust_name", Schema.of(Schema.Type.STRING)), Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("c_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("c_name", Schema.of(Schema.Type.STRING)));
StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema1).set("customer_id", "1").set("customer_name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(inputSchema1).set("customer_id", "2").set("customer_name", "bob").build();
StructuredRecord recordJane = StructuredRecord.builder(inputSchema1).set("customer_id", "3").set("customer_name", "jane").build();
StructuredRecord recordCar = StructuredRecord.builder(inputSchema2).set("item_id", "11").set("item_price", 10000L).set("cust_id", "1").set("cust_name", "samuel").build();
StructuredRecord recordBike = StructuredRecord.builder(inputSchema2).set("item_id", "22").set("item_price", 100L).set("cust_id", "3").set("cust_name", "jane").build();
StructuredRecord recordTrasCar = StructuredRecord.builder(inputSchema3).set("t_id", "1").set("c_id", "1").set("c_name", "samuel").build();
StructuredRecord recordTrasBike = StructuredRecord.builder(inputSchema3).set("t_id", "2").set("c_id", "3").set("c_name", "jane").build();
// write one record to each source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(input1Name));
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob, recordJane));
inputManager = getDataset(NamespaceId.DEFAULT.dataset(input2Name));
MockSource.writeInput(inputManager, ImmutableList.of(recordCar, recordBike));
inputManager = getDataset(NamespaceId.DEFAULT.dataset(input3Name));
MockSource.writeInput(inputManager, ImmutableList.of(recordTrasCar, recordTrasBike));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
StructuredRecord joinRecordSamuel = StructuredRecord.builder(outSchema).set("customer_id", "1").set("customer_name", "samuel").set("item_id", "11").set("item_price", 10000L).set("cust_id", "1").set("cust_name", "samuel").set("t_id", "1").set("c_id", "1").set("c_name", "samuel").build();
StructuredRecord joinRecordJane = StructuredRecord.builder(outSchema).set("customer_id", "3").set("customer_name", "jane").set("item_id", "22").set("item_price", 100L).set("cust_id", "3").set("cust_name", "jane").set("t_id", "2").set("c_id", "3").set("c_name", "jane").build();
DataSetManager<Table> sinkManager = getDataset(outputName);
Set<StructuredRecord> expected = ImmutableSet.of(joinRecordSamuel, joinRecordJane);
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
sinkManager = getDataset(outputName2);
actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
validateMetric(2, appId, joinerName + ".records.out");
validateMetric(2, appId, sinkName + ".records.in");
validateMetric(2, appId, sinkName2 + ".records.in");
}
use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class DataPipelineTest method testParallelAggregators.
private void testParallelAggregators(Engine engine) throws Exception {
String source1Name = "pAggInput1-" + engine.name();
String source2Name = "pAggInput2-" + engine.name();
String sink1Name = "pAggOutput1-" + engine.name();
String sink2Name = "pAggOutput2-" + engine.name();
Schema inputSchema = Schema.recordOf("testRecord", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.LONG)));
/*
source1 --|--> agg1 --> sink1
|
source2 --|--> agg2 --> sink2
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").setEngine(engine).addStage(new ETLStage("source1", MockSource.getPlugin(source1Name, inputSchema))).addStage(new ETLStage("source2", MockSource.getPlugin(source2Name, inputSchema))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))).addStage(new ETLStage("agg1", FieldCountAggregator.getPlugin("user", "string"))).addStage(new ETLStage("agg2", FieldCountAggregator.getPlugin("item", "long"))).addConnection("source1", "agg1").addConnection("source1", "agg2").addConnection("source2", "agg1").addConnection("source2", "agg2").addConnection("agg1", "sink1").addConnection("agg2", "sink2").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("ParallelAggApp-" + engine);
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
// write few records to each source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(source1Name));
MockSource.writeInput(inputManager, ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 1L).build(), StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 2L).build()));
inputManager = getDataset(NamespaceId.DEFAULT.dataset(source2Name));
MockSource.writeInput(inputManager, ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 3L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 4L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 3L).build()));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
Schema outputSchema1 = Schema.recordOf("user.count", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
Schema outputSchema2 = Schema.recordOf("item.count", Schema.Field.of("item", Schema.of(Schema.Type.LONG)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
// check output
DataSetManager<Table> sinkManager = getDataset(sink1Name);
Set<StructuredRecord> expected = ImmutableSet.of(StructuredRecord.builder(outputSchema1).set("user", "all").set("ct", 5L).build(), StructuredRecord.builder(outputSchema1).set("user", "samuel").set("ct", 3L).build(), StructuredRecord.builder(outputSchema1).set("user", "john").set("ct", 2L).build());
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
sinkManager = getDataset(sink2Name);
expected = ImmutableSet.of(StructuredRecord.builder(outputSchema2).set("item", 0L).set("ct", 5L).build(), StructuredRecord.builder(outputSchema2).set("item", 1L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 2L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 3L).set("ct", 2L).build(), StructuredRecord.builder(outputSchema2).set("item", 4L).set("ct", 1L).build());
actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
validateMetric(2, appId, "source1.records.out");
validateMetric(3, appId, "source2.records.out");
validateMetric(5, appId, "agg1.records.in");
// 2 users, but FieldCountAggregator always emits an 'all' group
validateMetric(3, appId, "agg1.aggregator.groups");
validateMetric(3, appId, "agg1.records.out");
validateMetric(5, appId, "agg2.records.in");
// 4 items, but FieldCountAggregator always emits an 'all' group
validateMetric(5, appId, "agg2.aggregator.groups");
validateMetric(5, appId, "agg2.records.out");
validateMetric(3, appId, "sink1.records.in");
validateMetric(5, appId, "sink2.records.in");
}
Aggregations