use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class DataStreamsTest method testErrorTransform.
@Test
public void testErrorTransform() throws Exception {
String sink1TableName = "errTestOut1";
String sink2TableName = "errTestOut2";
Schema inputSchema = Schema.recordOf("user", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(inputSchema).set("name", "Leo").build(), StructuredRecord.builder(inputSchema).set("name", "Ralph").build(), StructuredRecord.builder(inputSchema).set("name", "Don").build(), StructuredRecord.builder(inputSchema).set("name", "Mike").build(), StructuredRecord.builder(inputSchema).set("name", "April").build());
/*
*
* source--> filter1 --> filter2 --> agg1 --> agg2
* | | | |
* |-----------|---------|--------|--------|--> flatten errors --> sink1
* |
* |--> filter errors --> sink2
* arrows coming out the right represent output records
* arrows coming out the bottom represent error records
* this will test multiple stages from multiple phases emitting errors to the same stage
* as well as errors from one stage going to multiple stages
*/
DataStreamsConfig config = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("source", MockSource.getPlugin(inputSchema, input))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("name", "Leo"))).addStage(new ETLStage("filter2", StringValueFilterTransform.getPlugin("name", "Ralph"))).addStage(new ETLStage("agg1", GroupFilterAggregator.getPlugin("name", "Don"))).addStage(new ETLStage("agg2", GroupFilterAggregator.getPlugin("name", "Mike"))).addStage(new ETLStage("errorflatten", FlattenErrorTransform.getPlugin())).addStage(new ETLStage("errorfilter", FilterErrorTransform.getPlugin(3))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1TableName))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2TableName))).addConnection("source", "filter1").addConnection("filter1", "filter2").addConnection("filter2", "agg1").addConnection("agg1", "agg2").addConnection("filter1", "errorflatten").addConnection("filter1", "errorfilter").addConnection("filter2", "errorflatten").addConnection("filter2", "errorfilter").addConnection("agg1", "errorflatten").addConnection("agg1", "errorfilter").addConnection("agg2", "errorflatten").addConnection("agg2", "errorfilter").addConnection("errorflatten", "sink1").addConnection("errorfilter", "sink2").build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app("ErrTransformTest");
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForStatus(true, 10, 1);
Schema flattenSchema = Schema.recordOf("erroruser", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("errMsg", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("errCode", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("errStage", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
final Set<StructuredRecord> expected = ImmutableSet.of(StructuredRecord.builder(flattenSchema).set("name", "Leo").set("errMsg", "bad string value").set("errCode", 1).set("errStage", "filter1").build(), StructuredRecord.builder(flattenSchema).set("name", "Ralph").set("errMsg", "bad string value").set("errCode", 1).set("errStage", "filter2").build(), StructuredRecord.builder(flattenSchema).set("name", "Don").set("errMsg", "bad val").set("errCode", 3).set("errStage", "agg1").build(), StructuredRecord.builder(flattenSchema).set("name", "Mike").set("errMsg", "bad val").set("errCode", 3).set("errStage", "agg2").build());
final DataSetManager<Table> sink1Table = getDataset(sink1TableName);
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
sink1Table.flush();
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(sink1Table));
return expected.equals(outputRecords);
}
}, 4, TimeUnit.MINUTES);
final Set<StructuredRecord> expected2 = ImmutableSet.of(StructuredRecord.builder(inputSchema).set("name", "Leo").build(), StructuredRecord.builder(inputSchema).set("name", "Ralph").build());
final DataSetManager<Table> sink2Table = getDataset(sink2TableName);
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
sink2Table.flush();
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(sink2Table));
return expected2.equals(outputRecords);
}
}, 4, TimeUnit.MINUTES);
}
use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class DataStreamsTest method testParallelAggregators.
@Test
public void testParallelAggregators() throws Exception {
String sink1Name = "pAggOutput1";
String sink2Name = "pAggOutput2";
Schema inputSchema = Schema.recordOf("testRecord", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.LONG)));
List<StructuredRecord> input1 = ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 1L).build(), StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 2L).build());
List<StructuredRecord> input2 = ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 3L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 4L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 3L).build());
/*
source1 --|--> agg1 --> sink1
|
source2 --|--> agg2 --> sink2
*/
DataStreamsConfig pipelineConfig = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("source1", MockSource.getPlugin(inputSchema, input1))).addStage(new ETLStage("source2", MockSource.getPlugin(inputSchema, input2))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))).addStage(new ETLStage("agg1", FieldCountAggregator.getPlugin("user", "string"))).addStage(new ETLStage("agg2", FieldCountAggregator.getPlugin("item", "long"))).addConnection("source1", "agg1").addConnection("source1", "agg2").addConnection("source2", "agg1").addConnection("source2", "agg2").addConnection("agg1", "sink1").addConnection("agg2", "sink2").disableCheckpoints().build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, pipelineConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("ParallelAggApp");
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForStatus(true, 10, 1);
Schema outputSchema1 = Schema.recordOf("user.count", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
Schema outputSchema2 = Schema.recordOf("item.count", Schema.Field.of("item", Schema.of(Schema.Type.LONG)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
// check output
final DataSetManager<Table> sinkManager1 = getDataset(sink1Name);
final Set<StructuredRecord> expected1 = ImmutableSet.of(StructuredRecord.builder(outputSchema1).set("user", "all").set("ct", 5L).build(), StructuredRecord.builder(outputSchema1).set("user", "samuel").set("ct", 3L).build(), StructuredRecord.builder(outputSchema1).set("user", "john").set("ct", 2L).build());
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
sinkManager1.flush();
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(sinkManager1));
return expected1.equals(outputRecords);
}
}, 1, TimeUnit.MINUTES);
final DataSetManager<Table> sinkManager2 = getDataset(sink2Name);
final Set<StructuredRecord> expected2 = ImmutableSet.of(StructuredRecord.builder(outputSchema2).set("item", 0L).set("ct", 5L).build(), StructuredRecord.builder(outputSchema2).set("item", 1L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 2L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 3L).set("ct", 2L).build(), StructuredRecord.builder(outputSchema2).set("item", 4L).set("ct", 1L).build());
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
sinkManager2.flush();
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(sinkManager2));
return expected2.equals(outputRecords);
}
}, 1, TimeUnit.MINUTES);
sparkManager.stop();
sparkManager.waitForStatus(false, 10, 1);
validateMetric(appId, "source1.records.out", 2);
validateMetric(appId, "source2.records.out", 3);
validateMetric(appId, "agg1.records.in", 5);
validateMetric(appId, "agg1.records.out", 3);
validateMetric(appId, "agg2.records.in", 5);
validateMetric(appId, "agg2.records.out", 5);
validateMetric(appId, "sink1.records.in", 3);
validateMetric(appId, "sink2.records.in", 5);
}
use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class StructuredRecordWritableTest method testNonAsciiString.
@Test
public void testNonAsciiString() throws IOException {
Schema schema = Schema.recordOf("rec", Schema.Field.of("x", Schema.of(Schema.Type.STRING)));
StructuredRecord record = StructuredRecord.builder(schema).set("x", "идыло").build();
StructuredRecordWritable writableOut = new StructuredRecordWritable(record);
ByteArrayOutputStream os = new ByteArrayOutputStream();
DataOutput output = new DataOutputStream(os);
writableOut.write(output);
os.flush();
StructuredRecordWritable writableIn = new StructuredRecordWritable();
ByteArrayInputStream is = new ByteArrayInputStream(os.toByteArray());
DataInput input = new DataInputStream(is);
writableIn.readFields(input);
Assert.assertEquals(writableIn.get(), record);
}
use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class NaiveBayesClassifier method cloneRecord.
// creates a builder based off the given record
private StructuredRecord.Builder cloneRecord(StructuredRecord record) {
Schema schema = record.getSchema();
StructuredRecord.Builder builder = StructuredRecord.builder(schema);
for (Schema.Field field : schema.getFields()) {
builder.set(field.getName(), record.get(field.getName()));
}
return builder;
}
use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class DataPipelineTest method testSequentialAggregators.
private void testSequentialAggregators(Engine engine) throws Exception {
String sourceName = "linearAggInput-" + engine.name();
String sinkName = "linearAggOutput-" + engine.name();
/*
* source --> filter1 --> aggregator1 --> aggregator2 --> filter2 --> sink
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").setEngine(engine).addStage(new ETLStage("source", MockSource.getPlugin(sourceName))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("name", "bob"))).addStage(new ETLStage("filter2", StringValueFilterTransform.getPlugin("name", "jane"))).addStage(new ETLStage("aggregator1", IdentityAggregator.getPlugin())).addStage(new ETLStage("aggregator2", IdentityAggregator.getPlugin())).addConnection("source", "filter1").addConnection("filter1", "aggregator1").addConnection("aggregator1", "aggregator2").addConnection("aggregator2", "filter2").addConnection("filter2", "sink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("LinearAggApp-" + engine);
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
StructuredRecord recordJane = StructuredRecord.builder(schema).set("name", "jane").build();
// write one record to each source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(sourceName));
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob, recordJane));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// check output
DataSetManager<Table> sinkManager = getDataset(sinkName);
Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel);
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
validateMetric(3, appId, "source.records.out");
validateMetric(3, appId, "filter1.records.in");
validateMetric(2, appId, "filter1.records.out");
validateMetric(2, appId, "aggregator1.records.in");
validateMetric(2, appId, "aggregator1.records.out");
validateMetric(2, appId, "aggregator2.records.in");
validateMetric(2, appId, "aggregator2.records.out");
validateMetric(2, appId, "filter2.records.in");
validateMetric(1, appId, "filter2.records.out");
validateMetric(1, appId, "sink.records.out");
}
Aggregations