use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testBroadcastJoinUsingSQLEngineWithIncludedStages.
@Test
public void testBroadcastJoinUsingSQLEngineWithIncludedStages() throws Exception {
Schema expectedSchema = Schema.recordOf("purchases.users", Schema.Field.of("purchases_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("purchases_purchase_id", Schema.of(Schema.Type.INT)), Schema.Field.of("purchases_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("users_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_name", Schema.of(Schema.Type.STRING)));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").build());
testSimpleAutoJoinUsingSQLEngineWithStageSettings(Arrays.asList("users", "purchases"), Collections.singletonList("users"), expected, expectedSchema, "", "join", Engine.SPARK);
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testNullIsEqual.
@Test
public void testNullIsEqual() throws Exception {
Schema expectedSchema = Schema.recordOf("items.attributes", Schema.Field.of("items_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("items_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("items_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("attributes_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("attributes_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("attributes_attr", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(expectedSchema).set("items_id", 0).set("items_region", "us").set("items_name", "bacon").set("attributes_region", "us").set("attributes_id", 0).set("attributes_attr", "food").build());
expected.add(StructuredRecord.builder(expectedSchema).set("items_id", 1).set("attributes_id", 1).set("attributes_attr", "car").build());
expected.add(StructuredRecord.builder(expectedSchema).set("items_region", "us").set("attributes_region", "us").build());
testNullEquality(Engine.SPARK, true, expected);
testNullEquality(Engine.MAPREDUCE, true, expected);
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class ReducibleAggregatorTestBase method testFieldCountAgg.
protected void testFieldCountAgg(Engine engine, Map<String, String> arguments) throws Exception {
String runSuffix = engine.name() + "-" + UUID.randomUUID();
String source1Name = "pAggInput1-" + runSuffix;
String source2Name = "pAggInput2-" + runSuffix;
String sink1Name = "pAggOutput1-" + runSuffix;
String sink2Name = "pAggOutput2-" + runSuffix;
Schema inputSchema = Schema.recordOf("testRecord", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.LONG)));
/*
source1 --|--> agg1 --> sink1
|
source2 --|--> agg2 --> sink2
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder().setEngine(engine).addStage(new ETLStage("source1", MockSource.getPlugin(source1Name, inputSchema))).addStage(new ETLStage("source2", MockSource.getPlugin(source2Name, inputSchema))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))).addStage(new ETLStage("agg1", FieldCountReducibleAggregator.getPlugin("user", "string"))).addStage(new ETLStage("agg2", FieldCountReducibleAggregator.getPlugin("item", "long"))).addConnection("source1", "agg1").addConnection("source1", "agg2").addConnection("source2", "agg1").addConnection("source2", "agg2").addConnection("agg1", "sink1").addConnection("agg2", "sink2").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("ParallelAggApp-" + runSuffix);
ApplicationManager appManager = deployApplication(appId, appRequest);
// write few records to each source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(source1Name));
MockSource.writeInput(inputManager, ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 1L).build(), StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 2L).build()));
inputManager = getDataset(NamespaceId.DEFAULT.dataset(source2Name));
MockSource.writeInput(inputManager, ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 3L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 4L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 3L).build()));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(arguments);
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
Schema outputSchema1 = Schema.recordOf("user.count", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
Schema outputSchema2 = Schema.recordOf("item.count", Schema.Field.of("item", Schema.of(Schema.Type.LONG)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
// check output
DataSetManager<Table> sinkManager = getDataset(sink1Name);
Set<StructuredRecord> expected = ImmutableSet.of(StructuredRecord.builder(outputSchema1).set("user", "all").set("ct", 5L).build(), StructuredRecord.builder(outputSchema1).set("user", "samuel").set("ct", 3L).build(), StructuredRecord.builder(outputSchema1).set("user", "john").set("ct", 2L).build());
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
sinkManager = getDataset(sink2Name);
expected = ImmutableSet.of(StructuredRecord.builder(outputSchema2).set("item", 0L).set("ct", 5L).build(), StructuredRecord.builder(outputSchema2).set("item", 1L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 2L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 3L).set("ct", 2L).build(), StructuredRecord.builder(outputSchema2).set("item", 4L).set("ct", 1L).build());
actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
validateMetric(2, appId, "source1.records.out");
validateMetric(3, appId, "source2.records.out");
validateMetric(5, appId, "agg1.records.in");
// 2 users, but FieldCountReduceAggregator always emits an 'all' group
validateMetric(3, appId, "agg1.aggregator.groups");
validateMetric(3, appId, "agg1.records.out");
validateMetric(5, appId, "agg2.records.in");
// 4 items, but FieldCountReduceAggregator always emits an 'all' group
validateMetric(5, appId, "agg2.aggregator.groups");
validateMetric(5, appId, "agg2.records.out");
validateMetric(3, appId, "sink1.records.in");
validateMetric(5, appId, "sink2.records.in");
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class ConnectionUtils method getSampleResponse.
/**
* Return {@link SampleResponse} for the connector
*
* @throws IOException
*/
public static SampleResponse getSampleResponse(Connector connector, ConnectorContext connectorContext, SampleRequest sampleRequest, ConnectorDetail detail, ServicePluginConfigurer pluginConfigurer) throws IOException {
if (connector instanceof DirectConnector) {
DirectConnector directConnector = (DirectConnector) connector;
List<StructuredRecord> sample = directConnector.sample(connectorContext, sampleRequest);
return new SampleResponse(detail, sample.isEmpty() ? null : sample.get(0).getSchema(), sample);
}
if (connector instanceof BatchConnector) {
LimitingConnector limitingConnector = new LimitingConnector((BatchConnector) connector, pluginConfigurer);
List<StructuredRecord> sample = limitingConnector.sample(connectorContext, sampleRequest);
return new SampleResponse(detail, sample.isEmpty() ? null : sample.get(0).getSchema(), sample);
}
throw new ConnectionBadRequestException("Connector is not supported. " + "The supported connector should be DirectConnector or BatchConnector.");
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class NaiveBayesClassifier method cloneRecord.
// creates a builder based off the given record
private StructuredRecord.Builder cloneRecord(StructuredRecord record) {
Schema schema = record.getSchema();
StructuredRecord.Builder builder = StructuredRecord.builder(schema);
for (Schema.Field field : schema.getFields()) {
builder.set(field.getName(), record.get(field.getName()));
}
return builder;
}
Aggregations