Search in sources :

Example 46 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class AutoJoinerTest method testBroadcastJoinUsingSQLEngineWithIncludedStages.

@Test
public void testBroadcastJoinUsingSQLEngineWithIncludedStages() throws Exception {
    Schema expectedSchema = Schema.recordOf("purchases.users", Schema.Field.of("purchases_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("purchases_purchase_id", Schema.of(Schema.Type.INT)), Schema.Field.of("purchases_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("users_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_name", Schema.of(Schema.Type.STRING)));
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").build());
    testSimpleAutoJoinUsingSQLEngineWithStageSettings(Arrays.asList("users", "purchases"), Collections.singletonList("users"), expected, expectedSchema, "", "join", Engine.SPARK);
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 47 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class AutoJoinerTest method testNullIsEqual.

@Test
public void testNullIsEqual() throws Exception {
    Schema expectedSchema = Schema.recordOf("items.attributes", Schema.Field.of("items_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("items_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("items_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("attributes_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("attributes_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("attributes_attr", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(expectedSchema).set("items_id", 0).set("items_region", "us").set("items_name", "bacon").set("attributes_region", "us").set("attributes_id", 0).set("attributes_attr", "food").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("items_id", 1).set("attributes_id", 1).set("attributes_attr", "car").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("items_region", "us").set("attributes_region", "us").build());
    testNullEquality(Engine.SPARK, true, expected);
    testNullEquality(Engine.MAPREDUCE, true, expected);
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 48 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class ReducibleAggregatorTestBase method testFieldCountAgg.

protected void testFieldCountAgg(Engine engine, Map<String, String> arguments) throws Exception {
    String runSuffix = engine.name() + "-" + UUID.randomUUID();
    String source1Name = "pAggInput1-" + runSuffix;
    String source2Name = "pAggInput2-" + runSuffix;
    String sink1Name = "pAggOutput1-" + runSuffix;
    String sink2Name = "pAggOutput2-" + runSuffix;
    Schema inputSchema = Schema.recordOf("testRecord", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.LONG)));
    /*
       source1 --|--> agg1 --> sink1
                 |
       source2 --|--> agg2 --> sink2
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().setEngine(engine).addStage(new ETLStage("source1", MockSource.getPlugin(source1Name, inputSchema))).addStage(new ETLStage("source2", MockSource.getPlugin(source2Name, inputSchema))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))).addStage(new ETLStage("agg1", FieldCountReducibleAggregator.getPlugin("user", "string"))).addStage(new ETLStage("agg2", FieldCountReducibleAggregator.getPlugin("item", "long"))).addConnection("source1", "agg1").addConnection("source1", "agg2").addConnection("source2", "agg1").addConnection("source2", "agg2").addConnection("agg1", "sink1").addConnection("agg2", "sink2").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("ParallelAggApp-" + runSuffix);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // write few records to each source
    DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(source1Name));
    MockSource.writeInput(inputManager, ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 1L).build(), StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 2L).build()));
    inputManager = getDataset(NamespaceId.DEFAULT.dataset(source2Name));
    MockSource.writeInput(inputManager, ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 3L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 4L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 3L).build()));
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start(arguments);
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    Schema outputSchema1 = Schema.recordOf("user.count", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
    Schema outputSchema2 = Schema.recordOf("item.count", Schema.Field.of("item", Schema.of(Schema.Type.LONG)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
    // check output
    DataSetManager<Table> sinkManager = getDataset(sink1Name);
    Set<StructuredRecord> expected = ImmutableSet.of(StructuredRecord.builder(outputSchema1).set("user", "all").set("ct", 5L).build(), StructuredRecord.builder(outputSchema1).set("user", "samuel").set("ct", 3L).build(), StructuredRecord.builder(outputSchema1).set("user", "john").set("ct", 2L).build());
    Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
    Assert.assertEquals(expected, actual);
    sinkManager = getDataset(sink2Name);
    expected = ImmutableSet.of(StructuredRecord.builder(outputSchema2).set("item", 0L).set("ct", 5L).build(), StructuredRecord.builder(outputSchema2).set("item", 1L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 2L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 3L).set("ct", 2L).build(), StructuredRecord.builder(outputSchema2).set("item", 4L).set("ct", 1L).build());
    actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
    Assert.assertEquals(expected, actual);
    validateMetric(2, appId, "source1.records.out");
    validateMetric(3, appId, "source2.records.out");
    validateMetric(5, appId, "agg1.records.in");
    // 2 users, but FieldCountReduceAggregator always emits an 'all' group
    validateMetric(3, appId, "agg1.aggregator.groups");
    validateMetric(3, appId, "agg1.records.out");
    validateMetric(5, appId, "agg2.records.in");
    // 4 items, but FieldCountReduceAggregator always emits an 'all' group
    validateMetric(5, appId, "agg2.aggregator.groups");
    validateMetric(5, appId, "agg2.records.out");
    validateMetric(3, appId, "sink1.records.in");
    validateMetric(5, appId, "sink2.records.in");
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId)

Example 49 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class ConnectionUtils method getSampleResponse.

/**
 * Return {@link SampleResponse} for the connector
 *
 * @throws IOException
 */
public static SampleResponse getSampleResponse(Connector connector, ConnectorContext connectorContext, SampleRequest sampleRequest, ConnectorDetail detail, ServicePluginConfigurer pluginConfigurer) throws IOException {
    if (connector instanceof DirectConnector) {
        DirectConnector directConnector = (DirectConnector) connector;
        List<StructuredRecord> sample = directConnector.sample(connectorContext, sampleRequest);
        return new SampleResponse(detail, sample.isEmpty() ? null : sample.get(0).getSchema(), sample);
    }
    if (connector instanceof BatchConnector) {
        LimitingConnector limitingConnector = new LimitingConnector((BatchConnector) connector, pluginConfigurer);
        List<StructuredRecord> sample = limitingConnector.sample(connectorContext, sampleRequest);
        return new SampleResponse(detail, sample.isEmpty() ? null : sample.get(0).getSchema(), sample);
    }
    throw new ConnectionBadRequestException("Connector is not supported. " + "The supported connector should be DirectConnector or BatchConnector.");
}
Also used : ConnectionBadRequestException(io.cdap.cdap.etl.proto.connection.ConnectionBadRequestException) BatchConnector(io.cdap.cdap.etl.api.batch.BatchConnector) DirectConnector(io.cdap.cdap.etl.api.connector.DirectConnector) SampleResponse(io.cdap.cdap.etl.proto.connection.SampleResponse) LimitingConnector(io.cdap.cdap.datapipeline.connection.LimitingConnector) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord)

Example 50 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class NaiveBayesClassifier method cloneRecord.

// creates a builder based off the given record
private StructuredRecord.Builder cloneRecord(StructuredRecord record) {
    Schema schema = record.getSchema();
    StructuredRecord.Builder builder = StructuredRecord.builder(schema);
    for (Schema.Field field : schema.getFields()) {
        builder.set(field.getName(), record.get(field.getName()));
    }
    return builder;
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord)

Aggregations

StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)210 Schema (io.cdap.cdap.api.data.schema.Schema)169 Test (org.junit.Test)119 Table (io.cdap.cdap.api.dataset.table.Table)76 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)73 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)73 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)68 ApplicationManager (io.cdap.cdap.test.ApplicationManager)68 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)59 WorkflowManager (io.cdap.cdap.test.WorkflowManager)54 HashSet (java.util.HashSet)50 ArrayList (java.util.ArrayList)44 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)40 HashMap (java.util.HashMap)25 File (java.io.File)17 ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)16 FormatSpecification (io.cdap.cdap.api.data.format.FormatSpecification)15 DataStreamsConfig (io.cdap.cdap.etl.proto.v2.DataStreamsConfig)14 SparkManager (io.cdap.cdap.test.SparkManager)12 Map (java.util.Map)12