Examples with Schema - co.cask.cdap.api.data.schema.Schema

Example 16 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class StringCaseTransform method configurePipeline.

// configurePipeline is called only once, when the pipeline is deployed. Static validation should be done here.
@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
    StageConfigurer stageConfigurer = pipelineConfigurer.getStageConfigurer();
    // the output schema is always the same as the input schema
    Schema inputSchema = stageConfigurer.getInputSchema();
    // if schema is null, that means it is either not known until runtime, or it is variable
    if (inputSchema != null) {
        // if the input schema is constant and known at configure time, check that all configured fields are strings
        for (String fieldName : config.getUpperFields()) {
            validateFieldIsString(inputSchema, fieldName);
        }
        for (String fieldName : config.getLowerFields()) {
            validateFieldIsString(inputSchema, fieldName);
        }
    }
    stageConfigurer.setOutputSchema(inputSchema);
}

Also used : StageConfigurer(co.cask.cdap.etl.api.StageConfigurer) Schema(co.cask.cdap.api.data.schema.Schema)

Example 17 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class WordCount method validateSchema.

public void validateSchema(Schema inputSchema) {
    // a null input schema means its unknown until runtime, or its not constant
    if (inputSchema != null) {
        // if the input schema is constant and known at configure time, check that the input field exists and is a string.
        Schema.Field inputField = inputSchema.getField(field);
        if (inputField == null) {
            throw new IllegalArgumentException(String.format("Field '%s' does not exist in input schema %s.", field, inputSchema));
        }
        Schema fieldSchema = inputField.getSchema();
        Schema.Type fieldType = fieldSchema.isNullable() ? fieldSchema.getNonNullable().getType() : fieldSchema.getType();
        if (fieldType != Schema.Type.STRING) {
            throw new IllegalArgumentException(String.format("Field '%s' is of illegal type %s. Must be of type %s.", field, fieldType, Schema.Type.STRING));
        }
    }
}

Also used : Schema(co.cask.cdap.api.data.schema.Schema)

Example 18 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class WordCountAggregator method configurePipeline.

@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
    // any static configuration validation should happen here.
    // We will check that the field is in the input schema and is of type string.
    Schema inputSchema = pipelineConfigurer.getStageConfigurer().getInputSchema();
    // a null input schema means its unknown until runtime, or its not constant
    if (inputSchema != null) {
        // if the input schema is constant and known at configure time, check that the input field exists and is a string.
        Schema.Field inputField = inputSchema.getField(config.field);
        if (inputField == null) {
            throw new IllegalArgumentException(String.format("Field '%s' does not exist in input schema %s.", config.field, inputSchema));
        }
        Schema fieldSchema = inputField.getSchema();
        Schema.Type fieldType = fieldSchema.isNullable() ? fieldSchema.getNonNullable().getType() : fieldSchema.getType();
        if (fieldType != Schema.Type.STRING) {
            throw new IllegalArgumentException(String.format("Field '%s' is of illegal type %s. Must be of type %s.", config.field, fieldType, Schema.Type.STRING));
        }
    }
    // set the output schema so downstream stages will know their input schema.
    pipelineConfigurer.getStageConfigurer().setOutputSchema(OUTPUT_SCHEMA);
}

Also used : Schema(co.cask.cdap.api.data.schema.Schema)

Example 19 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class WordCountCompute method configurePipeline.

@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
    // any static configuration validation should happen here.
    // We will check that the field is in the input schema and is of type string.
    Schema inputSchema = pipelineConfigurer.getStageConfigurer().getInputSchema();
    if (inputSchema != null) {
        WordCount wordCount = new WordCount(config.field);
        wordCount.validateSchema(inputSchema);
    }
    // set the output schema so downstream stages will know their input schema.
    pipelineConfigurer.getStageConfigurer().setOutputSchema(OUTPUT_SCHEMA);
}

Also used : Schema(co.cask.cdap.api.data.schema.Schema)

Example 20 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class DataPipelineTest method testSimpleMultiSource.

private void testSimpleMultiSource(Engine engine) throws Exception {
    /*
     * source1 --|
     *           |--> sleep --> sink
     * source2 --|
     */
    String source1Name = String.format("simpleMSInput1-%s", engine);
    String source2Name = String.format("simpleMSInput2-%s", engine);
    String sinkName = String.format("simpleMSOutput-%s", engine);
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source1", MockSource.getPlugin(source1Name))).addStage(new ETLStage("source2", MockSource.getPlugin(source2Name))).addStage(new ETLStage("sleep", SleepTransform.getPlugin(2L))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addConnection("source1", "sleep").addConnection("source2", "sleep").addConnection("sleep", "sink").setEngine(engine).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("SimpleMultiSourceApp-" + engine);
    ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
    // there should be only two programs - one workflow and one mapreduce/spark
    Assert.assertEquals(2, appManager.getInfo().getPrograms().size());
    Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
    StructuredRecord recordVincent = StructuredRecord.builder(schema).set("name", "vincent").build();
    // write one record to each source
    DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(source1Name));
    MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordVincent));
    inputManager = getDataset(NamespaceId.DEFAULT.dataset(source2Name));
    MockSource.writeInput(inputManager, ImmutableList.of(recordBob));
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    // check sink
    DataSetManager<Table> sinkManager = getDataset(sinkName);
    Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob, recordVincent);
    Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
    Assert.assertEquals(expected, actual);
    validateMetric(2, appId, "source1.records.out");
    validateMetric(1, appId, "source2.records.out");
    validateMetric(3, appId, "sleep.records.in");
    validateMetric(3, appId, "sleep.records.out");
    validateMetric(3, appId, "sink.records.in");
    Assert.assertTrue(getMetric(appId, "sleep." + co.cask.cdap.etl.common.Constants.Metrics.TOTAL_TIME) > 0L);
}

Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Table(co.cask.cdap.api.dataset.table.Table) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) AppRequest(co.cask.cdap.proto.artifact.AppRequest) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId)

Aggregations

Schema (co.cask.cdap.api.data.schema.Schema)210 Test (org.junit.Test)92 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)69 Table (co.cask.cdap.api.dataset.table.Table)38 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)35 ApplicationId (co.cask.cdap.proto.id.ApplicationId)34 FormatSpecification (co.cask.cdap.api.data.format.FormatSpecification)32 ApplicationManager (co.cask.cdap.test.ApplicationManager)30 AppRequest (co.cask.cdap.proto.artifact.AppRequest)29 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)24 IOException (java.io.IOException)23 ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)22 ReflectionSchemaGenerator (co.cask.cdap.internal.io.ReflectionSchemaGenerator)22 ArrayList (java.util.ArrayList)22 WorkflowManager (co.cask.cdap.test.WorkflowManager)20 Map (java.util.Map)18 Set (java.util.Set)14 UnsupportedTypeException (co.cask.cdap.api.data.schema.UnsupportedTypeException)12 HashMap (java.util.HashMap)12 HashSet (java.util.HashSet)11