use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.
the class StringCaseTransform method configurePipeline.
// configurePipeline is called only once, when the pipeline is deployed. Static validation should be done here.
@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
StageConfigurer stageConfigurer = pipelineConfigurer.getStageConfigurer();
// the output schema is always the same as the input schema
Schema inputSchema = stageConfigurer.getInputSchema();
// if schema is null, that means it is either not known until runtime, or it is variable
if (inputSchema != null) {
// if the input schema is constant and known at configure time, check that all configured fields are strings
for (String fieldName : config.getUpperFields()) {
validateFieldIsString(inputSchema, fieldName);
}
for (String fieldName : config.getLowerFields()) {
validateFieldIsString(inputSchema, fieldName);
}
}
stageConfigurer.setOutputSchema(inputSchema);
}
use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.
the class WordCount method validateSchema.
public void validateSchema(Schema inputSchema) {
// a null input schema means its unknown until runtime, or its not constant
if (inputSchema != null) {
// if the input schema is constant and known at configure time, check that the input field exists and is a string.
Schema.Field inputField = inputSchema.getField(field);
if (inputField == null) {
throw new IllegalArgumentException(String.format("Field '%s' does not exist in input schema %s.", field, inputSchema));
}
Schema fieldSchema = inputField.getSchema();
Schema.Type fieldType = fieldSchema.isNullable() ? fieldSchema.getNonNullable().getType() : fieldSchema.getType();
if (fieldType != Schema.Type.STRING) {
throw new IllegalArgumentException(String.format("Field '%s' is of illegal type %s. Must be of type %s.", field, fieldType, Schema.Type.STRING));
}
}
}
use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.
the class WordCountAggregator method configurePipeline.
@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
// any static configuration validation should happen here.
// We will check that the field is in the input schema and is of type string.
Schema inputSchema = pipelineConfigurer.getStageConfigurer().getInputSchema();
// a null input schema means its unknown until runtime, or its not constant
if (inputSchema != null) {
// if the input schema is constant and known at configure time, check that the input field exists and is a string.
Schema.Field inputField = inputSchema.getField(config.field);
if (inputField == null) {
throw new IllegalArgumentException(String.format("Field '%s' does not exist in input schema %s.", config.field, inputSchema));
}
Schema fieldSchema = inputField.getSchema();
Schema.Type fieldType = fieldSchema.isNullable() ? fieldSchema.getNonNullable().getType() : fieldSchema.getType();
if (fieldType != Schema.Type.STRING) {
throw new IllegalArgumentException(String.format("Field '%s' is of illegal type %s. Must be of type %s.", config.field, fieldType, Schema.Type.STRING));
}
}
// set the output schema so downstream stages will know their input schema.
pipelineConfigurer.getStageConfigurer().setOutputSchema(OUTPUT_SCHEMA);
}
use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.
the class WordCountCompute method configurePipeline.
@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
// any static configuration validation should happen here.
// We will check that the field is in the input schema and is of type string.
Schema inputSchema = pipelineConfigurer.getStageConfigurer().getInputSchema();
if (inputSchema != null) {
WordCount wordCount = new WordCount(config.field);
wordCount.validateSchema(inputSchema);
}
// set the output schema so downstream stages will know their input schema.
pipelineConfigurer.getStageConfigurer().setOutputSchema(OUTPUT_SCHEMA);
}
use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.
the class DataPipelineTest method testSimpleMultiSource.
private void testSimpleMultiSource(Engine engine) throws Exception {
/*
* source1 --|
* |--> sleep --> sink
* source2 --|
*/
String source1Name = String.format("simpleMSInput1-%s", engine);
String source2Name = String.format("simpleMSInput2-%s", engine);
String sinkName = String.format("simpleMSOutput-%s", engine);
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source1", MockSource.getPlugin(source1Name))).addStage(new ETLStage("source2", MockSource.getPlugin(source2Name))).addStage(new ETLStage("sleep", SleepTransform.getPlugin(2L))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addConnection("source1", "sleep").addConnection("source2", "sleep").addConnection("sleep", "sink").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("SimpleMultiSourceApp-" + engine);
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
// there should be only two programs - one workflow and one mapreduce/spark
Assert.assertEquals(2, appManager.getInfo().getPrograms().size());
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
StructuredRecord recordVincent = StructuredRecord.builder(schema).set("name", "vincent").build();
// write one record to each source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(source1Name));
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordVincent));
inputManager = getDataset(NamespaceId.DEFAULT.dataset(source2Name));
MockSource.writeInput(inputManager, ImmutableList.of(recordBob));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// check sink
DataSetManager<Table> sinkManager = getDataset(sinkName);
Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob, recordVincent);
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
validateMetric(2, appId, "source1.records.out");
validateMetric(1, appId, "source2.records.out");
validateMetric(3, appId, "sleep.records.in");
validateMetric(3, appId, "sleep.records.out");
validateMetric(3, appId, "sink.records.in");
Assert.assertTrue(getMetric(appId, "sleep." + co.cask.cdap.etl.common.Constants.Metrics.TOTAL_TIME) > 0L);
}
Aggregations