use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.
the class DataStreamsTest method testTransformComputeWithMacros.
@Test
public void testTransformComputeWithMacros() throws Exception {
Schema schema = Schema.recordOf("test", Schema.Field.of("id", Schema.of(Schema.Type.STRING)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> input = new ArrayList<>();
StructuredRecord samuelRecord = StructuredRecord.builder(schema).set("id", "123").set("name", "samuel").build();
StructuredRecord jacksonRecord = StructuredRecord.builder(schema).set("id", "456").set("name", "jackson").build();
StructuredRecord dwayneRecord = StructuredRecord.builder(schema).set("id", "789").set("name", "dwayne").build();
StructuredRecord johnsonRecord = StructuredRecord.builder(schema).set("id", "0").set("name", "johnson").build();
input.add(samuelRecord);
input.add(jacksonRecord);
input.add(dwayneRecord);
input.add(johnsonRecord);
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("${field}", "${val1}"))).addStage(new ETLStage("filter2", StringValueFilterCompute.getPlugin("${field}", "${val2}"))).addStage(new ETLStage("sleep", SleepTransform.getPlugin(2L))).addConnection("source", "sleep").addConnection("sleep", "filter1").addConnection("filter1", "filter2").addConnection("filter2", "sink").setBatchInterval("1s").build();
ApplicationId appId = NamespaceId.DEFAULT.app("simpleApp");
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
final Set<StructuredRecord> expected = new HashSet<>();
expected.add(samuelRecord);
expected.add(jacksonRecord);
testTransformComputeRun(appManager, expected, "dwayne", "johnson", "macroOutput1");
validateMetric(appId, "source.records.out", 4);
validateMetric(appId, "sleep.records.in", 4);
validateMetric(appId, "sleep.records.out", 4);
validateMetric(appId, "filter1.records.in", 4);
validateMetric(appId, "filter1.records.out", 3);
validateMetric(appId, "filter2.records.in", 3);
validateMetric(appId, "filter2.records.out", 2);
validateMetric(appId, "sink.records.in", 2);
Assert.assertTrue(getMetric(appId, "sleep." + co.cask.cdap.etl.common.Constants.Metrics.TOTAL_TIME) > 0L);
expected.clear();
expected.add(dwayneRecord);
expected.add(johnsonRecord);
testTransformComputeRun(appManager, expected, "samuel", "jackson", "macroOutput2");
}
use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.
the class StringCaseTransform method configurePipeline.
// configurePipeline is called only once, when the pipeline is deployed. Static validation should be done here.
@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
StageConfigurer stageConfigurer = pipelineConfigurer.getStageConfigurer();
// the output schema is always the same as the input schema
Schema inputSchema = stageConfigurer.getInputSchema();
// if schema is null, that means it is either not known until runtime, or it is variable
if (inputSchema != null) {
// if the input schema is constant and known at configure time, check that all configured fields are strings
for (String fieldName : config.getUpperFields()) {
validateFieldIsString(inputSchema, fieldName);
}
for (String fieldName : config.getLowerFields()) {
validateFieldIsString(inputSchema, fieldName);
}
}
stageConfigurer.setOutputSchema(inputSchema);
}
use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.
the class WordCount method validateSchema.
public void validateSchema(Schema inputSchema) {
// a null input schema means its unknown until runtime, or its not constant
if (inputSchema != null) {
// if the input schema is constant and known at configure time, check that the input field exists and is a string.
Schema.Field inputField = inputSchema.getField(field);
if (inputField == null) {
throw new IllegalArgumentException(String.format("Field '%s' does not exist in input schema %s.", field, inputSchema));
}
Schema fieldSchema = inputField.getSchema();
Schema.Type fieldType = fieldSchema.isNullable() ? fieldSchema.getNonNullable().getType() : fieldSchema.getType();
if (fieldType != Schema.Type.STRING) {
throw new IllegalArgumentException(String.format("Field '%s' is of illegal type %s. Must be of type %s.", field, fieldType, Schema.Type.STRING));
}
}
}
use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.
the class WordCountAggregator method configurePipeline.
@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
// any static configuration validation should happen here.
// We will check that the field is in the input schema and is of type string.
Schema inputSchema = pipelineConfigurer.getStageConfigurer().getInputSchema();
// a null input schema means its unknown until runtime, or its not constant
if (inputSchema != null) {
// if the input schema is constant and known at configure time, check that the input field exists and is a string.
Schema.Field inputField = inputSchema.getField(config.field);
if (inputField == null) {
throw new IllegalArgumentException(String.format("Field '%s' does not exist in input schema %s.", config.field, inputSchema));
}
Schema fieldSchema = inputField.getSchema();
Schema.Type fieldType = fieldSchema.isNullable() ? fieldSchema.getNonNullable().getType() : fieldSchema.getType();
if (fieldType != Schema.Type.STRING) {
throw new IllegalArgumentException(String.format("Field '%s' is of illegal type %s. Must be of type %s.", config.field, fieldType, Schema.Type.STRING));
}
}
// set the output schema so downstream stages will know their input schema.
pipelineConfigurer.getStageConfigurer().setOutputSchema(OUTPUT_SCHEMA);
}
use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.
the class WordCountCompute method configurePipeline.
@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
// any static configuration validation should happen here.
// We will check that the field is in the input schema and is of type string.
Schema inputSchema = pipelineConfigurer.getStageConfigurer().getInputSchema();
if (inputSchema != null) {
WordCount wordCount = new WordCount(config.field);
wordCount.validateSchema(inputSchema);
}
// set the output schema so downstream stages will know their input schema.
pipelineConfigurer.getStageConfigurer().setOutputSchema(OUTPUT_SCHEMA);
}
Aggregations