Search in sources :

Example 11 with Schema

use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.

the class XMLParserTest method testXMLParserWithSimpleXPath.

@Test
public void testXMLParserWithSimpleXPath() throws Exception {
    Schema schema = Schema.recordOf("record", Schema.Field.of("title", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("author", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("year", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    XMLParser.Config config = new XMLParser.Config("body", "UTF-8", "title:/book/title,author:/book/author,year:/book/year", "title:string,author:string,year:string", "Write to error dataset");
    Transform<StructuredRecord, StructuredRecord> transform = new XMLParser(config);
    transform.initialize(new MockTransformContext());
    MockEmitter<StructuredRecord> emitter = new MockEmitter<>();
    StructuredRecord inputRecord = StructuredRecord.builder(INPUT).set("offset", 1).set("body", "<book category=\"COOKING\"><title lang=\"en\">Everyday Italian</title>" + "<author>Giada De Laurentiis</author><year>2005</year><price>30.00</price></book>").build();
    transform.transform(inputRecord, emitter);
    List<StructuredRecord> expected = ImmutableList.of(StructuredRecord.builder(schema).set("title", "Everyday Italian").set("author", "Giada De Laurentiis").set("year", "2005").build());
    Assert.assertEquals(expected, emitter.getEmitted());
}
Also used : MockTransformContext(io.cdap.cdap.etl.mock.transform.MockTransformContext) MockEmitter(io.cdap.cdap.etl.mock.common.MockEmitter) Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) Test(org.junit.Test)

Example 12 with Schema

use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.

the class XMLToJSONConverterTest method testInvalidInputFieldType.

@Test
public void testInvalidInputFieldType() throws Exception {
    Schema schema = Schema.recordOf("input1", Schema.Field.of("body", Schema.of(Schema.Type.INT)));
    XMLToJSON.Config config = new XMLToJSON.Config("body", "jsonevent", OUTPUT.toString());
    PipelineConfigurer configurer = new MockPipelineConfigurer(schema);
    FailureCollector collector = configurer.getStageConfigurer().getFailureCollector();
    XMLToJSON converter = new XMLToJSON(config);
    converter.configurePipeline(configurer);
    Assert.assertEquals(1, collector.getValidationFailures().size());
    Assert.assertEquals(2, collector.getValidationFailures().get(0).getCauses().size());
}
Also used : MockPipelineConfigurer(io.cdap.cdap.etl.mock.common.MockPipelineConfigurer) Schema(io.cdap.cdap.api.data.schema.Schema) PipelineConfigurer(io.cdap.cdap.etl.api.PipelineConfigurer) MockPipelineConfigurer(io.cdap.cdap.etl.mock.common.MockPipelineConfigurer) FailureCollector(io.cdap.cdap.etl.api.FailureCollector) Test(org.junit.Test)

Example 13 with Schema

use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.

the class NormalizeTest method testInvalidOutputSchemaFieldType.

@Test
public void testInvalidOutputSchemaFieldType() throws Exception {
    // schema with ID field as long
    Schema outputSchema = Schema.recordOf("outputSchema", Schema.Field.of(ID, Schema.of(Schema.Type.LONG)), Schema.Field.of(DATE, Schema.of(Schema.Type.STRING)), Schema.Field.of(ATTRIBUTE_TYPE, Schema.of(Schema.Type.STRING)), Schema.Field.of(ATTRIBUTE_VALUE, Schema.of(Schema.Type.STRING)));
    Normalize.NormalizeConfig config = new Normalize.NormalizeConfig(validFieldMapping, validFieldNormalizing, outputSchema.toString());
    MockPipelineConfigurer configurer = new MockPipelineConfigurer(INPUT_SCHEMA);
    new Normalize(config).configurePipeline(configurer);
    FailureCollector collector = configurer.getStageConfigurer().getFailureCollector();
    Assert.assertEquals(1, collector.getValidationFailures().size());
    Assert.assertEquals(1, collector.getValidationFailures().get(0).getCauses().size());
    ValidationFailure.Cause expectedCause = new ValidationFailure.Cause();
    expectedCause.addAttribute(CauseAttributes.OUTPUT_SCHEMA_FIELD, ID);
    Assert.assertEquals(expectedCause, collector.getValidationFailures().get(0).getCauses().get(0));
}
Also used : MockPipelineConfigurer(io.cdap.cdap.etl.mock.common.MockPipelineConfigurer) Schema(io.cdap.cdap.api.data.schema.Schema) FailureCollector(io.cdap.cdap.etl.api.FailureCollector) ValidationFailure(io.cdap.cdap.etl.api.validation.ValidationFailure) Test(org.junit.Test)

Example 14 with Schema

use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.

the class UnionSplitterTest method testInvalidSchemas.

@Test
public void testInvalidSchemas() {
    Schema inputSchema = Schema.recordOf("union", Schema.Field.of("a", Schema.unionOf(Schema.of(Schema.Type.NULL), Schema.arrayOf(Schema.of(Schema.Type.STRING)))), Schema.Field.of("b", Schema.unionOf(Schema.of(Schema.Type.NULL), Schema.enumWith("something"))), Schema.Field.of("c", Schema.unionOf(Schema.of(Schema.Type.NULL), Schema.mapOf(Schema.of(Schema.Type.STRING), Schema.of(Schema.Type.STRING)))), Schema.Field.of("d", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    FailureCollector collector = new MockFailureCollector();
    UnionSplitter.getOutputSchemas(inputSchema, "a", true, collector);
    Assert.assertEquals(1, collector.getValidationFailures().size());
    Assert.assertEquals(2, collector.getValidationFailures().get(0).getCauses().size());
    collector = new MockFailureCollector();
    UnionSplitter.getOutputSchemas(inputSchema, "b", true, collector);
    Assert.assertEquals(1, collector.getValidationFailures().size());
    Assert.assertEquals(2, collector.getValidationFailures().get(0).getCauses().size());
    collector = new MockFailureCollector();
    UnionSplitter.getOutputSchemas(inputSchema, "c", true, collector);
    Assert.assertEquals(1, collector.getValidationFailures().size());
    Assert.assertEquals(2, collector.getValidationFailures().get(0).getCauses().size());
    collector = new MockFailureCollector();
    UnionSplitter.getOutputSchemas(inputSchema, "d", true, collector);
    Assert.assertEquals(0, collector.getValidationFailures().size());
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema) MockFailureCollector(io.cdap.cdap.etl.mock.validation.MockFailureCollector) FailureCollector(io.cdap.cdap.etl.api.FailureCollector) MockFailureCollector(io.cdap.cdap.etl.mock.validation.MockFailureCollector) Test(org.junit.Test)

Example 15 with Schema

use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.

the class AbstractFileSource method prepareRun.

@Override
public void prepareRun(BatchSourceContext context) throws Exception {
    FailureCollector collector = context.getFailureCollector();
    config.validate(collector);
    String fileFormat = config.getFormatName();
    ValidatingInputFormat validatingInputFormat;
    try {
        validatingInputFormat = context.newPluginInstance(fileFormat);
    } catch (InvalidPluginConfigException e) {
        Set<String> properties = new HashSet<>(e.getMissingProperties());
        for (InvalidPluginProperty invalidProperty : e.getInvalidProperties()) {
            properties.add(invalidProperty.getName());
        }
        String errorMessage = String.format("Format '%s' cannot be used because properties %s were not provided or " + "were invalid when the pipeline was deployed. Set the format to a " + "different value, or re-create the pipeline with all required properties.", fileFormat, properties);
        throw new IllegalArgumentException(errorMessage, e);
    }
    FormatContext formatContext = new FormatContext(collector, context.getInputSchema());
    validateInputFormatProvider(formatContext, fileFormat, validatingInputFormat);
    validatePathField(collector, validatingInputFormat.getSchema(formatContext));
    collector.getOrThrowException();
    Job job = JobUtils.createInstance();
    Configuration conf = job.getConfiguration();
    Pattern pattern = config.getFilePattern();
    if (pattern != null) {
        RegexPathFilter.configure(conf, pattern);
        FileInputFormat.setInputPathFilter(job, RegexPathFilter.class);
    }
    FileInputFormat.setInputDirRecursive(job, config.shouldReadRecursively());
    Schema schema = config.getSchema();
    LineageRecorder lineageRecorder = new LineageRecorder(context, config.getReferenceName());
    lineageRecorder.createExternalDataset(schema);
    if (schema != null && schema.getFields() != null) {
        recordLineage(lineageRecorder, schema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList()));
    }
    // set entries here, before FileSystem is used
    for (Map.Entry<String, String> entry : getFileSystemProperties(context).entrySet()) {
        conf.set(entry.getKey(), entry.getValue());
    }
    Path path = new Path(config.getPath());
    FileSystem pathFileSystem = FileSystem.get(path.toUri(), conf);
    FileStatus[] fileStatus = pathFileSystem.globStatus(path);
    String inputFormatClass;
    if (fileStatus == null) {
        if (config.shouldAllowEmptyInput()) {
            inputFormatClass = EmptyInputFormat.class.getName();
        } else {
            throw new IOException(String.format("Input path %s does not exist", path));
        }
    } else {
        FileInputFormat.addInputPath(job, path);
        FileInputFormat.setMaxInputSplitSize(job, config.getMaxSplitSize());
        inputFormatClass = validatingInputFormat.getInputFormatClassName();
        Configuration hConf = job.getConfiguration();
        Map<String, String> inputFormatConfiguration = validatingInputFormat.getInputFormatConfiguration();
        for (Map.Entry<String, String> propertyEntry : inputFormatConfiguration.entrySet()) {
            hConf.set(propertyEntry.getKey(), propertyEntry.getValue());
        }
    }
    // set entries here again, in case anything set by PathTrackingInputFormat should be overridden
    for (Map.Entry<String, String> entry : getFileSystemProperties(context).entrySet()) {
        conf.set(entry.getKey(), entry.getValue());
    }
    context.setInput(Input.of(config.getReferenceName(), new SourceInputFormatProvider(inputFormatClass, conf)));
}
Also used : ValidatingInputFormat(io.cdap.cdap.etl.api.validation.ValidatingInputFormat) HashSet(java.util.HashSet) Set(java.util.Set) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) FormatContext(io.cdap.cdap.etl.api.validation.FormatContext) Schema(io.cdap.cdap.api.data.schema.Schema) InvalidPluginConfigException(io.cdap.cdap.api.plugin.InvalidPluginConfigException) FileSystem(org.apache.hadoop.fs.FileSystem) Job(org.apache.hadoop.mapreduce.Job) LineageRecorder(io.cdap.plugin.common.LineageRecorder) Path(org.apache.hadoop.fs.Path) Pattern(java.util.regex.Pattern) EmptyInputFormat(io.cdap.plugin.format.input.EmptyInputFormat) IOException(java.io.IOException) SourceInputFormatProvider(io.cdap.plugin.common.SourceInputFormatProvider) InvalidPluginProperty(io.cdap.cdap.api.plugin.InvalidPluginProperty) Map(java.util.Map) FailureCollector(io.cdap.cdap.etl.api.FailureCollector)

Aggregations

Schema (io.cdap.cdap.api.data.schema.Schema)1135 Test (org.junit.Test)664 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)432 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)177 Table (io.cdap.cdap.api.dataset.table.Table)169 ApplicationManager (io.cdap.cdap.test.ApplicationManager)148 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)141 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)133 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)130 ArrayList (java.util.ArrayList)114 HashSet (java.util.HashSet)113 HashMap (java.util.HashMap)101 WorkflowManager (io.cdap.cdap.test.WorkflowManager)96 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)81 IOException (java.io.IOException)69 FailureCollector (io.cdap.cdap.etl.api.FailureCollector)67 MockPipelineConfigurer (io.cdap.cdap.etl.mock.common.MockPipelineConfigurer)56 Map (java.util.Map)56 ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)47 ReflectionSchemaGenerator (io.cdap.cdap.internal.io.ReflectionSchemaGenerator)46