use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.
the class XMLParserTest method testXMLParserWithSimpleXPath.
@Test
public void testXMLParserWithSimpleXPath() throws Exception {
Schema schema = Schema.recordOf("record", Schema.Field.of("title", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("author", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("year", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
XMLParser.Config config = new XMLParser.Config("body", "UTF-8", "title:/book/title,author:/book/author,year:/book/year", "title:string,author:string,year:string", "Write to error dataset");
Transform<StructuredRecord, StructuredRecord> transform = new XMLParser(config);
transform.initialize(new MockTransformContext());
MockEmitter<StructuredRecord> emitter = new MockEmitter<>();
StructuredRecord inputRecord = StructuredRecord.builder(INPUT).set("offset", 1).set("body", "<book category=\"COOKING\"><title lang=\"en\">Everyday Italian</title>" + "<author>Giada De Laurentiis</author><year>2005</year><price>30.00</price></book>").build();
transform.transform(inputRecord, emitter);
List<StructuredRecord> expected = ImmutableList.of(StructuredRecord.builder(schema).set("title", "Everyday Italian").set("author", "Giada De Laurentiis").set("year", "2005").build());
Assert.assertEquals(expected, emitter.getEmitted());
}
use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.
the class XMLToJSONConverterTest method testInvalidInputFieldType.
@Test
public void testInvalidInputFieldType() throws Exception {
Schema schema = Schema.recordOf("input1", Schema.Field.of("body", Schema.of(Schema.Type.INT)));
XMLToJSON.Config config = new XMLToJSON.Config("body", "jsonevent", OUTPUT.toString());
PipelineConfigurer configurer = new MockPipelineConfigurer(schema);
FailureCollector collector = configurer.getStageConfigurer().getFailureCollector();
XMLToJSON converter = new XMLToJSON(config);
converter.configurePipeline(configurer);
Assert.assertEquals(1, collector.getValidationFailures().size());
Assert.assertEquals(2, collector.getValidationFailures().get(0).getCauses().size());
}
use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.
the class NormalizeTest method testInvalidOutputSchemaFieldType.
@Test
public void testInvalidOutputSchemaFieldType() throws Exception {
// schema with ID field as long
Schema outputSchema = Schema.recordOf("outputSchema", Schema.Field.of(ID, Schema.of(Schema.Type.LONG)), Schema.Field.of(DATE, Schema.of(Schema.Type.STRING)), Schema.Field.of(ATTRIBUTE_TYPE, Schema.of(Schema.Type.STRING)), Schema.Field.of(ATTRIBUTE_VALUE, Schema.of(Schema.Type.STRING)));
Normalize.NormalizeConfig config = new Normalize.NormalizeConfig(validFieldMapping, validFieldNormalizing, outputSchema.toString());
MockPipelineConfigurer configurer = new MockPipelineConfigurer(INPUT_SCHEMA);
new Normalize(config).configurePipeline(configurer);
FailureCollector collector = configurer.getStageConfigurer().getFailureCollector();
Assert.assertEquals(1, collector.getValidationFailures().size());
Assert.assertEquals(1, collector.getValidationFailures().get(0).getCauses().size());
ValidationFailure.Cause expectedCause = new ValidationFailure.Cause();
expectedCause.addAttribute(CauseAttributes.OUTPUT_SCHEMA_FIELD, ID);
Assert.assertEquals(expectedCause, collector.getValidationFailures().get(0).getCauses().get(0));
}
use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.
the class UnionSplitterTest method testInvalidSchemas.
@Test
public void testInvalidSchemas() {
Schema inputSchema = Schema.recordOf("union", Schema.Field.of("a", Schema.unionOf(Schema.of(Schema.Type.NULL), Schema.arrayOf(Schema.of(Schema.Type.STRING)))), Schema.Field.of("b", Schema.unionOf(Schema.of(Schema.Type.NULL), Schema.enumWith("something"))), Schema.Field.of("c", Schema.unionOf(Schema.of(Schema.Type.NULL), Schema.mapOf(Schema.of(Schema.Type.STRING), Schema.of(Schema.Type.STRING)))), Schema.Field.of("d", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
FailureCollector collector = new MockFailureCollector();
UnionSplitter.getOutputSchemas(inputSchema, "a", true, collector);
Assert.assertEquals(1, collector.getValidationFailures().size());
Assert.assertEquals(2, collector.getValidationFailures().get(0).getCauses().size());
collector = new MockFailureCollector();
UnionSplitter.getOutputSchemas(inputSchema, "b", true, collector);
Assert.assertEquals(1, collector.getValidationFailures().size());
Assert.assertEquals(2, collector.getValidationFailures().get(0).getCauses().size());
collector = new MockFailureCollector();
UnionSplitter.getOutputSchemas(inputSchema, "c", true, collector);
Assert.assertEquals(1, collector.getValidationFailures().size());
Assert.assertEquals(2, collector.getValidationFailures().get(0).getCauses().size());
collector = new MockFailureCollector();
UnionSplitter.getOutputSchemas(inputSchema, "d", true, collector);
Assert.assertEquals(0, collector.getValidationFailures().size());
}
use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.
the class AbstractFileSource method prepareRun.
@Override
public void prepareRun(BatchSourceContext context) throws Exception {
FailureCollector collector = context.getFailureCollector();
config.validate(collector);
String fileFormat = config.getFormatName();
ValidatingInputFormat validatingInputFormat;
try {
validatingInputFormat = context.newPluginInstance(fileFormat);
} catch (InvalidPluginConfigException e) {
Set<String> properties = new HashSet<>(e.getMissingProperties());
for (InvalidPluginProperty invalidProperty : e.getInvalidProperties()) {
properties.add(invalidProperty.getName());
}
String errorMessage = String.format("Format '%s' cannot be used because properties %s were not provided or " + "were invalid when the pipeline was deployed. Set the format to a " + "different value, or re-create the pipeline with all required properties.", fileFormat, properties);
throw new IllegalArgumentException(errorMessage, e);
}
FormatContext formatContext = new FormatContext(collector, context.getInputSchema());
validateInputFormatProvider(formatContext, fileFormat, validatingInputFormat);
validatePathField(collector, validatingInputFormat.getSchema(formatContext));
collector.getOrThrowException();
Job job = JobUtils.createInstance();
Configuration conf = job.getConfiguration();
Pattern pattern = config.getFilePattern();
if (pattern != null) {
RegexPathFilter.configure(conf, pattern);
FileInputFormat.setInputPathFilter(job, RegexPathFilter.class);
}
FileInputFormat.setInputDirRecursive(job, config.shouldReadRecursively());
Schema schema = config.getSchema();
LineageRecorder lineageRecorder = new LineageRecorder(context, config.getReferenceName());
lineageRecorder.createExternalDataset(schema);
if (schema != null && schema.getFields() != null) {
recordLineage(lineageRecorder, schema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList()));
}
// set entries here, before FileSystem is used
for (Map.Entry<String, String> entry : getFileSystemProperties(context).entrySet()) {
conf.set(entry.getKey(), entry.getValue());
}
Path path = new Path(config.getPath());
FileSystem pathFileSystem = FileSystem.get(path.toUri(), conf);
FileStatus[] fileStatus = pathFileSystem.globStatus(path);
String inputFormatClass;
if (fileStatus == null) {
if (config.shouldAllowEmptyInput()) {
inputFormatClass = EmptyInputFormat.class.getName();
} else {
throw new IOException(String.format("Input path %s does not exist", path));
}
} else {
FileInputFormat.addInputPath(job, path);
FileInputFormat.setMaxInputSplitSize(job, config.getMaxSplitSize());
inputFormatClass = validatingInputFormat.getInputFormatClassName();
Configuration hConf = job.getConfiguration();
Map<String, String> inputFormatConfiguration = validatingInputFormat.getInputFormatConfiguration();
for (Map.Entry<String, String> propertyEntry : inputFormatConfiguration.entrySet()) {
hConf.set(propertyEntry.getKey(), propertyEntry.getValue());
}
}
// set entries here again, in case anything set by PathTrackingInputFormat should be overridden
for (Map.Entry<String, String> entry : getFileSystemProperties(context).entrySet()) {
conf.set(entry.getKey(), entry.getValue());
}
context.setInput(Input.of(config.getReferenceName(), new SourceInputFormatProvider(inputFormatClass, conf)));
}
Aggregations