use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.
the class SparkUtils method validateConfigParameters.
/**
* Validate the config parameters for the spark sink and spark compute classes.
*
* @param inputSchema schema of the received record.
* @param featuresToInclude features to be used for training/prediction.
* @param featuresToExclude features to be excluded when training/predicting.
* @param predictionField field containing the prediction values.
*/
public static void validateConfigParameters(Schema inputSchema, @Nullable String featuresToInclude, @Nullable String featuresToExclude, String predictionField, @Nullable String cardinalityMapping) {
if (!Strings.isNullOrEmpty(featuresToExclude) && !Strings.isNullOrEmpty(featuresToInclude)) {
throw new IllegalArgumentException("Cannot specify values for both featuresToInclude and featuresToExclude. " + "Please specify fields for one.");
}
Map<String, Integer> fields = getFeatureList(inputSchema, featuresToInclude, featuresToExclude, predictionField);
for (String field : fields.keySet()) {
Schema.Field inputField = inputSchema.getField(field);
Schema schema = inputField.getSchema();
Schema.Type features = schema.isNullableSimple() ? schema.getNonNullable().getType() : schema.getType();
if (!(features.equals(Schema.Type.INT) || features.equals(Schema.Type.LONG) || features.equals(Schema.Type.FLOAT) || features.equals(Schema.Type.DOUBLE))) {
throw new IllegalArgumentException(String.format("Features must be of type : int, double, float, long but " + "was of type %s for field %s.", features, field));
}
}
getCategoricalFeatureInfo(inputSchema, featuresToInclude, featuresToExclude, predictionField, cardinalityMapping);
}
use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.
the class RowRecordTransformer method setField.
private void setField(StructuredRecord.Builder builder, Schema.Field field, byte[] fieldBytes) {
String fieldName = field.getName();
Schema fieldSchema = field.getSchema();
if (fieldBytes == null) {
if (!fieldSchema.isNullable()) {
throw new IllegalArgumentException("null value found for non-nullable field " + fieldName);
}
return;
}
Schema.Type fieldType = fieldSchema.isNullable() ? fieldSchema.getNonNullable().getType() : fieldSchema.getType();
switch(fieldType) {
case BOOLEAN:
builder.set(fieldName, Bytes.toBoolean(fieldBytes));
break;
case INT:
builder.set(fieldName, Bytes.toInt(fieldBytes));
break;
case LONG:
builder.set(fieldName, Bytes.toLong(fieldBytes));
break;
case FLOAT:
builder.set(fieldName, Bytes.toFloat(fieldBytes));
break;
case DOUBLE:
builder.set(fieldName, Bytes.toDouble(fieldBytes));
break;
case BYTES:
builder.set(fieldName, fieldBytes);
break;
case STRING:
builder.set(fieldName, Bytes.toString(fieldBytes));
break;
default:
// shouldn't ever happen
throw new IllegalArgumentException("Unsupported type " + fieldType + " for field " + fieldName);
}
}
use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.
the class SchemaValidator method validateOutputSchemaIsSubsetOfInputSchema.
/**
* Checks that all the fields in output schema is part of input schema and the fields schema type matches.
*
* @param inputSchema input schema
* @param outputSchema output schema
* @param collector collects validation failures
*/
public static void validateOutputSchemaIsSubsetOfInputSchema(Schema inputSchema, Schema outputSchema, FailureCollector collector) {
// check if input schema contains all the fields expected in the output schema
for (Schema.Field field : outputSchema.getFields()) {
String fieldName = field.getName();
if (inputSchema.getField(fieldName) == null) {
collector.addFailure(String.format("Field '%s' is present in output schema but not present in input schema.", fieldName), null).withOutputSchemaField(fieldName);
continue;
}
Schema inFieldSchema = inputSchema.getField(fieldName).getSchema();
inFieldSchema = inFieldSchema.isNullable() ? inFieldSchema.getNonNullable() : inFieldSchema;
Schema fieldSchema = field.getSchema().isNullable() ? field.getSchema().getNonNullable() : field.getSchema();
if (!inFieldSchema.equals(fieldSchema)) {
collector.addFailure(String.format("Field '%s' has type mismatch with input schema type '%s'.", fieldName, inFieldSchema.getDisplayName()), "Change type to match input schema type.").withOutputSchemaField(fieldName).withInputSchemaField(fieldName);
}
}
}
use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.
the class SparkUtilsTest method testGetArrayInputField.
@Test
public void testGetArrayInputField() throws Exception {
Schema input = Schema.recordOf("input", Schema.Field.of("offset", Schema.of(Schema.Type.INT)), Schema.Field.of("body", Schema.arrayOf(Schema.of(Schema.Type.STRING))));
StructuredRecord record = StructuredRecord.builder(input).set("offset", 1).set("body", new String[] { "Hi", "heard", "about", "Spark" }).build();
Splitter splitter = Splitter.on(Pattern.compile(" "));
List<String> expected = new ArrayList<>();
expected.add("Hi");
expected.add("heard");
expected.add("about");
expected.add("Spark");
Assert.assertEquals(expected, SparkUtils.getInputFieldValue(record, "body", splitter));
record = StructuredRecord.builder(input).set("offset", 2).set("body", new String[] { "Classes", "in", "Java" }).build();
expected = new ArrayList<>();
expected.add("Classes");
expected.add("in");
expected.add("Java");
Assert.assertEquals(expected, SparkUtils.getInputFieldValue(record, "body", splitter));
}
use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.
the class SparkUtilsTest method testGetStringInputField.
@Test
public void testGetStringInputField() throws Exception {
Schema input = Schema.recordOf("input", Schema.Field.of("offset", Schema.of(Schema.Type.INT)), Schema.Field.of("body", Schema.of(Schema.Type.STRING)));
StructuredRecord record = StructuredRecord.builder(input).set("offset", 1).set("body", "Hi heard about Spark").build();
Splitter splitter = Splitter.on(Pattern.compile(" "));
List<String> expected = new ArrayList<>();
expected.add("Hi");
expected.add("heard");
expected.add("about");
expected.add("Spark");
Assert.assertEquals(expected, SparkUtils.getInputFieldValue(record, "body", splitter));
record = StructuredRecord.builder(input).set("offset", 2).set("body", "Classes in Java").build();
expected = new ArrayList<>();
expected.add("Classes");
expected.add("in");
expected.add("Java");
Assert.assertEquals(expected, SparkUtils.getInputFieldValue(record, "body", splitter));
}
Aggregations