Search in sources :

Example 26 with Schema

use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.

the class SparkUtils method validateConfigParameters.

/**
 * Validate the config parameters for the spark sink and spark compute classes.
 *
 * @param inputSchema schema of the received record.
 * @param featuresToInclude features to be used for training/prediction.
 * @param featuresToExclude features to be excluded when training/predicting.
 * @param predictionField field containing the prediction values.
 */
public static void validateConfigParameters(Schema inputSchema, @Nullable String featuresToInclude, @Nullable String featuresToExclude, String predictionField, @Nullable String cardinalityMapping) {
    if (!Strings.isNullOrEmpty(featuresToExclude) && !Strings.isNullOrEmpty(featuresToInclude)) {
        throw new IllegalArgumentException("Cannot specify values for both featuresToInclude and featuresToExclude. " + "Please specify fields for one.");
    }
    Map<String, Integer> fields = getFeatureList(inputSchema, featuresToInclude, featuresToExclude, predictionField);
    for (String field : fields.keySet()) {
        Schema.Field inputField = inputSchema.getField(field);
        Schema schema = inputField.getSchema();
        Schema.Type features = schema.isNullableSimple() ? schema.getNonNullable().getType() : schema.getType();
        if (!(features.equals(Schema.Type.INT) || features.equals(Schema.Type.LONG) || features.equals(Schema.Type.FLOAT) || features.equals(Schema.Type.DOUBLE))) {
            throw new IllegalArgumentException(String.format("Features must be of type : int, double, float, long but " + "was of type %s for field %s.", features, field));
        }
    }
    getCategoricalFeatureInfo(inputSchema, featuresToInclude, featuresToExclude, predictionField, cardinalityMapping);
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema)

Example 27 with Schema

use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.

the class RowRecordTransformer method setField.

private void setField(StructuredRecord.Builder builder, Schema.Field field, byte[] fieldBytes) {
    String fieldName = field.getName();
    Schema fieldSchema = field.getSchema();
    if (fieldBytes == null) {
        if (!fieldSchema.isNullable()) {
            throw new IllegalArgumentException("null value found for non-nullable field " + fieldName);
        }
        return;
    }
    Schema.Type fieldType = fieldSchema.isNullable() ? fieldSchema.getNonNullable().getType() : fieldSchema.getType();
    switch(fieldType) {
        case BOOLEAN:
            builder.set(fieldName, Bytes.toBoolean(fieldBytes));
            break;
        case INT:
            builder.set(fieldName, Bytes.toInt(fieldBytes));
            break;
        case LONG:
            builder.set(fieldName, Bytes.toLong(fieldBytes));
            break;
        case FLOAT:
            builder.set(fieldName, Bytes.toFloat(fieldBytes));
            break;
        case DOUBLE:
            builder.set(fieldName, Bytes.toDouble(fieldBytes));
            break;
        case BYTES:
            builder.set(fieldName, fieldBytes);
            break;
        case STRING:
            builder.set(fieldName, Bytes.toString(fieldBytes));
            break;
        default:
            // shouldn't ever happen
            throw new IllegalArgumentException("Unsupported type " + fieldType + " for field " + fieldName);
    }
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema)

Example 28 with Schema

use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.

the class SchemaValidator method validateOutputSchemaIsSubsetOfInputSchema.

/**
 * Checks that all the fields in output schema is part of input schema and the fields schema type matches.
 *
 * @param inputSchema input schema
 * @param outputSchema output schema
 * @param collector collects validation failures
 */
public static void validateOutputSchemaIsSubsetOfInputSchema(Schema inputSchema, Schema outputSchema, FailureCollector collector) {
    // check if input schema contains all the fields expected in the output schema
    for (Schema.Field field : outputSchema.getFields()) {
        String fieldName = field.getName();
        if (inputSchema.getField(fieldName) == null) {
            collector.addFailure(String.format("Field '%s' is present in output schema but not present in input schema.", fieldName), null).withOutputSchemaField(fieldName);
            continue;
        }
        Schema inFieldSchema = inputSchema.getField(fieldName).getSchema();
        inFieldSchema = inFieldSchema.isNullable() ? inFieldSchema.getNonNullable() : inFieldSchema;
        Schema fieldSchema = field.getSchema().isNullable() ? field.getSchema().getNonNullable() : field.getSchema();
        if (!inFieldSchema.equals(fieldSchema)) {
            collector.addFailure(String.format("Field '%s' has type mismatch with input schema type '%s'.", fieldName, inFieldSchema.getDisplayName()), "Change type to match input schema type.").withOutputSchemaField(fieldName).withInputSchemaField(fieldName);
        }
    }
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema)

Example 29 with Schema

use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.

the class SparkUtilsTest method testGetArrayInputField.

@Test
public void testGetArrayInputField() throws Exception {
    Schema input = Schema.recordOf("input", Schema.Field.of("offset", Schema.of(Schema.Type.INT)), Schema.Field.of("body", Schema.arrayOf(Schema.of(Schema.Type.STRING))));
    StructuredRecord record = StructuredRecord.builder(input).set("offset", 1).set("body", new String[] { "Hi", "heard", "about", "Spark" }).build();
    Splitter splitter = Splitter.on(Pattern.compile(" "));
    List<String> expected = new ArrayList<>();
    expected.add("Hi");
    expected.add("heard");
    expected.add("about");
    expected.add("Spark");
    Assert.assertEquals(expected, SparkUtils.getInputFieldValue(record, "body", splitter));
    record = StructuredRecord.builder(input).set("offset", 2).set("body", new String[] { "Classes", "in", "Java" }).build();
    expected = new ArrayList<>();
    expected.add("Classes");
    expected.add("in");
    expected.add("Java");
    Assert.assertEquals(expected, SparkUtils.getInputFieldValue(record, "body", splitter));
}
Also used : Splitter(com.google.common.base.Splitter) Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) Test(org.junit.Test)

Example 30 with Schema

use of io.cdap.cdap.api.data.schema.Schema in project hydrator-plugins by cdapio.

the class SparkUtilsTest method testGetStringInputField.

@Test
public void testGetStringInputField() throws Exception {
    Schema input = Schema.recordOf("input", Schema.Field.of("offset", Schema.of(Schema.Type.INT)), Schema.Field.of("body", Schema.of(Schema.Type.STRING)));
    StructuredRecord record = StructuredRecord.builder(input).set("offset", 1).set("body", "Hi heard about Spark").build();
    Splitter splitter = Splitter.on(Pattern.compile(" "));
    List<String> expected = new ArrayList<>();
    expected.add("Hi");
    expected.add("heard");
    expected.add("about");
    expected.add("Spark");
    Assert.assertEquals(expected, SparkUtils.getInputFieldValue(record, "body", splitter));
    record = StructuredRecord.builder(input).set("offset", 2).set("body", "Classes in Java").build();
    expected = new ArrayList<>();
    expected.add("Classes");
    expected.add("in");
    expected.add("Java");
    Assert.assertEquals(expected, SparkUtils.getInputFieldValue(record, "body", splitter));
}
Also used : Splitter(com.google.common.base.Splitter) Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) Test(org.junit.Test)

Aggregations

Schema (io.cdap.cdap.api.data.schema.Schema)1135 Test (org.junit.Test)664 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)432 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)177 Table (io.cdap.cdap.api.dataset.table.Table)169 ApplicationManager (io.cdap.cdap.test.ApplicationManager)148 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)141 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)133 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)130 ArrayList (java.util.ArrayList)114 HashSet (java.util.HashSet)113 HashMap (java.util.HashMap)101 WorkflowManager (io.cdap.cdap.test.WorkflowManager)96 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)81 IOException (java.io.IOException)69 FailureCollector (io.cdap.cdap.etl.api.FailureCollector)67 MockPipelineConfigurer (io.cdap.cdap.etl.mock.common.MockPipelineConfigurer)56 Map (java.util.Map)56 ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)47 ReflectionSchemaGenerator (io.cdap.cdap.internal.io.ReflectionSchemaGenerator)46