Search in sources :

Example 36 with ResourceFieldSchema

use of org.apache.pig.ResourceSchema.ResourceFieldSchema in project hive by apache.

the class PigHCatUtil method getResourceSchema.

public static ResourceSchema getResourceSchema(HCatSchema hcatSchema) throws IOException {
    List<ResourceFieldSchema> rfSchemaList = new ArrayList<ResourceFieldSchema>();
    for (HCatFieldSchema hfs : hcatSchema.getFields()) {
        ResourceFieldSchema rfSchema;
        rfSchema = getResourceSchemaFromFieldSchema(hfs);
        rfSchemaList.add(rfSchema);
    }
    ResourceSchema rSchema = new ResourceSchema();
    rSchema.setFields(rfSchemaList.toArray(new ResourceFieldSchema[rfSchemaList.size()]));
    return rSchema;
}
Also used : ResourceSchema(org.apache.pig.ResourceSchema) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema) ArrayList(java.util.ArrayList) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema)

Example 37 with ResourceFieldSchema

use of org.apache.pig.ResourceSchema.ResourceFieldSchema in project hive by apache.

the class PigHCatUtil method getBagSubSchema.

protected static ResourceSchema getBagSubSchema(HCatFieldSchema hfs) throws IOException {
    // there are two cases - array<Type> and array<struct<...>>
    // in either case the element type of the array is represented in a
    // tuple field schema in the bag's field schema - the second case (struct)
    // more naturally translates to the tuple - in the first case (array<Type>)
    // we simulate the tuple by putting the single field in a tuple
    Properties props = UDFContext.getUDFContext().getClientSystemProps();
    String innerTupleName = HCatConstants.HCAT_PIG_INNER_TUPLE_NAME_DEFAULT;
    if (props != null && props.containsKey(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME)) {
        innerTupleName = props.getProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME).replaceAll("FIELDNAME", hfs.getName());
    }
    String innerFieldName = HCatConstants.HCAT_PIG_INNER_FIELD_NAME_DEFAULT;
    if (props != null && props.containsKey(HCatConstants.HCAT_PIG_INNER_FIELD_NAME)) {
        innerFieldName = props.getProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME).replaceAll("FIELDNAME", hfs.getName());
    }
    ResourceFieldSchema[] bagSubFieldSchemas = new ResourceFieldSchema[1];
    bagSubFieldSchemas[0] = new ResourceFieldSchema().setName(innerTupleName).setDescription("The tuple in the bag").setType(DataType.TUPLE);
    HCatFieldSchema arrayElementFieldSchema = hfs.getArrayElementSchema().get(0);
    if (arrayElementFieldSchema.getType() == Type.STRUCT) {
        bagSubFieldSchemas[0].setSchema(getTupleSubSchema(arrayElementFieldSchema));
    } else if (arrayElementFieldSchema.getType() == Type.ARRAY) {
        ResourceSchema s = new ResourceSchema();
        List<ResourceFieldSchema> lrfs = Arrays.asList(getResourceSchemaFromFieldSchema(arrayElementFieldSchema));
        s.setFields(lrfs.toArray(new ResourceFieldSchema[lrfs.size()]));
        bagSubFieldSchemas[0].setSchema(s);
    } else {
        ResourceFieldSchema[] innerTupleFieldSchemas = new ResourceFieldSchema[1];
        innerTupleFieldSchemas[0] = new ResourceFieldSchema().setName(innerFieldName).setDescription("The inner field in the tuple in the bag").setType(getPigType(arrayElementFieldSchema)).setSchema(// the element type is not a tuple - so no subschema
        null);
        bagSubFieldSchemas[0].setSchema(new ResourceSchema().setFields(innerTupleFieldSchemas));
    }
    return new ResourceSchema().setFields(bagSubFieldSchemas);
}
Also used : ResourceSchema(org.apache.pig.ResourceSchema) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema) ArrayList(java.util.ArrayList) List(java.util.List) Properties(java.util.Properties) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema)

Example 38 with ResourceFieldSchema

use of org.apache.pig.ResourceSchema.ResourceFieldSchema in project hive by apache.

the class TestPigHCatUtil method testGetBagSubSchemaConfigured.

@Test
public void testGetBagSubSchemaConfigured() throws Exception {
    // NOTE: pig-0.8 sets client system properties by actually getting the client
    // system properties. Starting in pig-0.9 you must pass the properties in.
    // When updating our pig dependency this will need updated.
    System.setProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME, "t");
    System.setProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME, "FIELDNAME_tuple");
    UDFContext.getUDFContext().setClientSystemProps(System.getProperties());
    // Define the expected schema.
    ResourceFieldSchema[] bagSubFieldSchemas = new ResourceFieldSchema[1];
    bagSubFieldSchemas[0] = new ResourceFieldSchema().setName("t").setDescription("The tuple in the bag").setType(DataType.TUPLE);
    ResourceFieldSchema[] innerTupleFieldSchemas = new ResourceFieldSchema[1];
    innerTupleFieldSchemas[0] = new ResourceFieldSchema().setName("llama_tuple").setType(DataType.CHARARRAY);
    bagSubFieldSchemas[0].setSchema(new ResourceSchema().setFields(innerTupleFieldSchemas));
    ResourceSchema expected = new ResourceSchema().setFields(bagSubFieldSchemas);
    // Get the actual converted schema.
    HCatSchema actualHCatSchema = new HCatSchema(Lists.newArrayList(new HCatFieldSchema("innerLlama", HCatFieldSchema.Type.STRING, null)));
    HCatFieldSchema actualHCatFieldSchema = new HCatFieldSchema("llama", HCatFieldSchema.Type.ARRAY, actualHCatSchema, null);
    ResourceSchema actual = PigHCatUtil.getBagSubSchema(actualHCatFieldSchema);
    Assert.assertEquals(expected.toString(), actual.toString());
    // Clean up System properties that were set by this test
    System.clearProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME);
    System.clearProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME);
}
Also used : ResourceSchema(org.apache.pig.ResourceSchema) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) Test(org.junit.Test)

Example 39 with ResourceFieldSchema

use of org.apache.pig.ResourceSchema.ResourceFieldSchema in project gora by apache.

the class SchemaUtils method checkUnionSchema.

/**
 * Checks and tries to match a pig field schema with an avro union schema.
 * @param avroSchema Schema with
 * @param pigFieldSchema
 * @return true: if a match is found
 *         false: if avro schema is not UNION
 * @throws IOException(message, Exception()) if avro schema is UNION but not match is found for pig field schema.
 */
private static boolean checkUnionSchema(Schema avroSchema, ResourceFieldSchema pigFieldSchema) throws IOException {
    if (!avroSchema.getType().equals(Type.UNION))
        return false;
    LOG.trace("    checking against UNION");
    for (Schema unionElementSchema : avroSchema.getTypes()) {
        try {
            LOG.trace("    union component {}", unionElementSchema.getType().getName());
            checkEqualSchema(pigFieldSchema, unionElementSchema);
            return true;
        } catch (IOException e) {
            // Exception from inner union, rethrow
            if (e.getCause() != null) {
                throw e;
            }
        // else ignore
        }
    }
    // throws IOException(message,Exception()) to mark nested union exception.
    throw new IOException("Expected some field defined in '" + avroSchema.getName() + "' for pig schema type '" + DataType.genTypeToNameMap().get(pigFieldSchema.getType()) + "'", new Exception("Union not satisfied"));
}
Also used : Schema(org.apache.avro.Schema) ResourceSchema(org.apache.pig.ResourceSchema) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema) IOException(java.io.IOException) IOException(java.io.IOException)

Example 40 with ResourceFieldSchema

use of org.apache.pig.ResourceSchema.ResourceFieldSchema in project gora by apache.

the class SchemaUtils method avro2ResouceFieldSchema.

private static ResourceFieldSchema avro2ResouceFieldSchema(Schema schema) throws IOException {
    Type schemaType = schema.getType();
    switch(schemaType) {
        case NULL:
            return new ResourceFieldSchema().setType(DataType.NULL);
        case BOOLEAN:
            return new ResourceFieldSchema().setType(DataType.BOOLEAN);
        case ENUM:
            return new ResourceFieldSchema().setType(DataType.INTEGER);
        case BYTES:
            return new ResourceFieldSchema().setType(DataType.BYTEARRAY);
        case STRING:
            return new ResourceFieldSchema().setType(DataType.CHARARRAY);
        case FLOAT:
            return new ResourceFieldSchema().setType(DataType.FLOAT);
        case DOUBLE:
            return new ResourceFieldSchema().setType(DataType.DOUBLE);
        case INT:
            return new ResourceFieldSchema().setType(DataType.INTEGER);
        case LONG:
            return new ResourceFieldSchema().setType(DataType.LONG);
        case UNION:
            // Returns the first not-null type
            if (schema.getTypes().size() != 2) {
                LOG.warn("Field UNION {} must be ['null','othertype']. Maybe wrong definition?");
            }
            for (Schema s : schema.getTypes()) {
                if (s.getType() != Type.NULL)
                    return avro2ResouceFieldSchema(s);
            }
            LOG.error("Union with only ['null']?");
            throw new RuntimeException("Union with only ['null']?");
        case RECORD:
            // A record in Gora is a Tuple in Pig
            if (recursiveRecordSchema.incSchema(schema.getName()) > 1) {
                // Recursivity detected (and we are 2 levels bellow desired)
                // So we can put the esquema of bother leafs
                recursiveRecordSchema.decSchema(schema.getName());
                // Return a tuple schema with no fields
                return new ResourceFieldSchema().setType(DataType.TUPLE);
            }
            int numRecordFields = schema.getFields().size();
            Iterator<Field> recordFields = schema.getFields().iterator();
            ResourceFieldSchema returnRecordResourceFieldSchema = new ResourceFieldSchema().setType(DataType.TUPLE);
            ResourceFieldSchema[] recordFieldSchemas = new ResourceFieldSchema[numRecordFields];
            for (int fieldIndex = 0; recordFields.hasNext(); fieldIndex++) {
                Field schemaField = recordFields.next();
                recordFieldSchemas[fieldIndex] = avro2ResouceFieldSchema(schemaField.schema()).setName(schemaField.name());
            }
            returnRecordResourceFieldSchema.setSchema(new ResourceSchema().setFields(recordFieldSchemas));
            return returnRecordResourceFieldSchema;
        case ARRAY:
            // An array in Gora is a Bag in Pig
            // Maybe should be a Map with string(numeric) index to ensure order, but Avro and Pig data model are different :\
            ResourceFieldSchema returnArrayResourceFieldSchema = new ResourceFieldSchema().setType(DataType.BAG);
            Schema arrayElementType = schema.getElementType();
            returnArrayResourceFieldSchema.setSchema(new ResourceSchema().setFields(new ResourceFieldSchema[] { new ResourceFieldSchema().setType(DataType.TUPLE).setName("t").setSchema(new ResourceSchema().setFields(new ResourceFieldSchema[] { avro2ResouceFieldSchema(arrayElementType) })) }));
            return returnArrayResourceFieldSchema;
        case MAP:
            // A map in Gora is a Map in Pig, but in pig is only chararray=>something
            ResourceFieldSchema returnMapResourceFieldSchema = new ResourceFieldSchema().setType(DataType.MAP);
            Schema mapValueType = schema.getValueType();
            returnMapResourceFieldSchema.setSchema(new ResourceSchema().setFields(new ResourceFieldSchema[] { avro2ResouceFieldSchema(mapValueType) }));
            return returnMapResourceFieldSchema;
        case FIXED:
            // TODO Implement FIXED data type
            throw new RuntimeException("Fixed type not implemented");
        default:
            throw new RuntimeException("Unexpected schema type " + schemaType);
    }
}
Also used : Field(org.apache.avro.Schema.Field) DataType(org.apache.pig.data.DataType) Type(org.apache.avro.Schema.Type) ResourceSchema(org.apache.pig.ResourceSchema) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema) Schema(org.apache.avro.Schema) ResourceSchema(org.apache.pig.ResourceSchema) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema)

Aggregations

ResourceFieldSchema (org.apache.pig.ResourceSchema.ResourceFieldSchema)42 ResourceSchema (org.apache.pig.ResourceSchema)22 IOException (java.io.IOException)16 ArrayList (java.util.ArrayList)7 Map (java.util.Map)7 Tuple (org.apache.pig.data.Tuple)7 HCatFieldSchema (org.apache.hive.hcatalog.data.schema.HCatFieldSchema)5 BasicDBObjectBuilder (com.mongodb.BasicDBObjectBuilder)4 SQLException (java.sql.SQLException)4 HashMap (java.util.HashMap)4 Field (org.apache.avro.Schema.Field)4 Schema (org.apache.pig.impl.logicalLayer.schema.Schema)4 BasicBSONObject (org.bson.BasicBSONObject)4 Test (org.junit.Test)4 List (java.util.List)3 DataBag (org.apache.pig.data.DataBag)3 FieldSchema (org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema)3 BSONObject (org.bson.BSONObject)3 DateTime (org.joda.time.DateTime)3 LinkedHashMap (java.util.LinkedHashMap)2