Search in sources :

Example 21 with ResourceSchema

use of org.apache.pig.ResourceSchema in project hive by apache.

the class PigHCatUtil method getBagSubSchema.

protected static ResourceSchema getBagSubSchema(HCatFieldSchema hfs) throws IOException {
    // there are two cases - array<Type> and array<struct<...>>
    // in either case the element type of the array is represented in a
    // tuple field schema in the bag's field schema - the second case (struct)
    // more naturally translates to the tuple - in the first case (array<Type>)
    // we simulate the tuple by putting the single field in a tuple
    Properties props = UDFContext.getUDFContext().getClientSystemProps();
    String innerTupleName = HCatConstants.HCAT_PIG_INNER_TUPLE_NAME_DEFAULT;
    if (props != null && props.containsKey(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME)) {
        innerTupleName = props.getProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME).replaceAll("FIELDNAME", hfs.getName());
    }
    String innerFieldName = HCatConstants.HCAT_PIG_INNER_FIELD_NAME_DEFAULT;
    if (props != null && props.containsKey(HCatConstants.HCAT_PIG_INNER_FIELD_NAME)) {
        innerFieldName = props.getProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME).replaceAll("FIELDNAME", hfs.getName());
    }
    ResourceFieldSchema[] bagSubFieldSchemas = new ResourceFieldSchema[1];
    bagSubFieldSchemas[0] = new ResourceFieldSchema().setName(innerTupleName).setDescription("The tuple in the bag").setType(DataType.TUPLE);
    HCatFieldSchema arrayElementFieldSchema = hfs.getArrayElementSchema().get(0);
    if (arrayElementFieldSchema.getType() == Type.STRUCT) {
        bagSubFieldSchemas[0].setSchema(getTupleSubSchema(arrayElementFieldSchema));
    } else if (arrayElementFieldSchema.getType() == Type.ARRAY) {
        ResourceSchema s = new ResourceSchema();
        List<ResourceFieldSchema> lrfs = Arrays.asList(getResourceSchemaFromFieldSchema(arrayElementFieldSchema));
        s.setFields(lrfs.toArray(new ResourceFieldSchema[lrfs.size()]));
        bagSubFieldSchemas[0].setSchema(s);
    } else {
        ResourceFieldSchema[] innerTupleFieldSchemas = new ResourceFieldSchema[1];
        innerTupleFieldSchemas[0] = new ResourceFieldSchema().setName(innerFieldName).setDescription("The inner field in the tuple in the bag").setType(getPigType(arrayElementFieldSchema)).setSchema(// the element type is not a tuple - so no subschema
        null);
        bagSubFieldSchemas[0].setSchema(new ResourceSchema().setFields(innerTupleFieldSchemas));
    }
    return new ResourceSchema().setFields(bagSubFieldSchemas);
}
Also used : ResourceSchema(org.apache.pig.ResourceSchema) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema) ArrayList(java.util.ArrayList) List(java.util.List) Properties(java.util.Properties) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema)

Example 22 with ResourceSchema

use of org.apache.pig.ResourceSchema in project hive by apache.

the class TestPigHCatUtil method testGetBagSubSchemaConfigured.

@Test
public void testGetBagSubSchemaConfigured() throws Exception {
    // NOTE: pig-0.8 sets client system properties by actually getting the client
    // system properties. Starting in pig-0.9 you must pass the properties in.
    // When updating our pig dependency this will need updated.
    System.setProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME, "t");
    System.setProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME, "FIELDNAME_tuple");
    UDFContext.getUDFContext().setClientSystemProps(System.getProperties());
    // Define the expected schema.
    ResourceFieldSchema[] bagSubFieldSchemas = new ResourceFieldSchema[1];
    bagSubFieldSchemas[0] = new ResourceFieldSchema().setName("t").setDescription("The tuple in the bag").setType(DataType.TUPLE);
    ResourceFieldSchema[] innerTupleFieldSchemas = new ResourceFieldSchema[1];
    innerTupleFieldSchemas[0] = new ResourceFieldSchema().setName("llama_tuple").setType(DataType.CHARARRAY);
    bagSubFieldSchemas[0].setSchema(new ResourceSchema().setFields(innerTupleFieldSchemas));
    ResourceSchema expected = new ResourceSchema().setFields(bagSubFieldSchemas);
    // Get the actual converted schema.
    HCatSchema actualHCatSchema = new HCatSchema(Lists.newArrayList(new HCatFieldSchema("innerLlama", HCatFieldSchema.Type.STRING, null)));
    HCatFieldSchema actualHCatFieldSchema = new HCatFieldSchema("llama", HCatFieldSchema.Type.ARRAY, actualHCatSchema, null);
    ResourceSchema actual = PigHCatUtil.getBagSubSchema(actualHCatFieldSchema);
    Assert.assertEquals(expected.toString(), actual.toString());
    // Clean up System properties that were set by this test
    System.clearProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME);
    System.clearProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME);
}
Also used : ResourceSchema(org.apache.pig.ResourceSchema) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) Test(org.junit.Test)

Example 23 with ResourceSchema

use of org.apache.pig.ResourceSchema in project gora by apache.

the class SchemaUtils method avro2ResouceFieldSchema.

private static ResourceFieldSchema avro2ResouceFieldSchema(Schema schema) throws IOException {
    Type schemaType = schema.getType();
    switch(schemaType) {
        case NULL:
            return new ResourceFieldSchema().setType(DataType.NULL);
        case BOOLEAN:
            return new ResourceFieldSchema().setType(DataType.BOOLEAN);
        case ENUM:
            return new ResourceFieldSchema().setType(DataType.INTEGER);
        case BYTES:
            return new ResourceFieldSchema().setType(DataType.BYTEARRAY);
        case STRING:
            return new ResourceFieldSchema().setType(DataType.CHARARRAY);
        case FLOAT:
            return new ResourceFieldSchema().setType(DataType.FLOAT);
        case DOUBLE:
            return new ResourceFieldSchema().setType(DataType.DOUBLE);
        case INT:
            return new ResourceFieldSchema().setType(DataType.INTEGER);
        case LONG:
            return new ResourceFieldSchema().setType(DataType.LONG);
        case UNION:
            // Returns the first not-null type
            if (schema.getTypes().size() != 2) {
                LOG.warn("Field UNION {} must be ['null','othertype']. Maybe wrong definition?");
            }
            for (Schema s : schema.getTypes()) {
                if (s.getType() != Type.NULL)
                    return avro2ResouceFieldSchema(s);
            }
            LOG.error("Union with only ['null']?");
            throw new RuntimeException("Union with only ['null']?");
        case RECORD:
            // A record in Gora is a Tuple in Pig
            if (recursiveRecordSchema.incSchema(schema.getName()) > 1) {
                // Recursivity detected (and we are 2 levels bellow desired)
                // So we can put the esquema of bother leafs
                recursiveRecordSchema.decSchema(schema.getName());
                // Return a tuple schema with no fields
                return new ResourceFieldSchema().setType(DataType.TUPLE);
            }
            int numRecordFields = schema.getFields().size();
            Iterator<Field> recordFields = schema.getFields().iterator();
            ResourceFieldSchema returnRecordResourceFieldSchema = new ResourceFieldSchema().setType(DataType.TUPLE);
            ResourceFieldSchema[] recordFieldSchemas = new ResourceFieldSchema[numRecordFields];
            for (int fieldIndex = 0; recordFields.hasNext(); fieldIndex++) {
                Field schemaField = recordFields.next();
                recordFieldSchemas[fieldIndex] = avro2ResouceFieldSchema(schemaField.schema()).setName(schemaField.name());
            }
            returnRecordResourceFieldSchema.setSchema(new ResourceSchema().setFields(recordFieldSchemas));
            return returnRecordResourceFieldSchema;
        case ARRAY:
            // An array in Gora is a Bag in Pig
            // Maybe should be a Map with string(numeric) index to ensure order, but Avro and Pig data model are different :\
            ResourceFieldSchema returnArrayResourceFieldSchema = new ResourceFieldSchema().setType(DataType.BAG);
            Schema arrayElementType = schema.getElementType();
            returnArrayResourceFieldSchema.setSchema(new ResourceSchema().setFields(new ResourceFieldSchema[] { new ResourceFieldSchema().setType(DataType.TUPLE).setName("t").setSchema(new ResourceSchema().setFields(new ResourceFieldSchema[] { avro2ResouceFieldSchema(arrayElementType) })) }));
            return returnArrayResourceFieldSchema;
        case MAP:
            // A map in Gora is a Map in Pig, but in pig is only chararray=>something
            ResourceFieldSchema returnMapResourceFieldSchema = new ResourceFieldSchema().setType(DataType.MAP);
            Schema mapValueType = schema.getValueType();
            returnMapResourceFieldSchema.setSchema(new ResourceSchema().setFields(new ResourceFieldSchema[] { avro2ResouceFieldSchema(mapValueType) }));
            return returnMapResourceFieldSchema;
        case FIXED:
            // TODO Implement FIXED data type
            throw new RuntimeException("Fixed type not implemented");
        default:
            throw new RuntimeException("Unexpected schema type " + schemaType);
    }
}
Also used : Field(org.apache.avro.Schema.Field) DataType(org.apache.pig.data.DataType) Type(org.apache.avro.Schema.Type) ResourceSchema(org.apache.pig.ResourceSchema) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema) Schema(org.apache.avro.Schema) ResourceSchema(org.apache.pig.ResourceSchema) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema)

Example 24 with ResourceSchema

use of org.apache.pig.ResourceSchema in project mongo-hadoop by mongodb.

the class JSONPigReplace method substitute.

/*
     * Returns result of substituting pig objects in Tuple t into
     * initStr
     * 
     * @param Tuple t : Pig tuple containing pig objects
     * @param Object s : Schema representing Tuple t
     * @param String un : String to represent un-named Schema Fields 
     * 
     * @return Array of BasicBSONObjects that contain all replacements for "marked" strings
     */
public BasicBSONObject[] substitute(final Tuple t, final Object s, final String un) throws Exception {
    unnamedStr = un;
    final ResourceFieldSchema[] fields;
    try {
        final ResourceSchema schema;
        if (s instanceof String) {
            schema = new ResourceSchema(Utils.getSchemaFromString((String) s));
        } else if (s instanceof Schema) {
            schema = new ResourceSchema((Schema) s);
        } else if (s instanceof ResourceSchema) {
            schema = (ResourceSchema) s;
        } else {
            throw new IllegalArgumentException("Schema must be represented either by a string or a Schema " + "object, not " + s);
        }
        fields = schema.getFields();
    } catch (Exception e) {
        throw new IllegalArgumentException("Invalid Schema Format", e);
    }
    // Make Tuple t into BSONObject using schema provided and store result in pObj
    final BasicDBObjectBuilder builder = BasicDBObjectBuilder.start();
    for (int i = 0; i < fields.length; i++) {
        writeField(builder, fields[i], t.get(i));
    }
    // BSONObject that represents Pig Tuple input using Pig Schema
    BasicBSONObject pObj = (BasicBSONObject) builder.get();
    // fill map of replacement strings to corresponding objects to replace these strings with
    fillReplacementMap(pObj);
    // Now, replace replacement strings (of form $elem) with corresponding objects in pObj
    return replaceAll(initBSONs, reps);
}
Also used : BasicBSONObject(org.bson.BasicBSONObject) ResourceSchema(org.apache.pig.ResourceSchema) BasicDBObjectBuilder(com.mongodb.BasicDBObjectBuilder) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema) ResourceSchema(org.apache.pig.ResourceSchema) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema) Schema(org.apache.pig.impl.logicalLayer.schema.Schema) IOException(java.io.IOException)

Example 25 with ResourceSchema

use of org.apache.pig.ResourceSchema in project mongo-hadoop by mongodb.

the class MongoInsertStorage method prepareToWrite.

public void prepareToWrite(final RecordWriter writer) throws IOException {
    out = writer;
    if (out == null) {
        throw new IOException("Invalid Record Writer");
    }
    UDFContext udfc = UDFContext.getUDFContext();
    Properties p = udfc.getUDFProperties(getClass(), new String[] { udfcSignature });
    String strSchema = p.getProperty(SCHEMA_SIGNATURE);
    if (strSchema == null) {
        LOG.warn("Could not find schema in UDF context. Interpreting each tuple as containing a single map.");
    } else {
        try {
            // Parse the schema from the string stored in the properties object.
            schema = new ResourceSchema(Utils.getSchemaFromString(strSchema));
        } catch (Exception e) {
            schema = null;
            LOG.warn(e.getMessage());
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug("GOT A SCHEMA " + schema + " " + strSchema);
        }
    }
}
Also used : ResourceSchema(org.apache.pig.ResourceSchema) UDFContext(org.apache.pig.impl.util.UDFContext) IOException(java.io.IOException) Properties(java.util.Properties) IOException(java.io.IOException)

Aggregations

ResourceSchema (org.apache.pig.ResourceSchema)35 ResourceFieldSchema (org.apache.pig.ResourceSchema.ResourceFieldSchema)20 Schema (org.apache.pig.impl.logicalLayer.schema.Schema)11 Test (org.junit.Test)10 IOException (java.io.IOException)9 ArrayList (java.util.ArrayList)6 HCatFieldSchema (org.apache.hive.hcatalog.data.schema.HCatFieldSchema)6 Properties (java.util.Properties)5 FieldSchema (org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema)4 UDFContext (org.apache.pig.impl.util.UDFContext)4 HashMap (java.util.HashMap)3 List (java.util.List)3 Map (java.util.Map)3 HCatSchema (org.apache.hive.hcatalog.data.schema.HCatSchema)3 Tuple (org.apache.pig.data.Tuple)3 Descriptors (com.google.protobuf.Descriptors)2 BasicDBObjectBuilder (com.mongodb.BasicDBObjectBuilder)2 SQLException (java.sql.SQLException)2 LinkedHashMap (java.util.LinkedHashMap)2 Field (org.apache.avro.Schema.Field)2