Search in sources :

Example 21 with ResourceFieldSchema

use of org.apache.pig.ResourceSchema.ResourceFieldSchema in project gora by apache.

the class GoraStorage method putNext.

@SuppressWarnings("unchecked")
@Override
public void putNext(Tuple pigTuple) throws IOException {
    PersistentBase persistentObj = this.dataStore.newPersistent();
    if (LOG.isTraceEnabled())
        LOG.trace("key: {}", pigTuple.get(pigFieldKeyIndex));
    for (String fieldName : this.loadQueryFields) {
        if (LOG.isTraceEnabled()) {
            LOG.trace("  Put fieldname: {}", fieldName);
            LOG.trace("      resourcefield schema: {}", this.writeResourceFieldSchemaMap.get(fieldName).getResourceFieldSchema());
            LOG.trace("      value: {} - {}", this.writeResourceFieldSchemaMap.get(fieldName).getIndex(), pigTuple.get(this.writeResourceFieldSchemaMap.get(fieldName).getIndex()));
        }
        ResourceFieldSchemaWithIndex writeResourceFieldSchemaWithIndex = this.writeResourceFieldSchemaMap.get(fieldName);
        if (writeResourceFieldSchemaWithIndex == null) {
            if (LOG.isTraceEnabled())
                LOG.trace("Field {} defined in constructor not found in the tuple to persist, skipping field", fieldName);
            continue;
        }
        Field persistentField = persistentSchema.getField(fieldName);
        if (persistentField == null) {
            throw new IOException("Field " + fieldName + " does not exist in the Gora's Avro schema.");
        }
        ResourceFieldSchema pigFieldSchema = writeResourceFieldSchemaWithIndex.getResourceFieldSchema();
        if (pigFieldSchema == null) {
            throw new IOException("The field " + fieldName + " does not have a Pig schema when writing.");
        }
        // TODO Move this put to PersistentUtils
        // TODO Here is used the resourceFieldSchema and the index. Think about optimize if possible
        // TODO Find a better name to this.writeField, like 'tupleToPersistent'
        int persistentFieldIndex = persistentObj.getSchema().getField(fieldName).pos();
        persistentObj.put(persistentFieldIndex, this.writeField(persistentField.schema(), pigFieldSchema, pigTuple.get(writeResourceFieldSchemaWithIndex.getIndex())));
        persistentObj.setDirty(persistentFieldIndex);
    }
    try {
        ((GoraRecordWriter<Object, PersistentBase>) this.writer).write(pigTuple.get(pigFieldKeyIndex), (PersistentBase) persistentObj);
    } catch (InterruptedException e) {
        throw new IOException("Error writing the tuple.", e);
    }
}
Also used : Field(org.apache.avro.Schema.Field) PersistentBase(org.apache.gora.persistency.impl.PersistentBase) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema) IOException(java.io.IOException) GoraRecordWriter(org.apache.gora.mapreduce.GoraRecordWriter)

Example 22 with ResourceFieldSchema

use of org.apache.pig.ResourceSchema.ResourceFieldSchema in project gora by apache.

the class GoraStorage method writeField.

/**
 * Converts one pig field data to PersistentBase Data.
 *
 * @param avroSchema PersistentBase schema used to create new nested records
 * @param pigField Pig schema of the field being converted
 * @param pigData Pig data relative to the schema
 * @return PersistentBase data
 * @throws IOException
 */
private Object writeField(Schema avroSchema, ResourceFieldSchema pigField, Object pigData) throws IOException {
    // If data is null, return null (check if avro schema is right)
    if (pigData == null) {
        if (avroSchema.getType() != Type.UNION && avroSchema.getType() != Type.NULL) {
            throw new IOException("Tuple field " + pigField.getName() + " is null, but Avro Schema is not union nor null");
        } else {
            return null;
        }
    }
    // ONLY SUPPORT 2 ELEMENTS UNION!
    if (avroSchema.getType() == Type.UNION) {
        if (avroSchema.getTypes().get(0).getType() == Schema.Type.NULL) {
            avroSchema = avroSchema.getTypes().get(1);
        } else {
            avroSchema = avroSchema.getTypes().get(0);
        }
    }
    switch(pigField.getType()) {
        case DataType.DOUBLE:
        case DataType.FLOAT:
        case DataType.LONG:
        case DataType.BOOLEAN:
        case DataType.NULL:
            if (LOG.isTraceEnabled())
                LOG.trace("    Writing double, float, long, boolean or null.");
            return (Object) pigData;
        case DataType.CHARARRAY:
            if (LOG.isTraceEnabled())
                LOG.trace("    Writing chararray.");
            return pigData.toString();
        case DataType.INTEGER:
            if (LOG.isTraceEnabled())
                LOG.trace("    Writing integer/enum.");
            if (avroSchema.getType() == Type.ENUM) {
                return AvroUtils.getEnumValue(avroSchema, ((Number) pigData).intValue());
            } else {
                return ((Number) pigData).intValue();
            }
        case DataType.BYTEARRAY:
            if (LOG.isTraceEnabled())
                LOG.trace("    Writing bytearray.");
            return ByteBuffer.wrap(((DataByteArray) pigData).get());
        case // Pig Map -> Avro Map
        DataType.MAP:
            if (LOG.isTraceEnabled())
                LOG.trace("   Writing map.");
            @SuppressWarnings("unchecked") Map<String, Object> pigMap = (Map<String, Object>) pigData;
            Map<String, Object> goraMap = new HashMap<String, Object>(pigMap.size());
            if (pigField.getSchema() == null) {
                throw new IOException("The map being written does not have schema.");
            }
            for (Entry<String, Object> pigEntry : pigMap.entrySet()) {
                goraMap.put(pigEntry.getKey(), this.writeField(avroSchema.getValueType(), pigField.getSchema().getFields()[0], pigEntry.getValue()));
            }
            return goraMap;
        case // Pig Bag -> Avro Array
        DataType.BAG:
            if (LOG.isTraceEnabled())
                LOG.trace("    Writing bag.");
            Array<Object> persistentArray = new Array<Object>((int) ((DataBag) pigData).size(), avroSchema);
            for (Object pigArrayElement : (DataBag) pigData) {
                if (avroSchema.getElementType().getType() == Type.RECORD) {
                    // If element type is record, the mapping Persistent->PigType deletes one nested tuple:
                    // We want the map as: map((a1,a2,a3), (b1,b2,b3),...) instead of map(((a1,a2,a3)), ((b1,b2,b3)), ...)
                    persistentArray.add(this.writeField(avroSchema.getElementType(), pigField.getSchema().getFields()[0], pigArrayElement));
                } else {
                    // Every bag has a tuple as element type. Since this is not a record, that "tuple" container must be ignored
                    persistentArray.add(this.writeField(avroSchema.getElementType(), pigField.getSchema().getFields()[0].getSchema().getFields()[0], ((Tuple) pigArrayElement).get(0)));
                }
            }
            return persistentArray;
        case // Pig Tuple -> Avro Record
        DataType.TUPLE:
            if (LOG.isTraceEnabled())
                LOG.trace("    Writing tuple.");
            try {
                PersistentBase persistentRecord = (PersistentBase) ClassLoadingUtils.loadClass(avroSchema.getFullName()).newInstance();
                ResourceFieldSchema[] tupleFieldSchemas = pigField.getSchema().getFields();
                for (int i = 0; i < tupleFieldSchemas.length; i++) {
                    persistentRecord.put(tupleFieldSchemas[i].getName(), this.writeField(avroSchema.getField(tupleFieldSchemas[i].getName()).schema(), tupleFieldSchemas[i], ((Tuple) pigData).get(i)));
                }
                return persistentRecord;
            } catch (InstantiationException e) {
                throw new IOException(e);
            } catch (IllegalAccessException e) {
                throw new IOException(e);
            } catch (ClassNotFoundException e) {
                throw new IOException(e);
            }
        default:
            throw new IOException("Unexpected field " + pigField.getName() + " with Pig type " + DataType.genTypeToNameMap().get(pigField.getType()));
    }
}
Also used : PersistentBase(org.apache.gora.persistency.impl.PersistentBase) DataBag(org.apache.pig.data.DataBag) HashMap(java.util.HashMap) IOException(java.io.IOException) Array(org.apache.avro.generic.GenericData.Array) DataByteArray(org.apache.pig.data.DataByteArray) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema) Map(java.util.Map) HashMap(java.util.HashMap) Tuple(org.apache.pig.data.Tuple)

Example 23 with ResourceFieldSchema

use of org.apache.pig.ResourceSchema.ResourceFieldSchema in project gora by apache.

the class GoraStorage method prepareToWrite.

@Override
@SuppressWarnings({ "rawtypes", "unchecked" })
public void prepareToWrite(RecordWriter writer) throws IOException {
    // Get the schema of data to write from UDFContext (coming from frontend checkSchema())
    String strSchema = this.getUDFProperties().getProperty(GoraStorage.GORA_STORE_PIG_SCHEMA);
    this.writer = (GoraRecordWriter<?, ? extends PersistentBase>) writer;
    // Parse de the schema from string stored in properties object
    this.writeResourceSchema = new ResourceSchema(Utils.getSchemaFromString(strSchema));
    this.writeResourceFieldSchemaMap = new HashMap<String, ResourceFieldSchemaWithIndex>();
    int index = 0;
    for (ResourceFieldSchema fieldSchema : this.writeResourceSchema.getFields()) {
        this.writeResourceFieldSchemaMap.put(fieldSchema.getName(), new ResourceFieldSchemaWithIndex(fieldSchema, index++));
    }
    this.pigFieldKeyIndex = this.writeResourceFieldSchemaMap.get("key").getIndex();
}
Also used : ResourceSchema(org.apache.pig.ResourceSchema) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema)

Example 24 with ResourceFieldSchema

use of org.apache.pig.ResourceSchema.ResourceFieldSchema in project gora by apache.

the class GoraDeleteStorage method checkSchema.

@Override
public /**
 * Checks that the Pig schema has at least the field "key" with schema chararray.
 *
 * Sets UDFContext property GORA_STORE_SCHEMA with the schema to send it to the backend.
 */
void checkSchema(ResourceSchema pigSchema) throws IOException {
    List<String> pigFieldSchemasNames = new ArrayList<String>(Arrays.asList(pigSchema.fieldNames()));
    if (!pigFieldSchemasNames.contains("key")) {
        throw new IOException("Expected a field called \"key\" but not found.");
    }
    for (ResourceFieldSchema fieldSchema : pigSchema.getFields()) {
        if (fieldSchema.getName().equals("key") && fieldSchema.getType() != DataType.CHARARRAY) {
            throw new IOException("Expected field \"key\" with schema chararray, but found schema " + DataType.findTypeName(fieldSchema.getType()) + ".");
        }
    }
    // Save the schema to UDFContext to use it on backend when writing data
    this.getUDFProperties().setProperty(GoraStorage.GORA_STORE_PIG_SCHEMA, pigSchema.toString());
}
Also used : ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema) ArrayList(java.util.ArrayList) IOException(java.io.IOException)

Example 25 with ResourceFieldSchema

use of org.apache.pig.ResourceSchema.ResourceFieldSchema in project gora by apache.

the class SchemaUtils method generatePigSchema.

/**
 * Generates a Pig Schema from a Persistent's Avro Schema
 *
 * @param persistentSchema - The Persistent's Avro Schema to generate a Pig Schema.
 * @param queryFields - Declared query fields in the Storage WITHOUT 'key' field.
 * @param keyClass - Key class of the Persistent.
 * @return
 * @throws IOException
 */
public static ResourceSchema generatePigSchema(Schema persistentSchema, List<String> queryFields, Class<?> keyClass) throws IOException {
    ResourceFieldSchema[] resourceFieldSchemas = null;
    // We count 'key' field here
    int numFields = queryFields.size() + 1;
    resourceFieldSchemas = new ResourceFieldSchema[numFields];
    resourceFieldSchemas[0] = new ResourceFieldSchema().setType(DataType.findType(keyClass)).setName("key");
    int fieldIndex = 1;
    for (String fieldName : queryFields) {
        // Initialize the recursive schema checker in each field
        recursiveRecordSchema.clear();
        Field field = persistentSchema.getField(fieldName);
        if (field == null) {
            throw new IOException("Field \"" + fieldName + "\" not found in the entity " + persistentSchema.getFullName());
        }
        resourceFieldSchemas[fieldIndex++] = avro2ResouceFieldSchema(field.schema()).setName(field.name());
    }
    ResourceSchema resourceSchema = new ResourceSchema().setFields(resourceFieldSchemas);
    return resourceSchema;
}
Also used : Field(org.apache.avro.Schema.Field) ResourceSchema(org.apache.pig.ResourceSchema) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema) IOException(java.io.IOException)

Aggregations

ResourceFieldSchema (org.apache.pig.ResourceSchema.ResourceFieldSchema)42 ResourceSchema (org.apache.pig.ResourceSchema)22 IOException (java.io.IOException)16 ArrayList (java.util.ArrayList)7 Map (java.util.Map)7 Tuple (org.apache.pig.data.Tuple)7 HCatFieldSchema (org.apache.hive.hcatalog.data.schema.HCatFieldSchema)5 BasicDBObjectBuilder (com.mongodb.BasicDBObjectBuilder)4 SQLException (java.sql.SQLException)4 HashMap (java.util.HashMap)4 Field (org.apache.avro.Schema.Field)4 Schema (org.apache.pig.impl.logicalLayer.schema.Schema)4 BasicBSONObject (org.bson.BasicBSONObject)4 Test (org.junit.Test)4 List (java.util.List)3 DataBag (org.apache.pig.data.DataBag)3 FieldSchema (org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema)3 BSONObject (org.bson.BSONObject)3 DateTime (org.joda.time.DateTime)3 LinkedHashMap (java.util.LinkedHashMap)2