use of org.apache.pig.ResourceSchema.ResourceFieldSchema in project hive by apache.
the class PigHCatUtil method getResourceSchema.
public static ResourceSchema getResourceSchema(HCatSchema hcatSchema) throws IOException {
List<ResourceFieldSchema> rfSchemaList = new ArrayList<ResourceFieldSchema>();
for (HCatFieldSchema hfs : hcatSchema.getFields()) {
ResourceFieldSchema rfSchema;
rfSchema = getResourceSchemaFromFieldSchema(hfs);
rfSchemaList.add(rfSchema);
}
ResourceSchema rSchema = new ResourceSchema();
rSchema.setFields(rfSchemaList.toArray(new ResourceFieldSchema[rfSchemaList.size()]));
return rSchema;
}
use of org.apache.pig.ResourceSchema.ResourceFieldSchema in project hive by apache.
the class PigHCatUtil method getBagSubSchema.
protected static ResourceSchema getBagSubSchema(HCatFieldSchema hfs) throws IOException {
// there are two cases - array<Type> and array<struct<...>>
// in either case the element type of the array is represented in a
// tuple field schema in the bag's field schema - the second case (struct)
// more naturally translates to the tuple - in the first case (array<Type>)
// we simulate the tuple by putting the single field in a tuple
Properties props = UDFContext.getUDFContext().getClientSystemProps();
String innerTupleName = HCatConstants.HCAT_PIG_INNER_TUPLE_NAME_DEFAULT;
if (props != null && props.containsKey(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME)) {
innerTupleName = props.getProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME).replaceAll("FIELDNAME", hfs.getName());
}
String innerFieldName = HCatConstants.HCAT_PIG_INNER_FIELD_NAME_DEFAULT;
if (props != null && props.containsKey(HCatConstants.HCAT_PIG_INNER_FIELD_NAME)) {
innerFieldName = props.getProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME).replaceAll("FIELDNAME", hfs.getName());
}
ResourceFieldSchema[] bagSubFieldSchemas = new ResourceFieldSchema[1];
bagSubFieldSchemas[0] = new ResourceFieldSchema().setName(innerTupleName).setDescription("The tuple in the bag").setType(DataType.TUPLE);
HCatFieldSchema arrayElementFieldSchema = hfs.getArrayElementSchema().get(0);
if (arrayElementFieldSchema.getType() == Type.STRUCT) {
bagSubFieldSchemas[0].setSchema(getTupleSubSchema(arrayElementFieldSchema));
} else if (arrayElementFieldSchema.getType() == Type.ARRAY) {
ResourceSchema s = new ResourceSchema();
List<ResourceFieldSchema> lrfs = Arrays.asList(getResourceSchemaFromFieldSchema(arrayElementFieldSchema));
s.setFields(lrfs.toArray(new ResourceFieldSchema[lrfs.size()]));
bagSubFieldSchemas[0].setSchema(s);
} else {
ResourceFieldSchema[] innerTupleFieldSchemas = new ResourceFieldSchema[1];
innerTupleFieldSchemas[0] = new ResourceFieldSchema().setName(innerFieldName).setDescription("The inner field in the tuple in the bag").setType(getPigType(arrayElementFieldSchema)).setSchema(// the element type is not a tuple - so no subschema
null);
bagSubFieldSchemas[0].setSchema(new ResourceSchema().setFields(innerTupleFieldSchemas));
}
return new ResourceSchema().setFields(bagSubFieldSchemas);
}
use of org.apache.pig.ResourceSchema.ResourceFieldSchema in project hive by apache.
the class TestPigHCatUtil method testGetBagSubSchemaConfigured.
@Test
public void testGetBagSubSchemaConfigured() throws Exception {
// NOTE: pig-0.8 sets client system properties by actually getting the client
// system properties. Starting in pig-0.9 you must pass the properties in.
// When updating our pig dependency this will need updated.
System.setProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME, "t");
System.setProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME, "FIELDNAME_tuple");
UDFContext.getUDFContext().setClientSystemProps(System.getProperties());
// Define the expected schema.
ResourceFieldSchema[] bagSubFieldSchemas = new ResourceFieldSchema[1];
bagSubFieldSchemas[0] = new ResourceFieldSchema().setName("t").setDescription("The tuple in the bag").setType(DataType.TUPLE);
ResourceFieldSchema[] innerTupleFieldSchemas = new ResourceFieldSchema[1];
innerTupleFieldSchemas[0] = new ResourceFieldSchema().setName("llama_tuple").setType(DataType.CHARARRAY);
bagSubFieldSchemas[0].setSchema(new ResourceSchema().setFields(innerTupleFieldSchemas));
ResourceSchema expected = new ResourceSchema().setFields(bagSubFieldSchemas);
// Get the actual converted schema.
HCatSchema actualHCatSchema = new HCatSchema(Lists.newArrayList(new HCatFieldSchema("innerLlama", HCatFieldSchema.Type.STRING, null)));
HCatFieldSchema actualHCatFieldSchema = new HCatFieldSchema("llama", HCatFieldSchema.Type.ARRAY, actualHCatSchema, null);
ResourceSchema actual = PigHCatUtil.getBagSubSchema(actualHCatFieldSchema);
Assert.assertEquals(expected.toString(), actual.toString());
// Clean up System properties that were set by this test
System.clearProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME);
System.clearProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME);
}
use of org.apache.pig.ResourceSchema.ResourceFieldSchema in project gora by apache.
the class SchemaUtils method checkUnionSchema.
/**
* Checks and tries to match a pig field schema with an avro union schema.
* @param avroSchema Schema with
* @param pigFieldSchema
* @return true: if a match is found
* false: if avro schema is not UNION
* @throws IOException(message, Exception()) if avro schema is UNION but not match is found for pig field schema.
*/
private static boolean checkUnionSchema(Schema avroSchema, ResourceFieldSchema pigFieldSchema) throws IOException {
if (!avroSchema.getType().equals(Type.UNION))
return false;
LOG.trace(" checking against UNION");
for (Schema unionElementSchema : avroSchema.getTypes()) {
try {
LOG.trace(" union component {}", unionElementSchema.getType().getName());
checkEqualSchema(pigFieldSchema, unionElementSchema);
return true;
} catch (IOException e) {
// Exception from inner union, rethrow
if (e.getCause() != null) {
throw e;
}
// else ignore
}
}
// throws IOException(message,Exception()) to mark nested union exception.
throw new IOException("Expected some field defined in '" + avroSchema.getName() + "' for pig schema type '" + DataType.genTypeToNameMap().get(pigFieldSchema.getType()) + "'", new Exception("Union not satisfied"));
}
use of org.apache.pig.ResourceSchema.ResourceFieldSchema in project gora by apache.
the class SchemaUtils method avro2ResouceFieldSchema.
private static ResourceFieldSchema avro2ResouceFieldSchema(Schema schema) throws IOException {
Type schemaType = schema.getType();
switch(schemaType) {
case NULL:
return new ResourceFieldSchema().setType(DataType.NULL);
case BOOLEAN:
return new ResourceFieldSchema().setType(DataType.BOOLEAN);
case ENUM:
return new ResourceFieldSchema().setType(DataType.INTEGER);
case BYTES:
return new ResourceFieldSchema().setType(DataType.BYTEARRAY);
case STRING:
return new ResourceFieldSchema().setType(DataType.CHARARRAY);
case FLOAT:
return new ResourceFieldSchema().setType(DataType.FLOAT);
case DOUBLE:
return new ResourceFieldSchema().setType(DataType.DOUBLE);
case INT:
return new ResourceFieldSchema().setType(DataType.INTEGER);
case LONG:
return new ResourceFieldSchema().setType(DataType.LONG);
case UNION:
// Returns the first not-null type
if (schema.getTypes().size() != 2) {
LOG.warn("Field UNION {} must be ['null','othertype']. Maybe wrong definition?");
}
for (Schema s : schema.getTypes()) {
if (s.getType() != Type.NULL)
return avro2ResouceFieldSchema(s);
}
LOG.error("Union with only ['null']?");
throw new RuntimeException("Union with only ['null']?");
case RECORD:
// A record in Gora is a Tuple in Pig
if (recursiveRecordSchema.incSchema(schema.getName()) > 1) {
// Recursivity detected (and we are 2 levels bellow desired)
// So we can put the esquema of bother leafs
recursiveRecordSchema.decSchema(schema.getName());
// Return a tuple schema with no fields
return new ResourceFieldSchema().setType(DataType.TUPLE);
}
int numRecordFields = schema.getFields().size();
Iterator<Field> recordFields = schema.getFields().iterator();
ResourceFieldSchema returnRecordResourceFieldSchema = new ResourceFieldSchema().setType(DataType.TUPLE);
ResourceFieldSchema[] recordFieldSchemas = new ResourceFieldSchema[numRecordFields];
for (int fieldIndex = 0; recordFields.hasNext(); fieldIndex++) {
Field schemaField = recordFields.next();
recordFieldSchemas[fieldIndex] = avro2ResouceFieldSchema(schemaField.schema()).setName(schemaField.name());
}
returnRecordResourceFieldSchema.setSchema(new ResourceSchema().setFields(recordFieldSchemas));
return returnRecordResourceFieldSchema;
case ARRAY:
// An array in Gora is a Bag in Pig
// Maybe should be a Map with string(numeric) index to ensure order, but Avro and Pig data model are different :\
ResourceFieldSchema returnArrayResourceFieldSchema = new ResourceFieldSchema().setType(DataType.BAG);
Schema arrayElementType = schema.getElementType();
returnArrayResourceFieldSchema.setSchema(new ResourceSchema().setFields(new ResourceFieldSchema[] { new ResourceFieldSchema().setType(DataType.TUPLE).setName("t").setSchema(new ResourceSchema().setFields(new ResourceFieldSchema[] { avro2ResouceFieldSchema(arrayElementType) })) }));
return returnArrayResourceFieldSchema;
case MAP:
// A map in Gora is a Map in Pig, but in pig is only chararray=>something
ResourceFieldSchema returnMapResourceFieldSchema = new ResourceFieldSchema().setType(DataType.MAP);
Schema mapValueType = schema.getValueType();
returnMapResourceFieldSchema.setSchema(new ResourceSchema().setFields(new ResourceFieldSchema[] { avro2ResouceFieldSchema(mapValueType) }));
return returnMapResourceFieldSchema;
case FIXED:
// TODO Implement FIXED data type
throw new RuntimeException("Fixed type not implemented");
default:
throw new RuntimeException("Unexpected schema type " + schemaType);
}
}
Aggregations