use of org.apache.hive.hcatalog.data.schema.HCatFieldSchema in project hive by apache.
the class TestHCatUtil method testGetTableSchemaWithPtnColsSerDeReportedFields.
/**
* Hive represents tables in two ways:
* <ul>
* <li>org.apache.hadoop.hive.metastore.api.Table - exactly whats stored in the metastore</li>
* <li>org.apache.hadoop.hive.ql.metadata.Table - adds business logic over api.Table</li>
* </ul>
* Here we check SerDe-reported fields are included in the table schema.
*/
@Test
public void testGetTableSchemaWithPtnColsSerDeReportedFields() throws IOException {
Map<String, String> parameters = Maps.newHashMap();
parameters.put(serdeConstants.SERIALIZATION_CLASS, "org.apache.hadoop.hive.serde2.thrift.test.IntString");
parameters.put(serdeConstants.SERIALIZATION_FORMAT, "org.apache.thrift.protocol.TBinaryProtocol");
SerDeInfo serDeInfo = new SerDeInfo(null, "org.apache.hadoop.hive.serde2.thrift.ThriftDeserializer", parameters);
// StorageDescriptor has an empty list of fields - SerDe will report them.
StorageDescriptor sd = new StorageDescriptor(new ArrayList<FieldSchema>(), "location", "org.apache.hadoop.mapred.TextInputFormat", "org.apache.hadoop.mapred.TextOutputFormat", false, -1, serDeInfo, new ArrayList<String>(), new ArrayList<Order>(), new HashMap<String, String>());
org.apache.hadoop.hive.metastore.api.Table apiTable = new org.apache.hadoop.hive.metastore.api.Table("test_tblname", "test_dbname", "test_owner", 0, 0, 0, sd, new ArrayList<FieldSchema>(), new HashMap<String, String>(), "viewOriginalText", "viewExpandedText", TableType.EXTERNAL_TABLE.name());
Table table = new Table(apiTable);
List<HCatFieldSchema> expectedHCatSchema = Lists.newArrayList(new HCatFieldSchema("myint", HCatFieldSchema.Type.INT, null), new HCatFieldSchema("mystring", HCatFieldSchema.Type.STRING, null), new HCatFieldSchema("underscore_int", HCatFieldSchema.Type.INT, null));
Assert.assertEquals(new HCatSchema(expectedHCatSchema), HCatUtil.getTableSchemaWithPtnCols(table));
}
use of org.apache.hive.hcatalog.data.schema.HCatFieldSchema in project hive by apache.
the class TestHCatPartitioned method tableSchemaTest.
// test that new columns gets added to table schema
private void tableSchemaTest() throws Exception {
HCatSchema tableSchema = getTableSchema();
assertEquals(4, tableSchema.getFields().size());
// Update partition schema to have 3 fields
partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c3", serdeConstants.STRING_TYPE_NAME, "")));
writeRecords = new ArrayList<HCatRecord>();
for (int i = 0; i < 20; i++) {
List<Object> objList = new ArrayList<Object>();
objList.add(i);
objList.add("strvalue" + i);
objList.add("str2value" + i);
writeRecords.add(new DefaultHCatRecord(objList));
}
Map<String, String> partitionMap = new HashMap<String, String>();
partitionMap.put("part1", "p1value5");
partitionMap.put("part0", "505");
runMRCreate(partitionMap, partitionColumns, writeRecords, 10, true);
tableSchema = getTableSchema();
// assert that c3 has got added to table schema
assertEquals(5, tableSchema.getFields().size());
assertEquals("c1", tableSchema.getFields().get(0).getName());
assertEquals("c2", tableSchema.getFields().get(1).getName());
assertEquals("c3", tableSchema.getFields().get(2).getName());
assertEquals("part1", tableSchema.getFields().get(3).getName());
assertEquals("part0", tableSchema.getFields().get(4).getName());
// Test that changing column data type fails
partitionMap.clear();
partitionMap.put("part1", "p1value6");
partitionMap.put("part0", "506");
partitionColumns = new ArrayList<HCatFieldSchema>();
partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c1", serdeConstants.INT_TYPE_NAME, "")));
partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c2", serdeConstants.INT_TYPE_NAME, "")));
IOException exc = null;
try {
runMRCreate(partitionMap, partitionColumns, writeRecords, 20, true);
} catch (IOException e) {
exc = e;
}
assertTrue(exc != null);
assertTrue(exc instanceof HCatException);
assertEquals(ErrorType.ERROR_SCHEMA_TYPE_MISMATCH, ((HCatException) exc).getErrorType());
// Test that partition key is not allowed in data
partitionColumns = new ArrayList<HCatFieldSchema>();
partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c1", serdeConstants.INT_TYPE_NAME, "")));
partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c2", serdeConstants.STRING_TYPE_NAME, "")));
partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c3", serdeConstants.STRING_TYPE_NAME, "")));
partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("part1", serdeConstants.STRING_TYPE_NAME, "")));
List<HCatRecord> recordsContainingPartitionCols = new ArrayList<HCatRecord>(20);
for (int i = 0; i < 20; i++) {
List<Object> objList = new ArrayList<Object>();
objList.add(i);
objList.add("c2value" + i);
objList.add("c3value" + i);
objList.add("p1value6");
recordsContainingPartitionCols.add(new DefaultHCatRecord(objList));
}
exc = null;
try {
runMRCreate(partitionMap, partitionColumns, recordsContainingPartitionCols, 20, true);
} catch (IOException e) {
exc = e;
}
List<HCatRecord> records = runMRRead(20, "part1 = \"p1value6\"");
assertEquals(20, records.size());
records = runMRRead(20, "part0 = \"506\"");
assertEquals(20, records.size());
Integer i = 0;
for (HCatRecord rec : records) {
assertEquals(5, rec.size());
assertEquals(rec.get(0), i);
assertEquals(rec.get(1), "c2value" + i);
assertEquals(rec.get(2), "c3value" + i);
assertEquals(rec.get(3), "p1value6");
assertEquals(rec.get(4), 506);
i++;
}
}
use of org.apache.hive.hcatalog.data.schema.HCatFieldSchema in project hive by apache.
the class HCatBaseStorer method getJavaObj.
/**
* Convert from Pig value object to Hive value object
* This method assumes that {@link #validateSchema(org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema, org.apache.hive.hcatalog.data.schema.HCatFieldSchema, org.apache.pig.impl.logicalLayer.schema.Schema, org.apache.hive.hcatalog.data.schema.HCatSchema, int)}
* which checks the types in Pig schema are compatible with target Hive table, has been called.
*/
private Object getJavaObj(Object pigObj, HCatFieldSchema hcatFS) throws HCatException, BackendException {
try {
if (pigObj == null)
return null;
// The real work-horse. Spend time and energy in this method if there is
// need to keep HCatStorer lean and go fast.
Type type = hcatFS.getType();
switch(type) {
case BINARY:
return ((DataByteArray) pigObj).get();
case STRUCT:
HCatSchema structSubSchema = hcatFS.getStructSubSchema();
// Unwrap the tuple.
List<Object> all = ((Tuple) pigObj).getAll();
ArrayList<Object> converted = new ArrayList<Object>(all.size());
for (int i = 0; i < all.size(); i++) {
converted.add(getJavaObj(all.get(i), structSubSchema.get(i)));
}
return converted;
case ARRAY:
// Unwrap the bag.
DataBag pigBag = (DataBag) pigObj;
HCatFieldSchema tupFS = hcatFS.getArrayElementSchema().get(0);
boolean needTuple = tupFS.getType() == Type.STRUCT;
List<Object> bagContents = new ArrayList<Object>((int) pigBag.size());
Iterator<Tuple> bagItr = pigBag.iterator();
while (bagItr.hasNext()) {
// If there is only one element in tuple contained in bag, we throw away the tuple.
bagContents.add(getJavaObj(needTuple ? bagItr.next() : bagItr.next().get(0), tupFS));
}
return bagContents;
case MAP:
Map<?, ?> pigMap = (Map<?, ?>) pigObj;
Map<Object, Object> typeMap = new HashMap<Object, Object>();
for (Entry<?, ?> entry : pigMap.entrySet()) {
// the value has a schema and not a FieldSchema
typeMap.put(// Schema validation enforces that the Key is a String
(String) entry.getKey(), getJavaObj(entry.getValue(), hcatFS.getMapValueSchema().get(0)));
}
return typeMap;
case STRING:
case INT:
case BIGINT:
case FLOAT:
case DOUBLE:
return pigObj;
case SMALLINT:
if ((Integer) pigObj < Short.MIN_VALUE || (Integer) pigObj > Short.MAX_VALUE) {
handleOutOfRangeValue(pigObj, hcatFS);
return null;
}
return ((Integer) pigObj).shortValue();
case TINYINT:
if ((Integer) pigObj < Byte.MIN_VALUE || (Integer) pigObj > Byte.MAX_VALUE) {
handleOutOfRangeValue(pigObj, hcatFS);
return null;
}
return ((Integer) pigObj).byteValue();
case BOOLEAN:
if (pigObj instanceof String) {
if (((String) pigObj).trim().compareTo("0") == 0) {
return Boolean.FALSE;
}
if (((String) pigObj).trim().compareTo("1") == 0) {
return Boolean.TRUE;
}
throw new BackendException("Unexpected type " + type + " for value " + pigObj + " of class " + pigObj.getClass().getName(), PigHCatUtil.PIG_EXCEPTION_CODE);
}
return Boolean.parseBoolean(pigObj.toString());
case DECIMAL:
BigDecimal bd = (BigDecimal) pigObj;
DecimalTypeInfo dti = (DecimalTypeInfo) hcatFS.getTypeInfo();
if (bd.precision() > dti.precision() || bd.scale() > dti.scale()) {
handleOutOfRangeValue(pigObj, hcatFS);
return null;
}
return HiveDecimal.create(bd);
case CHAR:
String charVal = (String) pigObj;
CharTypeInfo cti = (CharTypeInfo) hcatFS.getTypeInfo();
if (charVal.length() > cti.getLength()) {
handleOutOfRangeValue(pigObj, hcatFS);
return null;
}
return new HiveChar(charVal, cti.getLength());
case VARCHAR:
String varcharVal = (String) pigObj;
VarcharTypeInfo vti = (VarcharTypeInfo) hcatFS.getTypeInfo();
if (varcharVal.length() > vti.getLength()) {
handleOutOfRangeValue(pigObj, hcatFS);
return null;
}
return new HiveVarchar(varcharVal, vti.getLength());
case TIMESTAMP:
DateTime dt = (DateTime) pigObj;
// getMillis() returns UTC time regardless of TZ
return new Timestamp(dt.getMillis());
case DATE:
/**
* We ignore any TZ setting on Pig value since java.sql.Date doesn't have it (in any
* meaningful way). So the assumption is that if Pig value has 0 time component (midnight)
* we assume it reasonably 'fits' into a Hive DATE. If time part is not 0, it's considered
* out of range for target type.
*/
DateTime dateTime = ((DateTime) pigObj);
if (dateTime.getMillisOfDay() != 0) {
handleOutOfRangeValue(pigObj, hcatFS, "Time component must be 0 (midnight) in local timezone; Local TZ val='" + pigObj + "'");
return null;
}
/*java.sql.Date is a poorly defined API. Some (all?) SerDes call toString() on it
[e.g. LazySimpleSerDe, uses LazyUtils.writePrimitiveUTF8()], which automatically adjusts
for local timezone. Date.valueOf() also uses local timezone (as does Date(int,int,int).
Also see PigHCatUtil#extractPigObject() for corresponding read op. This way a DATETIME from Pig,
when stored into Hive and read back comes back with the same value.*/
return new Date(dateTime.getYear() - 1900, dateTime.getMonthOfYear() - 1, dateTime.getDayOfMonth());
default:
throw new BackendException("Unexpected HCat type " + type + " for value " + pigObj + " of class " + pigObj.getClass().getName(), PigHCatUtil.PIG_EXCEPTION_CODE);
}
} catch (BackendException e) {
// provide the path to the field in the error message
throw new BackendException((hcatFS.getName() == null ? " " : hcatFS.getName() + ".") + e.getMessage(), e);
}
}
use of org.apache.hive.hcatalog.data.schema.HCatFieldSchema in project hive by apache.
the class HCatBaseStorer method convertPigSchemaToHCatSchema.
/**
* Constructs HCatSchema from pigSchema. Passed tableSchema is the existing
* schema of the table in metastore.
*/
protected HCatSchema convertPigSchemaToHCatSchema(Schema pigSchema, HCatSchema tableSchema) throws FrontendException {
if (LOG.isDebugEnabled()) {
LOG.debug("convertPigSchemaToHCatSchema(pigSchema,tblSchema)=(" + pigSchema + "," + tableSchema + ")");
}
List<HCatFieldSchema> fieldSchemas = new ArrayList<HCatFieldSchema>(pigSchema.size());
for (FieldSchema fSchema : pigSchema.getFields()) {
try {
HCatFieldSchema hcatFieldSchema = getColFromSchema(fSchema.alias, tableSchema);
// if writing to a partitioned table, then pigSchema will have more columns than tableSchema
// partition columns are not part of tableSchema... e.g. TestHCatStorer#testPartColsInData()
// HCatUtil.assertNotNull(hcatFieldSchema, "Nothing matching '" + fSchema.alias + "' found " +
// "in target table schema", LOG);
fieldSchemas.add(getHCatFSFromPigFS(fSchema, hcatFieldSchema, pigSchema, tableSchema));
} catch (HCatException he) {
throw new FrontendException(he.getMessage(), PigHCatUtil.PIG_EXCEPTION_CODE, he);
}
}
HCatSchema s = new HCatSchema(fieldSchemas);
LOG.debug("convertPigSchemaToHCatSchema(computed)=(" + s + ")");
return s;
}
use of org.apache.hive.hcatalog.data.schema.HCatFieldSchema in project hive by apache.
the class HCatBaseStorer method getHCatFSFromPigFS.
/**
* Here we are processing HCat table schema as derived from metastore,
* thus it should have information about all fields/sub-fields, but not for partition columns
*/
private HCatFieldSchema getHCatFSFromPigFS(FieldSchema fSchema, HCatFieldSchema hcatFieldSchema, Schema pigSchema, HCatSchema tableSchema) throws FrontendException, HCatException {
if (hcatFieldSchema == null) {
if (LOG.isDebugEnabled()) {
LOG.debug("hcatFieldSchema is null for fSchema '" + fSchema.alias + "'");
// throw new IllegalArgumentException("hcatFiledSchema is null; fSchema=" + fSchema + " " +
// "(pigSchema, tableSchema)=(" + pigSchema + "," + tableSchema + ")");
}
}
byte type = fSchema.type;
switch(type) {
case DataType.CHARARRAY:
case DataType.BIGCHARARRAY:
if (hcatFieldSchema != null && hcatFieldSchema.getTypeInfo() != null) {
return new HCatFieldSchema(fSchema.alias, hcatFieldSchema.getTypeInfo(), null);
}
return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.stringTypeInfo, null);
case DataType.INTEGER:
if (hcatFieldSchema != null) {
if (!SUPPORTED_INTEGER_CONVERSIONS.contains(hcatFieldSchema.getType())) {
throw new FrontendException("Unsupported type: " + type + " in Pig's schema", PigHCatUtil.PIG_EXCEPTION_CODE);
}
return new HCatFieldSchema(fSchema.alias, hcatFieldSchema.getTypeInfo(), null);
}
return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.intTypeInfo, null);
case DataType.LONG:
return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.longTypeInfo, null);
case DataType.FLOAT:
return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.floatTypeInfo, null);
case DataType.DOUBLE:
return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.doubleTypeInfo, null);
case DataType.BYTEARRAY:
return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.binaryTypeInfo, null);
case DataType.BOOLEAN:
return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.booleanTypeInfo, null);
case DataType.DATETIME:
// is controlled by Hive target table information
if (hcatFieldSchema != null && hcatFieldSchema.getTypeInfo() != null) {
return new HCatFieldSchema(fSchema.alias, hcatFieldSchema.getTypeInfo(), null);
}
return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.timestampTypeInfo, null);
case DataType.BIGDECIMAL:
if (hcatFieldSchema != null && hcatFieldSchema.getTypeInfo() != null) {
return new HCatFieldSchema(fSchema.alias, hcatFieldSchema.getTypeInfo(), null);
}
return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.decimalTypeInfo, null);
case DataType.BAG:
Schema bagSchema = fSchema.schema;
List<HCatFieldSchema> arrFields = new ArrayList<HCatFieldSchema>(1);
FieldSchema field;
// Find out if we need to throw away the tuple or not.
if (removeTupleFromBag(hcatFieldSchema, fSchema)) {
field = bagSchema.getField(0).schema.getField(0);
} else {
field = bagSchema.getField(0);
}
arrFields.add(getHCatFSFromPigFS(field, hcatFieldSchema == null ? null : hcatFieldSchema.getArrayElementSchema().get(0), pigSchema, tableSchema));
return new HCatFieldSchema(fSchema.alias, Type.ARRAY, new HCatSchema(arrFields), "");
case DataType.TUPLE:
List<HCatFieldSchema> hcatFSs = new ArrayList<HCatFieldSchema>();
HCatSchema structSubSchema = hcatFieldSchema == null ? null : hcatFieldSchema.getStructSubSchema();
List<FieldSchema> fields = fSchema.schema.getFields();
for (int i = 0; i < fields.size(); i++) {
FieldSchema fieldSchema = fields.get(i);
hcatFSs.add(getHCatFSFromPigFS(fieldSchema, structSubSchema == null ? null : structSubSchema.get(i), pigSchema, tableSchema));
}
return new HCatFieldSchema(fSchema.alias, Type.STRUCT, new HCatSchema(hcatFSs), "");
case DataType.MAP:
{
// Pig's schema contain no type information about map's keys and
// values. So, if its a new column assume <string,string> if its existing
// return whatever is contained in the existing column.
HCatFieldSchema valFS;
List<HCatFieldSchema> valFSList = new ArrayList<HCatFieldSchema>(1);
if (hcatFieldSchema != null) {
return HCatFieldSchema.createMapTypeFieldSchema(fSchema.alias, hcatFieldSchema.getMapKeyTypeInfo(), hcatFieldSchema.getMapValueSchema(), "");
}
// Column not found in target table. Its a new column. Its schema is map<string,string>
valFS = new HCatFieldSchema(fSchema.alias, TypeInfoFactory.stringTypeInfo, "");
valFSList.add(valFS);
return HCatFieldSchema.createMapTypeFieldSchema(fSchema.alias, TypeInfoFactory.stringTypeInfo, new HCatSchema(valFSList), "");
}
case DataType.BIGINTEGER:
// fall through; doesn't map to Hive/Hcat type; here for completeness
default:
throw new FrontendException("Unsupported type: " + type + " in Pig's schema", PigHCatUtil.PIG_EXCEPTION_CODE);
}
}
Aggregations