use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.
the class TestHCatPartitioned method columnOrderChangeTest.
// check behavior while change the order of columns
private void columnOrderChangeTest() throws Exception {
HCatSchema tableSchema = getTableSchema();
assertEquals(5, tableSchema.getFields().size());
partitionColumns = new ArrayList<HCatFieldSchema>();
partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c1", serdeConstants.INT_TYPE_NAME, "")));
partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c3", serdeConstants.STRING_TYPE_NAME, "")));
partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c2", serdeConstants.STRING_TYPE_NAME, "")));
writeRecords = new ArrayList<HCatRecord>();
for (int i = 0; i < 10; i++) {
List<Object> objList = new ArrayList<Object>();
objList.add(i);
objList.add("co strvalue" + i);
objList.add("co str2value" + i);
writeRecords.add(new DefaultHCatRecord(objList));
}
Map<String, String> partitionMap = new HashMap<String, String>();
partitionMap.put("part1", "p1value8");
partitionMap.put("part0", "508");
Exception exc = null;
try {
runMRCreate(partitionMap, partitionColumns, writeRecords, 10, true);
} catch (IOException e) {
exc = e;
}
assertTrue(exc != null);
assertTrue(exc instanceof HCatException);
assertEquals(ErrorType.ERROR_SCHEMA_COLUMN_MISMATCH, ((HCatException) exc).getErrorType());
partitionColumns = new ArrayList<HCatFieldSchema>();
partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c1", serdeConstants.INT_TYPE_NAME, "")));
partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c2", serdeConstants.STRING_TYPE_NAME, "")));
writeRecords = new ArrayList<HCatRecord>();
for (int i = 0; i < 10; i++) {
List<Object> objList = new ArrayList<Object>();
objList.add(i);
objList.add("co strvalue" + i);
writeRecords.add(new DefaultHCatRecord(objList));
}
runMRCreate(partitionMap, partitionColumns, writeRecords, 10, true);
if (isTableImmutable()) {
// Read should get 10 + 20 + 10 + 10 + 20 rows
runMRRead(70);
} else {
// +20 from the duplicate publish
runMRRead(90);
}
}
use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.
the class HCatBaseStorer method validateSchema.
/**
* This method encodes which Pig type can map (be stored in) to which HCat type.
* @throws HCatException
* @throws FrontendException
*/
private void validateSchema(FieldSchema pigField, HCatFieldSchema hcatField, Schema topLevelPigSchema, HCatSchema topLevelHCatSchema, int columnPos) throws HCatException, FrontendException {
validateAlias(pigField.alias);
byte type = pigField.type;
if (DataType.isComplex(type)) {
switch(type) {
case DataType.MAP:
if (hcatField != null) {
if (hcatField.getMapKeyType() != Type.STRING) {
throw new FrontendException("Key Type of map must be String " + hcatField, PigHCatUtil.PIG_EXCEPTION_CODE);
}
// Map values can be primitive or complex
}
break;
case DataType.BAG:
HCatSchema arrayElementSchema = hcatField == null ? null : hcatField.getArrayElementSchema();
for (FieldSchema innerField : pigField.schema.getField(0).schema.getFields()) {
validateSchema(innerField, getColFromSchema(pigField.alias, arrayElementSchema), topLevelPigSchema, topLevelHCatSchema, columnPos);
}
break;
case DataType.TUPLE:
HCatSchema structSubSchema = hcatField == null ? null : hcatField.getStructSubSchema();
for (FieldSchema innerField : pigField.schema.getFields()) {
validateSchema(innerField, getColFromSchema(pigField.alias, structSubSchema), topLevelPigSchema, topLevelHCatSchema, columnPos);
}
break;
default:
throw new FrontendException("Internal Error.", PigHCatUtil.PIG_EXCEPTION_CODE);
}
} else if (hcatField != null) {
// there is no point trying to validate further if we have no type info about target field
switch(type) {
case DataType.BIGDECIMAL:
throwTypeMismatchException(type, Lists.newArrayList(Type.DECIMAL), hcatField, columnPos);
break;
case DataType.DATETIME:
throwTypeMismatchException(type, Lists.newArrayList(Type.TIMESTAMP, Type.DATE), hcatField, columnPos);
break;
case DataType.BYTEARRAY:
throwTypeMismatchException(type, Lists.newArrayList(Type.BINARY), hcatField, columnPos);
break;
case DataType.BIGINTEGER:
throwTypeMismatchException(type, Collections.<Type>emptyList(), hcatField, columnPos);
break;
case DataType.BOOLEAN:
throwTypeMismatchException(type, Lists.newArrayList(Type.BOOLEAN), hcatField, columnPos);
break;
case DataType.CHARARRAY:
throwTypeMismatchException(type, Lists.newArrayList(Type.STRING, Type.CHAR, Type.VARCHAR), hcatField, columnPos);
break;
case DataType.DOUBLE:
throwTypeMismatchException(type, Lists.newArrayList(Type.DOUBLE), hcatField, columnPos);
break;
case DataType.FLOAT:
throwTypeMismatchException(type, Lists.newArrayList(Type.FLOAT), hcatField, columnPos);
break;
case DataType.INTEGER:
throwTypeMismatchException(type, Lists.newArrayList(Type.INT, Type.BIGINT, Type.TINYINT, Type.SMALLINT), hcatField, columnPos);
break;
case DataType.LONG:
throwTypeMismatchException(type, Lists.newArrayList(Type.BIGINT), hcatField, columnPos);
break;
default:
throw new FrontendException("'" + type + "' Pig datatype in column " + columnPos + "(0-based) is not supported by HCat", PigHCatUtil.PIG_EXCEPTION_CODE);
}
} else {
if (false) {
// see HIVE-6194
throw new FrontendException("(pigSch,hcatSchema)=(" + pigField + "," + "" + hcatField + ") (topPig, topHcat)=(" + topLevelPigSchema + "," + "" + topLevelHCatSchema + ")");
}
}
}
use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.
the class HCatLoader method setLocation.
@Override
public void setLocation(String location, Job job) throws IOException {
HCatContext.INSTANCE.setConf(job.getConfiguration()).getConf().get().setBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION, true);
UDFContext udfContext = UDFContext.getUDFContext();
Properties udfProps = udfContext.getUDFProperties(this.getClass(), new String[] { signature });
job.getConfiguration().set(INNER_SIGNATURE, INNER_SIGNATURE_PREFIX + "_" + signature);
Pair<String, String> dbTablePair = PigHCatUtil.getDBTableNames(location);
dbName = dbTablePair.first;
tableName = dbTablePair.second;
RequiredFieldList requiredFieldsInfo = (RequiredFieldList) udfProps.get(PRUNE_PROJECTION_INFO);
// the Configuration
if (udfProps.containsKey(HCatConstants.HCAT_PIG_LOADER_LOCATION_SET)) {
for (Enumeration<Object> emr = udfProps.keys(); emr.hasMoreElements(); ) {
PigHCatUtil.getConfigFromUDFProperties(udfProps, job.getConfiguration(), emr.nextElement().toString());
}
if (!HCatUtil.checkJobContextIfRunningFromBackend(job)) {
// Combine credentials and credentials from job takes precedence for freshness
Credentials crd = jobCredentials.get(INNER_SIGNATURE_PREFIX + "_" + signature);
job.getCredentials().addAll(crd);
}
} else {
Job clone = new Job(job.getConfiguration());
HCatInputFormat.setInput(job, dbName, tableName, getPartitionFilterString());
InputJobInfo inputJobInfo = (InputJobInfo) HCatUtil.deserialize(job.getConfiguration().get(HCatConstants.HCAT_KEY_JOB_INFO));
SpecialCases.addSpecialCasesParametersForHCatLoader(job.getConfiguration(), inputJobInfo.getTableInfo());
// be called many times.
for (Entry<String, String> keyValue : job.getConfiguration()) {
String oldValue = clone.getConfiguration().getRaw(keyValue.getKey());
if ((oldValue == null) || (keyValue.getValue().equals(oldValue) == false)) {
udfProps.put(keyValue.getKey(), keyValue.getValue());
}
}
udfProps.put(HCatConstants.HCAT_PIG_LOADER_LOCATION_SET, true);
// Store credentials in a private hash map and not the udf context to
// make sure they are not public.
Credentials crd = new Credentials();
crd.addAll(job.getCredentials());
jobCredentials.put(INNER_SIGNATURE_PREFIX + "_" + signature, crd);
}
if (requiredFieldsInfo != null) {
// convert to hcatschema and pass to HCatInputFormat
try {
// push down projections to columnar store works for RCFile and ORCFile
ArrayList<Integer> list = new ArrayList<Integer>(requiredFieldsInfo.getFields().size());
for (RequiredField rf : requiredFieldsInfo.getFields()) {
list.add(rf.getIndex());
}
ColumnProjectionUtils.setReadColumns(job.getConfiguration(), list);
outputSchema = phutil.getHCatSchema(requiredFieldsInfo.getFields(), signature, this.getClass());
HCatInputFormat.setOutputSchema(job, outputSchema);
} catch (Exception e) {
throw new IOException(e);
}
} else {
// else - this means pig's optimizer never invoked the pushProjection
// method - so we need all fields and hence we should not call the
// setOutputSchema on HCatInputFormat
ColumnProjectionUtils.setReadAllColumns(job.getConfiguration());
if (HCatUtil.checkJobContextIfRunningFromBackend(job)) {
try {
HCatSchema hcatTableSchema = (HCatSchema) udfProps.get(HCatConstants.HCAT_TABLE_SCHEMA);
outputSchema = hcatTableSchema;
HCatInputFormat.setOutputSchema(job, outputSchema);
} catch (Exception e) {
throw new IOException(e);
}
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("outputSchema=" + outputSchema);
}
}
use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.
the class HCatStorer method setStoreLocation.
/**
* @param location databaseName.tableName
*/
@Override
public void setStoreLocation(String location, Job job) throws IOException {
Configuration config = job.getConfiguration();
config.set(INNER_SIGNATURE, INNER_SIGNATURE_PREFIX + "_" + sign);
Properties udfProps = UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[] { sign });
String[] userStr = location.split("\\.");
if (udfProps.containsKey(HCatConstants.HCAT_PIG_STORER_LOCATION_SET)) {
for (Enumeration<Object> emr = udfProps.keys(); emr.hasMoreElements(); ) {
PigHCatUtil.getConfigFromUDFProperties(udfProps, config, emr.nextElement().toString());
}
Credentials crd = jobCredentials.get(INNER_SIGNATURE_PREFIX + "_" + sign);
if (crd != null) {
job.getCredentials().addAll(crd);
}
} else {
Job clone = new Job(job.getConfiguration());
OutputJobInfo outputJobInfo;
if (userStr.length == 2) {
outputJobInfo = OutputJobInfo.create(userStr[0], userStr[1], partitions);
} else if (userStr.length == 1) {
outputJobInfo = OutputJobInfo.create(null, userStr[0], partitions);
} else {
throw new FrontendException("location " + location + " is invalid. It must be of the form [db.]table", PigHCatUtil.PIG_EXCEPTION_CODE);
}
Schema schema = (Schema) ObjectSerializer.deserialize(udfProps.getProperty(PIG_SCHEMA));
if (schema != null) {
pigSchema = schema;
}
if (pigSchema == null) {
throw new FrontendException("Schema for data cannot be determined.", PigHCatUtil.PIG_EXCEPTION_CODE);
}
String externalLocation = (String) udfProps.getProperty(HCatConstants.HCAT_PIG_STORER_EXTERNAL_LOCATION);
if (externalLocation != null) {
outputJobInfo.setLocation(externalLocation);
}
try {
HCatOutputFormat.setOutput(job, outputJobInfo);
} catch (HCatException he) {
// information passed to HCatOutputFormat was not right
throw new PigException(he.getMessage(), PigHCatUtil.PIG_EXCEPTION_CODE, he);
}
HCatSchema hcatTblSchema = HCatOutputFormat.getTableSchema(job.getConfiguration());
try {
doSchemaValidations(pigSchema, hcatTblSchema);
} catch (HCatException he) {
throw new FrontendException(he.getMessage(), PigHCatUtil.PIG_EXCEPTION_CODE, he);
}
computedSchema = convertPigSchemaToHCatSchema(pigSchema, hcatTblSchema);
HCatOutputFormat.setSchema(job, computedSchema);
udfProps.setProperty(COMPUTED_OUTPUT_SCHEMA, ObjectSerializer.serialize(computedSchema));
// methods need not be called many times.
for (Entry<String, String> keyValue : job.getConfiguration()) {
String oldValue = clone.getConfiguration().getRaw(keyValue.getKey());
if ((oldValue == null) || (keyValue.getValue().equals(oldValue) == false)) {
udfProps.put(keyValue.getKey(), keyValue.getValue());
}
}
// Store credentials in a private hash map and not the udf context to
// make sure they are not public.
jobCredentials.put(INNER_SIGNATURE_PREFIX + "_" + sign, job.getCredentials());
udfProps.put(HCatConstants.HCAT_PIG_STORER_LOCATION_SET, true);
}
}
use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.
the class JsonSerDe method extractCurrentField.
/**
* Utility method to extract current expected field from given JsonParser
*
* isTokenCurrent is a boolean variable also passed in, which determines
* if the JsonParser is already at the token we expect to read next, or
* needs advancing to the next before we read.
*/
private Object extractCurrentField(JsonParser p, HCatFieldSchema hcatFieldSchema, boolean isTokenCurrent) throws IOException {
Object val = null;
JsonToken valueToken;
if (isTokenCurrent) {
valueToken = p.getCurrentToken();
} else {
valueToken = p.nextToken();
}
switch(hcatFieldSchema.getType()) {
case INT:
val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getIntValue();
break;
case TINYINT:
val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getByteValue();
break;
case SMALLINT:
val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getShortValue();
break;
case BIGINT:
val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getLongValue();
break;
case BOOLEAN:
String bval = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText();
if (bval != null) {
val = Boolean.valueOf(bval);
} else {
val = null;
}
break;
case FLOAT:
val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getFloatValue();
break;
case DOUBLE:
val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getDoubleValue();
break;
case STRING:
val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText();
break;
case BINARY:
String b = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText();
if (b != null) {
try {
String t = Text.decode(b.getBytes(), 0, b.getBytes().length);
return t.getBytes();
} catch (CharacterCodingException e) {
LOG.warn("Error generating json binary type from object.", e);
return null;
}
} else {
val = null;
}
break;
case DATE:
val = (valueToken == JsonToken.VALUE_NULL) ? null : Date.valueOf(p.getText());
break;
case TIMESTAMP:
val = (valueToken == JsonToken.VALUE_NULL) ? null : tsParser.parseTimestamp(p.getText());
break;
case DECIMAL:
val = (valueToken == JsonToken.VALUE_NULL) ? null : HiveDecimal.create(p.getText());
break;
case VARCHAR:
int vLen = ((BaseCharTypeInfo) hcatFieldSchema.getTypeInfo()).getLength();
val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveVarchar(p.getText(), vLen);
break;
case CHAR:
int cLen = ((BaseCharTypeInfo) hcatFieldSchema.getTypeInfo()).getLength();
val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveChar(p.getText(), cLen);
break;
case ARRAY:
if (valueToken == JsonToken.VALUE_NULL) {
val = null;
break;
}
if (valueToken != JsonToken.START_ARRAY) {
throw new IOException("Start of Array expected");
}
List<Object> arr = new ArrayList<Object>();
while ((valueToken = p.nextToken()) != JsonToken.END_ARRAY) {
arr.add(extractCurrentField(p, hcatFieldSchema.getArrayElementSchema().get(0), true));
}
val = arr;
break;
case MAP:
if (valueToken == JsonToken.VALUE_NULL) {
val = null;
break;
}
if (valueToken != JsonToken.START_OBJECT) {
throw new IOException("Start of Object expected");
}
Map<Object, Object> map = new LinkedHashMap<Object, Object>();
HCatFieldSchema valueSchema = hcatFieldSchema.getMapValueSchema().get(0);
while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) {
Object k = getObjectOfCorrespondingPrimitiveType(p.getCurrentName(), hcatFieldSchema.getMapKeyTypeInfo());
Object v = extractCurrentField(p, valueSchema, false);
map.put(k, v);
}
val = map;
break;
case STRUCT:
if (valueToken == JsonToken.VALUE_NULL) {
val = null;
break;
}
if (valueToken != JsonToken.START_OBJECT) {
throw new IOException("Start of Object expected");
}
HCatSchema subSchema = hcatFieldSchema.getStructSubSchema();
int sz = subSchema.getFieldNames().size();
List<Object> struct = new ArrayList<Object>(Collections.nCopies(sz, null));
while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) {
populateRecord(struct, valueToken, p, subSchema);
}
val = struct;
break;
default:
LOG.error("Unknown type found: " + hcatFieldSchema.getType());
return null;
}
return val;
}
Aggregations