Search in sources :

Example 41 with HCatSchema

use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.

the class TestHCatPartitioned method columnOrderChangeTest.

// check behavior while change the order of columns
private void columnOrderChangeTest() throws Exception {
    HCatSchema tableSchema = getTableSchema();
    assertEquals(5, tableSchema.getFields().size());
    partitionColumns = new ArrayList<HCatFieldSchema>();
    partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c1", serdeConstants.INT_TYPE_NAME, "")));
    partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c3", serdeConstants.STRING_TYPE_NAME, "")));
    partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c2", serdeConstants.STRING_TYPE_NAME, "")));
    writeRecords = new ArrayList<HCatRecord>();
    for (int i = 0; i < 10; i++) {
        List<Object> objList = new ArrayList<Object>();
        objList.add(i);
        objList.add("co strvalue" + i);
        objList.add("co str2value" + i);
        writeRecords.add(new DefaultHCatRecord(objList));
    }
    Map<String, String> partitionMap = new HashMap<String, String>();
    partitionMap.put("part1", "p1value8");
    partitionMap.put("part0", "508");
    Exception exc = null;
    try {
        runMRCreate(partitionMap, partitionColumns, writeRecords, 10, true);
    } catch (IOException e) {
        exc = e;
    }
    assertTrue(exc != null);
    assertTrue(exc instanceof HCatException);
    assertEquals(ErrorType.ERROR_SCHEMA_COLUMN_MISMATCH, ((HCatException) exc).getErrorType());
    partitionColumns = new ArrayList<HCatFieldSchema>();
    partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c1", serdeConstants.INT_TYPE_NAME, "")));
    partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c2", serdeConstants.STRING_TYPE_NAME, "")));
    writeRecords = new ArrayList<HCatRecord>();
    for (int i = 0; i < 10; i++) {
        List<Object> objList = new ArrayList<Object>();
        objList.add(i);
        objList.add("co strvalue" + i);
        writeRecords.add(new DefaultHCatRecord(objList));
    }
    runMRCreate(partitionMap, partitionColumns, writeRecords, 10, true);
    if (isTableImmutable()) {
        // Read should get 10 + 20 + 10 + 10 + 20 rows
        runMRRead(70);
    } else {
        // +20 from the duplicate publish
        runMRRead(90);
    }
}
Also used : HashMap(java.util.HashMap) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) HCatException(org.apache.hive.hcatalog.common.HCatException) IOException(java.io.IOException) HCatException(org.apache.hive.hcatalog.common.HCatException) IOException(java.io.IOException) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) DefaultHCatRecord(org.apache.hive.hcatalog.data.DefaultHCatRecord) DefaultHCatRecord(org.apache.hive.hcatalog.data.DefaultHCatRecord) HCatRecord(org.apache.hive.hcatalog.data.HCatRecord)

Example 42 with HCatSchema

use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.

the class HCatBaseStorer method validateSchema.

/**
 * This method encodes which Pig type can map (be stored in) to which HCat type.
 * @throws HCatException
 * @throws FrontendException
 */
private void validateSchema(FieldSchema pigField, HCatFieldSchema hcatField, Schema topLevelPigSchema, HCatSchema topLevelHCatSchema, int columnPos) throws HCatException, FrontendException {
    validateAlias(pigField.alias);
    byte type = pigField.type;
    if (DataType.isComplex(type)) {
        switch(type) {
            case DataType.MAP:
                if (hcatField != null) {
                    if (hcatField.getMapKeyType() != Type.STRING) {
                        throw new FrontendException("Key Type of map must be String " + hcatField, PigHCatUtil.PIG_EXCEPTION_CODE);
                    }
                // Map values can be primitive or complex
                }
                break;
            case DataType.BAG:
                HCatSchema arrayElementSchema = hcatField == null ? null : hcatField.getArrayElementSchema();
                for (FieldSchema innerField : pigField.schema.getField(0).schema.getFields()) {
                    validateSchema(innerField, getColFromSchema(pigField.alias, arrayElementSchema), topLevelPigSchema, topLevelHCatSchema, columnPos);
                }
                break;
            case DataType.TUPLE:
                HCatSchema structSubSchema = hcatField == null ? null : hcatField.getStructSubSchema();
                for (FieldSchema innerField : pigField.schema.getFields()) {
                    validateSchema(innerField, getColFromSchema(pigField.alias, structSubSchema), topLevelPigSchema, topLevelHCatSchema, columnPos);
                }
                break;
            default:
                throw new FrontendException("Internal Error.", PigHCatUtil.PIG_EXCEPTION_CODE);
        }
    } else if (hcatField != null) {
        // there is no point trying to validate further if we have no type info about target field
        switch(type) {
            case DataType.BIGDECIMAL:
                throwTypeMismatchException(type, Lists.newArrayList(Type.DECIMAL), hcatField, columnPos);
                break;
            case DataType.DATETIME:
                throwTypeMismatchException(type, Lists.newArrayList(Type.TIMESTAMP, Type.DATE), hcatField, columnPos);
                break;
            case DataType.BYTEARRAY:
                throwTypeMismatchException(type, Lists.newArrayList(Type.BINARY), hcatField, columnPos);
                break;
            case DataType.BIGINTEGER:
                throwTypeMismatchException(type, Collections.<Type>emptyList(), hcatField, columnPos);
                break;
            case DataType.BOOLEAN:
                throwTypeMismatchException(type, Lists.newArrayList(Type.BOOLEAN), hcatField, columnPos);
                break;
            case DataType.CHARARRAY:
                throwTypeMismatchException(type, Lists.newArrayList(Type.STRING, Type.CHAR, Type.VARCHAR), hcatField, columnPos);
                break;
            case DataType.DOUBLE:
                throwTypeMismatchException(type, Lists.newArrayList(Type.DOUBLE), hcatField, columnPos);
                break;
            case DataType.FLOAT:
                throwTypeMismatchException(type, Lists.newArrayList(Type.FLOAT), hcatField, columnPos);
                break;
            case DataType.INTEGER:
                throwTypeMismatchException(type, Lists.newArrayList(Type.INT, Type.BIGINT, Type.TINYINT, Type.SMALLINT), hcatField, columnPos);
                break;
            case DataType.LONG:
                throwTypeMismatchException(type, Lists.newArrayList(Type.BIGINT), hcatField, columnPos);
                break;
            default:
                throw new FrontendException("'" + type + "' Pig datatype in column " + columnPos + "(0-based) is not supported by HCat", PigHCatUtil.PIG_EXCEPTION_CODE);
        }
    } else {
        if (false) {
            // see HIVE-6194
            throw new FrontendException("(pigSch,hcatSchema)=(" + pigField + "," + "" + hcatField + ") (topPig, topHcat)=(" + topLevelPigSchema + "," + "" + topLevelHCatSchema + ")");
        }
    }
}
Also used : DataType(org.apache.pig.data.DataType) Type(org.apache.hive.hcatalog.data.schema.HCatFieldSchema.Type) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) FieldSchema(org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema) FrontendException(org.apache.pig.impl.logicalLayer.FrontendException)

Example 43 with HCatSchema

use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.

the class HCatLoader method setLocation.

@Override
public void setLocation(String location, Job job) throws IOException {
    HCatContext.INSTANCE.setConf(job.getConfiguration()).getConf().get().setBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION, true);
    UDFContext udfContext = UDFContext.getUDFContext();
    Properties udfProps = udfContext.getUDFProperties(this.getClass(), new String[] { signature });
    job.getConfiguration().set(INNER_SIGNATURE, INNER_SIGNATURE_PREFIX + "_" + signature);
    Pair<String, String> dbTablePair = PigHCatUtil.getDBTableNames(location);
    dbName = dbTablePair.first;
    tableName = dbTablePair.second;
    RequiredFieldList requiredFieldsInfo = (RequiredFieldList) udfProps.get(PRUNE_PROJECTION_INFO);
    // the Configuration
    if (udfProps.containsKey(HCatConstants.HCAT_PIG_LOADER_LOCATION_SET)) {
        for (Enumeration<Object> emr = udfProps.keys(); emr.hasMoreElements(); ) {
            PigHCatUtil.getConfigFromUDFProperties(udfProps, job.getConfiguration(), emr.nextElement().toString());
        }
        if (!HCatUtil.checkJobContextIfRunningFromBackend(job)) {
            // Combine credentials and credentials from job takes precedence for freshness
            Credentials crd = jobCredentials.get(INNER_SIGNATURE_PREFIX + "_" + signature);
            job.getCredentials().addAll(crd);
        }
    } else {
        Job clone = new Job(job.getConfiguration());
        HCatInputFormat.setInput(job, dbName, tableName, getPartitionFilterString());
        InputJobInfo inputJobInfo = (InputJobInfo) HCatUtil.deserialize(job.getConfiguration().get(HCatConstants.HCAT_KEY_JOB_INFO));
        SpecialCases.addSpecialCasesParametersForHCatLoader(job.getConfiguration(), inputJobInfo.getTableInfo());
        // be called many times.
        for (Entry<String, String> keyValue : job.getConfiguration()) {
            String oldValue = clone.getConfiguration().getRaw(keyValue.getKey());
            if ((oldValue == null) || (keyValue.getValue().equals(oldValue) == false)) {
                udfProps.put(keyValue.getKey(), keyValue.getValue());
            }
        }
        udfProps.put(HCatConstants.HCAT_PIG_LOADER_LOCATION_SET, true);
        // Store credentials in a private hash map and not the udf context to
        // make sure they are not public.
        Credentials crd = new Credentials();
        crd.addAll(job.getCredentials());
        jobCredentials.put(INNER_SIGNATURE_PREFIX + "_" + signature, crd);
    }
    if (requiredFieldsInfo != null) {
        // convert to hcatschema and pass to HCatInputFormat
        try {
            // push down projections to columnar store works for RCFile and ORCFile
            ArrayList<Integer> list = new ArrayList<Integer>(requiredFieldsInfo.getFields().size());
            for (RequiredField rf : requiredFieldsInfo.getFields()) {
                list.add(rf.getIndex());
            }
            ColumnProjectionUtils.setReadColumns(job.getConfiguration(), list);
            outputSchema = phutil.getHCatSchema(requiredFieldsInfo.getFields(), signature, this.getClass());
            HCatInputFormat.setOutputSchema(job, outputSchema);
        } catch (Exception e) {
            throw new IOException(e);
        }
    } else {
        // else - this means pig's optimizer never invoked the pushProjection
        // method - so we need all fields and hence we should not call the
        // setOutputSchema on HCatInputFormat
        ColumnProjectionUtils.setReadAllColumns(job.getConfiguration());
        if (HCatUtil.checkJobContextIfRunningFromBackend(job)) {
            try {
                HCatSchema hcatTableSchema = (HCatSchema) udfProps.get(HCatConstants.HCAT_TABLE_SCHEMA);
                outputSchema = hcatTableSchema;
                HCatInputFormat.setOutputSchema(job, outputSchema);
            } catch (Exception e) {
                throw new IOException(e);
            }
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("outputSchema=" + outputSchema);
    }
}
Also used : ArrayList(java.util.ArrayList) UDFContext(org.apache.pig.impl.util.UDFContext) IOException(java.io.IOException) Properties(java.util.Properties) HCatException(org.apache.hive.hcatalog.common.HCatException) PigException(org.apache.pig.PigException) IOException(java.io.IOException) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) Job(org.apache.hadoop.mapreduce.Job) InputJobInfo(org.apache.hive.hcatalog.mapreduce.InputJobInfo) Credentials(org.apache.hadoop.security.Credentials)

Example 44 with HCatSchema

use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.

the class HCatStorer method setStoreLocation.

/**
 * @param location databaseName.tableName
 */
@Override
public void setStoreLocation(String location, Job job) throws IOException {
    Configuration config = job.getConfiguration();
    config.set(INNER_SIGNATURE, INNER_SIGNATURE_PREFIX + "_" + sign);
    Properties udfProps = UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[] { sign });
    String[] userStr = location.split("\\.");
    if (udfProps.containsKey(HCatConstants.HCAT_PIG_STORER_LOCATION_SET)) {
        for (Enumeration<Object> emr = udfProps.keys(); emr.hasMoreElements(); ) {
            PigHCatUtil.getConfigFromUDFProperties(udfProps, config, emr.nextElement().toString());
        }
        Credentials crd = jobCredentials.get(INNER_SIGNATURE_PREFIX + "_" + sign);
        if (crd != null) {
            job.getCredentials().addAll(crd);
        }
    } else {
        Job clone = new Job(job.getConfiguration());
        OutputJobInfo outputJobInfo;
        if (userStr.length == 2) {
            outputJobInfo = OutputJobInfo.create(userStr[0], userStr[1], partitions);
        } else if (userStr.length == 1) {
            outputJobInfo = OutputJobInfo.create(null, userStr[0], partitions);
        } else {
            throw new FrontendException("location " + location + " is invalid. It must be of the form [db.]table", PigHCatUtil.PIG_EXCEPTION_CODE);
        }
        Schema schema = (Schema) ObjectSerializer.deserialize(udfProps.getProperty(PIG_SCHEMA));
        if (schema != null) {
            pigSchema = schema;
        }
        if (pigSchema == null) {
            throw new FrontendException("Schema for data cannot be determined.", PigHCatUtil.PIG_EXCEPTION_CODE);
        }
        String externalLocation = (String) udfProps.getProperty(HCatConstants.HCAT_PIG_STORER_EXTERNAL_LOCATION);
        if (externalLocation != null) {
            outputJobInfo.setLocation(externalLocation);
        }
        try {
            HCatOutputFormat.setOutput(job, outputJobInfo);
        } catch (HCatException he) {
            // information passed to HCatOutputFormat was not right
            throw new PigException(he.getMessage(), PigHCatUtil.PIG_EXCEPTION_CODE, he);
        }
        HCatSchema hcatTblSchema = HCatOutputFormat.getTableSchema(job.getConfiguration());
        try {
            doSchemaValidations(pigSchema, hcatTblSchema);
        } catch (HCatException he) {
            throw new FrontendException(he.getMessage(), PigHCatUtil.PIG_EXCEPTION_CODE, he);
        }
        computedSchema = convertPigSchemaToHCatSchema(pigSchema, hcatTblSchema);
        HCatOutputFormat.setSchema(job, computedSchema);
        udfProps.setProperty(COMPUTED_OUTPUT_SCHEMA, ObjectSerializer.serialize(computedSchema));
        // methods need not be called many times.
        for (Entry<String, String> keyValue : job.getConfiguration()) {
            String oldValue = clone.getConfiguration().getRaw(keyValue.getKey());
            if ((oldValue == null) || (keyValue.getValue().equals(oldValue) == false)) {
                udfProps.put(keyValue.getKey(), keyValue.getValue());
            }
        }
        // Store credentials in a private hash map and not the udf context to
        // make sure they are not public.
        jobCredentials.put(INNER_SIGNATURE_PREFIX + "_" + sign, job.getCredentials());
        udfProps.put(HCatConstants.HCAT_PIG_STORER_LOCATION_SET, true);
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.pig.impl.logicalLayer.schema.Schema) ResourceSchema(org.apache.pig.ResourceSchema) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) HCatException(org.apache.hive.hcatalog.common.HCatException) Properties(java.util.Properties) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) OutputJobInfo(org.apache.hive.hcatalog.mapreduce.OutputJobInfo) Job(org.apache.hadoop.mapreduce.Job) Credentials(org.apache.hadoop.security.Credentials) FrontendException(org.apache.pig.impl.logicalLayer.FrontendException) PigException(org.apache.pig.PigException)

Example 45 with HCatSchema

use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.

the class JsonSerDe method extractCurrentField.

/**
 * Utility method to extract current expected field from given JsonParser
 *
 * isTokenCurrent is a boolean variable also passed in, which determines
 * if the JsonParser is already at the token we expect to read next, or
 * needs advancing to the next before we read.
 */
private Object extractCurrentField(JsonParser p, HCatFieldSchema hcatFieldSchema, boolean isTokenCurrent) throws IOException {
    Object val = null;
    JsonToken valueToken;
    if (isTokenCurrent) {
        valueToken = p.getCurrentToken();
    } else {
        valueToken = p.nextToken();
    }
    switch(hcatFieldSchema.getType()) {
        case INT:
            val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getIntValue();
            break;
        case TINYINT:
            val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getByteValue();
            break;
        case SMALLINT:
            val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getShortValue();
            break;
        case BIGINT:
            val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getLongValue();
            break;
        case BOOLEAN:
            String bval = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText();
            if (bval != null) {
                val = Boolean.valueOf(bval);
            } else {
                val = null;
            }
            break;
        case FLOAT:
            val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getFloatValue();
            break;
        case DOUBLE:
            val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getDoubleValue();
            break;
        case STRING:
            val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText();
            break;
        case BINARY:
            String b = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText();
            if (b != null) {
                try {
                    String t = Text.decode(b.getBytes(), 0, b.getBytes().length);
                    return t.getBytes();
                } catch (CharacterCodingException e) {
                    LOG.warn("Error generating json binary type from object.", e);
                    return null;
                }
            } else {
                val = null;
            }
            break;
        case DATE:
            val = (valueToken == JsonToken.VALUE_NULL) ? null : Date.valueOf(p.getText());
            break;
        case TIMESTAMP:
            val = (valueToken == JsonToken.VALUE_NULL) ? null : tsParser.parseTimestamp(p.getText());
            break;
        case DECIMAL:
            val = (valueToken == JsonToken.VALUE_NULL) ? null : HiveDecimal.create(p.getText());
            break;
        case VARCHAR:
            int vLen = ((BaseCharTypeInfo) hcatFieldSchema.getTypeInfo()).getLength();
            val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveVarchar(p.getText(), vLen);
            break;
        case CHAR:
            int cLen = ((BaseCharTypeInfo) hcatFieldSchema.getTypeInfo()).getLength();
            val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveChar(p.getText(), cLen);
            break;
        case ARRAY:
            if (valueToken == JsonToken.VALUE_NULL) {
                val = null;
                break;
            }
            if (valueToken != JsonToken.START_ARRAY) {
                throw new IOException("Start of Array expected");
            }
            List<Object> arr = new ArrayList<Object>();
            while ((valueToken = p.nextToken()) != JsonToken.END_ARRAY) {
                arr.add(extractCurrentField(p, hcatFieldSchema.getArrayElementSchema().get(0), true));
            }
            val = arr;
            break;
        case MAP:
            if (valueToken == JsonToken.VALUE_NULL) {
                val = null;
                break;
            }
            if (valueToken != JsonToken.START_OBJECT) {
                throw new IOException("Start of Object expected");
            }
            Map<Object, Object> map = new LinkedHashMap<Object, Object>();
            HCatFieldSchema valueSchema = hcatFieldSchema.getMapValueSchema().get(0);
            while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) {
                Object k = getObjectOfCorrespondingPrimitiveType(p.getCurrentName(), hcatFieldSchema.getMapKeyTypeInfo());
                Object v = extractCurrentField(p, valueSchema, false);
                map.put(k, v);
            }
            val = map;
            break;
        case STRUCT:
            if (valueToken == JsonToken.VALUE_NULL) {
                val = null;
                break;
            }
            if (valueToken != JsonToken.START_OBJECT) {
                throw new IOException("Start of Object expected");
            }
            HCatSchema subSchema = hcatFieldSchema.getStructSubSchema();
            int sz = subSchema.getFieldNames().size();
            List<Object> struct = new ArrayList<Object>(Collections.nCopies(sz, null));
            while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) {
                populateRecord(struct, valueToken, p, subSchema);
            }
            val = struct;
            break;
        default:
            LOG.error("Unknown type found: " + hcatFieldSchema.getType());
            return null;
    }
    return val;
}
Also used : BaseCharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo) HiveChar(org.apache.hadoop.hive.common.type.HiveChar) ArrayList(java.util.ArrayList) CharacterCodingException(java.nio.charset.CharacterCodingException) HiveVarchar(org.apache.hadoop.hive.common.type.HiveVarchar) IOException(java.io.IOException) LinkedHashMap(java.util.LinkedHashMap) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) JsonToken(org.codehaus.jackson.JsonToken)

Aggregations

HCatSchema (org.apache.hive.hcatalog.data.schema.HCatSchema)45 HCatFieldSchema (org.apache.hive.hcatalog.data.schema.HCatFieldSchema)21 Job (org.apache.hadoop.mapreduce.Job)17 ArrayList (java.util.ArrayList)14 Configuration (org.apache.hadoop.conf.Configuration)13 HashMap (java.util.HashMap)10 GenericOptionsParser (org.apache.hadoop.util.GenericOptionsParser)10 IOException (java.io.IOException)8 HCatException (org.apache.hive.hcatalog.common.HCatException)8 Table (org.apache.hadoop.hive.ql.metadata.Table)6 Test (org.junit.Test)6 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)5 Properties (java.util.Properties)4 Path (org.apache.hadoop.fs.Path)4 ResourceSchema (org.apache.pig.ResourceSchema)4 FrontendException (org.apache.pig.impl.logicalLayer.FrontendException)4 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)3 HCatRecord (org.apache.hive.hcatalog.data.HCatRecord)3 PigException (org.apache.pig.PigException)3 Map (java.util.Map)2