Search in sources :

Example 56 with HCatFieldSchema

use of org.apache.hive.hcatalog.data.schema.HCatFieldSchema in project hive by apache.

the class HCatBaseInputFormat method getColValsNotInDataColumns.

/**
 * gets values for fields requested by output schema which will not be in the data
 */
private static Map<String, Object> getColValsNotInDataColumns(HCatSchema outputSchema, PartInfo partInfo) throws HCatException {
    HCatSchema dataSchema = partInfo.getPartitionSchema();
    Map<String, Object> vals = new HashMap<String, Object>();
    for (String fieldName : outputSchema.getFieldNames()) {
        if (dataSchema.getPosition(fieldName) == null) {
            // so, we first check the table schema to see if it is a part col
            if (partInfo.getPartitionValues().containsKey(fieldName)) {
                // First, get the appropriate field schema for this field
                HCatFieldSchema fschema = outputSchema.get(fieldName);
                // For a partition key type, this will be a primitive typeinfo.
                // Obtain relevant object inspector for this typeinfo
                ObjectInspector oi = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(fschema.getTypeInfo());
                // get appropriate object from the string representation of the value in partInfo.getPartitionValues()
                // Essentially, partition values are represented as strings, but we want the actual object type associated
                Object objVal = ObjectInspectorConverters.getConverter(PrimitiveObjectInspectorFactory.javaStringObjectInspector, oi).convert(partInfo.getPartitionValues().get(fieldName));
                vals.put(fieldName, objVal);
            } else {
                vals.put(fieldName, null);
            }
        }
    }
    return vals;
}
Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) HashMap(java.util.HashMap) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema)

Example 57 with HCatFieldSchema

use of org.apache.hive.hcatalog.data.schema.HCatFieldSchema in project hive by apache.

the class SpecialCases method addSpecialCasesParametersToOutputJobProperties.

/**
 * Method to do any file-format specific special casing while
 * instantiating a storage handler to write. We set any parameters
 * we want to be visible to the job in jobProperties, and this will
 * be available to the job via jobconf at run time.
 *
 * This is mostly intended to be used by StorageHandlers that wrap
 * File-based OutputFormats such as FosterStorageHandler that wraps
 * RCFile, ORC, etc.
 *
 * @param jobProperties : map to write to
 * @param jobInfo : information about this output job to read from
 * @param ofclass : the output format in use
 */
public static void addSpecialCasesParametersToOutputJobProperties(Map<String, String> jobProperties, OutputJobInfo jobInfo, Class<? extends OutputFormat> ofclass) {
    if (ofclass == RCFileOutputFormat.class) {
        // RCFile specific parameter
        jobProperties.put(HiveConf.ConfVars.HIVE_RCFILE_COLUMN_NUMBER_CONF.varname, Integer.toOctalString(jobInfo.getOutputSchema().getFields().size()));
    } else if (ofclass == OrcOutputFormat.class) {
        // Special cases for ORC
        // We need to check table properties to see if a couple of parameters,
        // such as compression parameters are defined. If they are, then we copy
        // them to job properties, so that it will be available in jobconf at runtime
        // See HIVE-5504 for details
        Map<String, String> tableProps = jobInfo.getTableInfo().getTable().getParameters();
        for (OrcConf property : OrcConf.values()) {
            String propName = property.getAttribute();
            if (tableProps.containsKey(propName)) {
                jobProperties.put(propName, tableProps.get(propName));
            }
        }
    } else if (ofclass == AvroContainerOutputFormat.class) {
        // Special cases for Avro. As with ORC, we make table properties that
        // Avro is interested in available in jobconf at runtime
        Map<String, String> tableProps = jobInfo.getTableInfo().getTable().getParameters();
        for (AvroSerdeUtils.AvroTableProperties property : AvroSerdeUtils.AvroTableProperties.values()) {
            String propName = property.getPropName();
            if (tableProps.containsKey(propName)) {
                String propVal = tableProps.get(propName);
                jobProperties.put(propName, tableProps.get(propName));
            }
        }
        Properties properties = new Properties();
        properties.put("name", jobInfo.getTableName());
        List<String> colNames = jobInfo.getOutputSchema().getFieldNames();
        List<TypeInfo> colTypes = new ArrayList<TypeInfo>();
        for (HCatFieldSchema field : jobInfo.getOutputSchema().getFields()) {
            colTypes.add(TypeInfoUtils.getTypeInfoFromTypeString(field.getTypeString()));
        }
        if (jobProperties.get(AvroSerdeUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName()) == null || jobProperties.get(AvroSerdeUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName()).isEmpty()) {
            jobProperties.put(AvroSerdeUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName(), AvroSerDe.getSchemaFromCols(properties, colNames, colTypes, null).toString());
        }
    } else if (ofclass == MapredParquetOutputFormat.class) {
        // Handle table properties
        Properties tblProperties = new Properties();
        Map<String, String> tableProps = jobInfo.getTableInfo().getTable().getParameters();
        for (String key : tableProps.keySet()) {
            if (ParquetTableUtils.isParquetProperty(key)) {
                tblProperties.put(key, tableProps.get(key));
            }
        }
        // Handle table schema
        List<String> colNames = jobInfo.getOutputSchema().getFieldNames();
        List<TypeInfo> colTypes = new ArrayList<TypeInfo>();
        for (HCatFieldSchema field : jobInfo.getOutputSchema().getFields()) {
            colTypes.add(TypeInfoUtils.getTypeInfoFromTypeString(field.getTypeString()));
        }
        String parquetSchema = HiveSchemaConverter.convert(colNames, colTypes).toString();
        jobProperties.put(DataWritableWriteSupport.PARQUET_HIVE_SCHEMA, parquetSchema);
        jobProperties.putAll(Maps.fromProperties(tblProperties));
    }
}
Also used : OrcConf(org.apache.orc.OrcConf) MapredParquetOutputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat) ArrayList(java.util.ArrayList) Properties(java.util.Properties) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) ArrayList(java.util.ArrayList) List(java.util.List) Map(java.util.Map) AvroSerdeUtils(org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils)

Example 58 with HCatFieldSchema

use of org.apache.hive.hcatalog.data.schema.HCatFieldSchema in project hive by apache.

the class TestHCatUtil method testGetTableSchemaWithPtnColsApi.

@Test
public void testGetTableSchemaWithPtnColsApi() throws IOException {
    // Check the schema of a table with one field & no partition keys.
    StorageDescriptor sd = new StorageDescriptor(Lists.newArrayList(new FieldSchema("username", serdeConstants.STRING_TYPE_NAME, null)), "location", "org.apache.hadoop.mapred.TextInputFormat", "org.apache.hadoop.mapred.TextOutputFormat", false, -1, new SerDeInfo(), new ArrayList<String>(), new ArrayList<Order>(), new HashMap<String, String>());
    org.apache.hadoop.hive.metastore.api.Table apiTable = new org.apache.hadoop.hive.metastore.api.Table("test_tblname", "test_dbname", "test_owner", 0, 0, 0, sd, new ArrayList<FieldSchema>(), new HashMap<String, String>(), "viewOriginalText", "viewExpandedText", TableType.EXTERNAL_TABLE.name());
    Table table = new Table(apiTable);
    List<HCatFieldSchema> expectedHCatSchema = Lists.newArrayList(new HCatFieldSchema("username", HCatFieldSchema.Type.STRING, null));
    Assert.assertEquals(new HCatSchema(expectedHCatSchema), HCatUtil.getTableSchemaWithPtnCols(table));
    // Add a partition key & ensure its reflected in the schema.
    List<FieldSchema> partitionKeys = Lists.newArrayList(new FieldSchema("dt", serdeConstants.STRING_TYPE_NAME, null));
    table.getTTable().setPartitionKeys(partitionKeys);
    expectedHCatSchema.add(new HCatFieldSchema("dt", HCatFieldSchema.Type.STRING, null));
    Assert.assertEquals(new HCatSchema(expectedHCatSchema), HCatUtil.getTableSchemaWithPtnCols(table));
}
Also used : Order(org.apache.hadoop.hive.metastore.api.Order) Table(org.apache.hadoop.hive.ql.metadata.Table) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) SerDeInfo(org.apache.hadoop.hive.metastore.api.SerDeInfo) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) Test(org.junit.Test)

Example 59 with HCatFieldSchema

use of org.apache.hive.hcatalog.data.schema.HCatFieldSchema in project hive by apache.

the class TestHCatPartitioned method columnOrderChangeTest.

// check behavior while change the order of columns
private void columnOrderChangeTest() throws Exception {
    HCatSchema tableSchema = getTableSchema();
    assertEquals(5, tableSchema.getFields().size());
    partitionColumns = new ArrayList<HCatFieldSchema>();
    partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c1", serdeConstants.INT_TYPE_NAME, "")));
    partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c3", serdeConstants.STRING_TYPE_NAME, "")));
    partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c2", serdeConstants.STRING_TYPE_NAME, "")));
    writeRecords = new ArrayList<HCatRecord>();
    for (int i = 0; i < 10; i++) {
        List<Object> objList = new ArrayList<Object>();
        objList.add(i);
        objList.add("co strvalue" + i);
        objList.add("co str2value" + i);
        writeRecords.add(new DefaultHCatRecord(objList));
    }
    Map<String, String> partitionMap = new HashMap<String, String>();
    partitionMap.put("part1", "p1value8");
    partitionMap.put("part0", "508");
    Exception exc = null;
    try {
        runMRCreate(partitionMap, partitionColumns, writeRecords, 10, true);
    } catch (IOException e) {
        exc = e;
    }
    assertTrue(exc != null);
    assertTrue(exc instanceof HCatException);
    assertEquals(ErrorType.ERROR_SCHEMA_COLUMN_MISMATCH, ((HCatException) exc).getErrorType());
    partitionColumns = new ArrayList<HCatFieldSchema>();
    partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c1", serdeConstants.INT_TYPE_NAME, "")));
    partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c2", serdeConstants.STRING_TYPE_NAME, "")));
    writeRecords = new ArrayList<HCatRecord>();
    for (int i = 0; i < 10; i++) {
        List<Object> objList = new ArrayList<Object>();
        objList.add(i);
        objList.add("co strvalue" + i);
        writeRecords.add(new DefaultHCatRecord(objList));
    }
    runMRCreate(partitionMap, partitionColumns, writeRecords, 10, true);
    if (isTableImmutable()) {
        // Read should get 10 + 20 + 10 + 10 + 20 rows
        runMRRead(70);
    } else {
        // +20 from the duplicate publish
        runMRRead(90);
    }
}
Also used : HashMap(java.util.HashMap) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) HCatException(org.apache.hive.hcatalog.common.HCatException) IOException(java.io.IOException) HCatException(org.apache.hive.hcatalog.common.HCatException) IOException(java.io.IOException) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) DefaultHCatRecord(org.apache.hive.hcatalog.data.DefaultHCatRecord) DefaultHCatRecord(org.apache.hive.hcatalog.data.DefaultHCatRecord) HCatRecord(org.apache.hive.hcatalog.data.HCatRecord)

Example 60 with HCatFieldSchema

use of org.apache.hive.hcatalog.data.schema.HCatFieldSchema in project hive by apache.

the class HCatBaseStorer method validateSchema.

/**
 * This method encodes which Pig type can map (be stored in) to which HCat type.
 * @throws HCatException
 * @throws FrontendException
 */
private void validateSchema(FieldSchema pigField, HCatFieldSchema hcatField, Schema topLevelPigSchema, HCatSchema topLevelHCatSchema, int columnPos) throws HCatException, FrontendException {
    validateAlias(pigField.alias);
    byte type = pigField.type;
    if (DataType.isComplex(type)) {
        switch(type) {
            case DataType.MAP:
                if (hcatField != null) {
                    if (hcatField.getMapKeyType() != Type.STRING) {
                        throw new FrontendException("Key Type of map must be String " + hcatField, PigHCatUtil.PIG_EXCEPTION_CODE);
                    }
                // Map values can be primitive or complex
                }
                break;
            case DataType.BAG:
                HCatSchema arrayElementSchema = hcatField == null ? null : hcatField.getArrayElementSchema();
                for (FieldSchema innerField : pigField.schema.getField(0).schema.getFields()) {
                    validateSchema(innerField, getColFromSchema(pigField.alias, arrayElementSchema), topLevelPigSchema, topLevelHCatSchema, columnPos);
                }
                break;
            case DataType.TUPLE:
                HCatSchema structSubSchema = hcatField == null ? null : hcatField.getStructSubSchema();
                for (FieldSchema innerField : pigField.schema.getFields()) {
                    validateSchema(innerField, getColFromSchema(pigField.alias, structSubSchema), topLevelPigSchema, topLevelHCatSchema, columnPos);
                }
                break;
            default:
                throw new FrontendException("Internal Error.", PigHCatUtil.PIG_EXCEPTION_CODE);
        }
    } else if (hcatField != null) {
        // there is no point trying to validate further if we have no type info about target field
        switch(type) {
            case DataType.BIGDECIMAL:
                throwTypeMismatchException(type, Lists.newArrayList(Type.DECIMAL), hcatField, columnPos);
                break;
            case DataType.DATETIME:
                throwTypeMismatchException(type, Lists.newArrayList(Type.TIMESTAMP, Type.DATE), hcatField, columnPos);
                break;
            case DataType.BYTEARRAY:
                throwTypeMismatchException(type, Lists.newArrayList(Type.BINARY), hcatField, columnPos);
                break;
            case DataType.BIGINTEGER:
                throwTypeMismatchException(type, Collections.<Type>emptyList(), hcatField, columnPos);
                break;
            case DataType.BOOLEAN:
                throwTypeMismatchException(type, Lists.newArrayList(Type.BOOLEAN), hcatField, columnPos);
                break;
            case DataType.CHARARRAY:
                throwTypeMismatchException(type, Lists.newArrayList(Type.STRING, Type.CHAR, Type.VARCHAR), hcatField, columnPos);
                break;
            case DataType.DOUBLE:
                throwTypeMismatchException(type, Lists.newArrayList(Type.DOUBLE), hcatField, columnPos);
                break;
            case DataType.FLOAT:
                throwTypeMismatchException(type, Lists.newArrayList(Type.FLOAT), hcatField, columnPos);
                break;
            case DataType.INTEGER:
                throwTypeMismatchException(type, Lists.newArrayList(Type.INT, Type.BIGINT, Type.TINYINT, Type.SMALLINT), hcatField, columnPos);
                break;
            case DataType.LONG:
                throwTypeMismatchException(type, Lists.newArrayList(Type.BIGINT), hcatField, columnPos);
                break;
            default:
                throw new FrontendException("'" + type + "' Pig datatype in column " + columnPos + "(0-based) is not supported by HCat", PigHCatUtil.PIG_EXCEPTION_CODE);
        }
    } else {
        if (false) {
            // see HIVE-6194
            throw new FrontendException("(pigSch,hcatSchema)=(" + pigField + "," + "" + hcatField + ") (topPig, topHcat)=(" + topLevelPigSchema + "," + "" + topLevelHCatSchema + ")");
        }
    }
}
Also used : DataType(org.apache.pig.data.DataType) Type(org.apache.hive.hcatalog.data.schema.HCatFieldSchema.Type) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) FieldSchema(org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema) FrontendException(org.apache.pig.impl.logicalLayer.FrontendException)

Aggregations

HCatFieldSchema (org.apache.hive.hcatalog.data.schema.HCatFieldSchema)61 ArrayList (java.util.ArrayList)34 Test (org.junit.Test)30 HCatException (org.apache.hive.hcatalog.common.HCatException)22 IOException (java.io.IOException)21 HCatSchema (org.apache.hive.hcatalog.data.schema.HCatSchema)21 HashMap (java.util.HashMap)19 Configuration (org.apache.hadoop.conf.Configuration)15 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)15 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)7 ResourceSchema (org.apache.pig.ResourceSchema)6 HCatTable (org.apache.hive.hcatalog.api.HCatTable)5 ResourceFieldSchema (org.apache.pig.ResourceSchema.ResourceFieldSchema)5 List (java.util.List)4 Map (java.util.Map)4 Properties (java.util.Properties)4 Path (org.apache.hadoop.fs.Path)4 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)3 FrontendException (org.apache.pig.impl.logicalLayer.FrontendException)3 FieldSchema (org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema)3