Search in sources :

Example 36 with HCatSchema

use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.

the class HCatBaseInputFormat method getColValsNotInDataColumns.

/**
 * gets values for fields requested by output schema which will not be in the data
 */
private static Map<String, Object> getColValsNotInDataColumns(HCatSchema outputSchema, PartInfo partInfo) throws HCatException {
    HCatSchema dataSchema = partInfo.getPartitionSchema();
    Map<String, Object> vals = new HashMap<String, Object>();
    for (String fieldName : outputSchema.getFieldNames()) {
        if (dataSchema.getPosition(fieldName) == null) {
            // so, we first check the table schema to see if it is a part col
            if (partInfo.getPartitionValues().containsKey(fieldName)) {
                // First, get the appropriate field schema for this field
                HCatFieldSchema fschema = outputSchema.get(fieldName);
                // For a partition key type, this will be a primitive typeinfo.
                // Obtain relevant object inspector for this typeinfo
                ObjectInspector oi = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(fschema.getTypeInfo());
                // get appropriate object from the string representation of the value in partInfo.getPartitionValues()
                // Essentially, partition values are represented as strings, but we want the actual object type associated
                Object objVal = ObjectInspectorConverters.getConverter(PrimitiveObjectInspectorFactory.javaStringObjectInspector, oi).convert(partInfo.getPartitionValues().get(fieldName));
                vals.put(fieldName, objVal);
            } else {
                vals.put(fieldName, null);
            }
        }
    }
    return vals;
}
Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) HashMap(java.util.HashMap) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema)

Example 37 with HCatSchema

use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.

the class HCatTableInfo method valueOf.

/**
 * create an HCatTableInfo instance from the supplied Hive Table instance
 * @param table to create an instance from
 * @return HCatTableInfo
 * @throws IOException
 */
static HCatTableInfo valueOf(Table table) throws IOException {
    // Explicitly use {@link org.apache.hadoop.hive.ql.metadata.Table} when getting the schema,
    // but store @{link org.apache.hadoop.hive.metastore.api.Table} as this class is serialized
    // into the job conf.
    org.apache.hadoop.hive.ql.metadata.Table mTable = new org.apache.hadoop.hive.ql.metadata.Table(table);
    HCatSchema schema = HCatUtil.extractSchema(mTable);
    StorerInfo storerInfo = InternalUtil.extractStorerInfo(table.getSd(), table.getParameters());
    HCatSchema partitionColumns = HCatUtil.getPartitionColumns(mTable);
    return new HCatTableInfo(table.getDbName(), table.getTableName(), schema, partitionColumns, storerInfo, table);
}
Also used : Table(org.apache.hadoop.hive.metastore.api.Table) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema)

Example 38 with HCatSchema

use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.

the class InitializeInput method getInputJobInfo.

/**
 * Returns the given InputJobInfo after populating with data queried from the metadata service.
 */
private static InputJobInfo getInputJobInfo(Configuration conf, InputJobInfo inputJobInfo, String locationFilter) throws Exception {
    IMetaStoreClient client = null;
    HiveConf hiveConf = null;
    try {
        if (conf != null) {
            hiveConf = HCatUtil.getHiveConf(conf);
        } else {
            hiveConf = new HiveConf(HCatInputFormat.class);
        }
        client = HCatUtil.getHiveMetastoreClient(hiveConf);
        Table table = HCatUtil.getTable(client, inputJobInfo.getDatabaseName(), inputJobInfo.getTableName());
        List<PartInfo> partInfoList = new ArrayList<PartInfo>();
        inputJobInfo.setTableInfo(HCatTableInfo.valueOf(table.getTTable()));
        if (table.getPartitionKeys().size() != 0) {
            // Partitioned table
            List<Partition> parts = client.listPartitionsByFilter(inputJobInfo.getDatabaseName(), inputJobInfo.getTableName(), inputJobInfo.getFilter(), (short) -1);
            // Default to 100,000 partitions if hive.metastore.maxpartition is not defined
            int maxPart = hiveConf.getInt("hcat.metastore.maxpartitions", 100000);
            if (parts != null && parts.size() > maxPart) {
                throw new HCatException(ErrorType.ERROR_EXCEED_MAXPART, "total number of partitions is " + parts.size());
            }
            // populate partition info
            for (Partition ptn : parts) {
                HCatSchema schema = HCatUtil.extractSchema(new org.apache.hadoop.hive.ql.metadata.Partition(table, ptn));
                PartInfo partInfo = extractPartInfo(schema, ptn.getSd(), ptn.getParameters(), conf, inputJobInfo);
                partInfo.setPartitionValues(InternalUtil.createPtnKeyValueMap(table, ptn));
                partInfoList.add(partInfo);
            }
        } else {
            // Non partitioned table
            HCatSchema schema = HCatUtil.extractSchema(table);
            PartInfo partInfo = extractPartInfo(schema, table.getTTable().getSd(), table.getParameters(), conf, inputJobInfo);
            partInfo.setPartitionValues(new HashMap<String, String>());
            partInfoList.add(partInfo);
        }
        inputJobInfo.setPartitions(partInfoList);
        return inputJobInfo;
    } finally {
        HCatUtil.closeHiveClientQuietly(client);
    }
}
Also used : Partition(org.apache.hadoop.hive.metastore.api.Partition) Table(org.apache.hadoop.hive.ql.metadata.Table) ArrayList(java.util.ArrayList) HCatException(org.apache.hive.hcatalog.common.HCatException) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) HiveConf(org.apache.hadoop.hive.conf.HiveConf)

Example 39 with HCatSchema

use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.

the class TestHCatUtil method testGetTableSchemaWithPtnColsApi.

@Test
public void testGetTableSchemaWithPtnColsApi() throws IOException {
    // Check the schema of a table with one field & no partition keys.
    StorageDescriptor sd = new StorageDescriptor(Lists.newArrayList(new FieldSchema("username", serdeConstants.STRING_TYPE_NAME, null)), "location", "org.apache.hadoop.mapred.TextInputFormat", "org.apache.hadoop.mapred.TextOutputFormat", false, -1, new SerDeInfo(), new ArrayList<String>(), new ArrayList<Order>(), new HashMap<String, String>());
    org.apache.hadoop.hive.metastore.api.Table apiTable = new org.apache.hadoop.hive.metastore.api.Table("test_tblname", "test_dbname", "test_owner", 0, 0, 0, sd, new ArrayList<FieldSchema>(), new HashMap<String, String>(), "viewOriginalText", "viewExpandedText", TableType.EXTERNAL_TABLE.name());
    Table table = new Table(apiTable);
    List<HCatFieldSchema> expectedHCatSchema = Lists.newArrayList(new HCatFieldSchema("username", HCatFieldSchema.Type.STRING, null));
    Assert.assertEquals(new HCatSchema(expectedHCatSchema), HCatUtil.getTableSchemaWithPtnCols(table));
    // Add a partition key & ensure its reflected in the schema.
    List<FieldSchema> partitionKeys = Lists.newArrayList(new FieldSchema("dt", serdeConstants.STRING_TYPE_NAME, null));
    table.getTTable().setPartitionKeys(partitionKeys);
    expectedHCatSchema.add(new HCatFieldSchema("dt", HCatFieldSchema.Type.STRING, null));
    Assert.assertEquals(new HCatSchema(expectedHCatSchema), HCatUtil.getTableSchemaWithPtnCols(table));
}
Also used : Order(org.apache.hadoop.hive.metastore.api.Order) Table(org.apache.hadoop.hive.ql.metadata.Table) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) SerDeInfo(org.apache.hadoop.hive.metastore.api.SerDeInfo) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) Test(org.junit.Test)

Example 40 with HCatSchema

use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.

the class HCatMapReduceTest method runMRCreate.

/**
 * Run a local map reduce job to load data from in memory records to an HCatalog Table
 * @param partitionValues
 * @param partitionColumns
 * @param records data to be written to HCatalog table
 * @param writeCount
 * @param assertWrite
 * @param asSingleMapTask
 * @return
 * @throws Exception
 */
Job runMRCreate(Map<String, String> partitionValues, List<HCatFieldSchema> partitionColumns, List<HCatRecord> records, int writeCount, boolean assertWrite, boolean asSingleMapTask, String customDynamicPathPattern) throws Exception {
    writeRecords = records;
    MapCreate.writeCount = 0;
    Configuration conf = new Configuration();
    Job job = new Job(conf, "hcat mapreduce write test");
    job.setJarByClass(this.getClass());
    job.setMapperClass(HCatMapReduceTest.MapCreate.class);
    // input/output settings
    job.setInputFormatClass(TextInputFormat.class);
    if (asSingleMapTask) {
        // One input path would mean only one map task
        Path path = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceInput");
        createInputFile(path, writeCount);
        TextInputFormat.setInputPaths(job, path);
    } else {
        // Create two input paths so that two map tasks get triggered. There could be other ways
        // to trigger two map tasks.
        Path path = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceInput");
        createInputFile(path, writeCount / 2);
        Path path2 = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceInput2");
        createInputFile(path2, (writeCount - writeCount / 2));
        TextInputFormat.setInputPaths(job, path, path2);
    }
    job.setOutputFormatClass(HCatOutputFormat.class);
    OutputJobInfo outputJobInfo = OutputJobInfo.create(dbName, tableName, partitionValues);
    if (customDynamicPathPattern != null) {
        job.getConfiguration().set(HCatConstants.HCAT_DYNAMIC_CUSTOM_PATTERN, customDynamicPathPattern);
    }
    HCatOutputFormat.setOutput(job, outputJobInfo);
    job.setMapOutputKeyClass(BytesWritable.class);
    job.setMapOutputValueClass(DefaultHCatRecord.class);
    job.setNumReduceTasks(0);
    HCatOutputFormat.setSchema(job, new HCatSchema(partitionColumns));
    boolean success = job.waitForCompletion(true);
    // Ensure counters are set when data has actually been read.
    if (partitionValues != null) {
        assertTrue(job.getCounters().getGroup("FileSystemCounters").findCounter("FILE_BYTES_READ").getValue() > 0);
    }
    if (!HCatUtil.isHadoop23()) {
        // Local mode outputcommitter hook is not invoked in Hadoop 1.x
        if (success) {
            new FileOutputCommitterContainer(job, null).commitJob(job);
        } else {
            new FileOutputCommitterContainer(job, null).abortJob(job, JobStatus.State.FAILED);
        }
    }
    if (assertWrite) {
        // we assert only if we expected to assert with this call.
        Assert.assertEquals(writeCount, MapCreate.writeCount);
    }
    if (isTableExternal()) {
        externalTableLocation = outputJobInfo.getTableInfo().getTableLocation();
    }
    return job;
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) Job(org.apache.hadoop.mapreduce.Job)

Aggregations

HCatSchema (org.apache.hive.hcatalog.data.schema.HCatSchema)45 HCatFieldSchema (org.apache.hive.hcatalog.data.schema.HCatFieldSchema)21 Job (org.apache.hadoop.mapreduce.Job)17 ArrayList (java.util.ArrayList)14 Configuration (org.apache.hadoop.conf.Configuration)13 HashMap (java.util.HashMap)10 GenericOptionsParser (org.apache.hadoop.util.GenericOptionsParser)10 IOException (java.io.IOException)8 HCatException (org.apache.hive.hcatalog.common.HCatException)8 Table (org.apache.hadoop.hive.ql.metadata.Table)6 Test (org.junit.Test)6 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)5 Properties (java.util.Properties)4 Path (org.apache.hadoop.fs.Path)4 ResourceSchema (org.apache.pig.ResourceSchema)4 FrontendException (org.apache.pig.impl.logicalLayer.FrontendException)4 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)3 HCatRecord (org.apache.hive.hcatalog.data.HCatRecord)3 PigException (org.apache.pig.PigException)3 Map (java.util.Map)2