Search in sources :

Example 6 with HiveStorageHandler

use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.

the class HCatBaseInputFormat method getSplits.

/**
   * Logically split the set of input files for the job. Returns the
   * underlying InputFormat's splits
   * @param jobContext the job context object
   * @return the splits, an HCatInputSplit wrapper over the storage
   *         handler InputSplits
   * @throws IOException or InterruptedException
   */
@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    Configuration conf = jobContext.getConfiguration();
    //Get the job info from the configuration,
    //throws exception if not initialized
    InputJobInfo inputJobInfo;
    try {
        inputJobInfo = getJobInfo(conf);
    } catch (Exception e) {
        throw new IOException(e);
    }
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<PartInfo> partitionInfoList = inputJobInfo.getPartitions();
    if (partitionInfoList == null) {
        //No partitions match the specified partition filter
        return splits;
    }
    HiveStorageHandler storageHandler;
    JobConf jobConf;
    //For each matching partition, call getSplits on the underlying InputFormat
    for (PartInfo partitionInfo : partitionInfoList) {
        jobConf = HCatUtil.getJobConfFromContext(jobContext);
        List<String> setInputPath = setInputPath(jobConf, partitionInfo.getLocation());
        if (setInputPath.isEmpty()) {
            continue;
        }
        Map<String, String> jobProperties = partitionInfo.getJobProperties();
        HCatUtil.copyJobPropertiesToJobConf(jobProperties, jobConf);
        storageHandler = HCatUtil.getStorageHandler(jobConf, partitionInfo);
        //Get the input format
        Class inputFormatClass = storageHandler.getInputFormatClass();
        org.apache.hadoop.mapred.InputFormat inputFormat = getMapRedInputFormat(jobConf, inputFormatClass);
        //Call getSplit on the InputFormat, create an HCatSplit for each
        //underlying split. When the desired number of input splits is missing,
        //use a default number (denoted by zero).
        //TODO(malewicz): Currently each partition is split independently into
        //a desired number. However, we want the union of all partitions to be
        //split into a desired number while maintaining balanced sizes of input
        //splits.
        int desiredNumSplits = conf.getInt(HCatConstants.HCAT_DESIRED_PARTITION_NUM_SPLITS, 0);
        org.apache.hadoop.mapred.InputSplit[] baseSplits = inputFormat.getSplits(jobConf, desiredNumSplits);
        for (org.apache.hadoop.mapred.InputSplit split : baseSplits) {
            splits.add(new HCatSplit(partitionInfo, split));
        }
    }
    return splits;
}
Also used : HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) IOException(java.io.IOException) HCatException(org.apache.hive.hcatalog.common.HCatException) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapreduce.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf)

Example 7 with HiveStorageHandler

use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.

the class HCatBaseOutputFormat method getOutputFormat.

/**
   * Gets the output format instance.
   * @param context the job context
   * @return the output format instance
   * @throws IOException
   */
protected OutputFormat<WritableComparable<?>, HCatRecord> getOutputFormat(JobContext context) throws IOException {
    OutputJobInfo jobInfo = getJobInfo(context.getConfiguration());
    HiveStorageHandler storageHandler = HCatUtil.getStorageHandler(context.getConfiguration(), jobInfo.getTableInfo().getStorerInfo());
    // Always configure storage handler with jobproperties/jobconf before calling any methods on it
    configureOutputStorageHandler(context);
    if (storageHandler instanceof FosterStorageHandler) {
        return new FileOutputFormatContainer(ReflectionUtils.newInstance(storageHandler.getOutputFormatClass(), context.getConfiguration()));
    } else {
        return new DefaultOutputFormatContainer(ReflectionUtils.newInstance(storageHandler.getOutputFormatClass(), context.getConfiguration()));
    }
}
Also used : HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler)

Example 8 with HiveStorageHandler

use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.

the class HCatOutputFormat method setOutput.

/**
   * Set the information about the output to write for the job. This queries the metadata server
   * to find the StorageHandler to use for the table.  It throws an error if the
   * partition is already published.
   * @param conf the Configuration object
   * @param credentials the Credentials object
   * @param outputJobInfo the table output information for the job
   * @throws IOException the exception in communicating with the metadata server
   */
@SuppressWarnings("unchecked")
public static void setOutput(Configuration conf, Credentials credentials, OutputJobInfo outputJobInfo) throws IOException {
    IMetaStoreClient client = null;
    try {
        HiveConf hiveConf = HCatUtil.getHiveConf(conf);
        client = HCatUtil.getHiveMetastoreClient(hiveConf);
        Table table = HCatUtil.getTable(client, outputJobInfo.getDatabaseName(), outputJobInfo.getTableName());
        List<String> indexList = client.listIndexNames(outputJobInfo.getDatabaseName(), outputJobInfo.getTableName(), Short.MAX_VALUE);
        for (String indexName : indexList) {
            Index index = client.getIndex(outputJobInfo.getDatabaseName(), outputJobInfo.getTableName(), indexName);
            if (!index.isDeferredRebuild()) {
                throw new HCatException(ErrorType.ERROR_NOT_SUPPORTED, "Store into a table with an automatic index from Pig/Mapreduce is not supported");
            }
        }
        StorageDescriptor sd = table.getTTable().getSd();
        if (sd.isCompressed()) {
            throw new HCatException(ErrorType.ERROR_NOT_SUPPORTED, "Store into a compressed partition from Pig/Mapreduce is not supported");
        }
        if (sd.getBucketCols() != null && !sd.getBucketCols().isEmpty()) {
            throw new HCatException(ErrorType.ERROR_NOT_SUPPORTED, "Store into a partition with bucket definition from Pig/Mapreduce is not supported");
        }
        if (sd.getSortCols() != null && !sd.getSortCols().isEmpty()) {
            throw new HCatException(ErrorType.ERROR_NOT_SUPPORTED, "Store into a partition with sorted column definition from Pig/Mapreduce is not supported");
        }
        // Set up a common id hash for this job, so that when we create any temporary directory
        // later on, it is guaranteed to be unique.
        String idHash;
        if ((idHash = conf.get(HCatConstants.HCAT_OUTPUT_ID_HASH)) == null) {
            idHash = String.valueOf(Math.random());
        }
        conf.set(HCatConstants.HCAT_OUTPUT_ID_HASH, idHash);
        if (table.getTTable().getPartitionKeysSize() == 0) {
            if ((outputJobInfo.getPartitionValues() != null) && (!outputJobInfo.getPartitionValues().isEmpty())) {
                // attempt made to save partition values in non-partitioned table - throw error.
                throw new HCatException(ErrorType.ERROR_INVALID_PARTITION_VALUES, "Partition values specified for non-partitioned table");
            }
            // non-partitioned table
            outputJobInfo.setPartitionValues(new HashMap<String, String>());
        } else {
            // partitioned table, we expect partition values
            // convert user specified map to have lower case key names
            Map<String, String> valueMap = new HashMap<String, String>();
            if (outputJobInfo.getPartitionValues() != null) {
                for (Map.Entry<String, String> entry : outputJobInfo.getPartitionValues().entrySet()) {
                    valueMap.put(entry.getKey().toLowerCase(), entry.getValue());
                }
            }
            if ((outputJobInfo.getPartitionValues() == null) || (outputJobInfo.getPartitionValues().size() < table.getTTable().getPartitionKeysSize())) {
                // dynamic partition usecase - partition values were null, or not all were specified
                // need to figure out which keys are not specified.
                List<String> dynamicPartitioningKeys = new ArrayList<String>();
                boolean firstItem = true;
                for (FieldSchema fs : table.getPartitionKeys()) {
                    if (!valueMap.containsKey(fs.getName().toLowerCase())) {
                        dynamicPartitioningKeys.add(fs.getName().toLowerCase());
                    }
                }
                if (valueMap.size() + dynamicPartitioningKeys.size() != table.getTTable().getPartitionKeysSize()) {
                    // If this isn't equal, then bogus key values have been inserted, error out.
                    throw new HCatException(ErrorType.ERROR_INVALID_PARTITION_VALUES, "Invalid partition keys specified");
                }
                outputJobInfo.setDynamicPartitioningKeys(dynamicPartitioningKeys);
                String dynHash;
                if ((dynHash = conf.get(HCatConstants.HCAT_DYNAMIC_PTN_JOBID)) == null) {
                    dynHash = String.valueOf(Math.random());
                }
                conf.set(HCatConstants.HCAT_DYNAMIC_PTN_JOBID, dynHash);
                // if custom pattern is set in case of dynamic partitioning, configure custom path
                String customPattern = conf.get(HCatConstants.HCAT_DYNAMIC_CUSTOM_PATTERN);
                if (customPattern != null) {
                    HCatFileUtil.setCustomPath(customPattern, outputJobInfo);
                }
            }
            outputJobInfo.setPartitionValues(valueMap);
        }
        // To get around hbase failure on single node, see BUG-4383
        conf.set("dfs.client.read.shortcircuit", "false");
        HCatSchema tableSchema = HCatUtil.extractSchema(table);
        StorerInfo storerInfo = InternalUtil.extractStorerInfo(table.getTTable().getSd(), table.getParameters());
        List<String> partitionCols = new ArrayList<String>();
        for (FieldSchema schema : table.getPartitionKeys()) {
            partitionCols.add(schema.getName());
        }
        HiveStorageHandler storageHandler = HCatUtil.getStorageHandler(conf, storerInfo);
        //Serialize the output info into the configuration
        outputJobInfo.setTableInfo(HCatTableInfo.valueOf(table.getTTable()));
        outputJobInfo.setOutputSchema(tableSchema);
        harRequested = getHarRequested(hiveConf);
        outputJobInfo.setHarRequested(harRequested);
        maxDynamicPartitions = getMaxDynamicPartitions(hiveConf);
        outputJobInfo.setMaximumDynamicPartitions(maxDynamicPartitions);
        HCatUtil.configureOutputStorageHandler(storageHandler, conf, outputJobInfo);
        Path tblPath = new Path(table.getTTable().getSd().getLocation());
        /*  Set the umask in conf such that files/dirs get created with table-dir
      * permissions. Following three assumptions are made:
      * 1. Actual files/dirs creation is done by RecordWriter of underlying
      * output format. It is assumed that they use default permissions while creation.
      * 2. Default Permissions = FsPermission.getDefault() = 777.
      * 3. UMask is honored by underlying filesystem.
      */
        FsPermission.setUMask(conf, FsPermission.getDefault().applyUMask(tblPath.getFileSystem(conf).getFileStatus(tblPath).getPermission()));
        if (Security.getInstance().isSecurityEnabled()) {
            Security.getInstance().handleSecurity(credentials, outputJobInfo, client, conf, harRequested);
        }
    } catch (Exception e) {
        if (e instanceof HCatException) {
            throw (HCatException) e;
        } else {
            throw new HCatException(ErrorType.ERROR_SET_OUTPUT, e);
        }
    } finally {
        HCatUtil.closeHiveClientQuietly(client);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) Table(org.apache.hadoop.hive.ql.metadata.Table) HashMap(java.util.HashMap) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) HCatException(org.apache.hive.hcatalog.common.HCatException) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) ArrayList(java.util.ArrayList) Index(org.apache.hadoop.hive.metastore.api.Index) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) HCatException(org.apache.hive.hcatalog.common.HCatException) IOException(java.io.IOException) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) HiveConf(org.apache.hadoop.hive.conf.HiveConf) HashMap(java.util.HashMap) Map(java.util.Map)

Example 9 with HiveStorageHandler

use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.

the class CreateTableHook method postAnalyze.

@Override
public void postAnalyze(HiveSemanticAnalyzerHookContext context, List<Task<? extends Serializable>> rootTasks) throws SemanticException {
    if (rootTasks.size() == 0) {
        // NOT EXISTS
        return;
    }
    CreateTableDesc desc = ((DDLTask) rootTasks.get(rootTasks.size() - 1)).getWork().getCreateTblDesc();
    if (desc == null) {
        // pre-hook. So, desc can never be null.
        return;
    }
    Map<String, String> tblProps = desc.getTblProps();
    if (tblProps == null) {
        // tblProps will be null if user didnt use tblprops in his CREATE
        // TABLE cmd.
        tblProps = new HashMap<String, String>();
    }
    // first check if we will allow the user to create table.
    String storageHandler = desc.getStorageHandler();
    if (StringUtils.isEmpty(storageHandler)) {
    } else {
        try {
            HiveStorageHandler storageHandlerInst = HCatUtil.getStorageHandler(context.getConf(), desc.getStorageHandler(), desc.getSerName(), desc.getInputFormat(), desc.getOutputFormat());
        //Authorization checks are performed by the storageHandler.getAuthorizationProvider(), if
        //StorageDelegationAuthorizationProvider is used.
        } catch (IOException e) {
            throw new SemanticException(e);
        }
    }
    if (desc != null) {
        try {
            Table table = context.getHive().newTable(desc.getTableName());
            if (desc.getLocation() != null) {
                table.setDataLocation(new Path(desc.getLocation()));
            }
            if (desc.getStorageHandler() != null) {
                table.setProperty(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_STORAGE, desc.getStorageHandler());
            }
            for (Map.Entry<String, String> prop : tblProps.entrySet()) {
                table.setProperty(prop.getKey(), prop.getValue());
            }
            for (Map.Entry<String, String> prop : desc.getSerdeProps().entrySet()) {
                table.setSerdeParam(prop.getKey(), prop.getValue());
            }
            if (HCatAuthUtil.isAuthorizationEnabled(context.getConf())) {
                authorize(table, Privilege.CREATE);
            }
        } catch (HiveException ex) {
            throw new SemanticException(ex);
        }
    }
    desc.setTblProps(tblProps);
    context.getConf().set(HCatConstants.HCAT_CREATE_TBL_NAME, tableName);
}
Also used : Path(org.apache.hadoop.fs.Path) HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) CreateTableDesc(org.apache.hadoop.hive.ql.plan.CreateTableDesc) Table(org.apache.hadoop.hive.ql.metadata.Table) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) IOException(java.io.IOException) HashMap(java.util.HashMap) Map(java.util.Map) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 10 with HiveStorageHandler

use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.

the class InitializeInput method extractPartInfo.

private static PartInfo extractPartInfo(HCatSchema schema, StorageDescriptor sd, Map<String, String> parameters, Configuration conf, InputJobInfo inputJobInfo) throws IOException {
    StorerInfo storerInfo = InternalUtil.extractStorerInfo(sd, parameters);
    Properties hcatProperties = new Properties();
    HiveStorageHandler storageHandler = HCatUtil.getStorageHandler(conf, storerInfo);
    // copy the properties from storageHandler to jobProperties
    Map<String, String> jobProperties = HCatUtil.getInputJobProperties(storageHandler, inputJobInfo);
    for (String key : parameters.keySet()) {
        hcatProperties.put(key, parameters.get(key));
    }
    // Bloating partinfo with inputJobInfo is not good
    return new PartInfo(schema, storageHandler, sd.getLocation(), hcatProperties, jobProperties, inputJobInfo.getTableInfo());
}
Also used : HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) Properties(java.util.Properties)

Aggregations

HiveStorageHandler (org.apache.hadoop.hive.ql.metadata.HiveStorageHandler)15 IOException (java.io.IOException)6 Path (org.apache.hadoop.fs.Path)5 JobConf (org.apache.hadoop.mapred.JobConf)5 Map (java.util.Map)4 Configuration (org.apache.hadoop.conf.Configuration)4 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)4 Table (org.apache.hadoop.hive.ql.metadata.Table)4 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 HCatException (org.apache.hive.hcatalog.common.HCatException)3 Properties (java.util.Properties)2 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)2 LazySimpleSerDe (org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe)2 InputFormat (org.apache.hadoop.mapred.InputFormat)2 ThreadFactoryBuilder (com.google.common.util.concurrent.ThreadFactoryBuilder)1 EOFException (java.io.EOFException)1 FileNotFoundException (java.io.FileNotFoundException)1 SQLException (java.sql.SQLException)1 SQLFeatureNotSupportedException (java.sql.SQLFeatureNotSupportedException)1