Search in sources :

Example 1 with HiveStorageHandler

use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.

the class HCatTable method storageHandler.

/**
   * Setter for StorageHandler class.
   */
public HCatTable storageHandler(String storageHandler) throws HCatException {
    this.tblProps.put(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_STORAGE, storageHandler);
    LOG.warn("HiveStorageHandlers can't be reliably instantiated on the client-side. " + "Attempting to derive Input/OutputFormat settings from StorageHandler, on best effort: ");
    try {
        HiveStorageHandler sh = HiveUtils.getStorageHandler(getConf(), storageHandler);
        this.sd.setInputFormat(sh.getInputFormatClass().getName());
        this.sd.setOutputFormat(sh.getOutputFormatClass().getName());
        this.sd.getSerdeInfo().setSerializationLib(sh.getSerDeClass().getName());
    } catch (HiveException e) {
        LOG.warn("Could not derive Input/OutputFormat and SerDe settings from storageHandler. " + "These values need to be set explicitly.", e);
    }
    return this;
}
Also used : HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Example 2 with HiveStorageHandler

use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.

the class Utilities method getInputSummary.

/**
   * Calculate the total size of input files.
   *
   * @param ctx
   *          the hadoop job context
   * @param work
   *          map reduce job plan
   * @param filter
   *          filter to apply to the input paths before calculating size
   * @return the summary of all the input paths.
   * @throws IOException
   */
public static ContentSummary getInputSummary(final Context ctx, MapWork work, PathFilter filter) throws IOException {
    PerfLogger perfLogger = SessionState.getPerfLogger();
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
    long[] summary = { 0, 0, 0 };
    final Set<Path> pathNeedProcess = new HashSet<>();
    // this method will avoid number of threads out of control.
    synchronized (INPUT_SUMMARY_LOCK) {
        // For each input path, calculate the total size.
        for (Path path : work.getPathToAliases().keySet()) {
            Path p = path;
            if (filter != null && !filter.accept(p)) {
                continue;
            }
            ContentSummary cs = ctx.getCS(path);
            if (cs == null) {
                if (path == null) {
                    continue;
                }
                pathNeedProcess.add(path);
            } else {
                summary[0] += cs.getLength();
                summary[1] += cs.getFileCount();
                summary[2] += cs.getDirectoryCount();
            }
        }
        // Process the case when name node call is needed
        final Map<String, ContentSummary> resultMap = new ConcurrentHashMap<String, ContentSummary>();
        ArrayList<Future<?>> results = new ArrayList<Future<?>>();
        final ExecutorService executor;
        int numExecutors = getMaxExecutorsForInputListing(ctx.getConf(), pathNeedProcess.size());
        if (numExecutors > 1) {
            LOG.info("Using " + numExecutors + " threads for getContentSummary");
            executor = Executors.newFixedThreadPool(numExecutors, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Get-Input-Summary-%d").build());
        } else {
            executor = null;
        }
        HiveInterruptCallback interrup = HiveInterruptUtils.add(new HiveInterruptCallback() {

            @Override
            public void interrupt() {
                for (Path path : pathNeedProcess) {
                    try {
                        path.getFileSystem(ctx.getConf()).close();
                    } catch (IOException ignore) {
                        LOG.debug("Failed to close filesystem", ignore);
                    }
                }
                if (executor != null) {
                    executor.shutdownNow();
                }
            }
        });
        try {
            Configuration conf = ctx.getConf();
            JobConf jobConf = new JobConf(conf);
            for (Path path : pathNeedProcess) {
                final Path p = path;
                final String pathStr = path.toString();
                // All threads share the same Configuration and JobConf based on the
                // assumption that they are thread safe if only read operations are
                // executed. It is not stated in Hadoop's javadoc, the sourcce codes
                // clearly showed that they made efforts for it and we believe it is
                // thread safe. Will revisit this piece of codes if we find the assumption
                // is not correct.
                final Configuration myConf = conf;
                final JobConf myJobConf = jobConf;
                final Map<String, Operator<?>> aliasToWork = work.getAliasToWork();
                final Map<Path, ArrayList<String>> pathToAlias = work.getPathToAliases();
                final PartitionDesc partDesc = work.getPathToPartitionInfo().get(p);
                Runnable r = new Runnable() {

                    @Override
                    public void run() {
                        try {
                            Class<? extends InputFormat> inputFormatCls = partDesc.getInputFileFormatClass();
                            InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(inputFormatCls, myJobConf);
                            if (inputFormatObj instanceof ContentSummaryInputFormat) {
                                ContentSummaryInputFormat cs = (ContentSummaryInputFormat) inputFormatObj;
                                resultMap.put(pathStr, cs.getContentSummary(p, myJobConf));
                                return;
                            }
                            String metaTableStorage = null;
                            if (partDesc.getTableDesc() != null && partDesc.getTableDesc().getProperties() != null) {
                                metaTableStorage = partDesc.getTableDesc().getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, null);
                            }
                            if (partDesc.getProperties() != null) {
                                metaTableStorage = partDesc.getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, metaTableStorage);
                            }
                            HiveStorageHandler handler = HiveUtils.getStorageHandler(myConf, metaTableStorage);
                            if (handler instanceof InputEstimator) {
                                long total = 0;
                                TableDesc tableDesc = partDesc.getTableDesc();
                                InputEstimator estimator = (InputEstimator) handler;
                                for (String alias : HiveFileFormatUtils.doGetAliasesFromPath(pathToAlias, p)) {
                                    JobConf jobConf = new JobConf(myJobConf);
                                    TableScanOperator scanOp = (TableScanOperator) aliasToWork.get(alias);
                                    Utilities.setColumnNameList(jobConf, scanOp, true);
                                    Utilities.setColumnTypeList(jobConf, scanOp, true);
                                    PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc);
                                    Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf);
                                    total += estimator.estimate(jobConf, scanOp, -1).getTotalLength();
                                }
                                resultMap.put(pathStr, new ContentSummary(total, -1, -1));
                            } else {
                                // todo: should nullify summary for non-native tables,
                                // not to be selected as a mapjoin target
                                FileSystem fs = p.getFileSystem(myConf);
                                resultMap.put(pathStr, fs.getContentSummary(p));
                            }
                        } catch (Exception e) {
                            // We safely ignore this exception for summary data.
                            // We don't update the cache to protect it from polluting other
                            // usages. The worst case is that IOException will always be
                            // retried for another getInputSummary(), which is fine as
                            // IOException is not considered as a common case.
                            LOG.info("Cannot get size of " + pathStr + ". Safely ignored.");
                        }
                    }
                };
                if (executor == null) {
                    r.run();
                } else {
                    Future<?> result = executor.submit(r);
                    results.add(result);
                }
            }
            if (executor != null) {
                for (Future<?> result : results) {
                    boolean executorDone = false;
                    do {
                        try {
                            result.get();
                            executorDone = true;
                        } catch (InterruptedException e) {
                            LOG.info("Interrupted when waiting threads: ", e);
                            Thread.currentThread().interrupt();
                            break;
                        } catch (ExecutionException e) {
                            throw new IOException(e);
                        }
                    } while (!executorDone);
                }
                executor.shutdown();
            }
            HiveInterruptUtils.checkInterrupted();
            for (Map.Entry<String, ContentSummary> entry : resultMap.entrySet()) {
                ContentSummary cs = entry.getValue();
                summary[0] += cs.getLength();
                summary[1] += cs.getFileCount();
                summary[2] += cs.getDirectoryCount();
                ctx.addCS(entry.getKey(), cs);
                LOG.info("Cache Content Summary for " + entry.getKey() + " length: " + cs.getLength() + " file count: " + cs.getFileCount() + " directory count: " + cs.getDirectoryCount());
            }
            perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
            return new ContentSummary(summary[0], summary[1], summary[2]);
        } finally {
            HiveInterruptUtils.remove(interrup);
        }
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) PerfLogger(org.apache.hadoop.hive.ql.log.PerfLogger) ArrayList(java.util.ArrayList) ContentSummaryInputFormat(org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) ExecutionException(java.util.concurrent.ExecutionException) JobConf(org.apache.hadoop.mapred.JobConf) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) InputEstimator(org.apache.hadoop.hive.ql.metadata.InputEstimator) HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) HiveInterruptCallback(org.apache.hadoop.hive.common.HiveInterruptCallback) IOException(java.io.IOException) SQLFeatureNotSupportedException(java.sql.SQLFeatureNotSupportedException) SQLTransientException(java.sql.SQLTransientException) SQLException(java.sql.SQLException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) EOFException(java.io.EOFException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SequenceFileInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat) ReworkMapredInputFormat(org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat) ContentSummaryInputFormat(org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) OneNullRowInputFormat(org.apache.hadoop.hive.ql.io.OneNullRowInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) ContentSummary(org.apache.hadoop.fs.ContentSummary) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap)

Example 3 with HiveStorageHandler

use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.

the class HCatUtil method getStorageHandler.

/**
   * Create an instance of a storage handler. If storageHandler == null,
   * then surrrogate StorageHandler is used to encapsulate the InputFormat, OutputFormat and SerDe.
   * This StorageHandler assumes the other supplied storage artifacts are for a file-based storage system.
   * @param conf job's configuration will be used to configure the Configurable StorageHandler
   * @param storageHandler fully qualified class name of the desired StorageHandle instance
   * @param serDe fully qualified class name of the desired SerDe instance
   * @param inputFormat fully qualified class name of the desired InputFormat instance
   * @param outputFormat fully qualified class name of the desired outputFormat instance
   * @return storageHandler instance
   * @throws IOException
   */
public static HiveStorageHandler getStorageHandler(Configuration conf, String storageHandler, String serDe, String inputFormat, String outputFormat) throws IOException {
    if ((storageHandler == null) || (storageHandler.equals(FosterStorageHandler.class.getName()))) {
        try {
            FosterStorageHandler fosterStorageHandler = new FosterStorageHandler(inputFormat, outputFormat, serDe);
            fosterStorageHandler.setConf(conf);
            return fosterStorageHandler;
        } catch (ClassNotFoundException e) {
            throw new IOException("Failed to load " + "foster storage handler", e);
        }
    }
    try {
        Class<? extends HiveStorageHandler> handlerClass = (Class<? extends HiveStorageHandler>) Class.forName(storageHandler, true, Utilities.getSessionSpecifiedClassLoader());
        return (HiveStorageHandler) ReflectionUtils.newInstance(handlerClass, conf);
    } catch (ClassNotFoundException e) {
        throw new IOException("Error in loading storage handler." + e.getMessage(), e);
    }
}
Also used : HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) FosterStorageHandler(org.apache.hive.hcatalog.mapreduce.FosterStorageHandler) IOException(java.io.IOException)

Example 4 with HiveStorageHandler

use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.

the class FileOutputFormatContainer method getRecordWriter.

@Override
public RecordWriter<WritableComparable<?>, HCatRecord> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
    //this needs to be manually set, under normal circumstances MR Task does this
    setWorkOutputPath(context);
    //Configure the output key and value classes.
    // This is required for writing null as key for file based tables.
    context.getConfiguration().set("mapred.output.key.class", NullWritable.class.getName());
    String jobInfoString = context.getConfiguration().get(HCatConstants.HCAT_KEY_OUTPUT_INFO);
    OutputJobInfo jobInfo = (OutputJobInfo) HCatUtil.deserialize(jobInfoString);
    StorerInfo storeInfo = jobInfo.getTableInfo().getStorerInfo();
    HiveStorageHandler storageHandler = HCatUtil.getStorageHandler(context.getConfiguration(), storeInfo);
    Class<? extends AbstractSerDe> serde = storageHandler.getSerDeClass();
    AbstractSerDe sd = (AbstractSerDe) ReflectionUtils.newInstance(serde, context.getConfiguration());
    context.getConfiguration().set("mapred.output.value.class", sd.getSerializedClass().getName());
    RecordWriter<WritableComparable<?>, HCatRecord> rw;
    if (HCatBaseOutputFormat.getJobInfo(context.getConfiguration()).isDynamicPartitioningUsed()) {
        // When Dynamic partitioning is used, the RecordWriter instance initialized here isn't used. Can use null.
        // (That's because records can't be written until the values of the dynamic partitions are deduced.
        // By that time, a new local instance of RecordWriter, with the correct output-path, will be constructed.)
        rw = new DynamicPartitionFileRecordWriterContainer((org.apache.hadoop.mapred.RecordWriter) null, context);
    } else {
        Path parentDir = new Path(context.getConfiguration().get("mapred.work.output.dir"));
        Path childPath = new Path(parentDir, FileOutputFormat.getUniqueName(new JobConf(context.getConfiguration()), context.getConfiguration().get("mapreduce.output.basename", "part")));
        rw = new StaticPartitionFileRecordWriterContainer(getBaseOutputFormat().getRecordWriter(parentDir.getFileSystem(context.getConfiguration()), new JobConf(context.getConfiguration()), childPath.toString(), InternalUtil.createReporter(context)), context);
    }
    return rw;
}
Also used : Path(org.apache.hadoop.fs.Path) HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) NullWritable(org.apache.hadoop.io.NullWritable) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter) WritableComparable(org.apache.hadoop.io.WritableComparable) JobConf(org.apache.hadoop.mapred.JobConf) HCatRecord(org.apache.hive.hcatalog.data.HCatRecord)

Example 5 with HiveStorageHandler

use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.

the class HCatBaseInputFormat method createRecordReader.

/**
   * Create the RecordReader for the given InputSplit. Returns the underlying
   * RecordReader if the required operations are supported and schema matches
   * with HCatTable schema. Returns an HCatRecordReader if operations need to
   * be implemented in HCat.
   * @param split the split
   * @param taskContext the task attempt context
   * @return the record reader instance, either an HCatRecordReader(later) or
   *         the underlying storage handler's RecordReader
   * @throws IOException or InterruptedException
   */
@Override
public RecordReader<WritableComparable, HCatRecord> createRecordReader(InputSplit split, TaskAttemptContext taskContext) throws IOException, InterruptedException {
    HCatSplit hcatSplit = InternalUtil.castToHCatSplit(split);
    PartInfo partitionInfo = hcatSplit.getPartitionInfo();
    // Ensure PartInfo's TableInfo is initialized.
    if (partitionInfo.getTableInfo() == null) {
        partitionInfo.setTableInfo(((InputJobInfo) HCatUtil.deserialize(taskContext.getConfiguration().get(HCatConstants.HCAT_KEY_JOB_INFO))).getTableInfo());
    }
    JobContext jobContext = taskContext;
    Configuration conf = jobContext.getConfiguration();
    HiveStorageHandler storageHandler = HCatUtil.getStorageHandler(conf, partitionInfo);
    JobConf jobConf = HCatUtil.getJobConfFromContext(jobContext);
    Map<String, String> jobProperties = partitionInfo.getJobProperties();
    HCatUtil.copyJobPropertiesToJobConf(jobProperties, jobConf);
    Map<String, Object> valuesNotInDataCols = getColValsNotInDataColumns(getOutputSchema(conf), partitionInfo);
    return new HCatRecordReader(storageHandler, valuesNotInDataCols);
}
Also used : HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) Configuration(org.apache.hadoop.conf.Configuration) JobContext(org.apache.hadoop.mapreduce.JobContext) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

HiveStorageHandler (org.apache.hadoop.hive.ql.metadata.HiveStorageHandler)15 IOException (java.io.IOException)6 Path (org.apache.hadoop.fs.Path)5 JobConf (org.apache.hadoop.mapred.JobConf)5 Map (java.util.Map)4 Configuration (org.apache.hadoop.conf.Configuration)4 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)4 Table (org.apache.hadoop.hive.ql.metadata.Table)4 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 HCatException (org.apache.hive.hcatalog.common.HCatException)3 Properties (java.util.Properties)2 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)2 LazySimpleSerDe (org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe)2 InputFormat (org.apache.hadoop.mapred.InputFormat)2 ThreadFactoryBuilder (com.google.common.util.concurrent.ThreadFactoryBuilder)1 EOFException (java.io.EOFException)1 FileNotFoundException (java.io.FileNotFoundException)1 SQLException (java.sql.SQLException)1 SQLFeatureNotSupportedException (java.sql.SQLFeatureNotSupportedException)1