Search in sources :

Example 41 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class Utilities method reworkMapRedWork.

/**
   * The check here is kind of not clean. It first use a for loop to go through
   * all input formats, and choose the ones that extend ReworkMapredInputFormat
   * to a set. And finally go through the ReworkMapredInputFormat set, and call
   * rework for each one.
   *
   * Technically all these can be avoided if all Hive's input formats can share
   * a same interface. As in today's hive and Hadoop, it is not possible because
   * a lot of Hive's input formats are in Hadoop's code. And most of Hadoop's
   * input formats just extend InputFormat interface.
   *
   * @param task
   * @param reworkMapredWork
   * @param conf
   * @throws SemanticException
   */
public static void reworkMapRedWork(Task<? extends Serializable> task, boolean reworkMapredWork, HiveConf conf) throws SemanticException {
    if (reworkMapredWork && (task instanceof MapRedTask)) {
        try {
            MapredWork mapredWork = ((MapRedTask) task).getWork();
            Set<Class<? extends InputFormat>> reworkInputFormats = new HashSet<Class<? extends InputFormat>>();
            for (PartitionDesc part : mapredWork.getMapWork().getPathToPartitionInfo().values()) {
                Class<? extends InputFormat> inputFormatCls = part.getInputFileFormatClass();
                if (ReworkMapredInputFormat.class.isAssignableFrom(inputFormatCls)) {
                    reworkInputFormats.add(inputFormatCls);
                }
            }
            if (reworkInputFormats.size() > 0) {
                for (Class<? extends InputFormat> inputFormatCls : reworkInputFormats) {
                    ReworkMapredInputFormat inst = (ReworkMapredInputFormat) ReflectionUtil.newInstance(inputFormatCls, null);
                    inst.rework(conf, mapredWork);
                }
            }
        } catch (IOException e) {
            throw new SemanticException(e);
        }
    }
}
Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) SequenceFileInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat) ReworkMapredInputFormat(org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat) ContentSummaryInputFormat(org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) OneNullRowInputFormat(org.apache.hadoop.hive.ql.io.OneNullRowInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) ReworkMapredInputFormat(org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) IOException(java.io.IOException) HashSet(java.util.HashSet) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 42 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class Utilities method createDummyFileForEmptyTable.

@SuppressWarnings("rawtypes")
private static Path createDummyFileForEmptyTable(JobConf job, MapWork work, Path hiveScratchDir, String alias) throws Exception {
    TableDesc tableDesc = work.getAliasToPartnInfo().get(alias).getTableDesc();
    if (tableDesc.isNonNative()) {
        // if it does not need native storage, we can't create an empty file for it.
        return null;
    }
    Properties props = tableDesc.getProperties();
    HiveOutputFormat outFileFormat = HiveFileFormatUtils.getHiveOutputFormat(job, tableDesc);
    Path newPath = createEmptyFile(hiveScratchDir, outFileFormat, job, props, false);
    if (LOG.isInfoEnabled()) {
        LOG.info("Changed input file for alias " + alias + " to " + newPath);
    }
    // update the work
    LinkedHashMap<Path, ArrayList<String>> pathToAliases = work.getPathToAliases();
    ArrayList<String> newList = new ArrayList<String>();
    newList.add(alias);
    pathToAliases.put(newPath, newList);
    work.setPathToAliases(pathToAliases);
    PartitionDesc pDesc = work.getAliasToPartnInfo().get(alias).clone();
    work.addPathToPartitionInfo(newPath, pDesc);
    return newPath;
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) HiveOutputFormat(org.apache.hadoop.hive.ql.io.HiveOutputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) Properties(java.util.Properties)

Example 43 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class HiveIndexedInputFormat method doGetSplits.

public InputSplit[] doGetSplits(JobConf job, int numSplits) throws IOException {
    super.init(job);
    Path[] dirs = FileInputFormat.getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }
    JobConf newjob = new JobConf(job);
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();
    // for each dir, get the InputFormat, and do getSplits.
    PartitionDesc part;
    for (Path dir : dirs) {
        part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, dir, IOPrepareCache.get().allocatePartitionDescMap(), true);
        // create a new InputFormat instance if this is the first time to see this
        // class
        Class inputFormatClass = part.getInputFileFormatClass();
        InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
        Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), newjob);
        FileInputFormat.setInputPaths(newjob, dir);
        newjob.setInputFormat(inputFormat.getClass());
        InputSplit[] iss = inputFormat.getSplits(newjob, numSplits / dirs.length);
        for (InputSplit is : iss) {
            result.add(new HiveInputSplit(is, inputFormatClass.getName()));
        }
    }
    return result.toArray(new HiveInputSplit[result.size()]);
}
Also used : Path(org.apache.hadoop.fs.Path) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) ArrayList(java.util.ArrayList) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) IOException(java.io.IOException) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 44 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class TableBasedIndexHandler method generateIndexBuildTaskList.

@Override
public List<Task<?>> generateIndexBuildTaskList(org.apache.hadoop.hive.ql.metadata.Table baseTbl, org.apache.hadoop.hive.metastore.api.Index index, List<Partition> indexTblPartitions, List<Partition> baseTblPartitions, org.apache.hadoop.hive.ql.metadata.Table indexTbl, Set<ReadEntity> inputs, Set<WriteEntity> outputs) throws HiveException {
    try {
        TableDesc desc = Utilities.getTableDesc(indexTbl);
        List<Partition> newBaseTblPartitions = new ArrayList<Partition>();
        List<Task<?>> indexBuilderTasks = new ArrayList<Task<?>>();
        if (!baseTbl.isPartitioned()) {
            // the table does not have any partition, then create index for the
            // whole table
            Task<?> indexBuilder = getIndexBuilderMapRedTask(inputs, outputs, index, false, new PartitionDesc(desc, null), indexTbl.getTableName(), new PartitionDesc(Utilities.getTableDesc(baseTbl), null), baseTbl.getTableName(), indexTbl.getDbName());
            indexBuilderTasks.add(indexBuilder);
        } else {
            // table
            for (int i = 0; i < indexTblPartitions.size(); i++) {
                Partition indexPart = indexTblPartitions.get(i);
                Partition basePart = null;
                for (int j = 0; j < baseTblPartitions.size(); j++) {
                    if (baseTblPartitions.get(j).getName().equals(indexPart.getName())) {
                        basePart = baseTblPartitions.get(j);
                        newBaseTblPartitions.add(baseTblPartitions.get(j));
                        break;
                    }
                }
                if (basePart == null) {
                    throw new RuntimeException("Partitions of base table and index table are inconsistent.");
                }
                // for each partition, spawn a map reduce task.
                Task<?> indexBuilder = getIndexBuilderMapRedTask(inputs, outputs, index, true, new PartitionDesc(indexPart), indexTbl.getTableName(), new PartitionDesc(basePart), baseTbl.getTableName(), indexTbl.getDbName());
                indexBuilderTasks.add(indexBuilder);
            }
        }
        return indexBuilderTasks;
    } catch (Exception e) {
        throw new SemanticException(e);
    }
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) Task(org.apache.hadoop.hive.ql.exec.Task) ArrayList(java.util.ArrayList) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 45 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class CombineHiveInputFormat method getSplits.

/**
   * Create Hive splits based on CombineFileSplit.
   */
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    PerfLogger perfLogger = SessionState.getPerfLogger();
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
    init(job);
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();
    Path[] paths = getInputPaths(job);
    List<Path> nonCombinablePaths = new ArrayList<Path>(paths.length / 2);
    List<Path> combinablePaths = new ArrayList<Path>(paths.length / 2);
    int numThreads = Math.min(MAX_CHECK_NONCOMBINABLE_THREAD_NUM, (int) Math.ceil((double) paths.length / DEFAULT_NUM_PATH_PER_THREAD));
    // In that case, Executors.newFixedThreadPool will fail.
    if (numThreads > 0) {
        try {
            Set<Integer> nonCombinablePathIndices = getNonCombinablePathIndices(job, paths, numThreads);
            for (int i = 0; i < paths.length; i++) {
                if (nonCombinablePathIndices.contains(i)) {
                    nonCombinablePaths.add(paths[i]);
                } else {
                    combinablePaths.add(paths[i]);
                }
            }
        } catch (Exception e) {
            LOG.error("Error checking non-combinable path", e);
            perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
            throw new IOException(e);
        }
    }
    // Store the previous value for the path specification
    String oldPaths = job.get(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
    if (LOG.isDebugEnabled()) {
        LOG.debug("The received input paths are: [" + oldPaths + "] against the property " + org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
    }
    // Process the normal splits
    if (nonCombinablePaths.size() > 0) {
        FileInputFormat.setInputPaths(job, nonCombinablePaths.toArray(new Path[nonCombinablePaths.size()]));
        InputSplit[] splits = super.getSplits(job, numSplits);
        for (InputSplit split : splits) {
            result.add(split);
        }
    }
    // Process the combine splits
    if (combinablePaths.size() > 0) {
        FileInputFormat.setInputPaths(job, combinablePaths.toArray(new Path[combinablePaths.size()]));
        Map<Path, PartitionDesc> pathToPartitionInfo = this.pathToPartitionInfo != null ? this.pathToPartitionInfo : Utilities.getMapWork(job).getPathToPartitionInfo();
        InputSplit[] splits = getCombineSplits(job, numSplits, pathToPartitionInfo);
        for (InputSplit split : splits) {
            result.add(split);
        }
    }
    // if some application depends on the original value being set.
    if (oldPaths != null) {
        job.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, oldPaths);
    }
    // clear work from ThreadLocal after splits generated in case of thread is reused in pool.
    Utilities.clearWorkMapForConf(job);
    LOG.info("Number of all splits " + result.size());
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
    return result.toArray(new InputSplit[result.size()]);
}
Also used : Path(org.apache.hadoop.fs.Path) PerfLogger(org.apache.hadoop.hive.ql.log.PerfLogger) ArrayList(java.util.ArrayList) IOException(java.io.IOException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) InputSplit(org.apache.hadoop.mapred.InputSplit)

Aggregations

PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)58 Path (org.apache.hadoop.fs.Path)47 ArrayList (java.util.ArrayList)31 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)27 LinkedHashMap (java.util.LinkedHashMap)19 HashMap (java.util.HashMap)14 Map (java.util.Map)13 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)13 JobConf (org.apache.hadoop.mapred.JobConf)13 IOException (java.io.IOException)11 Properties (java.util.Properties)10 Operator (org.apache.hadoop.hive.ql.exec.Operator)10 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)10 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)10 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)10 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)8 Configuration (org.apache.hadoop.conf.Configuration)7 FileSystem (org.apache.hadoop.fs.FileSystem)7 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)7 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)7