Search in sources :

Example 11 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class GenMapRedUtils method createMergeTask.

/**
   * Create a block level merge task for RCFiles or stripe level merge task for
   * ORCFiles
   *
   * @param fsInputDesc
   * @param finalName
   * @param ctx
   * @param inputFormatClass
   * @return MergeWork if table is stored as RCFile or ORCFile,
   *         null otherwise
   */
public static MapWork createMergeTask(FileSinkDesc fsInputDesc, Path finalName, boolean hasDynamicPartitions, CompilationOpContext ctx) throws SemanticException {
    Path inputDir = fsInputDesc.getFinalDirName();
    TableDesc tblDesc = fsInputDesc.getTableInfo();
    List<Path> inputDirs = new ArrayList<Path>(1);
    ArrayList<String> inputDirstr = new ArrayList<String>(1);
    // in case of dynamic partitioning and list bucketing
    if (!hasDynamicPartitions && !GenMapRedUtils.isSkewedStoredAsDirs(fsInputDesc)) {
        inputDirs.add(inputDir);
    }
    inputDirstr.add(inputDir.toString());
    // internal input format class for CombineHiveInputFormat
    final Class<? extends InputFormat> internalIFClass;
    if (tblDesc.getInputFileFormatClass().equals(RCFileInputFormat.class)) {
        internalIFClass = RCFileBlockMergeInputFormat.class;
    } else if (tblDesc.getInputFileFormatClass().equals(OrcInputFormat.class)) {
        internalIFClass = OrcFileStripeMergeInputFormat.class;
    } else {
        throw new SemanticException("createMergeTask called on a table with file" + " format other than RCFile or ORCFile");
    }
    // create the merge file work
    MergeFileWork work = new MergeFileWork(inputDirs, finalName, hasDynamicPartitions, tblDesc.getInputFileFormatClass().getName());
    LinkedHashMap<Path, ArrayList<String>> pathToAliases = new LinkedHashMap<>();
    pathToAliases.put(inputDir, inputDirstr);
    work.setMapperCannotSpanPartns(true);
    work.setPathToAliases(pathToAliases);
    PartitionDesc pDesc = new PartitionDesc(tblDesc, null);
    pDesc.setInputFileFormatClass(internalIFClass);
    work.addPathToPartitionInfo(inputDir, pDesc);
    work.setListBucketingCtx(fsInputDesc.getLbCtx());
    // create alias to work which contains the merge operator
    LinkedHashMap<String, Operator<? extends OperatorDesc>> aliasToWork = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
    Operator<? extends OperatorDesc> mergeOp = null;
    final FileMergeDesc fmd;
    if (tblDesc.getInputFileFormatClass().equals(RCFileInputFormat.class)) {
        fmd = new RCFileMergeDesc();
    } else {
        fmd = new OrcFileMergeDesc();
    }
    fmd.setDpCtx(fsInputDesc.getDynPartCtx());
    fmd.setOutputPath(finalName);
    fmd.setHasDynamicPartitions(work.hasDynamicPartitions());
    fmd.setListBucketingAlterTableConcatenate(work.isListBucketingAlterTableConcatenate());
    int lbLevel = work.getListBucketingCtx() == null ? 0 : work.getListBucketingCtx().calculateListBucketingLevel();
    fmd.setListBucketingDepth(lbLevel);
    mergeOp = OperatorFactory.get(ctx, fmd);
    aliasToWork.put(inputDir.toString(), mergeOp);
    work.setAliasToWork(aliasToWork);
    return work;
}
Also used : Path(org.apache.hadoop.fs.Path) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) DemuxOperator(org.apache.hadoop.hive.ql.exec.DemuxOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) RCFileMergeDesc(org.apache.hadoop.hive.ql.plan.RCFileMergeDesc) OrcFileMergeDesc(org.apache.hadoop.hive.ql.plan.OrcFileMergeDesc) FileMergeDesc(org.apache.hadoop.hive.ql.plan.FileMergeDesc) RCFileMergeDesc(org.apache.hadoop.hive.ql.plan.RCFileMergeDesc) OrcFileMergeDesc(org.apache.hadoop.hive.ql.plan.OrcFileMergeDesc) ArrayList(java.util.ArrayList) OrcFileStripeMergeInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcFileStripeMergeInputFormat) LinkedHashMap(java.util.LinkedHashMap) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 12 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class GenMapRedUtils method replaceMapWork.

/**
   * Replace the Map-side operator tree associated with targetAlias in
   * target with the Map-side operator tree associated with sourceAlias in source.
   * @param sourceAlias
   * @param targetAlias
   * @param source
   * @param target
   */
public static void replaceMapWork(String sourceAlias, String targetAlias, MapWork source, MapWork target) {
    Map<Path, ArrayList<String>> sourcePathToAliases = source.getPathToAliases();
    Map<Path, PartitionDesc> sourcePathToPartitionInfo = source.getPathToPartitionInfo();
    Map<String, Operator<? extends OperatorDesc>> sourceAliasToWork = source.getAliasToWork();
    Map<String, PartitionDesc> sourceAliasToPartnInfo = source.getAliasToPartnInfo();
    LinkedHashMap<Path, ArrayList<String>> targetPathToAliases = target.getPathToAliases();
    LinkedHashMap<Path, PartitionDesc> targetPathToPartitionInfo = target.getPathToPartitionInfo();
    Map<String, Operator<? extends OperatorDesc>> targetAliasToWork = target.getAliasToWork();
    Map<String, PartitionDesc> targetAliasToPartnInfo = target.getAliasToPartnInfo();
    if (!sourceAliasToWork.containsKey(sourceAlias) || !targetAliasToWork.containsKey(targetAlias)) {
        // with targetAlias in target.
        return;
    }
    if (sourceAliasToWork.size() > 1) {
        // how to merge.
        return;
    }
    // Remove unnecessary information from target
    targetAliasToWork.remove(targetAlias);
    targetAliasToPartnInfo.remove(targetAlias);
    List<Path> pathsToRemove = new ArrayList<>();
    for (Entry<Path, ArrayList<String>> entry : targetPathToAliases.entrySet()) {
        ArrayList<String> aliases = entry.getValue();
        aliases.remove(targetAlias);
        if (aliases.isEmpty()) {
            pathsToRemove.add(entry.getKey());
        }
    }
    for (Path pathToRemove : pathsToRemove) {
        targetPathToAliases.remove(pathToRemove);
        targetPathToPartitionInfo.remove(pathToRemove);
    }
    // Add new information from source to target
    targetAliasToWork.put(sourceAlias, sourceAliasToWork.get(sourceAlias));
    targetAliasToPartnInfo.putAll(sourceAliasToPartnInfo);
    targetPathToPartitionInfo.putAll(sourcePathToPartitionInfo);
    List<Path> pathsToAdd = new ArrayList<>();
    for (Entry<Path, ArrayList<String>> entry : sourcePathToAliases.entrySet()) {
        ArrayList<String> aliases = entry.getValue();
        if (aliases.contains(sourceAlias)) {
            pathsToAdd.add(entry.getKey());
        }
    }
    for (Path pathToAdd : pathsToAdd) {
        if (!targetPathToAliases.containsKey(pathToAdd)) {
            targetPathToAliases.put(pathToAdd, new ArrayList<String>());
        }
        targetPathToAliases.get(pathToAdd).add(sourceAlias);
    }
    target.setPathToAliases(targetPathToAliases);
    target.setPathToPartitionInfo(targetPathToPartitionInfo);
}
Also used : Path(org.apache.hadoop.fs.Path) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) DemuxOperator(org.apache.hadoop.hive.ql.exec.DemuxOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) ArrayList(java.util.ArrayList) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 13 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class MapOperator method setChildren.

public void setChildren(Configuration hconf) throws Exception {
    List<Operator<? extends OperatorDesc>> children = new ArrayList<Operator<? extends OperatorDesc>>();
    Map<String, Configuration> tableNameToConf = cloneConfsForNestedColPruning(hconf);
    Map<TableDesc, StructObjectInspector> convertedOI = getConvertedOI(tableNameToConf);
    for (Map.Entry<Path, ArrayList<String>> entry : conf.getPathToAliases().entrySet()) {
        Path onefile = entry.getKey();
        List<String> aliases = entry.getValue();
        PartitionDesc partDesc = conf.getPathToPartitionInfo().get(onefile);
        TableDesc tableDesc = partDesc.getTableDesc();
        Configuration newConf = tableNameToConf.get(tableDesc.getTableName());
        for (String alias : aliases) {
            Operator<? extends OperatorDesc> op = conf.getAliasToWork().get(alias);
            if (isLogDebugEnabled) {
                LOG.debug("Adding alias " + alias + " to work list for file " + onefile);
            }
            Map<Operator<?>, MapOpCtx> contexts = opCtxMap.get(onefile.toString());
            if (contexts == null) {
                opCtxMap.put(onefile.toString(), contexts = new LinkedHashMap<Operator<?>, MapOpCtx>());
            }
            if (contexts.containsKey(op)) {
                continue;
            }
            MapOpCtx context = new MapOpCtx(alias, op, partDesc);
            StructObjectInspector tableRowOI = convertedOI.get(partDesc.getTableDesc());
            contexts.put(op, initObjectInspector(newConf, context, tableRowOI));
            if (children.contains(op) == false) {
                op.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>(1));
                op.getParentOperators().add(this);
                children.add(op);
            }
        }
    }
    initOperatorContext(children);
    // we found all the operators that we are supposed to process.
    setChildOperators(children);
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 14 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class MapOperator method cloneConfsForNestedColPruning.

/**
   * For each source table, combine the nested column pruning information from all its
   * table scan descriptors and set it in a configuration copy. This is necessary since
   * the configuration property "READ_NESTED_COLUMN_PATH_CONF_STR" is set on a per-table
   * basis, so we can't just use a single configuration for all the tables.
   */
private Map<String, Configuration> cloneConfsForNestedColPruning(Configuration hconf) {
    Map<String, Configuration> tableNameToConf = new HashMap<>();
    for (Map.Entry<Path, ArrayList<String>> e : conf.getPathToAliases().entrySet()) {
        List<String> aliases = e.getValue();
        if (aliases == null || aliases.isEmpty()) {
            continue;
        }
        String tableName = conf.getPathToPartitionInfo().get(e.getKey()).getTableName();
        for (String alias : aliases) {
            Operator<?> rootOp = conf.getAliasToWork().get(alias);
            if (!(rootOp instanceof TableScanOperator)) {
                continue;
            }
            TableScanDesc tableScanDesc = ((TableScanOperator) rootOp).getConf();
            List<String> nestedColumnPaths = tableScanDesc.getNeededNestedColumnPaths();
            if (nestedColumnPaths == null || nestedColumnPaths.isEmpty()) {
                continue;
            }
            if (!tableNameToConf.containsKey(tableName)) {
                Configuration clonedConf = new Configuration(hconf);
                clonedConf.unset(ColumnProjectionUtils.READ_NESTED_COLUMN_PATH_CONF_STR);
                tableNameToConf.put(tableName, clonedConf);
            }
            Configuration newConf = tableNameToConf.get(tableName);
            ColumnProjectionUtils.appendNestedColumnPaths(newConf, nestedColumnPaths);
        }
    }
    // Assign tables without nested column pruning info to the default conf
    for (PartitionDesc pd : conf.getPathToPartitionInfo().values()) {
        if (!tableNameToConf.containsKey(pd.getTableName())) {
            tableNameToConf.put(pd.getTableName(), hconf);
        }
    }
    for (PartitionDesc pd : conf.getAliasToPartnInfo().values()) {
        if (!tableNameToConf.containsKey(pd.getTableName())) {
            tableNameToConf.put(pd.getTableName(), hconf);
        }
    }
    return tableNameToConf;
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 15 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class SplitGrouper method schemaEvolved.

private boolean schemaEvolved(InputSplit s, InputSplit prevSplit, boolean groupAcrossFiles, MapWork work) throws IOException {
    boolean retval = false;
    Path path = ((FileSplit) s).getPath();
    PartitionDesc pd = HiveFileFormatUtils.getPartitionDescFromPathRecursively(work.getPathToPartitionInfo(), path, cache);
    String currentDeserializerClass = pd.getDeserializerClassName();
    Class<?> currentInputFormatClass = pd.getInputFileFormatClass();
    Class<?> previousInputFormatClass = null;
    String previousDeserializerClass = null;
    if (prevSplit != null) {
        Path prevPath = ((FileSplit) prevSplit).getPath();
        if (!groupAcrossFiles) {
            return !path.equals(prevPath);
        }
        PartitionDesc prevPD = HiveFileFormatUtils.getPartitionDescFromPathRecursively(work.getPathToPartitionInfo(), prevPath, cache);
        previousDeserializerClass = prevPD.getDeserializerClassName();
        previousInputFormatClass = prevPD.getInputFileFormatClass();
    }
    if ((currentInputFormatClass != previousInputFormatClass) || (!currentDeserializerClass.equals(previousDeserializerClass))) {
        retval = true;
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Adding split " + path + " to src new group? " + retval);
    }
    return retval;
}
Also used : Path(org.apache.hadoop.fs.Path) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) FileSplit(org.apache.hadoop.mapred.FileSplit)

Aggregations

PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)58 Path (org.apache.hadoop.fs.Path)47 ArrayList (java.util.ArrayList)31 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)27 LinkedHashMap (java.util.LinkedHashMap)19 HashMap (java.util.HashMap)14 Map (java.util.Map)13 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)13 JobConf (org.apache.hadoop.mapred.JobConf)13 IOException (java.io.IOException)11 Properties (java.util.Properties)10 Operator (org.apache.hadoop.hive.ql.exec.Operator)10 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)10 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)10 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)10 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)8 Configuration (org.apache.hadoop.conf.Configuration)7 FileSystem (org.apache.hadoop.fs.FileSystem)7 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)7 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)7