Search in sources :

Example 26 with ColumnInfo

use of org.apache.hadoop.hive.ql.exec.ColumnInfo in project hive by apache.

the class GenMRSkewJoinProcessor method processSkewJoin.

/**
   * Create tasks for processing skew joins. The idea is (HIVE-964) to use
   * separated jobs and map-joins to handle skew joins.
   * <p>
   * <ul>
   * <li>
   * Number of mr jobs to handle skew keys is the number of table minus 1 (we
   * can stream the last table, so big keys in the last table will not be a
   * problem).
   * <li>
   * At runtime in Join, we output big keys in one table into one corresponding
   * directories, and all same keys in other tables into different dirs(one for
   * each table). The directories will look like:
   * <ul>
   * <li>
   * dir-T1-bigkeys(containing big keys in T1), dir-T2-keys(containing keys
   * which is big in T1),dir-T3-keys(containing keys which is big in T1), ...
   * <li>
   * dir-T1-keys(containing keys which is big in T2), dir-T2-bigkeys(containing
   * big keys in T2),dir-T3-keys(containing keys which is big in T2), ...
   * <li>
   * dir-T1-keys(containing keys which is big in T3), dir-T2-keys(containing big
   * keys in T3),dir-T3-bigkeys(containing keys which is big in T3), ... .....
   * </ul>
   * </ul>
   * For each table, we launch one mapjoin job, taking the directory containing
   * big keys in this table and corresponding dirs in other tables as input.
   * (Actally one job for one row in the above.)
   *
   * <p>
   * For more discussions, please check
   * https://issues.apache.org/jira/browse/HIVE-964.
   *
   */
@SuppressWarnings("unchecked")
public static void processSkewJoin(JoinOperator joinOp, Task<? extends Serializable> currTask, ParseContext parseCtx) throws SemanticException {
    // now does not work with outer joins
    if (!GenMRSkewJoinProcessor.skewJoinEnabled(parseCtx.getConf(), joinOp)) {
        return;
    }
    List<Task<? extends Serializable>> children = currTask.getChildTasks();
    Path baseTmpDir = parseCtx.getContext().getMRTmpPath();
    JoinDesc joinDescriptor = joinOp.getConf();
    Map<Byte, List<ExprNodeDesc>> joinValues = joinDescriptor.getExprs();
    int numAliases = joinValues.size();
    Map<Byte, Path> bigKeysDirMap = new HashMap<Byte, Path>();
    Map<Byte, Map<Byte, Path>> smallKeysDirMap = new HashMap<Byte, Map<Byte, Path>>();
    Map<Byte, Path> skewJoinJobResultsDir = new HashMap<Byte, Path>();
    Byte[] tags = joinDescriptor.getTagOrder();
    for (int i = 0; i < numAliases; i++) {
        Byte alias = tags[i];
        bigKeysDirMap.put(alias, getBigKeysDir(baseTmpDir, alias));
        Map<Byte, Path> smallKeysMap = new HashMap<Byte, Path>();
        smallKeysDirMap.put(alias, smallKeysMap);
        for (Byte src2 : tags) {
            if (!src2.equals(alias)) {
                smallKeysMap.put(src2, getSmallKeysDir(baseTmpDir, alias, src2));
            }
        }
        skewJoinJobResultsDir.put(alias, getBigKeysSkewJoinResultDir(baseTmpDir, alias));
    }
    joinDescriptor.setHandleSkewJoin(true);
    joinDescriptor.setBigKeysDirMap(bigKeysDirMap);
    joinDescriptor.setSmallKeysDirMap(smallKeysDirMap);
    joinDescriptor.setSkewKeyDefinition(HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVESKEWJOINKEY));
    HashMap<Path, Task<? extends Serializable>> bigKeysDirToTaskMap = new HashMap<Path, Task<? extends Serializable>>();
    List<Serializable> listWorks = new ArrayList<Serializable>();
    List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
    MapredWork currPlan = (MapredWork) currTask.getWork();
    TableDesc keyTblDesc = (TableDesc) currPlan.getReduceWork().getKeyDesc().clone();
    List<String> joinKeys = Utilities.getColumnNames(keyTblDesc.getProperties());
    List<String> joinKeyTypes = Utilities.getColumnTypes(keyTblDesc.getProperties());
    Map<Byte, TableDesc> tableDescList = new HashMap<Byte, TableDesc>();
    Map<Byte, RowSchema> rowSchemaList = new HashMap<Byte, RowSchema>();
    Map<Byte, List<ExprNodeDesc>> newJoinValues = new HashMap<Byte, List<ExprNodeDesc>>();
    Map<Byte, List<ExprNodeDesc>> newJoinKeys = new HashMap<Byte, List<ExprNodeDesc>>();
    // used for create mapJoinDesc, should be in order
    List<TableDesc> newJoinValueTblDesc = new ArrayList<TableDesc>();
    for (Byte tag : tags) {
        newJoinValueTblDesc.add(null);
    }
    for (int i = 0; i < numAliases; i++) {
        Byte alias = tags[i];
        List<ExprNodeDesc> valueCols = joinValues.get(alias);
        String colNames = "";
        String colTypes = "";
        int columnSize = valueCols.size();
        List<ExprNodeDesc> newValueExpr = new ArrayList<ExprNodeDesc>();
        List<ExprNodeDesc> newKeyExpr = new ArrayList<ExprNodeDesc>();
        ArrayList<ColumnInfo> columnInfos = new ArrayList<ColumnInfo>();
        boolean first = true;
        for (int k = 0; k < columnSize; k++) {
            TypeInfo type = valueCols.get(k).getTypeInfo();
            // any name, it does not matter.
            String newColName = i + "_VALUE_" + k;
            ColumnInfo columnInfo = new ColumnInfo(newColName, type, alias.toString(), false);
            columnInfos.add(columnInfo);
            newValueExpr.add(new ExprNodeColumnDesc(columnInfo));
            if (!first) {
                colNames = colNames + ",";
                colTypes = colTypes + ",";
            }
            first = false;
            colNames = colNames + newColName;
            colTypes = colTypes + valueCols.get(k).getTypeString();
        }
        // we are putting join keys at last part of the spilled table
        for (int k = 0; k < joinKeys.size(); k++) {
            if (!first) {
                colNames = colNames + ",";
                colTypes = colTypes + ",";
            }
            first = false;
            colNames = colNames + joinKeys.get(k);
            colTypes = colTypes + joinKeyTypes.get(k);
            ColumnInfo columnInfo = new ColumnInfo(joinKeys.get(k), TypeInfoFactory.getPrimitiveTypeInfo(joinKeyTypes.get(k)), alias.toString(), false);
            columnInfos.add(columnInfo);
            newKeyExpr.add(new ExprNodeColumnDesc(columnInfo));
        }
        newJoinValues.put(alias, newValueExpr);
        newJoinKeys.put(alias, newKeyExpr);
        tableDescList.put(alias, Utilities.getTableDesc(colNames, colTypes));
        rowSchemaList.put(alias, new RowSchema(columnInfos));
        // construct value table Desc
        String valueColNames = "";
        String valueColTypes = "";
        first = true;
        for (int k = 0; k < columnSize; k++) {
            // any name, it does not matter.
            String newColName = i + "_VALUE_" + k;
            if (!first) {
                valueColNames = valueColNames + ",";
                valueColTypes = valueColTypes + ",";
            }
            valueColNames = valueColNames + newColName;
            valueColTypes = valueColTypes + valueCols.get(k).getTypeString();
            first = false;
        }
        newJoinValueTblDesc.set(Byte.valueOf((byte) i), Utilities.getTableDesc(valueColNames, valueColTypes));
    }
    joinDescriptor.setSkewKeysValuesTables(tableDescList);
    joinDescriptor.setKeyTableDesc(keyTblDesc);
    for (int i = 0; i < numAliases - 1; i++) {
        Byte src = tags[i];
        MapWork newPlan = PlanUtils.getMapRedWork().getMapWork();
        // This code has been only added for testing
        boolean mapperCannotSpanPartns = parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_MAPPER_CANNOT_SPAN_MULTIPLE_PARTITIONS);
        newPlan.setMapperCannotSpanPartns(mapperCannotSpanPartns);
        MapredWork clonePlan = SerializationUtilities.clonePlan(currPlan);
        Operator<? extends OperatorDesc>[] parentOps = new TableScanOperator[tags.length];
        for (int k = 0; k < tags.length; k++) {
            Operator<? extends OperatorDesc> ts = GenMapRedUtils.createTemporaryTableScanOperator(joinOp.getCompilationOpContext(), rowSchemaList.get((byte) k));
            ((TableScanOperator) ts).setTableDesc(tableDescList.get((byte) k));
            parentOps[k] = ts;
        }
        Operator<? extends OperatorDesc> tblScan_op = parentOps[i];
        ArrayList<String> aliases = new ArrayList<String>();
        String alias = src.toString().intern();
        aliases.add(alias);
        Path bigKeyDirPath = bigKeysDirMap.get(src);
        newPlan.addPathToAlias(bigKeyDirPath, aliases);
        newPlan.getAliasToWork().put(alias, tblScan_op);
        PartitionDesc part = new PartitionDesc(tableDescList.get(src), null);
        newPlan.addPathToPartitionInfo(bigKeyDirPath, part);
        newPlan.getAliasToPartnInfo().put(alias, part);
        Operator<? extends OperatorDesc> reducer = clonePlan.getReduceWork().getReducer();
        assert reducer instanceof JoinOperator;
        JoinOperator cloneJoinOp = (JoinOperator) reducer;
        String dumpFilePrefix = "mapfile" + PlanUtils.getCountForMapJoinDumpFilePrefix();
        MapJoinDesc mapJoinDescriptor = new MapJoinDesc(newJoinKeys, keyTblDesc, newJoinValues, newJoinValueTblDesc, newJoinValueTblDesc, joinDescriptor.getOutputColumnNames(), i, joinDescriptor.getConds(), joinDescriptor.getFilters(), joinDescriptor.getNoOuterJoin(), dumpFilePrefix);
        mapJoinDescriptor.setTagOrder(tags);
        mapJoinDescriptor.setHandleSkewJoin(false);
        mapJoinDescriptor.setNullSafes(joinDescriptor.getNullSafes());
        MapredLocalWork localPlan = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
        Map<Byte, Path> smallTblDirs = smallKeysDirMap.get(src);
        for (int j = 0; j < numAliases; j++) {
            if (j == i) {
                continue;
            }
            Byte small_alias = tags[j];
            Operator<? extends OperatorDesc> tblScan_op2 = parentOps[j];
            localPlan.getAliasToWork().put(small_alias.toString(), tblScan_op2);
            Path tblDir = smallTblDirs.get(small_alias);
            localPlan.getAliasToFetchWork().put(small_alias.toString(), new FetchWork(tblDir, tableDescList.get(small_alias)));
        }
        newPlan.setMapRedLocalWork(localPlan);
        // construct a map join and set it as the child operator of tblScan_op
        MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(joinOp.getCompilationOpContext(), mapJoinDescriptor, (RowSchema) null, parentOps);
        // change the children of the original join operator to point to the map
        // join operator
        List<Operator<? extends OperatorDesc>> childOps = cloneJoinOp.getChildOperators();
        for (Operator<? extends OperatorDesc> childOp : childOps) {
            childOp.replaceParent(cloneJoinOp, mapJoinOp);
        }
        mapJoinOp.setChildOperators(childOps);
        HiveConf jc = new HiveConf(parseCtx.getConf(), GenMRSkewJoinProcessor.class);
        newPlan.setNumMapTasks(HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVESKEWJOINMAPJOINNUMMAPTASK));
        newPlan.setMinSplitSize(HiveConf.getLongVar(jc, HiveConf.ConfVars.HIVESKEWJOINMAPJOINMINSPLIT));
        newPlan.setInputformat(HiveInputFormat.class.getName());
        MapredWork w = new MapredWork();
        w.setMapWork(newPlan);
        Task<? extends Serializable> skewJoinMapJoinTask = TaskFactory.get(w, jc);
        skewJoinMapJoinTask.setFetchSource(currTask.isFetchSource());
        bigKeysDirToTaskMap.put(bigKeyDirPath, skewJoinMapJoinTask);
        listWorks.add(skewJoinMapJoinTask.getWork());
        listTasks.add(skewJoinMapJoinTask);
    }
    if (children != null) {
        for (Task<? extends Serializable> tsk : listTasks) {
            for (Task<? extends Serializable> oldChild : children) {
                tsk.addDependentTask(oldChild);
            }
        }
        currTask.setChildTasks(new ArrayList<Task<? extends Serializable>>());
        for (Task<? extends Serializable> oldChild : children) {
            oldChild.getParentTasks().remove(currTask);
        }
        listTasks.addAll(children);
    }
    ConditionalResolverSkewJoinCtx context = new ConditionalResolverSkewJoinCtx(bigKeysDirToTaskMap, children);
    ConditionalWork cndWork = new ConditionalWork(listWorks);
    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, parseCtx.getConf());
    cndTsk.setListTasks(listTasks);
    cndTsk.setResolver(new ConditionalResolverSkewJoin());
    cndTsk.setResolverCtx(context);
    currTask.setChildTasks(new ArrayList<Task<? extends Serializable>>());
    currTask.addDependentTask(cndTsk);
    return;
}
Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) ConditionalWork(org.apache.hadoop.hive.ql.plan.ConditionalWork) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) ArrayList(java.util.ArrayList) List(java.util.List) HiveConf(org.apache.hadoop.hive.conf.HiveConf) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) ConditionalResolverSkewJoin(org.apache.hadoop.hive.ql.plan.ConditionalResolverSkewJoin) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) JoinDesc(org.apache.hadoop.hive.ql.plan.JoinDesc) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) Path(org.apache.hadoop.fs.Path) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) ConditionalResolverSkewJoinCtx(org.apache.hadoop.hive.ql.plan.ConditionalResolverSkewJoin.ConditionalResolverSkewJoinCtx) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc)

Example 27 with ColumnInfo

use of org.apache.hadoop.hive.ql.exec.ColumnInfo in project hive by apache.

the class SemanticAnalyzer method genFileSinkPlan.

@SuppressWarnings("nls")
protected Operator genFileSinkPlan(String dest, QB qb, Operator input) throws SemanticException {
    RowResolver inputRR = opParseCtx.get(input).getRowResolver();
    QBMetaData qbm = qb.getMetaData();
    Integer dest_type = qbm.getDestTypeForAlias(dest);
    // destination table if any
    Table dest_tab = null;
    // should the destination table be written to using ACID
    boolean destTableIsAcid = false;
    boolean destTableIsTemporary = false;
    boolean destTableIsMaterialization = false;
    // destination partition if any
    Partition dest_part = null;
    // the intermediate destination directory
    Path queryTmpdir = null;
    // the final destination directory
    Path dest_path = null;
    TableDesc table_desc = null;
    int currentTableId = 0;
    boolean isLocal = false;
    SortBucketRSCtx rsCtx = new SortBucketRSCtx();
    DynamicPartitionCtx dpCtx = null;
    LoadTableDesc ltd = null;
    ListBucketingCtx lbCtx = null;
    Map<String, String> partSpec = null;
    switch(dest_type.intValue()) {
        case QBMetaData.DEST_TABLE:
            {
                dest_tab = qbm.getDestTableForAlias(dest);
                destTableIsAcid = AcidUtils.isAcidTable(dest_tab);
                destTableIsTemporary = dest_tab.isTemporary();
                // Is the user trying to insert into a external tables
                if ((!conf.getBoolVar(HiveConf.ConfVars.HIVE_INSERT_INTO_EXTERNAL_TABLES)) && (dest_tab.getTableType().equals(TableType.EXTERNAL_TABLE))) {
                    throw new SemanticException(ErrorMsg.INSERT_EXTERNAL_TABLE.getMsg(dest_tab.getTableName()));
                }
                partSpec = qbm.getPartSpecForAlias(dest);
                dest_path = dest_tab.getPath();
                // verify that our destination is empty before proceeding
                if (dest_tab.isImmutable() && qb.getParseInfo().isInsertIntoTable(dest_tab.getDbName(), dest_tab.getTableName())) {
                    try {
                        FileSystem fs = dest_path.getFileSystem(conf);
                        if (!MetaStoreUtils.isDirEmpty(fs, dest_path)) {
                            LOG.warn("Attempted write into an immutable table : " + dest_tab.getTableName() + " : " + dest_path);
                            throw new SemanticException(ErrorMsg.INSERT_INTO_IMMUTABLE_TABLE.getMsg(dest_tab.getTableName()));
                        }
                    } catch (IOException ioe) {
                        LOG.warn("Error while trying to determine if immutable table has any data : " + dest_tab.getTableName() + " : " + dest_path);
                        throw new SemanticException(ErrorMsg.INSERT_INTO_IMMUTABLE_TABLE.getMsg(ioe.getMessage()));
                    }
                }
                // check for partition
                List<FieldSchema> parts = dest_tab.getPartitionKeys();
                if (parts != null && parts.size() > 0) {
                    // table is partitioned
                    if (partSpec == null || partSpec.size() == 0) {
                        // user did NOT specify partition
                        throw new SemanticException(generateErrorMessage(qb.getParseInfo().getDestForClause(dest), ErrorMsg.NEED_PARTITION_ERROR.getMsg()));
                    }
                    dpCtx = qbm.getDPCtx(dest);
                    if (dpCtx == null) {
                        dest_tab.validatePartColumnNames(partSpec, false);
                        dpCtx = new DynamicPartitionCtx(dest_tab, partSpec, conf.getVar(HiveConf.ConfVars.DEFAULTPARTITIONNAME), conf.getIntVar(HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTSPERNODE));
                        qbm.setDPCtx(dest, dpCtx);
                    }
                    if (!HiveConf.getBoolVar(conf, HiveConf.ConfVars.DYNAMICPARTITIONING)) {
                        // allow DP
                        throw new SemanticException(generateErrorMessage(qb.getParseInfo().getDestForClause(dest), ErrorMsg.DYNAMIC_PARTITION_DISABLED.getMsg()));
                    }
                    if (dpCtx.getSPPath() != null) {
                        dest_path = new Path(dest_tab.getPath(), dpCtx.getSPPath());
                    }
                    if ((dest_tab.getNumBuckets() > 0)) {
                        dpCtx.setNumBuckets(dest_tab.getNumBuckets());
                    }
                }
                boolean isNonNativeTable = dest_tab.isNonNative();
                if (isNonNativeTable) {
                    queryTmpdir = dest_path;
                } else {
                    queryTmpdir = ctx.getTempDirForPath(dest_path, true);
                }
                if (dpCtx != null) {
                    // set the root of the temporary path where dynamic partition columns will populate
                    dpCtx.setRootPath(queryTmpdir);
                }
                // this table_desc does not contain the partitioning columns
                table_desc = Utilities.getTableDesc(dest_tab);
                // Add sorting/bucketing if needed
                input = genBucketingSortingDest(dest, input, qb, table_desc, dest_tab, rsCtx);
                idToTableNameMap.put(String.valueOf(destTableId), dest_tab.getTableName());
                currentTableId = destTableId;
                destTableId++;
                lbCtx = constructListBucketingCtx(dest_tab.getSkewedColNames(), dest_tab.getSkewedColValues(), dest_tab.getSkewedColValueLocationMaps(), dest_tab.isStoredAsSubDirectories(), conf);
                // NOTE: specify Dynamic partitions in dest_tab for WriteEntity
                if (!isNonNativeTable) {
                    AcidUtils.Operation acidOp = AcidUtils.Operation.NOT_ACID;
                    if (destTableIsAcid) {
                        acidOp = getAcidType(table_desc.getOutputFileFormatClass(), dest);
                        checkAcidConstraints(qb, table_desc, dest_tab);
                    }
                    ltd = new LoadTableDesc(queryTmpdir, table_desc, dpCtx, acidOp);
                    ltd.setReplace(!qb.getParseInfo().isInsertIntoTable(dest_tab.getDbName(), dest_tab.getTableName()));
                    ltd.setLbCtx(lbCtx);
                    loadTableWork.add(ltd);
                } else {
                    // This is a non-native table.
                    // We need to set stats as inaccurate.
                    setStatsForNonNativeTable(dest_tab);
                    // true if it is insert overwrite.
                    boolean overwrite = !qb.getParseInfo().isInsertIntoTable(String.format("%s.%s", dest_tab.getDbName(), dest_tab.getTableName()));
                    createInsertDesc(dest_tab, overwrite);
                }
                WriteEntity output = null;
                // list of dynamically created partitions are known.
                if ((dpCtx == null || dpCtx.getNumDPCols() == 0)) {
                    output = new WriteEntity(dest_tab, determineWriteType(ltd, isNonNativeTable, dest));
                    if (!outputs.add(output)) {
                        throw new SemanticException(ErrorMsg.OUTPUT_SPECIFIED_MULTIPLE_TIMES.getMsg(dest_tab.getTableName()));
                    }
                }
                if ((dpCtx != null) && (dpCtx.getNumDPCols() >= 0)) {
                    // No static partition specified
                    if (dpCtx.getNumSPCols() == 0) {
                        output = new WriteEntity(dest_tab, determineWriteType(ltd, isNonNativeTable, dest), false);
                        outputs.add(output);
                        output.setDynamicPartitionWrite(true);
                    } else // part of the partition specified
                    // Create a DummyPartition in this case. Since, the metastore does not store partial
                    // partitions currently, we need to store dummy partitions
                    {
                        try {
                            String ppath = dpCtx.getSPPath();
                            ppath = ppath.substring(0, ppath.length() - 1);
                            DummyPartition p = new DummyPartition(dest_tab, dest_tab.getDbName() + "@" + dest_tab.getTableName() + "@" + ppath, partSpec);
                            output = new WriteEntity(p, getWriteType(dest), false);
                            output.setDynamicPartitionWrite(true);
                            outputs.add(output);
                        } catch (HiveException e) {
                            throw new SemanticException(e.getMessage(), e);
                        }
                    }
                }
                ctx.getLoadTableOutputMap().put(ltd, output);
                break;
            }
        case QBMetaData.DEST_PARTITION:
            {
                dest_part = qbm.getDestPartitionForAlias(dest);
                dest_tab = dest_part.getTable();
                destTableIsAcid = AcidUtils.isAcidTable(dest_tab);
                if ((!conf.getBoolVar(HiveConf.ConfVars.HIVE_INSERT_INTO_EXTERNAL_TABLES)) && dest_tab.getTableType().equals(TableType.EXTERNAL_TABLE)) {
                    throw new SemanticException(ErrorMsg.INSERT_EXTERNAL_TABLE.getMsg(dest_tab.getTableName()));
                }
                Path tabPath = dest_tab.getPath();
                Path partPath = dest_part.getDataLocation();
                // verify that our destination is empty before proceeding
                if (dest_tab.isImmutable() && qb.getParseInfo().isInsertIntoTable(dest_tab.getDbName(), dest_tab.getTableName())) {
                    try {
                        FileSystem fs = partPath.getFileSystem(conf);
                        if (!MetaStoreUtils.isDirEmpty(fs, partPath)) {
                            LOG.warn("Attempted write into an immutable table partition : " + dest_tab.getTableName() + " : " + partPath);
                            throw new SemanticException(ErrorMsg.INSERT_INTO_IMMUTABLE_TABLE.getMsg(dest_tab.getTableName()));
                        }
                    } catch (IOException ioe) {
                        LOG.warn("Error while trying to determine if immutable table partition has any data : " + dest_tab.getTableName() + " : " + partPath);
                        throw new SemanticException(ErrorMsg.INSERT_INTO_IMMUTABLE_TABLE.getMsg(ioe.getMessage()));
                    }
                }
                // if the table is in a different dfs than the partition,
                // replace the partition's dfs with the table's dfs.
                dest_path = new Path(tabPath.toUri().getScheme(), tabPath.toUri().getAuthority(), partPath.toUri().getPath());
                queryTmpdir = ctx.getTempDirForPath(dest_path, true);
                table_desc = Utilities.getTableDesc(dest_tab);
                // Add sorting/bucketing if needed
                input = genBucketingSortingDest(dest, input, qb, table_desc, dest_tab, rsCtx);
                idToTableNameMap.put(String.valueOf(destTableId), dest_tab.getTableName());
                currentTableId = destTableId;
                destTableId++;
                lbCtx = constructListBucketingCtx(dest_part.getSkewedColNames(), dest_part.getSkewedColValues(), dest_part.getSkewedColValueLocationMaps(), dest_part.isStoredAsSubDirectories(), conf);
                AcidUtils.Operation acidOp = AcidUtils.Operation.NOT_ACID;
                if (destTableIsAcid) {
                    acidOp = getAcidType(table_desc.getOutputFileFormatClass(), dest);
                    checkAcidConstraints(qb, table_desc, dest_tab);
                }
                ltd = new LoadTableDesc(queryTmpdir, table_desc, dest_part.getSpec(), acidOp);
                ltd.setReplace(!qb.getParseInfo().isInsertIntoTable(dest_tab.getDbName(), dest_tab.getTableName()));
                ltd.setLbCtx(lbCtx);
                loadTableWork.add(ltd);
                if (!outputs.add(new WriteEntity(dest_part, determineWriteType(ltd, dest_tab.isNonNative(), dest)))) {
                    throw new SemanticException(ErrorMsg.OUTPUT_SPECIFIED_MULTIPLE_TIMES.getMsg(dest_tab.getTableName() + "@" + dest_part.getName()));
                }
                break;
            }
        case QBMetaData.DEST_LOCAL_FILE:
            isLocal = true;
        // fall through
        case QBMetaData.DEST_DFS_FILE:
            {
                dest_path = new Path(qbm.getDestFileForAlias(dest));
                if (isLocal) {
                    // for local directory - we always write to map-red intermediate
                    // store and then copy to local fs
                    queryTmpdir = ctx.getMRTmpPath();
                } else {
                    try {
                        Path qPath = FileUtils.makeQualified(dest_path, conf);
                        queryTmpdir = ctx.getTempDirForPath(qPath, true);
                    } catch (Exception e) {
                        throw new SemanticException("Error creating temporary folder on: " + dest_path, e);
                    }
                }
                String cols = "";
                String colTypes = "";
                ArrayList<ColumnInfo> colInfos = inputRR.getColumnInfos();
                // CTAS case: the file output format and serde are defined by the create
                // table command rather than taking the default value
                List<FieldSchema> field_schemas = null;
                CreateTableDesc tblDesc = qb.getTableDesc();
                CreateViewDesc viewDesc = qb.getViewDesc();
                if (tblDesc != null) {
                    field_schemas = new ArrayList<FieldSchema>();
                    destTableIsTemporary = tblDesc.isTemporary();
                    destTableIsMaterialization = tblDesc.isMaterialization();
                } else if (viewDesc != null) {
                    field_schemas = new ArrayList<FieldSchema>();
                    destTableIsTemporary = false;
                }
                boolean first = true;
                for (ColumnInfo colInfo : colInfos) {
                    String[] nm = inputRR.reverseLookup(colInfo.getInternalName());
                    if (nm[1] != null) {
                        // non-null column alias
                        colInfo.setAlias(nm[1]);
                    }
                    //default column name
                    String colName = colInfo.getInternalName();
                    if (field_schemas != null) {
                        FieldSchema col = new FieldSchema();
                        if (!("".equals(nm[0])) && nm[1] != null) {
                            // remove ``
                            colName = unescapeIdentifier(colInfo.getAlias()).toLowerCase();
                        }
                        colName = fixCtasColumnName(colName);
                        col.setName(colName);
                        String typeName = colInfo.getType().getTypeName();
                        // CTAS should NOT create a VOID type
                        if (typeName.equals(serdeConstants.VOID_TYPE_NAME)) {
                            throw new SemanticException(ErrorMsg.CTAS_CREATES_VOID_TYPE.getMsg(colName));
                        }
                        col.setType(typeName);
                        field_schemas.add(col);
                    }
                    if (!first) {
                        cols = cols.concat(",");
                        colTypes = colTypes.concat(":");
                    }
                    first = false;
                    cols = cols.concat(colName);
                    // Replace VOID type with string when the output is a temp table or
                    // local files.
                    // A VOID type can be generated under the query:
                    //
                    // select NULL from tt;
                    // or
                    // insert overwrite local directory "abc" select NULL from tt;
                    //
                    // where there is no column type to which the NULL value should be
                    // converted.
                    //
                    String tName = colInfo.getType().getTypeName();
                    if (tName.equals(serdeConstants.VOID_TYPE_NAME)) {
                        colTypes = colTypes.concat(serdeConstants.STRING_TYPE_NAME);
                    } else {
                        colTypes = colTypes.concat(tName);
                    }
                }
                // update the create table descriptor with the resulting schema.
                if (tblDesc != null) {
                    tblDesc.setCols(new ArrayList<FieldSchema>(field_schemas));
                } else if (viewDesc != null) {
                    viewDesc.setSchema(new ArrayList<FieldSchema>(field_schemas));
                }
                boolean isDestTempFile = true;
                if (!ctx.isMRTmpFileURI(dest_path.toUri().toString())) {
                    idToTableNameMap.put(String.valueOf(destTableId), dest_path.toUri().toString());
                    currentTableId = destTableId;
                    destTableId++;
                    isDestTempFile = false;
                }
                boolean isDfsDir = (dest_type.intValue() == QBMetaData.DEST_DFS_FILE);
                loadFileWork.add(new LoadFileDesc(tblDesc, viewDesc, queryTmpdir, dest_path, isDfsDir, cols, colTypes));
                if (tblDesc == null) {
                    if (viewDesc != null) {
                        table_desc = PlanUtils.getTableDesc(viewDesc, cols, colTypes);
                    } else if (qb.getIsQuery()) {
                        String fileFormat;
                        if (SessionState.get().getIsUsingThriftJDBCBinarySerDe()) {
                            fileFormat = "SequenceFile";
                            HiveConf.setVar(conf, HiveConf.ConfVars.HIVEQUERYRESULTFILEFORMAT, fileFormat);
                            table_desc = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, fileFormat, ThriftJDBCBinarySerDe.class);
                            // Set the fetch formatter to be a no-op for the ListSinkOperator, since we'll
                            // write out formatted thrift objects to SequenceFile
                            conf.set(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER, NoOpFetchFormatter.class.getName());
                        } else {
                            fileFormat = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYRESULTFILEFORMAT);
                            table_desc = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, fileFormat, LazySimpleSerDe.class);
                        }
                    } else {
                        table_desc = PlanUtils.getDefaultTableDesc(qb.getDirectoryDesc(), cols, colTypes);
                    }
                } else {
                    table_desc = PlanUtils.getTableDesc(tblDesc, cols, colTypes);
                }
                if (!outputs.add(new WriteEntity(dest_path, !isDfsDir, isDestTempFile))) {
                    throw new SemanticException(ErrorMsg.OUTPUT_SPECIFIED_MULTIPLE_TIMES.getMsg(dest_path.toUri().toString()));
                }
                break;
            }
        default:
            throw new SemanticException("Unknown destination type: " + dest_type);
    }
    input = genConversionSelectOperator(dest, qb, input, table_desc, dpCtx);
    inputRR = opParseCtx.get(input).getRowResolver();
    ArrayList<ColumnInfo> vecCol = new ArrayList<ColumnInfo>();
    if (updating(dest) || deleting(dest)) {
        vecCol.add(new ColumnInfo(VirtualColumn.ROWID.getName(), VirtualColumn.ROWID.getTypeInfo(), "", true));
    } else {
        try {
            StructObjectInspector rowObjectInspector = (StructObjectInspector) table_desc.getDeserializer(conf).getObjectInspector();
            List<? extends StructField> fields = rowObjectInspector.getAllStructFieldRefs();
            for (int i = 0; i < fields.size(); i++) {
                vecCol.add(new ColumnInfo(fields.get(i).getFieldName(), TypeInfoUtils.getTypeInfoFromObjectInspector(fields.get(i).getFieldObjectInspector()), "", false));
            }
        } catch (Exception e) {
            throw new SemanticException(e.getMessage(), e);
        }
    }
    RowSchema fsRS = new RowSchema(vecCol);
    // The output files of a FileSink can be merged if they are either not being written to a table
    // or are being written to a table which is not bucketed
    // and table the table is not sorted
    boolean canBeMerged = (dest_tab == null || !((dest_tab.getNumBuckets() > 0) || (dest_tab.getSortCols() != null && dest_tab.getSortCols().size() > 0)));
    // If this table is working with ACID semantics, turn off merging
    canBeMerged &= !destTableIsAcid;
    // Generate the partition columns from the parent input
    if (dest_type.intValue() == QBMetaData.DEST_TABLE || dest_type.intValue() == QBMetaData.DEST_PARTITION) {
        genPartnCols(dest, input, qb, table_desc, dest_tab, rsCtx);
    }
    FileSinkDesc fileSinkDesc = new FileSinkDesc(queryTmpdir, table_desc, conf.getBoolVar(HiveConf.ConfVars.COMPRESSRESULT), currentTableId, rsCtx.isMultiFileSpray(), canBeMerged, rsCtx.getNumFiles(), rsCtx.getTotalFiles(), rsCtx.getPartnCols(), dpCtx, dest_path);
    boolean isHiveServerQuery = SessionState.get().isHiveServerQuery();
    fileSinkDesc.setHiveServerQuery(isHiveServerQuery);
    // FileSinkOperator knows how to properly write to it.
    if (destTableIsAcid) {
        AcidUtils.Operation wt = updating(dest) ? AcidUtils.Operation.UPDATE : (deleting(dest) ? AcidUtils.Operation.DELETE : AcidUtils.Operation.INSERT);
        fileSinkDesc.setWriteType(wt);
        acidFileSinks.add(fileSinkDesc);
    }
    fileSinkDesc.setTemporary(destTableIsTemporary);
    fileSinkDesc.setMaterialization(destTableIsMaterialization);
    /* Set List Bucketing context. */
    if (lbCtx != null) {
        lbCtx.processRowSkewedIndex(fsRS);
        lbCtx.calculateSkewedValueSubDirList();
    }
    fileSinkDesc.setLbCtx(lbCtx);
    // set the stats publishing/aggregating key prefix
    // the same as directory name. The directory name
    // can be changed in the optimizer but the key should not be changed
    // it should be the same as the MoveWork's sourceDir.
    fileSinkDesc.setStatsAggPrefix(fileSinkDesc.getDirName().toString());
    if (!destTableIsMaterialization && HiveConf.getVar(conf, HIVESTATSDBCLASS).equalsIgnoreCase(StatDB.fs.name())) {
        String statsTmpLoc = ctx.getTempDirForPath(dest_path).toString();
        fileSinkDesc.setStatsTmpDir(statsTmpLoc);
        LOG.debug("Set stats collection dir : " + statsTmpLoc);
    }
    if (dest_part != null) {
        try {
            String staticSpec = Warehouse.makePartPath(dest_part.getSpec());
            fileSinkDesc.setStaticSpec(staticSpec);
        } catch (MetaException e) {
            throw new SemanticException(e);
        }
    } else if (dpCtx != null) {
        fileSinkDesc.setStaticSpec(dpCtx.getSPPath());
    }
    if (isHiveServerQuery && null != table_desc && table_desc.getSerdeClassName().equalsIgnoreCase(ThriftJDBCBinarySerDe.class.getName()) && HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_SERVER2_THRIFT_RESULTSET_SERIALIZE_IN_TASKS)) {
        fileSinkDesc.setIsUsingThriftJDBCBinarySerDe(true);
    } else {
        fileSinkDesc.setIsUsingThriftJDBCBinarySerDe(false);
    }
    Operator output = putOpInsertMap(OperatorFactory.getAndMakeChild(fileSinkDesc, fsRS, input), inputRR);
    if (ltd != null && SessionState.get() != null) {
        SessionState.get().getLineageState().mapDirToOp(ltd.getSourcePath(), (FileSinkOperator) output);
    } else if (queryState.getCommandType().equals(HiveOperation.CREATETABLE_AS_SELECT.getOperationName())) {
        Path tlocation = null;
        String tName = Utilities.getDbTableName(tableDesc.getTableName())[1];
        try {
            Warehouse wh = new Warehouse(conf);
            tlocation = wh.getTablePath(db.getDatabase(tableDesc.getDatabaseName()), tName);
        } catch (MetaException | HiveException e) {
            throw new SemanticException(e);
        }
        SessionState.get().getLineageState().mapDirToOp(tlocation, (FileSinkOperator) output);
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Created FileSink Plan for clause: " + dest + "dest_path: " + dest_path + " row schema: " + inputRR.toString());
    }
    FileSinkOperator fso = (FileSinkOperator) output;
    fso.getConf().setTable(dest_tab);
    fsopToTable.put(fso, dest_tab);
    // and it is an insert overwrite or insert into table
    if (dest_tab != null && conf.getBoolVar(ConfVars.HIVESTATSAUTOGATHER) && conf.getBoolVar(ConfVars.HIVESTATSCOLAUTOGATHER) && ColumnStatsAutoGatherContext.canRunAutogatherStats(fso)) {
        if (dest_type.intValue() == QBMetaData.DEST_TABLE) {
            genAutoColumnStatsGatheringPipeline(qb, table_desc, partSpec, input, qb.getParseInfo().isInsertIntoTable(dest_tab.getDbName(), dest_tab.getTableName()));
        } else if (dest_type.intValue() == QBMetaData.DEST_PARTITION) {
            genAutoColumnStatsGatheringPipeline(qb, table_desc, dest_part.getSpec(), input, qb.getParseInfo().isInsertIntoTable(dest_tab.getDbName(), dest_tab.getTableName()));
        }
    }
    return output;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) Warehouse(org.apache.hadoop.hive.metastore.Warehouse) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FileSinkDesc(org.apache.hadoop.hive.ql.plan.FileSinkDesc) LazySimpleSerDe(org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) DynamicPartitionCtx(org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) Operation(org.apache.hadoop.hive.ql.io.AcidUtils.Operation) HiveOperation(org.apache.hadoop.hive.ql.plan.HiveOperation) CreateViewDesc(org.apache.hadoop.hive.ql.plan.CreateViewDesc) ThriftJDBCBinarySerDe(org.apache.hadoop.hive.serde2.thrift.ThriftJDBCBinarySerDe) FileSystem(org.apache.hadoop.fs.FileSystem) NoOpFetchFormatter(org.apache.hadoop.hive.serde2.NoOpFetchFormatter) ListBucketingCtx(org.apache.hadoop.hive.ql.plan.ListBucketingCtx) LinkedList(java.util.LinkedList) ArrayList(java.util.ArrayList) List(java.util.List) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) DummyPartition(org.apache.hadoop.hive.ql.metadata.DummyPartition) LoadFileDesc(org.apache.hadoop.hive.ql.plan.LoadFileDesc) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) Table(org.apache.hadoop.hive.ql.metadata.Table) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) IOException(java.io.IOException) IOException(java.io.IOException) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) PatternSyntaxException(java.util.regex.PatternSyntaxException) FileNotFoundException(java.io.FileNotFoundException) AccessControlException(java.security.AccessControlException) InvalidTableException(org.apache.hadoop.hive.ql.metadata.InvalidTableException) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) DummyPartition(org.apache.hadoop.hive.ql.metadata.DummyPartition) CreateTableDesc(org.apache.hadoop.hive.ql.plan.CreateTableDesc) Operation(org.apache.hadoop.hive.ql.io.AcidUtils.Operation) CreateTableDesc(org.apache.hadoop.hive.ql.plan.CreateTableDesc) InsertTableDesc(org.apache.hadoop.hive.ql.plan.InsertTableDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) AlterTableDesc(org.apache.hadoop.hive.ql.plan.AlterTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) StandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils)

Example 28 with ColumnInfo

use of org.apache.hadoop.hive.ql.exec.ColumnInfo in project hive by apache.

the class SemanticAnalyzer method genGroupByPlanGroupByOperator1.

/**
   * Generate the GroupByOperator for the Query Block (parseInfo.getXXX(dest)).
   * The new GroupByOperator will be a child of the reduceSinkOperatorInfo.
   *
   * @param parseInfo
   * @param dest
   * @param reduceSinkOperatorInfo
   * @param mode
   *          The mode of the aggregation (MERGEPARTIAL, PARTIAL2)
   * @param genericUDAFEvaluators
   *          The mapping from Aggregation StringTree to the
   *          genericUDAFEvaluator.
   * @param groupingSets
   *          list of grouping sets
   * @param groupingSetsPresent
   *          whether grouping sets are present in this query
   * @param groupingSetsNeedAdditionalMRJob
   *          whether grouping sets are consumed by this group by
   * @return the new GroupByOperator
   */
@SuppressWarnings("nls")
private Operator genGroupByPlanGroupByOperator1(QBParseInfo parseInfo, String dest, Operator reduceSinkOperatorInfo, GroupByDesc.Mode mode, Map<String, GenericUDAFEvaluator> genericUDAFEvaluators, List<Integer> groupingSets, boolean groupingSetsPresent, boolean groupingSetsNeedAdditionalMRJob) throws SemanticException {
    ArrayList<String> outputColumnNames = new ArrayList<String>();
    RowResolver groupByInputRowResolver = opParseCtx.get(reduceSinkOperatorInfo).getRowResolver();
    RowResolver groupByOutputRowResolver = new RowResolver();
    groupByOutputRowResolver.setIsExprResolver(true);
    ArrayList<ExprNodeDesc> groupByKeys = new ArrayList<ExprNodeDesc>();
    ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
    List<ASTNode> grpByExprs = getGroupByForClause(parseInfo, dest);
    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
    for (int i = 0; i < grpByExprs.size(); ++i) {
        ASTNode grpbyExpr = grpByExprs.get(i);
        ColumnInfo exprInfo = groupByInputRowResolver.getExpression(grpbyExpr);
        if (exprInfo == null) {
            throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(grpbyExpr));
        }
        groupByKeys.add(new ExprNodeColumnDesc(exprInfo));
        String field = getColumnInternalName(i);
        outputColumnNames.add(field);
        ColumnInfo oColInfo = new ColumnInfo(field, exprInfo.getType(), "", false);
        groupByOutputRowResolver.putExpression(grpbyExpr, oColInfo);
        addAlternateGByKeyMappings(grpbyExpr, oColInfo, reduceSinkOperatorInfo, groupByOutputRowResolver);
        colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1));
    }
    // This is only needed if a new grouping set key is being created
    int groupingSetsPosition = -1;
    // For grouping sets, add a dummy grouping key
    if (groupingSetsPresent) {
        groupingSetsPosition = groupByKeys.size();
        // This function is called for GroupBy2 to add grouping id as part of the groupby keys
        if (!groupingSetsNeedAdditionalMRJob) {
            addGroupingSetKey(groupByKeys, groupByInputRowResolver, groupByOutputRowResolver, outputColumnNames, colExprMap);
        } else {
            // The grouping set has not yet been processed. Create a new grouping key
            // Consider the query: select a,b, count(1) from T group by a,b with cube;
            // where it is being executed in 2 map-reduce jobs
            // The plan for 1st MR is TableScan -> GroupBy1 -> ReduceSink -> GroupBy2 -> FileSink
            // GroupBy1/ReduceSink worked as if grouping sets were not present
            // This function is called for GroupBy2 to create new rows for grouping sets
            // For each input row (a,b), 4 rows are created for the example above:
            // (a,b), (a,null), (null, b), (null, null)
            createNewGroupingKey(groupByKeys, outputColumnNames, groupByOutputRowResolver, colExprMap);
        }
    }
    HashMap<String, ASTNode> aggregationTrees = parseInfo.getAggregationExprsForClause(dest);
    // get the last colName for the reduce KEY
    // it represents the column name corresponding to distinct aggr, if any
    String lastKeyColName = null;
    List<ExprNodeDesc> reduceValues = null;
    if (reduceSinkOperatorInfo.getConf() instanceof ReduceSinkDesc) {
        List<String> inputKeyCols = ((ReduceSinkDesc) reduceSinkOperatorInfo.getConf()).getOutputKeyColumnNames();
        if (inputKeyCols.size() > 0) {
            lastKeyColName = inputKeyCols.get(inputKeyCols.size() - 1);
        }
        reduceValues = ((ReduceSinkDesc) reduceSinkOperatorInfo.getConf()).getValueCols();
    }
    int numDistinctUDFs = 0;
    boolean containsDistinctAggr = false;
    for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) {
        ASTNode value = entry.getValue();
        String aggName = unescapeIdentifier(value.getChild(0).getText());
        ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
        boolean isDistinct = (value.getType() == HiveParser.TOK_FUNCTIONDI);
        containsDistinctAggr = containsDistinctAggr || isDistinct;
        // side, so always look for the parameters: d+e
        if (isDistinct) {
            // 0 is the function name
            for (int i = 1; i < value.getChildCount(); i++) {
                ASTNode paraExpr = (ASTNode) value.getChild(i);
                ColumnInfo paraExprInfo = groupByInputRowResolver.getExpression(paraExpr);
                if (paraExprInfo == null) {
                    throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(paraExpr));
                }
                String paraExpression = paraExprInfo.getInternalName();
                assert (paraExpression != null);
                if (isDistinct && lastKeyColName != null) {
                    // if aggr is distinct, the parameter is name is constructed as
                    // KEY.lastKeyColName:<tag>._colx
                    paraExpression = Utilities.ReduceField.KEY.name() + "." + lastKeyColName + ":" + numDistinctUDFs + "." + getColumnInternalName(i - 1);
                }
                ExprNodeDesc expr = new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol());
                ExprNodeDesc reduceValue = isConstantParameterInAggregationParameters(paraExprInfo.getInternalName(), reduceValues);
                if (reduceValue != null) {
                    // this parameter is a constant
                    expr = reduceValue;
                }
                aggParameters.add(expr);
            }
        } else {
            ColumnInfo paraExprInfo = groupByInputRowResolver.getExpression(value);
            if (paraExprInfo == null) {
                throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(value));
            }
            String paraExpression = paraExprInfo.getInternalName();
            assert (paraExpression != null);
            aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()));
        }
        if (isDistinct) {
            numDistinctUDFs++;
        }
        Mode amode = groupByDescModeToUDAFMode(mode, isDistinct);
        GenericUDAFEvaluator genericUDAFEvaluator = null;
        genericUDAFEvaluator = genericUDAFEvaluators.get(entry.getKey());
        assert (genericUDAFEvaluator != null);
        GenericUDAFInfo udaf = getGenericUDAFInfo(genericUDAFEvaluator, amode, aggParameters);
        aggregations.add(new AggregationDesc(aggName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, (mode != GroupByDesc.Mode.FINAL && isDistinct), amode));
        String field = getColumnInternalName(groupByKeys.size() + aggregations.size() - 1);
        outputColumnNames.add(field);
        groupByOutputRowResolver.putExpression(value, new ColumnInfo(field, udaf.returnType, "", false));
    }
    float groupByMemoryUsage = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
    float memoryThreshold = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
    // Nothing special needs to be done for grouping sets if
    // this is the final group by operator, and multiple rows corresponding to the
    // grouping sets have been generated upstream.
    // However, if an addition MR job has been created to handle grouping sets,
    // additional rows corresponding to grouping sets need to be created here.
    Operator op = putOpInsertMap(OperatorFactory.getAndMakeChild(new GroupByDesc(mode, outputColumnNames, groupByKeys, aggregations, groupByMemoryUsage, memoryThreshold, groupingSets, groupingSetsPresent && groupingSetsNeedAdditionalMRJob, groupingSetsPosition, containsDistinctAggr), new RowSchema(groupByOutputRowResolver.getColumnInfos()), reduceSinkOperatorInfo), groupByOutputRowResolver);
    op.setColumnExprMap(colExprMap);
    return op;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) GenericUDAFEvaluator(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) GroupByDesc(org.apache.hadoop.hive.ql.plan.GroupByDesc) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) Mode(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode) AggregationDesc(org.apache.hadoop.hive.ql.plan.AggregationDesc) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap)

Example 29 with ColumnInfo

use of org.apache.hadoop.hive.ql.exec.ColumnInfo in project hive by apache.

the class SemanticAnalyzer method genGroupByPlanReduceSinkOperator.

/**
   * Generate the ReduceSinkOperator for the Group By Query Block
   * (qb.getPartInfo().getXXX(dest)). The new ReduceSinkOperator will be a child
   * of inputOperatorInfo.
   *
   * It will put all Group By keys and the distinct field (if any) in the
   * map-reduce sort key, and all other fields in the map-reduce value.
   *
   * @param numPartitionFields
   *          the number of fields for map-reduce partitioning. This is usually
   *          the number of fields in the Group By keys.
   * @return the new ReduceSinkOperator.
   * @throws SemanticException
   */
@SuppressWarnings("nls")
private ReduceSinkOperator genGroupByPlanReduceSinkOperator(QB qb, String dest, Operator inputOperatorInfo, List<ASTNode> grpByExprs, int numPartitionFields, boolean changeNumPartitionFields, int numReducers, boolean mapAggrDone, boolean groupingSetsPresent) throws SemanticException {
    RowResolver reduceSinkInputRowResolver = opParseCtx.get(inputOperatorInfo).getRowResolver();
    QBParseInfo parseInfo = qb.getParseInfo();
    RowResolver reduceSinkOutputRowResolver = new RowResolver();
    reduceSinkOutputRowResolver.setIsExprResolver(true);
    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
    // Pre-compute group-by keys and store in reduceKeys
    List<String> outputKeyColumnNames = new ArrayList<String>();
    List<String> outputValueColumnNames = new ArrayList<String>();
    ArrayList<ExprNodeDesc> reduceKeys = getReduceKeysForReduceSink(grpByExprs, dest, reduceSinkInputRowResolver, reduceSinkOutputRowResolver, outputKeyColumnNames, colExprMap);
    int keyLength = reduceKeys.size();
    int numOfColsRmedFromkey = grpByExprs.size() - keyLength;
    // add a key for reduce sink
    if (groupingSetsPresent) {
        // Process grouping set for the reduce sink operator
        processGroupingSetReduceSinkOperator(reduceSinkInputRowResolver, reduceSinkOutputRowResolver, reduceKeys, outputKeyColumnNames, colExprMap);
        if (changeNumPartitionFields) {
            numPartitionFields++;
        }
    }
    List<List<Integer>> distinctColIndices = getDistinctColIndicesForReduceSink(parseInfo, dest, reduceKeys, reduceSinkInputRowResolver, reduceSinkOutputRowResolver, outputKeyColumnNames, colExprMap);
    ArrayList<ExprNodeDesc> reduceValues = new ArrayList<ExprNodeDesc>();
    HashMap<String, ASTNode> aggregationTrees = parseInfo.getAggregationExprsForClause(dest);
    if (!mapAggrDone) {
        getReduceValuesForReduceSinkNoMapAgg(parseInfo, dest, reduceSinkInputRowResolver, reduceSinkOutputRowResolver, outputValueColumnNames, reduceValues, colExprMap);
    } else {
        // Put partial aggregation results in reduceValues
        int inputField = reduceKeys.size() + numOfColsRmedFromkey;
        for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) {
            TypeInfo type = reduceSinkInputRowResolver.getColumnInfos().get(inputField).getType();
            ExprNodeColumnDesc exprDesc = new ExprNodeColumnDesc(type, getColumnInternalName(inputField), "", false);
            reduceValues.add(exprDesc);
            inputField++;
            String outputColName = getColumnInternalName(reduceValues.size() - 1);
            outputValueColumnNames.add(outputColName);
            String internalName = Utilities.ReduceField.VALUE.toString() + "." + outputColName;
            reduceSinkOutputRowResolver.putExpression(entry.getValue(), new ColumnInfo(internalName, type, null, false));
            colExprMap.put(internalName, exprDesc);
        }
    }
    ReduceSinkOperator rsOp = (ReduceSinkOperator) putOpInsertMap(OperatorFactory.getAndMakeChild(PlanUtils.getReduceSinkDesc(reduceKeys, groupingSetsPresent ? keyLength + 1 : keyLength, reduceValues, distinctColIndices, outputKeyColumnNames, outputValueColumnNames, true, -1, numPartitionFields, numReducers, AcidUtils.Operation.NOT_ACID), new RowSchema(reduceSinkOutputRowResolver.getColumnInfos()), inputOperatorInfo), reduceSinkOutputRowResolver);
    rsOp.setColumnExprMap(colExprMap);
    return rsOp;
}
Also used : RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) LinkedList(java.util.LinkedList) ArrayList(java.util.ArrayList) List(java.util.List) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap)

Example 30 with ColumnInfo

use of org.apache.hadoop.hive.ql.exec.ColumnInfo in project hive by apache.

the class SemanticAnalyzer method handleInsertStatementSpec.

/**
   * This modifies the Select projections when the Select is part of an insert statement and
   * the insert statement specifies a column list for the target table, e.g.
   * create table source (a int, b int);
   * create table target (x int, y int, z int);
   * insert into target(z,x) select * from source
   *
   * Once the * is resolved to 'a,b', this list needs to rewritten to 'b,null,a' so that it looks
   * as if the original query was written as
   * insert into target select b, null, a from source
   *
   * if target schema is not specified, this is no-op
   *
   * @see #handleInsertStatementSpecPhase1(ASTNode, QBParseInfo, org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.Phase1Ctx)
   * @throws SemanticException
   */
public RowResolver handleInsertStatementSpec(List<ExprNodeDesc> col_list, String dest, RowResolver outputRR, RowResolver inputRR, QB qb, ASTNode selExprList) throws SemanticException {
    //(z,x)
    //specified in the query
    List<String> targetTableSchema = qb.getParseInfo().getDestSchemaForClause(dest);
    if (targetTableSchema == null) {
        //no insert schema was specified
        return outputRR;
    }
    if (targetTableSchema.size() != col_list.size()) {
        Table target = qb.getMetaData().getDestTableForAlias(dest);
        Partition partition = target == null ? qb.getMetaData().getDestPartitionForAlias(dest) : null;
        throw new SemanticException(generateErrorMessage(selExprList, "Expected " + targetTableSchema.size() + " columns for " + dest + (target != null ? "/" + target.getCompleteName() : (partition != null ? "/" + partition.getCompleteName() : "")) + "; select produces " + col_list.size() + " columns"));
    }
    //e.g. map z->expr for a
    Map<String, ExprNodeDesc> targetCol2Projection = new HashMap<String, ExprNodeDesc>();
    //e.g. map z->ColumnInfo for a
    Map<String, ColumnInfo> targetCol2ColumnInfo = new HashMap<String, ColumnInfo>();
    int colListPos = 0;
    for (String targetCol : targetTableSchema) {
        targetCol2ColumnInfo.put(targetCol, outputRR.getColumnInfos().get(colListPos));
        targetCol2Projection.put(targetCol, col_list.get(colListPos++));
    }
    Table target = qb.getMetaData().getDestTableForAlias(dest);
    Partition partition = target == null ? qb.getMetaData().getDestPartitionForAlias(dest) : null;
    if (target == null && partition == null) {
        throw new SemanticException(generateErrorMessage(selExprList, "No table/partition found in QB metadata for dest='" + dest + "'"));
    }
    ArrayList<ExprNodeDesc> new_col_list = new ArrayList<ExprNodeDesc>();
    colListPos = 0;
    List<FieldSchema> targetTableCols = target != null ? target.getCols() : partition.getCols();
    List<String> targetTableColNames = new ArrayList<String>();
    List<TypeInfo> targetTableColTypes = new ArrayList<TypeInfo>();
    for (FieldSchema fs : targetTableCols) {
        targetTableColNames.add(fs.getName());
        targetTableColTypes.add(TypeInfoUtils.getTypeInfoFromTypeString(fs.getType()));
    }
    Map<String, String> partSpec = qb.getMetaData().getPartSpecForAlias(dest);
    if (partSpec != null) {
        //relies on consistent order via LinkedHashMap
        for (Map.Entry<String, String> partKeyVal : partSpec.entrySet()) {
            if (partKeyVal.getValue() == null) {
                //these must be after non-partition cols
                targetTableColNames.add(partKeyVal.getKey());
                targetTableColTypes.add(TypeInfoFactory.stringTypeInfo);
            }
        }
    }
    RowResolver newOutputRR = new RowResolver();
    //where missing columns are NULL-filled
    for (int i = 0; i < targetTableColNames.size(); i++) {
        String f = targetTableColNames.get(i);
        if (targetCol2Projection.containsKey(f)) {
            //put existing column in new list to make sure it is in the right position
            new_col_list.add(targetCol2Projection.get(f));
            //todo: is this OK?
            ColumnInfo ci = targetCol2ColumnInfo.get(f);
            ci.setInternalName(getColumnInternalName(colListPos));
            newOutputRR.put(ci.getTabAlias(), ci.getInternalName(), ci);
        } else {
            //add new 'synthetic' columns for projections not provided by Select
            ExprNodeDesc exp = new ExprNodeConstantDesc(targetTableColTypes.get(i), null);
            new_col_list.add(exp);
            //this column doesn't come from any table
            final String tableAlias = null;
            ColumnInfo colInfo = new ColumnInfo(getColumnInternalName(colListPos), exp.getWritableObjectInspector(), tableAlias, false);
            newOutputRR.put(colInfo.getTabAlias(), colInfo.getInternalName(), colInfo);
        }
        colListPos++;
    }
    col_list.clear();
    col_list.addAll(new_col_list);
    return newOutputRR;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) DummyPartition(org.apache.hadoop.hive.ql.metadata.DummyPartition) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) Table(org.apache.hadoop.hive.ql.metadata.Table) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException)

Aggregations

ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)117 ArrayList (java.util.ArrayList)75 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)69 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)56 HashMap (java.util.HashMap)46 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)43 LinkedHashMap (java.util.LinkedHashMap)35 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)34 Operator (org.apache.hadoop.hive.ql.exec.Operator)28 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)27 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)27 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)26 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)25 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)24 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)21 Map (java.util.Map)20 CalciteSemanticException (org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException)20 AbstractMapJoinOperator (org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator)19 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)19 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)18