Search in sources :

Example 16 with TableScanDesc

use of org.apache.hadoop.hive.ql.plan.TableScanDesc in project hive by apache.

the class IndexWhereProcessor method process.

@Override
public /**
   * Process a node of the operator tree. This matches on the rule in IndexWhereTaskDispatcher
   */
Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
    TableScanOperator operator = (TableScanOperator) nd;
    List<Node> opChildren = operator.getChildren();
    TableScanDesc operatorDesc = operator.getConf();
    if (operatorDesc == null || !tsToIndices.containsKey(operator)) {
        return null;
    }
    List<Index> indexes = tsToIndices.get(operator);
    ExprNodeDesc predicate = operatorDesc.getFilterExpr();
    IndexWhereProcCtx context = (IndexWhereProcCtx) procCtx;
    ParseContext pctx = context.getParseContext();
    LOG.info("Processing predicate for index optimization");
    if (predicate == null) {
        LOG.info("null predicate pushed down");
        return null;
    }
    LOG.info(predicate.getExprString());
    // check if we have tsToIndices on all partitions in this table scan
    Set<Partition> queryPartitions;
    try {
        queryPartitions = IndexUtils.checkPartitionsCoveredByIndex(operator, pctx, indexes);
        if (queryPartitions == null) {
            // partitions not covered
            return null;
        }
    } catch (HiveException e) {
        LOG.error("Fatal Error: problem accessing metastore", e);
        throw new SemanticException(e);
    }
    // we can only process MapReduce tasks to check input size
    if (!context.getCurrentTask().isMapRedTask()) {
        return null;
    }
    MapRedTask currentTask = (MapRedTask) context.getCurrentTask();
    // get potential reentrant index queries from each index
    Map<Index, HiveIndexQueryContext> queryContexts = new HashMap<Index, HiveIndexQueryContext>();
    // make sure we have an index on the table being scanned
    TableDesc tblDesc = operator.getTableDesc();
    Map<String, List<Index>> indexesByType = new HashMap<String, List<Index>>();
    for (Index indexOnTable : indexes) {
        if (indexesByType.get(indexOnTable.getIndexHandlerClass()) == null) {
            List<Index> newType = new ArrayList<Index>();
            newType.add(indexOnTable);
            indexesByType.put(indexOnTable.getIndexHandlerClass(), newType);
        } else {
            indexesByType.get(indexOnTable.getIndexHandlerClass()).add(indexOnTable);
        }
    }
    // choose index type with most tsToIndices of the same type on the table
    // TODO HIVE-2130 This would be a good place for some sort of cost based choice?
    List<Index> bestIndexes = indexesByType.values().iterator().next();
    for (List<Index> indexTypes : indexesByType.values()) {
        if (bestIndexes.size() < indexTypes.size()) {
            bestIndexes = indexTypes;
        }
    }
    // rewrite index queries for the chosen index type
    HiveIndexQueryContext tmpQueryContext = new HiveIndexQueryContext();
    tmpQueryContext.setQueryPartitions(queryPartitions);
    rewriteForIndexes(predicate, bestIndexes, pctx, currentTask, tmpQueryContext);
    List<Task<?>> indexTasks = tmpQueryContext.getQueryTasks();
    if (indexTasks != null && indexTasks.size() > 0) {
        queryContexts.put(bestIndexes.get(0), tmpQueryContext);
    }
    // choose an index rewrite to use
    if (queryContexts.size() > 0) {
        // TODO HIVE-2130 This would be a good place for some sort of cost based choice?
        Index chosenIndex = queryContexts.keySet().iterator().next();
        // modify the parse context to use indexing
        // we need to delay this until we choose one index so that we don't attempt to modify pctx multiple times
        HiveIndexQueryContext queryContext = queryContexts.get(chosenIndex);
        // prepare the map reduce job to use indexing
        MapWork work = currentTask.getWork().getMapWork();
        work.setInputformat(queryContext.getIndexInputFormat());
        work.addIndexIntermediateFile(queryContext.getIndexIntermediateFile());
        // modify inputs based on index query
        Set<ReadEntity> inputs = pctx.getSemanticInputs();
        inputs.addAll(queryContext.getAdditionalSemanticInputs());
        List<Task<?>> chosenRewrite = queryContext.getQueryTasks();
        // add dependencies so index query runs first
        insertIndexQuery(pctx, context, chosenRewrite);
    }
    return null;
}
Also used : HiveIndexQueryContext(org.apache.hadoop.hive.ql.index.HiveIndexQueryContext) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) HashMap(java.util.HashMap) Node(org.apache.hadoop.hive.ql.lib.Node) ArrayList(java.util.ArrayList) Index(org.apache.hadoop.hive.metastore.api.Index) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) ArrayList(java.util.ArrayList) List(java.util.List) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 17 with TableScanDesc

use of org.apache.hadoop.hive.ql.plan.TableScanDesc in project hive by apache.

the class RewriteQueryUsingAggregateIndexCtx method replaceTableScanProcess.

/**
   * This method replaces the original TableScanOperator with the new
   * TableScanOperator and metadata that scans over the index table rather than
   * scanning over the original table.
   *
   */
private void replaceTableScanProcess(TableScanOperator scanOperator) throws SemanticException {
    RewriteQueryUsingAggregateIndexCtx rewriteQueryCtx = this;
    String alias = rewriteQueryCtx.getAlias();
    // Need to remove the original TableScanOperators from these data structures
    // and add new ones
    HashMap<String, TableScanOperator> topOps = rewriteQueryCtx.getParseContext().getTopOps();
    // remove original TableScanOperator
    topOps.remove(alias);
    String indexTableName = rewriteQueryCtx.getIndexName();
    Table indexTableHandle = null;
    try {
        indexTableHandle = rewriteQueryCtx.getHiveDb().getTable(indexTableName);
    } catch (HiveException e) {
        LOG.error("Error while getting the table handle for index table.");
        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
        throw new SemanticException(e.getMessage(), e);
    }
    // construct a new descriptor for the index table scan
    TableScanDesc indexTableScanDesc = new TableScanDesc(indexTableHandle);
    indexTableScanDesc.setGatherStats(false);
    String k = MetaStoreUtils.encodeTableName(indexTableName) + Path.SEPARATOR;
    indexTableScanDesc.setStatsAggPrefix(k);
    scanOperator.setConf(indexTableScanDesc);
    // Construct the new RowResolver for the new TableScanOperator
    ArrayList<ColumnInfo> sigRS = new ArrayList<ColumnInfo>();
    try {
        StructObjectInspector rowObjectInspector = (StructObjectInspector) indexTableHandle.getDeserializer().getObjectInspector();
        StructField field = rowObjectInspector.getStructFieldRef(rewriteQueryCtx.getIndexKey());
        sigRS.add(new ColumnInfo(field.getFieldName(), TypeInfoUtils.getTypeInfoFromObjectInspector(field.getFieldObjectInspector()), indexTableName, false));
    } catch (SerDeException e) {
        LOG.error("Error while creating the RowResolver for new TableScanOperator.");
        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
        throw new SemanticException(e.getMessage(), e);
    }
    RowSchema rs = new RowSchema(sigRS);
    // Set row resolver for new table
    String newAlias = indexTableName;
    int index = alias.lastIndexOf(":");
    if (index >= 0) {
        newAlias = alias.substring(0, index) + ":" + indexTableName;
    }
    // Scan operator now points to other table
    scanOperator.getConf().setAlias(newAlias);
    scanOperator.setAlias(indexTableName);
    topOps.put(newAlias, scanOperator);
    rewriteQueryCtx.getParseContext().setTopOps(topOps);
    ColumnPrunerProcFactory.setupNeededColumns(scanOperator, rs, Arrays.asList(new FieldNode(rewriteQueryCtx.getIndexKey())));
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) Table(org.apache.hadoop.hive.ql.metadata.Table) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FieldNode(org.apache.hadoop.hive.ql.optimizer.FieldNode) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 18 with TableScanDesc

use of org.apache.hadoop.hive.ql.plan.TableScanDesc in project hive by apache.

the class HiveInputFormat method pushFilters.

public static void pushFilters(JobConf jobConf, TableScanOperator tableScan, final MapWork mrwork) {
    // ensure filters are not set from previous pushFilters
    jobConf.unset(TableScanDesc.FILTER_TEXT_CONF_STR);
    jobConf.unset(TableScanDesc.FILTER_EXPR_CONF_STR);
    Utilities.unsetSchemaEvolution(jobConf);
    TableScanDesc scanDesc = tableScan.getConf();
    if (scanDesc == null) {
        return;
    }
    Utilities.addTableSchemaToConf(jobConf, tableScan);
    // construct column name list and types for reference by filter push down
    Utilities.setColumnNameList(jobConf, tableScan);
    Utilities.setColumnTypeList(jobConf, tableScan);
    // push down filters
    ExprNodeGenericFuncDesc filterExpr = (ExprNodeGenericFuncDesc) scanDesc.getFilterExpr();
    if (filterExpr == null) {
        return;
    }
    // since we don't clone jobConf per alias
    if (mrwork != null && mrwork.getAliases() != null && mrwork.getAliases().size() > 1 && jobConf.get(ConfVars.HIVE_EXECUTION_ENGINE.varname).equals("mr")) {
        return;
    }
    String serializedFilterObj = scanDesc.getSerializedFilterObject();
    String serializedFilterExpr = scanDesc.getSerializedFilterExpr();
    boolean hasObj = serializedFilterObj != null, hasExpr = serializedFilterExpr != null;
    if (!hasObj) {
        Serializable filterObject = scanDesc.getFilterObject();
        if (filterObject != null) {
            serializedFilterObj = SerializationUtilities.serializeObject(filterObject);
        }
    }
    if (serializedFilterObj != null) {
        jobConf.set(TableScanDesc.FILTER_OBJECT_CONF_STR, serializedFilterObj);
    }
    if (!hasExpr) {
        serializedFilterExpr = SerializationUtilities.serializeExpression(filterExpr);
    }
    String filterText = filterExpr.getExprString();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Pushdown initiated with filterText = " + filterText + ", filterExpr = " + filterExpr + ", serializedFilterExpr = " + serializedFilterExpr + " (" + (hasExpr ? "desc" : "new") + ")" + (serializedFilterObj == null ? "" : (", serializedFilterObj = " + serializedFilterObj + " (" + (hasObj ? "desc" : "new") + ")")));
    }
    jobConf.set(TableScanDesc.FILTER_TEXT_CONF_STR, filterText);
    jobConf.set(TableScanDesc.FILTER_EXPR_CONF_STR, serializedFilterExpr);
}
Also used : Serializable(java.io.Serializable) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc)

Example 19 with TableScanDesc

use of org.apache.hadoop.hive.ql.plan.TableScanDesc in project parquet-mr by apache.

the class Hive010Binding method pushFilters.

private void pushFilters(final JobConf jobConf, final TableScanOperator tableScan) {
    final TableScanDesc scanDesc = tableScan.getConf();
    if (scanDesc == null) {
        LOG.debug("Not pushing filters because TableScanDesc is null");
        return;
    }
    // construct column name list for reference by filter push down
    Utilities.setColumnNameList(jobConf, tableScan);
    // push down filters
    final ExprNodeDesc filterExpr = scanDesc.getFilterExpr();
    if (filterExpr == null) {
        LOG.debug("Not pushing filters because FilterExpr is null");
        return;
    }
    final String filterText = filterExpr.getExprString();
    final String filterExprSerialized = Utilities.serializeExpression(filterExpr);
    jobConf.set(TableScanDesc.FILTER_TEXT_CONF_STR, filterText);
    jobConf.set(TableScanDesc.FILTER_EXPR_CONF_STR, filterExprSerialized);
}
Also used : TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc)

Example 20 with TableScanDesc

use of org.apache.hadoop.hive.ql.plan.TableScanDesc in project hive by apache.

the class MapOperator method initObjectInspector.

private MapOpCtx initObjectInspector(Configuration hconf, MapOpCtx opCtx, StructObjectInspector tableRowOI) throws Exception {
    PartitionDesc pd = opCtx.partDesc;
    TableDesc td = pd.getTableDesc();
    // Use table properties in case of unpartitioned tables,
    // and the union of table properties and partition properties, with partition
    // taking precedence, in the case of partitioned tables
    Properties overlayedProps = SerDeUtils.createOverlayedProperties(td.getProperties(), pd.getProperties());
    Map<String, String> partSpec = pd.getPartSpec();
    opCtx.tableName = String.valueOf(overlayedProps.getProperty("name"));
    opCtx.partName = String.valueOf(partSpec);
    opCtx.deserializer = pd.getDeserializer(hconf);
    StructObjectInspector partRawRowObjectInspector;
    boolean isAcid = AcidUtils.isTablePropertyTransactional(td.getProperties());
    if (Utilities.isSchemaEvolutionEnabled(hconf, isAcid) && Utilities.isInputFileFormatSelfDescribing(pd)) {
        partRawRowObjectInspector = tableRowOI;
    } else {
        partRawRowObjectInspector = (StructObjectInspector) opCtx.deserializer.getObjectInspector();
    }
    opCtx.partTblObjectInspectorConverter = ObjectInspectorConverters.getConverter(partRawRowObjectInspector, tableRowOI);
    // Next check if this table has partitions and if so
    // get the list of partition names as well as allocate
    // the serdes for the partition columns
    String pcols = overlayedProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS);
    if (pcols != null && pcols.length() > 0) {
        String[] partKeys = pcols.trim().split("/");
        String pcolTypes = overlayedProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMN_TYPES);
        String[] partKeyTypes = pcolTypes.trim().split(":");
        if (partKeys.length > partKeyTypes.length) {
            throw new HiveException("Internal error : partKeys length, " + partKeys.length + " greater than partKeyTypes length, " + partKeyTypes.length);
        }
        List<String> partNames = new ArrayList<String>(partKeys.length);
        Object[] partValues = new Object[partKeys.length];
        List<ObjectInspector> partObjectInspectors = new ArrayList<ObjectInspector>(partKeys.length);
        for (int i = 0; i < partKeys.length; i++) {
            String key = partKeys[i];
            partNames.add(key);
            ObjectInspector oi = PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(partKeyTypes[i]));
            // Partitions do not exist for this table
            if (partSpec == null) {
                // for partitionless table, initialize partValue to null
                partValues[i] = null;
            } else {
                partValues[i] = ObjectInspectorConverters.getConverter(PrimitiveObjectInspectorFactory.javaStringObjectInspector, oi).convert(partSpec.get(key));
            }
            partObjectInspectors.add(oi);
        }
        opCtx.rowWithPart = new Object[] { null, partValues };
        opCtx.partObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(partNames, partObjectInspectors);
    }
    // In that case, it will be a Select, but the rowOI need not be amended
    if (opCtx.op instanceof TableScanOperator) {
        TableScanOperator tsOp = (TableScanOperator) opCtx.op;
        TableScanDesc tsDesc = tsOp.getConf();
        if (tsDesc != null && tsDesc.hasVirtualCols()) {
            opCtx.vcs = tsDesc.getVirtualCols();
            opCtx.vcValues = new Object[opCtx.vcs.size()];
            opCtx.vcsObjectInspector = VirtualColumn.getVCSObjectInspector(opCtx.vcs);
            if (opCtx.isPartitioned()) {
                opCtx.rowWithPartAndVC = Arrays.copyOfRange(opCtx.rowWithPart, 0, 3);
            } else {
                opCtx.rowWithPartAndVC = new Object[2];
            }
        }
    }
    if (!opCtx.hasVC() && !opCtx.isPartitioned()) {
        opCtx.rowObjectInspector = tableRowOI;
        return opCtx;
    }
    List<StructObjectInspector> inspectors = new ArrayList<StructObjectInspector>();
    inspectors.add(tableRowOI);
    if (opCtx.isPartitioned()) {
        inspectors.add(opCtx.partObjectInspector);
    }
    if (opCtx.hasVC()) {
        inspectors.add(opCtx.vcsObjectInspector);
    }
    opCtx.rowObjectInspector = ObjectInspectorFactory.getUnionStructObjectInspector(inspectors);
    return opCtx;
}
Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ArrayList(java.util.ArrayList) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) Properties(java.util.Properties) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Aggregations

TableScanDesc (org.apache.hadoop.hive.ql.plan.TableScanDesc)28 ArrayList (java.util.ArrayList)12 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)12 Table (org.apache.hadoop.hive.ql.metadata.Table)8 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)7 ExprNodeGenericFuncDesc (org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc)7 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)6 HashMap (java.util.HashMap)5 LinkedHashMap (java.util.LinkedHashMap)4 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)4 ExprNodeConstantDesc (org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc)4 Serializable (java.io.Serializable)3 List (java.util.List)3 Map (java.util.Map)3 Path (org.apache.hadoop.fs.Path)3 HiveConf (org.apache.hadoop.hive.conf.HiveConf)3 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)3 Operator (org.apache.hadoop.hive.ql.exec.Operator)3 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)3 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)3