Search in sources :

Example 1 with FieldNode

use of org.apache.hadoop.hive.ql.optimizer.FieldNode in project hive by apache.

the class DataWritableReadSupport method getPrunedNestedColumns.

/**
   * Return the columns which contains required nested attribute level
   * E.g., given struct a:<x:int, y:int> while 'x' is required and 'y' is not, the method will return
   * a pruned struct for 'a' which only contains the attribute 'x'
   *
   * @param nestedColPaths the paths for required nested attribute
   * @return a map from the column to its selected nested column paths, of which the keys are all lower-cased.
   */
private static Map<String, FieldNode> getPrunedNestedColumns(Set<String> nestedColPaths) {
    Map<String, FieldNode> resMap = new HashMap<>();
    if (nestedColPaths.isEmpty()) {
        return resMap;
    }
    for (String s : nestedColPaths) {
        String c = StringUtils.split(s, '.')[0].toLowerCase();
        if (!resMap.containsKey(c)) {
            FieldNode f = NestedColumnFieldPruningUtils.addNodeByPath(null, s);
            resMap.put(c, f);
        } else {
            resMap.put(c, NestedColumnFieldPruningUtils.addNodeByPath(resMap.get(c), s));
        }
    }
    return resMap;
}
Also used : FieldNode(org.apache.hadoop.hive.ql.optimizer.FieldNode) HashMap(java.util.HashMap)

Example 2 with FieldNode

use of org.apache.hadoop.hive.ql.optimizer.FieldNode in project hive by apache.

the class DataWritableReadSupport method getProjectedSchema.

/**
   * Generate the projected schema from colIndexes and nested column paths. If the column is
   * contained by colIndex, it will be added directly, otherwise it will build a group type which
   * contains all required sub types using nestedColumnPaths.
   * @param schema original schema
   * @param colNames
   * @param colIndexes the index of needed columns
   * @param nestedColumnPaths the paths for nested columns
   * @return
   */
public static MessageType getProjectedSchema(MessageType schema, List<String> colNames, List<Integer> colIndexes, Set<String> nestedColumnPaths) {
    List<Type> schemaTypes = new ArrayList<Type>();
    Map<String, FieldNode> prunedCols = getPrunedNestedColumns(nestedColumnPaths);
    for (Integer i : colIndexes) {
        if (i < colNames.size()) {
            if (i < schema.getFieldCount()) {
                Type t = schema.getType(i);
                String tn = t.getName().toLowerCase();
                if (!prunedCols.containsKey(tn)) {
                    schemaTypes.add(schema.getType(i));
                } else {
                    if (t.isPrimitive()) {
                        // For primitive type, add directly.
                        schemaTypes.add(t);
                    } else {
                        // For group type, we need to build the projected group type with required leaves
                        List<Type> g = projectLeafTypes(Arrays.asList(t), Arrays.asList(prunedCols.get(tn)));
                        if (!g.isEmpty()) {
                            schemaTypes.addAll(g);
                        }
                    }
                }
            } else {
                //prefixing with '_mask_' to ensure no conflict with named
                //columns in the file schema
                schemaTypes.add(Types.optional(PrimitiveTypeName.BINARY).named("_mask_" + colNames.get(i)));
            }
        }
    }
    return new MessageType(schema.getName(), schemaTypes);
}
Also used : OriginalType(org.apache.parquet.schema.OriginalType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) FieldNode(org.apache.hadoop.hive.ql.optimizer.FieldNode) ArrayList(java.util.ArrayList) MessageType(org.apache.parquet.schema.MessageType)

Example 3 with FieldNode

use of org.apache.hadoop.hive.ql.optimizer.FieldNode in project hive by apache.

the class DataWritableReadSupport method projectLeafTypes.

private static List<Type> projectLeafTypes(List<Type> types, List<FieldNode> nodes) {
    List<Type> res = new ArrayList<>();
    if (nodes.isEmpty()) {
        return res;
    }
    Map<String, FieldNode> fieldMap = new HashMap<>();
    for (FieldNode n : nodes) {
        fieldMap.put(n.getFieldName().toLowerCase(), n);
    }
    for (Type type : types) {
        String tn = type.getName().toLowerCase();
        if (fieldMap.containsKey(tn)) {
            FieldNode f = fieldMap.get(tn);
            if (f.getNodes().isEmpty()) {
                // no child, no need for pruning
                res.add(type);
            } else {
                if (type instanceof GroupType) {
                    GroupType groupType = type.asGroupType();
                    List<Type> ts = projectLeafTypes(groupType.getFields(), f.getNodes());
                    GroupType g = buildProjectedGroupType(groupType, ts);
                    if (g != null) {
                        res.add(g);
                    }
                } else {
                    throw new RuntimeException("Primitive type " + f.getFieldName() + "should not " + "doesn't match type" + f.toString());
                }
            }
        }
    }
    return res;
}
Also used : OriginalType(org.apache.parquet.schema.OriginalType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) FieldNode(org.apache.hadoop.hive.ql.optimizer.FieldNode) GroupType(org.apache.parquet.schema.GroupType) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList)

Example 4 with FieldNode

use of org.apache.hadoop.hive.ql.optimizer.FieldNode in project hive by apache.

the class ParquetHiveSerDe method processRawPrunedPaths.

/**
   * Given a list of raw pruned paths separated by ',', return a list of merged pruned paths.
   * For instance, if the 'prunedPaths' is "s.a, s, s", this returns ["s"].
   */
private static List<String> processRawPrunedPaths(String prunedPaths) {
    List<FieldNode> fieldNodes = new ArrayList<>();
    for (String p : prunedPaths.split(",")) {
        fieldNodes = FieldNode.mergeFieldNodes(fieldNodes, FieldNode.fromPath(p));
    }
    List<String> prunedPathList = new ArrayList<>();
    for (FieldNode fn : fieldNodes) {
        prunedPathList.addAll(fn.toPaths());
    }
    return prunedPathList;
}
Also used : FieldNode(org.apache.hadoop.hive.ql.optimizer.FieldNode) ArrayList(java.util.ArrayList)

Example 5 with FieldNode

use of org.apache.hadoop.hive.ql.optimizer.FieldNode in project hive by apache.

the class RewriteQueryUsingAggregateIndexCtx method replaceTableScanProcess.

/**
   * This method replaces the original TableScanOperator with the new
   * TableScanOperator and metadata that scans over the index table rather than
   * scanning over the original table.
   *
   */
private void replaceTableScanProcess(TableScanOperator scanOperator) throws SemanticException {
    RewriteQueryUsingAggregateIndexCtx rewriteQueryCtx = this;
    String alias = rewriteQueryCtx.getAlias();
    // Need to remove the original TableScanOperators from these data structures
    // and add new ones
    HashMap<String, TableScanOperator> topOps = rewriteQueryCtx.getParseContext().getTopOps();
    // remove original TableScanOperator
    topOps.remove(alias);
    String indexTableName = rewriteQueryCtx.getIndexName();
    Table indexTableHandle = null;
    try {
        indexTableHandle = rewriteQueryCtx.getHiveDb().getTable(indexTableName);
    } catch (HiveException e) {
        LOG.error("Error while getting the table handle for index table.");
        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
        throw new SemanticException(e.getMessage(), e);
    }
    // construct a new descriptor for the index table scan
    TableScanDesc indexTableScanDesc = new TableScanDesc(indexTableHandle);
    indexTableScanDesc.setGatherStats(false);
    String k = MetaStoreUtils.encodeTableName(indexTableName) + Path.SEPARATOR;
    indexTableScanDesc.setStatsAggPrefix(k);
    scanOperator.setConf(indexTableScanDesc);
    // Construct the new RowResolver for the new TableScanOperator
    ArrayList<ColumnInfo> sigRS = new ArrayList<ColumnInfo>();
    try {
        StructObjectInspector rowObjectInspector = (StructObjectInspector) indexTableHandle.getDeserializer().getObjectInspector();
        StructField field = rowObjectInspector.getStructFieldRef(rewriteQueryCtx.getIndexKey());
        sigRS.add(new ColumnInfo(field.getFieldName(), TypeInfoUtils.getTypeInfoFromObjectInspector(field.getFieldObjectInspector()), indexTableName, false));
    } catch (SerDeException e) {
        LOG.error("Error while creating the RowResolver for new TableScanOperator.");
        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
        throw new SemanticException(e.getMessage(), e);
    }
    RowSchema rs = new RowSchema(sigRS);
    // Set row resolver for new table
    String newAlias = indexTableName;
    int index = alias.lastIndexOf(":");
    if (index >= 0) {
        newAlias = alias.substring(0, index) + ":" + indexTableName;
    }
    // Scan operator now points to other table
    scanOperator.getConf().setAlias(newAlias);
    scanOperator.setAlias(indexTableName);
    topOps.put(newAlias, scanOperator);
    rewriteQueryCtx.getParseContext().setTopOps(topOps);
    ColumnPrunerProcFactory.setupNeededColumns(scanOperator, rs, Arrays.asList(new FieldNode(rewriteQueryCtx.getIndexKey())));
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) Table(org.apache.hadoop.hive.ql.metadata.Table) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FieldNode(org.apache.hadoop.hive.ql.optimizer.FieldNode) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Aggregations

FieldNode (org.apache.hadoop.hive.ql.optimizer.FieldNode)5 ArrayList (java.util.ArrayList)4 HashMap (java.util.HashMap)2 GroupType (org.apache.parquet.schema.GroupType)2 MessageType (org.apache.parquet.schema.MessageType)2 OriginalType (org.apache.parquet.schema.OriginalType)2 Type (org.apache.parquet.schema.Type)2 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)1 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)1 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)1 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)1 Table (org.apache.hadoop.hive.ql.metadata.Table)1 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)1 TableScanDesc (org.apache.hadoop.hive.ql.plan.TableScanDesc)1 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)1 StructField (org.apache.hadoop.hive.serde2.objectinspector.StructField)1 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)1