Search in sources :

Example 76 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class Vectorizer method canSpecializeReduceSink.

private boolean canSpecializeReduceSink(ReduceSinkDesc desc, boolean isTezOrSpark, VectorizationContext vContext, VectorReduceSinkInfo vectorReduceSinkInfo) throws HiveException {
    // Allocate a VectorReduceSinkDesc initially with key type NONE so EXPLAIN can report this
    // operator was vectorized, but not native.  And, the conditions.
    VectorReduceSinkDesc vectorDesc = new VectorReduceSinkDesc();
    desc.setVectorDesc(vectorDesc);
    boolean isVectorizationReduceSinkNativeEnabled = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCESINK_NEW_ENABLED);
    String engine = HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE);
    boolean hasBuckets = desc.getBucketCols() != null && !desc.getBucketCols().isEmpty();
    boolean hasTopN = desc.getTopN() >= 0;
    boolean useUniformHash = desc.getReducerTraits().contains(UNIFORM);
    boolean hasDistinctColumns = desc.getDistinctColumnIndices().size() > 0;
    TableDesc keyTableDesc = desc.getKeySerializeInfo();
    Class<? extends Deserializer> keySerializerClass = keyTableDesc.getDeserializerClass();
    boolean isKeyBinarySortable = (keySerializerClass == org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe.class);
    TableDesc valueTableDesc = desc.getValueSerializeInfo();
    Class<? extends Deserializer> valueDeserializerClass = valueTableDesc.getDeserializerClass();
    boolean isValueLazyBinary = (valueDeserializerClass == org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe.class);
    // Remember the condition variables for EXPLAIN regardless.
    vectorDesc.setIsVectorizationReduceSinkNativeEnabled(isVectorizationReduceSinkNativeEnabled);
    vectorDesc.setEngine(engine);
    vectorDesc.setHasBuckets(hasBuckets);
    vectorDesc.setHasTopN(hasTopN);
    vectorDesc.setUseUniformHash(useUniformHash);
    vectorDesc.setHasDistinctColumns(hasDistinctColumns);
    vectorDesc.setIsKeyBinarySortable(isKeyBinarySortable);
    vectorDesc.setIsValueLazyBinary(isValueLazyBinary);
    // Many restrictions.
    if (!isVectorizationReduceSinkNativeEnabled || !isTezOrSpark || hasBuckets || hasTopN || !useUniformHash || hasDistinctColumns || !isKeyBinarySortable || !isValueLazyBinary) {
        return false;
    }
    // We are doing work here we'd normally do in VectorGroupByCommonOperator's constructor.
    // So if we later decide not to specialize, we'll just waste any scratch columns allocated...
    List<ExprNodeDesc> keysDescs = desc.getKeyCols();
    VectorExpression[] allKeyExpressions = vContext.getVectorExpressions(keysDescs);
    // Since a key expression can be a calculation and the key will go into a scratch column,
    // we need the mapping and type information.
    int[] reduceSinkKeyColumnMap = new int[allKeyExpressions.length];
    TypeInfo[] reduceSinkKeyTypeInfos = new TypeInfo[allKeyExpressions.length];
    Type[] reduceSinkKeyColumnVectorTypes = new Type[allKeyExpressions.length];
    ArrayList<VectorExpression> groupByKeyExpressionsList = new ArrayList<VectorExpression>();
    VectorExpression[] reduceSinkKeyExpressions;
    for (int i = 0; i < reduceSinkKeyColumnMap.length; i++) {
        VectorExpression ve = allKeyExpressions[i];
        reduceSinkKeyColumnMap[i] = ve.getOutputColumn();
        reduceSinkKeyTypeInfos[i] = keysDescs.get(i).getTypeInfo();
        reduceSinkKeyColumnVectorTypes[i] = VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkKeyTypeInfos[i]);
        if (!IdentityExpression.isColumnOnly(ve)) {
            groupByKeyExpressionsList.add(ve);
        }
    }
    if (groupByKeyExpressionsList.size() == 0) {
        reduceSinkKeyExpressions = null;
    } else {
        reduceSinkKeyExpressions = groupByKeyExpressionsList.toArray(new VectorExpression[0]);
    }
    ArrayList<ExprNodeDesc> valueDescs = desc.getValueCols();
    VectorExpression[] allValueExpressions = vContext.getVectorExpressions(valueDescs);
    int[] reduceSinkValueColumnMap = new int[valueDescs.size()];
    TypeInfo[] reduceSinkValueTypeInfos = new TypeInfo[valueDescs.size()];
    Type[] reduceSinkValueColumnVectorTypes = new Type[valueDescs.size()];
    ArrayList<VectorExpression> reduceSinkValueExpressionsList = new ArrayList<VectorExpression>();
    VectorExpression[] reduceSinkValueExpressions;
    for (int i = 0; i < valueDescs.size(); ++i) {
        VectorExpression ve = allValueExpressions[i];
        reduceSinkValueColumnMap[i] = ve.getOutputColumn();
        reduceSinkValueTypeInfos[i] = valueDescs.get(i).getTypeInfo();
        reduceSinkValueColumnVectorTypes[i] = VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkValueTypeInfos[i]);
        if (!IdentityExpression.isColumnOnly(ve)) {
            reduceSinkValueExpressionsList.add(ve);
        }
    }
    if (reduceSinkValueExpressionsList.size() == 0) {
        reduceSinkValueExpressions = null;
    } else {
        reduceSinkValueExpressions = reduceSinkValueExpressionsList.toArray(new VectorExpression[0]);
    }
    vectorReduceSinkInfo.setReduceSinkKeyColumnMap(reduceSinkKeyColumnMap);
    vectorReduceSinkInfo.setReduceSinkKeyTypeInfos(reduceSinkKeyTypeInfos);
    vectorReduceSinkInfo.setReduceSinkKeyColumnVectorTypes(reduceSinkKeyColumnVectorTypes);
    vectorReduceSinkInfo.setReduceSinkKeyExpressions(reduceSinkKeyExpressions);
    vectorReduceSinkInfo.setReduceSinkValueColumnMap(reduceSinkValueColumnMap);
    vectorReduceSinkInfo.setReduceSinkValueTypeInfos(reduceSinkValueTypeInfos);
    vectorReduceSinkInfo.setReduceSinkValueColumnVectorTypes(reduceSinkValueColumnVectorTypes);
    vectorReduceSinkInfo.setReduceSinkValueExpressions(reduceSinkValueExpressions);
    return true;
}
Also used : ArrayList(java.util.ArrayList) VectorReduceSinkDesc(org.apache.hadoop.hive.ql.plan.VectorReduceSinkDesc) LazyBinarySerDe(org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe) UDFToString(org.apache.hadoop.hive.ql.udf.UDFToString) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) InConstantType(org.apache.hadoop.hive.ql.exec.vector.VectorizationContext.InConstantType) HashTableImplementationType(org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableImplementationType) HashTableKeyType(org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType) Type(org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type) VectorDeserializeType(org.apache.hadoop.hive.ql.plan.VectorPartitionDesc.VectorDeserializeType) OperatorType(org.apache.hadoop.hive.ql.plan.api.OperatorType) VectorExpression(org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc)

Example 77 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class HCatUtil method getInputJobProperties.

public static Map<String, String> getInputJobProperties(HiveStorageHandler storageHandler, InputJobInfo inputJobInfo) {
    Properties props = inputJobInfo.getTableInfo().getStorerInfo().getProperties();
    props.put(serdeConstants.SERIALIZATION_LIB, storageHandler.getSerDeClass().getName());
    TableDesc tableDesc = new TableDesc(storageHandler.getInputFormatClass(), storageHandler.getOutputFormatClass(), props);
    if (tableDesc.getJobProperties() == null) {
        tableDesc.setJobProperties(new HashMap<String, String>());
    }
    Properties mytableProperties = tableDesc.getProperties();
    mytableProperties.setProperty(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_NAME, inputJobInfo.getDatabaseName() + "." + inputJobInfo.getTableName());
    Map<String, String> jobProperties = new HashMap<String, String>();
    try {
        tableDesc.getJobProperties().put(HCatConstants.HCAT_KEY_JOB_INFO, HCatUtil.serialize(inputJobInfo));
        storageHandler.configureInputJobProperties(tableDesc, jobProperties);
    } catch (IOException e) {
        throw new IllegalStateException("Failed to configure StorageHandler", e);
    }
    return jobProperties;
}
Also used : HashMap(java.util.HashMap) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) IOException(java.io.IOException) Properties(java.util.Properties)

Example 78 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class ReduceRecordProcessor method initializeSourceForTag.

private void initializeSourceForTag(ReduceWork redWork, int tag, ObjectInspector[] ois, ReduceRecordSource[] sources, TableDesc valueTableDesc, String inputName) throws Exception {
    reducer = redWork.getReducer();
    reducer.getParentOperators().clear();
    // clear out any parents as reducer is the root
    reducer.setParentOperators(null);
    TableDesc keyTableDesc = redWork.getKeyDesc();
    Reader reader = inputs.get(inputName).getReader();
    sources[tag] = new ReduceRecordSource();
    // Only the big table input source should be vectorized (if applicable)
    // Note this behavior may have to change if we ever implement a vectorized merge join
    boolean vectorizedRecordSource = (tag == bigTablePosition) && redWork.getVectorMode();
    sources[tag].init(jconf, redWork.getReducer(), vectorizedRecordSource, keyTableDesc, valueTableDesc, reader, tag == bigTablePosition, (byte) tag, redWork.getVectorizedRowBatchCtx(), redWork.getVectorizedVertexNum());
    ois[tag] = sources[tag].getObjectInspector();
}
Also used : Reader(org.apache.tez.runtime.api.Reader) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 79 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class SemanticAnalyzer method genScriptPlan.

@SuppressWarnings("nls")
private Operator genScriptPlan(ASTNode trfm, QB qb, Operator input) throws SemanticException {
    // If there is no "AS" clause, the output schema will be "key,value"
    ArrayList<ColumnInfo> outputCols = new ArrayList<ColumnInfo>();
    int inputSerDeNum = 1, inputRecordWriterNum = 2;
    int outputSerDeNum = 4, outputRecordReaderNum = 5;
    int outputColsNum = 6;
    boolean outputColNames = false, outputColSchemas = false;
    int execPos = 3;
    boolean defaultOutputCols = false;
    // Go over all the children
    if (trfm.getChildCount() > outputColsNum) {
        ASTNode outCols = (ASTNode) trfm.getChild(outputColsNum);
        if (outCols.getType() == HiveParser.TOK_ALIASLIST) {
            outputColNames = true;
        } else if (outCols.getType() == HiveParser.TOK_TABCOLLIST) {
            outputColSchemas = true;
        }
    }
    // If column type is not specified, use a string
    if (!outputColNames && !outputColSchemas) {
        String intName = getColumnInternalName(0);
        ColumnInfo colInfo = new ColumnInfo(intName, TypeInfoFactory.stringTypeInfo, null, false);
        colInfo.setAlias("key");
        outputCols.add(colInfo);
        intName = getColumnInternalName(1);
        colInfo = new ColumnInfo(intName, TypeInfoFactory.stringTypeInfo, null, false);
        colInfo.setAlias("value");
        outputCols.add(colInfo);
        defaultOutputCols = true;
    } else {
        ASTNode collist = (ASTNode) trfm.getChild(outputColsNum);
        int ccount = collist.getChildCount();
        Set<String> colAliasNamesDuplicateCheck = new HashSet<String>();
        if (outputColNames) {
            for (int i = 0; i < ccount; ++i) {
                String colAlias = unescapeIdentifier(((ASTNode) collist.getChild(i)).getText());
                failIfColAliasExists(colAliasNamesDuplicateCheck, colAlias);
                String intName = getColumnInternalName(i);
                ColumnInfo colInfo = new ColumnInfo(intName, TypeInfoFactory.stringTypeInfo, null, false);
                colInfo.setAlias(colAlias);
                outputCols.add(colInfo);
            }
        } else {
            for (int i = 0; i < ccount; ++i) {
                ASTNode child = (ASTNode) collist.getChild(i);
                assert child.getType() == HiveParser.TOK_TABCOL;
                String colAlias = unescapeIdentifier(((ASTNode) child.getChild(0)).getText());
                failIfColAliasExists(colAliasNamesDuplicateCheck, colAlias);
                String intName = getColumnInternalName(i);
                ColumnInfo colInfo = new ColumnInfo(intName, TypeInfoUtils.getTypeInfoFromTypeString(getTypeStringFromAST((ASTNode) child.getChild(1))), null, false);
                colInfo.setAlias(colAlias);
                outputCols.add(colInfo);
            }
        }
    }
    RowResolver out_rwsch = new RowResolver();
    StringBuilder columns = new StringBuilder();
    StringBuilder columnTypes = new StringBuilder();
    for (int i = 0; i < outputCols.size(); ++i) {
        if (i != 0) {
            columns.append(",");
            columnTypes.append(",");
        }
        columns.append(outputCols.get(i).getInternalName());
        columnTypes.append(outputCols.get(i).getType().getTypeName());
        out_rwsch.put(qb.getParseInfo().getAlias(), outputCols.get(i).getAlias(), outputCols.get(i));
    }
    StringBuilder inpColumns = new StringBuilder();
    StringBuilder inpColumnTypes = new StringBuilder();
    ArrayList<ColumnInfo> inputSchema = opParseCtx.get(input).getRowResolver().getColumnInfos();
    for (int i = 0; i < inputSchema.size(); ++i) {
        if (i != 0) {
            inpColumns.append(",");
            inpColumnTypes.append(",");
        }
        inpColumns.append(inputSchema.get(i).getInternalName());
        inpColumnTypes.append(inputSchema.get(i).getType().getTypeName());
    }
    TableDesc outInfo;
    TableDesc errInfo;
    TableDesc inInfo;
    String defaultSerdeName = conf.getVar(HiveConf.ConfVars.HIVESCRIPTSERDE);
    Class<? extends Deserializer> serde;
    try {
        serde = (Class<? extends Deserializer>) Class.forName(defaultSerdeName, true, Utilities.getSessionSpecifiedClassLoader());
    } catch (ClassNotFoundException e) {
        throw new SemanticException(e);
    }
    int fieldSeparator = Utilities.tabCode;
    if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVESCRIPTESCAPE)) {
        fieldSeparator = Utilities.ctrlaCode;
    }
    // Input and Output Serdes
    if (trfm.getChild(inputSerDeNum).getChildCount() > 0) {
        inInfo = getTableDescFromSerDe((ASTNode) (((ASTNode) trfm.getChild(inputSerDeNum))).getChild(0), inpColumns.toString(), inpColumnTypes.toString(), false);
    } else {
        inInfo = PlanUtils.getTableDesc(serde, Integer.toString(fieldSeparator), inpColumns.toString(), inpColumnTypes.toString(), false, true);
    }
    if (trfm.getChild(outputSerDeNum).getChildCount() > 0) {
        outInfo = getTableDescFromSerDe((ASTNode) (((ASTNode) trfm.getChild(outputSerDeNum))).getChild(0), columns.toString(), columnTypes.toString(), false);
    // This is for backward compatibility. If the user did not specify the
    // output column list, we assume that there are 2 columns: key and value.
    // However, if the script outputs: col1, col2, col3 seperated by TAB, the
    // requirement is: key is col and value is (col2 TAB col3)
    } else {
        outInfo = PlanUtils.getTableDesc(serde, Integer.toString(fieldSeparator), columns.toString(), columnTypes.toString(), defaultOutputCols);
    }
    // Error stream always uses the default serde with a single column
    errInfo = PlanUtils.getTableDesc(serde, Integer.toString(Utilities.tabCode), "KEY");
    // Output record readers
    Class<? extends RecordReader> outRecordReader = getRecordReader((ASTNode) trfm.getChild(outputRecordReaderNum));
    Class<? extends RecordWriter> inRecordWriter = getRecordWriter((ASTNode) trfm.getChild(inputRecordWriterNum));
    Class<? extends RecordReader> errRecordReader = getDefaultRecordReader();
    Operator output = putOpInsertMap(OperatorFactory.getAndMakeChild(new ScriptDesc(fetchFilesNotInLocalFilesystem(stripQuotes(trfm.getChild(execPos).getText())), inInfo, inRecordWriter, outInfo, outRecordReader, errRecordReader, errInfo), new RowSchema(out_rwsch.getColumnInfos()), input), out_rwsch);
    // disable backtracking
    output.setColumnExprMap(new HashMap<String, ExprNodeDesc>());
    // Add URI entity for transform script. script assumed t be local unless downloadable
    if (conf.getBoolVar(ConfVars.HIVE_CAPTURE_TRANSFORM_ENTITY)) {
        String scriptCmd = getScriptProgName(stripQuotes(trfm.getChild(execPos).getText()));
        getInputs().add(new ReadEntity(new Path(scriptCmd), ResourceDownloader.isFileUri(scriptCmd)));
    }
    return output;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) ScriptDesc(org.apache.hadoop.hive.ql.plan.ScriptDesc) Path(org.apache.hadoop.fs.Path) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) CreateTableDesc(org.apache.hadoop.hive.ql.plan.CreateTableDesc) InsertTableDesc(org.apache.hadoop.hive.ql.plan.InsertTableDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) AlterTableDesc(org.apache.hadoop.hive.ql.plan.AlterTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) HashSet(java.util.HashSet) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException)

Example 80 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project haivvreo by jghoman.

the class AvroSerDe method determineCorrectProperties.

// Hive passes different properties in at different times.  If we're in a MR job,
// we'll get properties for the partition rather than the table, which will give
// us old values for the schema (if it's evolved).  Therefore, in an MR job
// we need to extract the table properties.
// Also, in join queries, multiple properties will be included, so we need
// to extract out the one appropriate to the table we're serde'ing.
private Properties determineCorrectProperties(Configuration configuration, Properties properties) {
    if ((configuration instanceof JobConf) && HaivvreoUtils.insideMRJob((JobConf) configuration)) {
        LOG.info("In MR job, extracting table-level properties");
        MapWork mapWork = Utilities.getMapWork(configuration);
        LinkedHashMap<String, PartitionDesc> a = mapWork.getAliasToPartnInfo();
        if (a.size() == 1) {
            LOG.info("Only one PartitionDesc found.  Returning that Properties");
            PartitionDesc p = a.values().iterator().next();
            TableDesc tableDesc = p.getTableDesc();
            return tableDesc.getProperties();
        } else {
            String tableName = properties.getProperty("name");
            LOG.info("Multiple PartitionDescs.  Return properties for " + tableName);
            for (Map.Entry<String, PartitionDesc> partitionDescs : a.entrySet()) {
                Properties p = partitionDescs.getValue().getTableDesc().getProperties();
                if (p.get("name").equals(tableName)) {
                    // We've found the matching table partition
                    LOG.info("Matched table name against " + partitionDescs.getKey() + ", return its properties");
                    return p;
                }
            }
            // Didn't find anything in partitions to match on.  WARN, at least.
            LOG.warn("Couldn't find any matching properties for table: " + tableName + ". Returning original properties");
        }
    }
    return properties;
}
Also used : MapWork(org.apache.hadoop.hive.ql.plan.MapWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) Properties(java.util.Properties) JobConf(org.apache.hadoop.mapred.JobConf) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Aggregations

TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)80 ArrayList (java.util.ArrayList)40 Path (org.apache.hadoop.fs.Path)33 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)27 HashMap (java.util.HashMap)24 LinkedHashMap (java.util.LinkedHashMap)21 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)21 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)19 Properties (java.util.Properties)16 Operator (org.apache.hadoop.hive.ql.exec.Operator)16 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)16 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)16 LoadTableDesc (org.apache.hadoop.hive.ql.plan.LoadTableDesc)14 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)14 List (java.util.List)13 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)13 JobConf (org.apache.hadoop.mapred.JobConf)13 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)11 IOException (java.io.IOException)10 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)10