Search in sources :

Example 36 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class TestOperators method testScriptOperator.

public void testScriptOperator() throws Throwable {
    try {
        System.out.println("Testing Script Operator");
        // col1
        ExprNodeDesc exprDesc1 = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, "col1", "", false);
        // col2
        ExprNodeDesc expr1 = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, "col0", "", false);
        ExprNodeDesc expr2 = new ExprNodeConstantDesc("1");
        ExprNodeDesc exprDesc2 = TypeCheckProcFactory.DefaultExprProcessor.getFuncExprNodeDesc("concat", expr1, expr2);
        // select operator to project these two columns
        ArrayList<ExprNodeDesc> earr = new ArrayList<ExprNodeDesc>();
        earr.add(exprDesc1);
        earr.add(exprDesc2);
        ArrayList<String> outputCols = new ArrayList<String>();
        for (int i = 0; i < earr.size(); i++) {
            outputCols.add("_col" + i);
        }
        SelectDesc selectCtx = new SelectDesc(earr, outputCols);
        Operator<SelectDesc> op = OperatorFactory.get(new CompilationOpContext(), SelectDesc.class);
        op.setConf(selectCtx);
        // scriptOperator to echo the output of the select
        TableDesc scriptOutput = PlanUtils.getDefaultTableDesc("" + Utilities.tabCode, "a,b");
        TableDesc scriptInput = PlanUtils.getDefaultTableDesc("" + Utilities.tabCode, "a,b");
        ScriptDesc sd = new ScriptDesc("cat", scriptOutput, TextRecordWriter.class, scriptInput, TextRecordReader.class, TextRecordReader.class, PlanUtils.getDefaultTableDesc("" + Utilities.tabCode, "key"));
        Operator<ScriptDesc> sop = OperatorFactory.getAndMakeChild(sd, op);
        // Collect operator to observe the output of the script
        CollectDesc cd = new CollectDesc(Integer.valueOf(10));
        CollectOperator cdop = (CollectOperator) OperatorFactory.getAndMakeChild(cd, sop);
        op.initialize(new JobConf(TestOperators.class), new ObjectInspector[] { r[0].oi });
        // evaluate on row
        for (int i = 0; i < 5; i++) {
            op.process(r[i].o, 0);
        }
        op.close(false);
        InspectableObject io = new InspectableObject();
        for (int i = 0; i < 5; i++) {
            cdop.retrieve(io);
            System.out.println("[" + i + "] io.o=" + io.o);
            System.out.println("[" + i + "] io.oi=" + io.oi);
            StructObjectInspector soi = (StructObjectInspector) io.oi;
            assert (soi != null);
            StructField a = soi.getStructFieldRef("a");
            StructField b = soi.getStructFieldRef("b");
            assertEquals("" + (i + 1), ((PrimitiveObjectInspector) a.getFieldObjectInspector()).getPrimitiveJavaObject(soi.getStructFieldData(io.o, a)));
            assertEquals((i) + "1", ((PrimitiveObjectInspector) b.getFieldObjectInspector()).getPrimitiveJavaObject(soi.getStructFieldData(io.o, b)));
        }
        System.out.println("Script Operator ok");
    } catch (Throwable e) {
        e.printStackTrace();
        throw e;
    }
}
Also used : ScriptDesc(org.apache.hadoop.hive.ql.plan.ScriptDesc) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) CollectDesc(org.apache.hadoop.hive.ql.plan.CollectDesc) ArrayList(java.util.ArrayList) InspectableObject(org.apache.hadoop.hive.serde2.objectinspector.InspectableObject) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) SelectDesc(org.apache.hadoop.hive.ql.plan.SelectDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JobConf(org.apache.hadoop.mapred.JobConf) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 37 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class DemuxOperator method initializeOp.

@Override
protected void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);
    // A DemuxOperator should have at least one child
    if (childOperatorsArray.length == 0) {
        throw new HiveException("Expected number of children is at least 1. Found : " + childOperatorsArray.length);
    }
    newTagToOldTag = toArray(conf.getNewTagToOldTag());
    newTagToChildIndex = toArray(conf.getNewTagToChildIndex());
    childInputObjInspectors = new ObjectInspector[childOperators.size()][];
    cntrs = new long[newTagToOldTag.length];
    nextCntrs = new long[newTagToOldTag.length];
    try {
        // Those inputObjectInspectors are stored in childInputObjInspectors.
        for (int i = 0; i < newTagToOldTag.length; i++) {
            int newTag = i;
            int oldTag = newTagToOldTag[i];
            int childIndex = newTagToChildIndex[newTag];
            cntrs[newTag] = 0;
            nextCntrs[newTag] = 0;
            TableDesc keyTableDesc = conf.getKeysSerializeInfos().get(newTag);
            Deserializer inputKeyDeserializer = ReflectionUtil.newInstance(keyTableDesc.getDeserializerClass(), null);
            SerDeUtils.initializeSerDe(inputKeyDeserializer, null, keyTableDesc.getProperties(), null);
            TableDesc valueTableDesc = conf.getValuesSerializeInfos().get(newTag);
            Deserializer inputValueDeserializer = ReflectionUtil.newInstance(valueTableDesc.getDeserializerClass(), null);
            SerDeUtils.initializeSerDe(inputValueDeserializer, null, valueTableDesc.getProperties(), null);
            List<ObjectInspector> oi = new ArrayList<ObjectInspector>();
            oi.add(inputKeyDeserializer.getObjectInspector());
            oi.add(inputValueDeserializer.getObjectInspector());
            int childParentsCount = conf.getChildIndexToOriginalNumParents().get(childIndex);
            // So, we first check if childInputObjInspectors contains the key of childIndex.
            if (childInputObjInspectors[childIndex] == null) {
                childInputObjInspectors[childIndex] = new ObjectInspector[childParentsCount];
            }
            ObjectInspector[] ois = childInputObjInspectors[childIndex];
            ois[oldTag] = ObjectInspectorFactory.getStandardStructObjectInspector(Utilities.reduceFieldNameList, oi);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    childrenDone = 0;
    newChildOperatorsTag = new int[childOperators.size()][];
    for (int i = 0; i < childOperators.size(); i++) {
        Operator<? extends OperatorDesc> child = childOperators.get(i);
        List<Integer> childOperatorTags = new ArrayList<Integer>();
        if (child instanceof MuxOperator) {
            // This DemuxOperator can appear multiple times in MuxOperator's
            // parentOperators
            int index = 0;
            for (Operator<? extends OperatorDesc> parent : child.getParentOperators()) {
                if (this == parent) {
                    childOperatorTags.add(index);
                }
                index++;
            }
        } else {
            childOperatorTags.add(child.getParentOperators().indexOf(this));
        }
        newChildOperatorsTag[i] = toArray(childOperatorTags);
    }
    if (LOG.isInfoEnabled()) {
        LOG.info("newChildOperatorsTag " + Arrays.toString(newChildOperatorsTag));
    }
}
Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ArrayList(java.util.ArrayList) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) Deserializer(org.apache.hadoop.hive.serde2.Deserializer) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 38 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class SparkDynamicPartitionPruner method initialize.

public void initialize(MapWork work, JobConf jobConf) throws SerDeException {
    Map<String, SourceInfo> columnMap = new HashMap<String, SourceInfo>();
    Set<String> sourceWorkIds = work.getEventSourceTableDescMap().keySet();
    for (String id : sourceWorkIds) {
        List<TableDesc> tables = work.getEventSourceTableDescMap().get(id);
        // Real column name - on which the operation is being performed
        List<String> columnNames = work.getEventSourceColumnNameMap().get(id);
        // Column type
        List<String> columnTypes = work.getEventSourceColumnTypeMap().get(id);
        List<ExprNodeDesc> partKeyExprs = work.getEventSourcePartKeyExprMap().get(id);
        Iterator<String> cit = columnNames.iterator();
        Iterator<String> typit = columnTypes.iterator();
        Iterator<ExprNodeDesc> pit = partKeyExprs.iterator();
        for (TableDesc t : tables) {
            String columnName = cit.next();
            String columnType = typit.next();
            ExprNodeDesc partKeyExpr = pit.next();
            SourceInfo si = new SourceInfo(t, partKeyExpr, columnName, columnType, jobConf);
            if (!sourceInfoMap.containsKey(id)) {
                sourceInfoMap.put(id, new ArrayList<SourceInfo>());
            }
            sourceInfoMap.get(id).add(si);
            // the union of the values in that case.
            if (columnMap.containsKey(columnName)) {
                si.values = columnMap.get(columnName).values;
            }
            columnMap.put(columnName, si);
        }
    }
}
Also used : HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc)

Example 39 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class Utilities method getInputSummaryWithPool.

@VisibleForTesting
static ContentSummary getInputSummaryWithPool(final Context ctx, Set<Path> pathNeedProcess, MapWork work, long[] summary, ExecutorService executor) throws IOException {
    List<Future<?>> results = new ArrayList<Future<?>>();
    final Map<String, ContentSummary> resultMap = new ConcurrentHashMap<String, ContentSummary>();
    HiveInterruptCallback interrup = HiveInterruptUtils.add(new HiveInterruptCallback() {

        @Override
        public void interrupt() {
            for (Path path : pathNeedProcess) {
                try {
                    path.getFileSystem(ctx.getConf()).close();
                } catch (IOException ignore) {
                    LOG.debug("Failed to close filesystem", ignore);
                }
            }
            if (executor != null) {
                executor.shutdownNow();
            }
        }
    });
    try {
        Configuration conf = ctx.getConf();
        JobConf jobConf = new JobConf(conf);
        for (Path path : pathNeedProcess) {
            final Path p = path;
            final String pathStr = path.toString();
            // All threads share the same Configuration and JobConf based on the
            // assumption that they are thread safe if only read operations are
            // executed. It is not stated in Hadoop's javadoc, the sourcce codes
            // clearly showed that they made efforts for it and we believe it is
            // thread safe. Will revisit this piece of codes if we find the assumption
            // is not correct.
            final Configuration myConf = conf;
            final JobConf myJobConf = jobConf;
            final Map<String, Operator<?>> aliasToWork = work.getAliasToWork();
            final Map<Path, ArrayList<String>> pathToAlias = work.getPathToAliases();
            final PartitionDesc partDesc = work.getPathToPartitionInfo().get(p);
            Runnable r = new Runnable() {

                @Override
                public void run() {
                    try {
                        Class<? extends InputFormat> inputFormatCls = partDesc.getInputFileFormatClass();
                        InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(inputFormatCls, myJobConf);
                        if (inputFormatObj instanceof ContentSummaryInputFormat) {
                            ContentSummaryInputFormat cs = (ContentSummaryInputFormat) inputFormatObj;
                            resultMap.put(pathStr, cs.getContentSummary(p, myJobConf));
                            return;
                        }
                        String metaTableStorage = null;
                        if (partDesc.getTableDesc() != null && partDesc.getTableDesc().getProperties() != null) {
                            metaTableStorage = partDesc.getTableDesc().getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, null);
                        }
                        if (partDesc.getProperties() != null) {
                            metaTableStorage = partDesc.getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, metaTableStorage);
                        }
                        HiveStorageHandler handler = HiveUtils.getStorageHandler(myConf, metaTableStorage);
                        if (handler instanceof InputEstimator) {
                            long total = 0;
                            TableDesc tableDesc = partDesc.getTableDesc();
                            InputEstimator estimator = (InputEstimator) handler;
                            for (String alias : HiveFileFormatUtils.doGetAliasesFromPath(pathToAlias, p)) {
                                JobConf jobConf = new JobConf(myJobConf);
                                TableScanOperator scanOp = (TableScanOperator) aliasToWork.get(alias);
                                Utilities.setColumnNameList(jobConf, scanOp, true);
                                Utilities.setColumnTypeList(jobConf, scanOp, true);
                                PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc);
                                Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf);
                                total += estimator.estimate(jobConf, scanOp, -1).getTotalLength();
                            }
                            resultMap.put(pathStr, new ContentSummary(total, -1, -1));
                        } else {
                            // todo: should nullify summary for non-native tables,
                            // not to be selected as a mapjoin target
                            FileSystem fs = p.getFileSystem(myConf);
                            resultMap.put(pathStr, fs.getContentSummary(p));
                        }
                    } catch (Exception e) {
                        // We safely ignore this exception for summary data.
                        // We don't update the cache to protect it from polluting other
                        // usages. The worst case is that IOException will always be
                        // retried for another getInputSummary(), which is fine as
                        // IOException is not considered as a common case.
                        LOG.info("Cannot get size of {}. Safely ignored.", pathStr);
                    }
                }
            };
            if (executor == null) {
                r.run();
            } else {
                Future<?> result = executor.submit(r);
                results.add(result);
            }
        }
        if (executor != null) {
            for (Future<?> result : results) {
                boolean executorDone = false;
                do {
                    try {
                        result.get();
                        executorDone = true;
                    } catch (InterruptedException e) {
                        LOG.info("Interrupted when waiting threads: ", e);
                        Thread.currentThread().interrupt();
                        break;
                    } catch (ExecutionException e) {
                        throw new IOException(e);
                    }
                } while (!executorDone);
            }
            executor.shutdown();
        }
        HiveInterruptUtils.checkInterrupted();
        for (Map.Entry<String, ContentSummary> entry : resultMap.entrySet()) {
            ContentSummary cs = entry.getValue();
            summary[0] += cs.getLength();
            summary[1] += cs.getFileCount();
            summary[2] += cs.getDirectoryCount();
            ctx.addCS(entry.getKey(), cs);
            if (LOG.isInfoEnabled()) {
                LOG.info("Cache Content Summary for {} length: {} file count: {} " + " directory count: {}", entry.getKey(), cs.getLength(), cs.getFileCount(), cs.getDirectoryCount());
            }
        }
        return new ContentSummary(summary[0], summary[1], summary[2]);
    } finally {
        if (executor != null) {
            executor.shutdownNow();
        }
        HiveInterruptUtils.remove(interrup);
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) ContentSummaryInputFormat(org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) ExecutionException(java.util.concurrent.ExecutionException) JobConf(org.apache.hadoop.mapred.JobConf) Path(org.apache.hadoop.fs.Path) InputEstimator(org.apache.hadoop.hive.ql.metadata.InputEstimator) HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) HiveInterruptCallback(org.apache.hadoop.hive.common.HiveInterruptCallback) IOException(java.io.IOException) SQLFeatureNotSupportedException(java.sql.SQLFeatureNotSupportedException) SQLTransientException(java.sql.SQLTransientException) SQLException(java.sql.SQLException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) EOFException(java.io.EOFException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SequenceFileInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat) ReworkMapredInputFormat(org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat) ContentSummaryInputFormat(org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) OneNullRowInputFormat(org.apache.hadoop.hive.ql.io.OneNullRowInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) ContentSummary(org.apache.hadoop.fs.ContentSummary) Future(java.util.concurrent.Future) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 40 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class ReduceSinkOperator method initializeOp.

@Override
protected void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);
    try {
        numRows = 0;
        cntr = 1;
        logEveryNRows = HiveConf.getLongVar(hconf, HiveConf.ConfVars.HIVE_LOG_N_RECORDS);
        List<ExprNodeDesc> keys = conf.getKeyCols();
        if (LOG.isDebugEnabled()) {
            LOG.debug("keys size is " + keys.size());
            for (ExprNodeDesc k : keys) {
                LOG.debug("Key exprNodeDesc " + k.getExprString());
            }
        }
        keyEval = new ExprNodeEvaluator[keys.size()];
        int i = 0;
        for (ExprNodeDesc e : keys) {
            if (e instanceof ExprNodeConstantDesc && (BUCKET_NUMBER_COL_NAME).equals(((ExprNodeConstantDesc) e).getValue())) {
                buckColIdxInKeyForSdpo = i;
            }
            keyEval[i++] = ExprNodeEvaluatorFactory.get(e);
        }
        numDistributionKeys = conf.getNumDistributionKeys();
        distinctColIndices = conf.getDistinctColumnIndices();
        numDistinctExprs = distinctColIndices.size();
        valueEval = new ExprNodeEvaluator[conf.getValueCols().size()];
        i = 0;
        for (ExprNodeDesc e : conf.getValueCols()) {
            valueEval[i++] = ExprNodeEvaluatorFactory.get(e);
        }
        partitionEval = new ExprNodeEvaluator[conf.getPartitionCols().size()];
        i = 0;
        for (ExprNodeDesc e : conf.getPartitionCols()) {
            int index = ExprNodeDescUtils.indexOf(e, keys);
            partitionEval[i++] = index < 0 ? ExprNodeEvaluatorFactory.get(e) : keyEval[index];
        }
        if (conf.getBucketCols() != null && !conf.getBucketCols().isEmpty()) {
            bucketEval = new ExprNodeEvaluator[conf.getBucketCols().size()];
            i = 0;
            for (ExprNodeDesc e : conf.getBucketCols()) {
                int index = ExprNodeDescUtils.indexOf(e, keys);
                bucketEval[i++] = index < 0 ? ExprNodeEvaluatorFactory.get(e) : keyEval[index];
            }
            buckColIdxInKey = conf.getPartitionCols().size();
        }
        tag = conf.getTag();
        tagByte[0] = (byte) tag;
        skipTag = conf.getSkipTag();
        if (LOG.isInfoEnabled()) {
            LOG.info("Using tag = " + tag);
        }
        TableDesc keyTableDesc = conf.getKeySerializeInfo();
        keySerializer = (Serializer) keyTableDesc.getDeserializerClass().newInstance();
        keySerializer.initialize(null, keyTableDesc.getProperties());
        keyIsText = keySerializer.getSerializedClass().equals(Text.class);
        TableDesc valueTableDesc = conf.getValueSerializeInfo();
        valueSerializer = (Serializer) valueTableDesc.getDeserializerClass().newInstance();
        valueSerializer.initialize(null, valueTableDesc.getProperties());
        int limit = conf.getTopN();
        float memUsage = conf.getTopNMemoryUsage();
        if (limit >= 0 && memUsage > 0) {
            reducerHash = conf.isPTFReduceSink() ? new PTFTopNHash() : new TopNHash();
            reducerHash.initialize(limit, memUsage, conf.isMapGroupBy(), this, conf, hconf);
        }
        useUniformHash = conf.getReducerTraits().contains(UNIFORM);
        firstRow = true;
    } catch (Exception e) {
        String msg = "Error initializing ReduceSinkOperator: " + e.getMessage();
        LOG.error(msg, e);
        throw new RuntimeException(e);
    }
}
Also used : ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) Text(org.apache.hadoop.io.Text) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Aggregations

TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)93 ArrayList (java.util.ArrayList)47 Path (org.apache.hadoop.fs.Path)34 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)29 HashMap (java.util.HashMap)26 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)26 LinkedHashMap (java.util.LinkedHashMap)23 Properties (java.util.Properties)19 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)19 LoadTableDesc (org.apache.hadoop.hive.ql.plan.LoadTableDesc)18 Operator (org.apache.hadoop.hive.ql.exec.Operator)16 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)16 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)16 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)16 JobConf (org.apache.hadoop.mapred.JobConf)15 List (java.util.List)14 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)14 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)14 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)11 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)11