Search in sources :

Example 6 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class TestGenTezWork method setUp.

/**
   * @throws java.lang.Exception
   */
@SuppressWarnings("unchecked")
@Before
public void setUp() throws Exception {
    // Init conf
    final HiveConf conf = new HiveConf(SemanticAnalyzer.class);
    SessionState.start(conf);
    // Init parse context
    final ParseContext pctx = new ParseContext();
    pctx.setContext(new Context(conf));
    ctx = new GenTezProcContext(conf, pctx, Collections.EMPTY_LIST, new ArrayList<Task<? extends Serializable>>(), Collections.EMPTY_SET, Collections.EMPTY_SET);
    proc = new GenTezWork(new GenTezUtils() {

        @Override
        protected void setupMapWork(MapWork mapWork, GenTezProcContext context, PrunedPartitionList partitions, TableScanOperator root, String alias) throws SemanticException {
            LinkedHashMap<String, Operator<? extends OperatorDesc>> map = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
            map.put("foo", root);
            mapWork.setAliasToWork(map);
            return;
        }
    });
    CompilationOpContext cCtx = new CompilationOpContext();
    fs = new FileSinkOperator(cCtx);
    fs.setConf(new FileSinkDesc());
    rs = new ReduceSinkOperator(cCtx);
    rs.setConf(new ReduceSinkDesc());
    TableDesc tableDesc = new TableDesc();
    tableDesc.setProperties(new Properties());
    rs.getConf().setKeySerializeInfo(tableDesc);
    ts = new TableScanOperator(cCtx);
    ts.setConf(new TableScanDesc(null));
    ts.getChildOperators().add(rs);
    rs.getParentOperators().add(ts);
    rs.getChildOperators().add(fs);
    fs.getParentOperators().add(rs);
    ctx.preceedingWork = null;
    ctx.currentRootOperator = ts;
}
Also used : Context(org.apache.hadoop.hive.ql.Context) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FileSinkDesc(org.apache.hadoop.hive.ql.plan.FileSinkDesc) ArrayList(java.util.ArrayList) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) Properties(java.util.Properties) LinkedHashMap(java.util.LinkedHashMap) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) HiveConf(org.apache.hadoop.hive.conf.HiveConf) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) Before(org.junit.Before)

Example 7 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class DemuxOperator method initializeOp.

@Override
protected void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);
    // A DemuxOperator should have at least one child
    if (childOperatorsArray.length == 0) {
        throw new HiveException("Expected number of children is at least 1. Found : " + childOperatorsArray.length);
    }
    newTagToOldTag = toArray(conf.getNewTagToOldTag());
    newTagToChildIndex = toArray(conf.getNewTagToChildIndex());
    childInputObjInspectors = new ObjectInspector[childOperators.size()][];
    cntrs = new long[newTagToOldTag.length];
    nextCntrs = new long[newTagToOldTag.length];
    try {
        // Those inputObjectInspectors are stored in childInputObjInspectors.
        for (int i = 0; i < newTagToOldTag.length; i++) {
            int newTag = i;
            int oldTag = newTagToOldTag[i];
            int childIndex = newTagToChildIndex[newTag];
            cntrs[newTag] = 0;
            nextCntrs[newTag] = 0;
            TableDesc keyTableDesc = conf.getKeysSerializeInfos().get(newTag);
            Deserializer inputKeyDeserializer = ReflectionUtil.newInstance(keyTableDesc.getDeserializerClass(), null);
            SerDeUtils.initializeSerDe(inputKeyDeserializer, null, keyTableDesc.getProperties(), null);
            TableDesc valueTableDesc = conf.getValuesSerializeInfos().get(newTag);
            Deserializer inputValueDeserializer = ReflectionUtil.newInstance(valueTableDesc.getDeserializerClass(), null);
            SerDeUtils.initializeSerDe(inputValueDeserializer, null, valueTableDesc.getProperties(), null);
            List<ObjectInspector> oi = new ArrayList<ObjectInspector>();
            oi.add(inputKeyDeserializer.getObjectInspector());
            oi.add(inputValueDeserializer.getObjectInspector());
            int childParentsCount = conf.getChildIndexToOriginalNumParents().get(childIndex);
            // So, we first check if childInputObjInspectors contains the key of childIndex.
            if (childInputObjInspectors[childIndex] == null) {
                childInputObjInspectors[childIndex] = new ObjectInspector[childParentsCount];
            }
            ObjectInspector[] ois = childInputObjInspectors[childIndex];
            ois[oldTag] = ObjectInspectorFactory.getStandardStructObjectInspector(Utilities.reduceFieldNameList, oi);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    childrenDone = 0;
    newChildOperatorsTag = new int[childOperators.size()][];
    for (int i = 0; i < childOperators.size(); i++) {
        Operator<? extends OperatorDesc> child = childOperators.get(i);
        List<Integer> childOperatorTags = new ArrayList<Integer>();
        if (child instanceof MuxOperator) {
            // This DemuxOperator can appear multiple times in MuxOperator's
            // parentOperators
            int index = 0;
            for (Operator<? extends OperatorDesc> parent : child.getParentOperators()) {
                if (this == parent) {
                    childOperatorTags.add(index);
                }
                index++;
            }
        } else {
            childOperatorTags.add(child.getParentOperators().indexOf(this));
        }
        newChildOperatorsTag[i] = toArray(childOperatorTags);
    }
    if (isLogInfoEnabled) {
        LOG.info("newChildOperatorsTag " + Arrays.toString(newChildOperatorsTag));
    }
}
Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ArrayList(java.util.ArrayList) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) Deserializer(org.apache.hadoop.hive.serde2.Deserializer) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 8 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class Utilities method getInputSummary.

/**
   * Calculate the total size of input files.
   *
   * @param ctx
   *          the hadoop job context
   * @param work
   *          map reduce job plan
   * @param filter
   *          filter to apply to the input paths before calculating size
   * @return the summary of all the input paths.
   * @throws IOException
   */
public static ContentSummary getInputSummary(final Context ctx, MapWork work, PathFilter filter) throws IOException {
    PerfLogger perfLogger = SessionState.getPerfLogger();
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
    long[] summary = { 0, 0, 0 };
    final Set<Path> pathNeedProcess = new HashSet<>();
    // this method will avoid number of threads out of control.
    synchronized (INPUT_SUMMARY_LOCK) {
        // For each input path, calculate the total size.
        for (Path path : work.getPathToAliases().keySet()) {
            Path p = path;
            if (filter != null && !filter.accept(p)) {
                continue;
            }
            ContentSummary cs = ctx.getCS(path);
            if (cs == null) {
                if (path == null) {
                    continue;
                }
                pathNeedProcess.add(path);
            } else {
                summary[0] += cs.getLength();
                summary[1] += cs.getFileCount();
                summary[2] += cs.getDirectoryCount();
            }
        }
        // Process the case when name node call is needed
        final Map<String, ContentSummary> resultMap = new ConcurrentHashMap<String, ContentSummary>();
        ArrayList<Future<?>> results = new ArrayList<Future<?>>();
        final ExecutorService executor;
        int numExecutors = getMaxExecutorsForInputListing(ctx.getConf(), pathNeedProcess.size());
        if (numExecutors > 1) {
            LOG.info("Using " + numExecutors + " threads for getContentSummary");
            executor = Executors.newFixedThreadPool(numExecutors, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Get-Input-Summary-%d").build());
        } else {
            executor = null;
        }
        HiveInterruptCallback interrup = HiveInterruptUtils.add(new HiveInterruptCallback() {

            @Override
            public void interrupt() {
                for (Path path : pathNeedProcess) {
                    try {
                        path.getFileSystem(ctx.getConf()).close();
                    } catch (IOException ignore) {
                        LOG.debug("Failed to close filesystem", ignore);
                    }
                }
                if (executor != null) {
                    executor.shutdownNow();
                }
            }
        });
        try {
            Configuration conf = ctx.getConf();
            JobConf jobConf = new JobConf(conf);
            for (Path path : pathNeedProcess) {
                final Path p = path;
                final String pathStr = path.toString();
                // All threads share the same Configuration and JobConf based on the
                // assumption that they are thread safe if only read operations are
                // executed. It is not stated in Hadoop's javadoc, the sourcce codes
                // clearly showed that they made efforts for it and we believe it is
                // thread safe. Will revisit this piece of codes if we find the assumption
                // is not correct.
                final Configuration myConf = conf;
                final JobConf myJobConf = jobConf;
                final Map<String, Operator<?>> aliasToWork = work.getAliasToWork();
                final Map<Path, ArrayList<String>> pathToAlias = work.getPathToAliases();
                final PartitionDesc partDesc = work.getPathToPartitionInfo().get(p);
                Runnable r = new Runnable() {

                    @Override
                    public void run() {
                        try {
                            Class<? extends InputFormat> inputFormatCls = partDesc.getInputFileFormatClass();
                            InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(inputFormatCls, myJobConf);
                            if (inputFormatObj instanceof ContentSummaryInputFormat) {
                                ContentSummaryInputFormat cs = (ContentSummaryInputFormat) inputFormatObj;
                                resultMap.put(pathStr, cs.getContentSummary(p, myJobConf));
                                return;
                            }
                            String metaTableStorage = null;
                            if (partDesc.getTableDesc() != null && partDesc.getTableDesc().getProperties() != null) {
                                metaTableStorage = partDesc.getTableDesc().getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, null);
                            }
                            if (partDesc.getProperties() != null) {
                                metaTableStorage = partDesc.getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, metaTableStorage);
                            }
                            HiveStorageHandler handler = HiveUtils.getStorageHandler(myConf, metaTableStorage);
                            if (handler instanceof InputEstimator) {
                                long total = 0;
                                TableDesc tableDesc = partDesc.getTableDesc();
                                InputEstimator estimator = (InputEstimator) handler;
                                for (String alias : HiveFileFormatUtils.doGetAliasesFromPath(pathToAlias, p)) {
                                    JobConf jobConf = new JobConf(myJobConf);
                                    TableScanOperator scanOp = (TableScanOperator) aliasToWork.get(alias);
                                    Utilities.setColumnNameList(jobConf, scanOp, true);
                                    Utilities.setColumnTypeList(jobConf, scanOp, true);
                                    PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc);
                                    Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf);
                                    total += estimator.estimate(jobConf, scanOp, -1).getTotalLength();
                                }
                                resultMap.put(pathStr, new ContentSummary(total, -1, -1));
                            } else {
                                // todo: should nullify summary for non-native tables,
                                // not to be selected as a mapjoin target
                                FileSystem fs = p.getFileSystem(myConf);
                                resultMap.put(pathStr, fs.getContentSummary(p));
                            }
                        } catch (Exception e) {
                            // We safely ignore this exception for summary data.
                            // We don't update the cache to protect it from polluting other
                            // usages. The worst case is that IOException will always be
                            // retried for another getInputSummary(), which is fine as
                            // IOException is not considered as a common case.
                            LOG.info("Cannot get size of " + pathStr + ". Safely ignored.");
                        }
                    }
                };
                if (executor == null) {
                    r.run();
                } else {
                    Future<?> result = executor.submit(r);
                    results.add(result);
                }
            }
            if (executor != null) {
                for (Future<?> result : results) {
                    boolean executorDone = false;
                    do {
                        try {
                            result.get();
                            executorDone = true;
                        } catch (InterruptedException e) {
                            LOG.info("Interrupted when waiting threads: ", e);
                            Thread.currentThread().interrupt();
                            break;
                        } catch (ExecutionException e) {
                            throw new IOException(e);
                        }
                    } while (!executorDone);
                }
                executor.shutdown();
            }
            HiveInterruptUtils.checkInterrupted();
            for (Map.Entry<String, ContentSummary> entry : resultMap.entrySet()) {
                ContentSummary cs = entry.getValue();
                summary[0] += cs.getLength();
                summary[1] += cs.getFileCount();
                summary[2] += cs.getDirectoryCount();
                ctx.addCS(entry.getKey(), cs);
                LOG.info("Cache Content Summary for " + entry.getKey() + " length: " + cs.getLength() + " file count: " + cs.getFileCount() + " directory count: " + cs.getDirectoryCount());
            }
            perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
            return new ContentSummary(summary[0], summary[1], summary[2]);
        } finally {
            HiveInterruptUtils.remove(interrup);
        }
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) PerfLogger(org.apache.hadoop.hive.ql.log.PerfLogger) ArrayList(java.util.ArrayList) ContentSummaryInputFormat(org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) ExecutionException(java.util.concurrent.ExecutionException) JobConf(org.apache.hadoop.mapred.JobConf) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) InputEstimator(org.apache.hadoop.hive.ql.metadata.InputEstimator) HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) HiveInterruptCallback(org.apache.hadoop.hive.common.HiveInterruptCallback) IOException(java.io.IOException) SQLFeatureNotSupportedException(java.sql.SQLFeatureNotSupportedException) SQLTransientException(java.sql.SQLTransientException) SQLException(java.sql.SQLException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) EOFException(java.io.EOFException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SequenceFileInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat) ReworkMapredInputFormat(org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat) ContentSummaryInputFormat(org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) OneNullRowInputFormat(org.apache.hadoop.hive.ql.io.OneNullRowInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) ContentSummary(org.apache.hadoop.fs.ContentSummary) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap)

Example 9 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class SparkDynamicPartitionPruner method initialize.

public void initialize(MapWork work, JobConf jobConf) throws SerDeException {
    Map<String, SourceInfo> columnMap = new HashMap<String, SourceInfo>();
    Set<String> sourceWorkIds = work.getEventSourceTableDescMap().keySet();
    for (String id : sourceWorkIds) {
        List<TableDesc> tables = work.getEventSourceTableDescMap().get(id);
        List<String> columnNames = work.getEventSourceColumnNameMap().get(id);
        List<ExprNodeDesc> partKeyExprs = work.getEventSourcePartKeyExprMap().get(id);
        Iterator<String> cit = columnNames.iterator();
        Iterator<ExprNodeDesc> pit = partKeyExprs.iterator();
        for (TableDesc t : tables) {
            String columnName = cit.next();
            ExprNodeDesc partKeyExpr = pit.next();
            SourceInfo si = new SourceInfo(t, partKeyExpr, columnName, jobConf);
            if (!sourceInfoMap.containsKey(id)) {
                sourceInfoMap.put(id, new ArrayList<SourceInfo>());
            }
            sourceInfoMap.get(id).add(si);
            // the union of the values in that case.
            if (columnMap.containsKey(columnName)) {
                si.values = columnMap.get(columnName).values;
            }
            columnMap.put(columnName, si);
        }
    }
}
Also used : HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc)

Example 10 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class PTFRowContainer method createTableDesc.

public static TableDesc createTableDesc(StructObjectInspector oI) {
    Map<String, String> props = new HashMap<String, String>();
    PTFDeserializer.addOIPropertiestoSerDePropsMap(oI, props);
    String colNames = props.get(serdeConstants.LIST_COLUMNS);
    String colTypes = props.get(serdeConstants.LIST_COLUMN_TYPES);
    TableDesc tblDesc = new TableDesc(PTFSequenceFileInputFormat.class, PTFHiveSequenceFileOutputFormat.class, Utilities.makeProperties(serdeConstants.SERIALIZATION_FORMAT, "" + Utilities.ctrlaCode, serdeConstants.LIST_COLUMNS, colNames.toString(), serdeConstants.LIST_COLUMN_TYPES, colTypes.toString(), serdeConstants.SERIALIZATION_LIB, LazyBinarySerDe.class.getName()));
    return tblDesc;
}
Also used : HashMap(java.util.HashMap) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Aggregations

TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)80 ArrayList (java.util.ArrayList)40 Path (org.apache.hadoop.fs.Path)33 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)27 HashMap (java.util.HashMap)24 LinkedHashMap (java.util.LinkedHashMap)21 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)21 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)19 Properties (java.util.Properties)16 Operator (org.apache.hadoop.hive.ql.exec.Operator)16 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)16 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)16 LoadTableDesc (org.apache.hadoop.hive.ql.plan.LoadTableDesc)14 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)14 List (java.util.List)13 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)13 JobConf (org.apache.hadoop.mapred.JobConf)13 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)11 IOException (java.io.IOException)10 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)10