Search in sources :

Example 1 with ExecMapperContext

use of org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext in project hive by apache.

the class FetchOperator method setupExecContext.

private ExecMapperContext setupExecContext(Operator operator, List<Path> paths) {
    ExecMapperContext context = null;
    if (hasVC || work.getSplitSample() != null) {
        context = new ExecMapperContext(job);
        if (operator != null) {
            operator.passExecContext(context);
        }
    }
    setFetchOperatorContext(job, paths);
    return context;
}
Also used : ExecMapperContext(org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext)

Example 2 with ExecMapperContext

use of org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext in project hive by apache.

the class SparkMapRecordHandler method init.

@Override
public <K, V> void init(JobConf job, OutputCollector<K, V> output, Reporter reporter) throws Exception {
    perfLogger.perfLogBegin(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS);
    super.init(job, output, reporter);
    try {
        jc = job;
        execContext = new ExecMapperContext(jc);
        // create map and fetch operators
        MapWork mrwork = Utilities.getMapWork(job);
        for (PartitionDesc part : mrwork.getAliasToPartnInfo().values()) {
            TableDesc tableDesc = part.getTableDesc();
            Utilities.copyJobSecretToTableProperties(tableDesc);
        }
        CompilationOpContext runtimeCtx = new CompilationOpContext();
        if (mrwork.getVectorMode()) {
            mo = new VectorMapOperator(runtimeCtx);
        } else {
            mo = new MapOperator(runtimeCtx);
        }
        mo.setConf(mrwork);
        // initialize map operator
        mo.initialize(jc, null);
        mo.setChildren(job);
        LOG.info(mo.dump(0));
        // initialize map local work
        localWork = mrwork.getMapRedLocalWork();
        execContext.setLocalWork(localWork);
        MapredContext.init(true, new JobConf(jc));
        MapredContext.get().setReporter(reporter);
        mo.passExecContext(execContext);
        mo.initializeLocalWork(jc);
        mo.initializeMapOperator(jc);
        mo.setReporter(rp);
        if (localWork == null) {
            return;
        }
        // The following code is for mapjoin
        // initialize all the dummy ops
        LOG.info("Initializing dummy operator");
        List<Operator<? extends OperatorDesc>> dummyOps = localWork.getDummyParentOp();
        for (Operator<? extends OperatorDesc> dummyOp : dummyOps) {
            dummyOp.setExecContext(execContext);
            dummyOp.initialize(jc, null);
        }
    } catch (Throwable e) {
        abort = true;
        if (e instanceof OutOfMemoryError) {
            // Don't create a new object if we are already out of memory
            throw (OutOfMemoryError) e;
        } else {
            throw new RuntimeException("Map operator initialization failed: " + e, e);
        }
    }
    perfLogger.perfLogEnd(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS);
}
Also used : MapOperator(org.apache.hadoop.hive.ql.exec.MapOperator) VectorMapOperator(org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) AbstractMapOperator(org.apache.hadoop.hive.ql.exec.AbstractMapOperator) ExecMapperContext(org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext) VectorMapOperator(org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator) MapOperator(org.apache.hadoop.hive.ql.exec.MapOperator) VectorMapOperator(org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator) AbstractMapOperator(org.apache.hadoop.hive.ql.exec.AbstractMapOperator) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JobConf(org.apache.hadoop.mapred.JobConf) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 3 with ExecMapperContext

use of org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext in project hive by apache.

the class MapOperator method initializeAsRoot.

/**
 * Initializes this map op as the root of the tree. It sets JobConf &
 * MapRedWork and starts initialization of the operator tree rooted at this
 * op.
 *
 * @param hconf
 * @param mapWork
 * @throws HiveException
 */
@VisibleForTesting
void initializeAsRoot(JobConf hconf, MapWork mapWork) throws Exception {
    setConf(mapWork);
    setChildren(hconf);
    passExecContext(new ExecMapperContext(hconf));
    initializeMapOperator(hconf);
}
Also used : ExecMapperContext(org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 4 with ExecMapperContext

use of org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext in project hive by apache.

the class MapOperator method process.

public void process(Writable value) throws HiveException {
    // A mapper can span multiple files/partitions.
    // The serializers need to be reset if the input file changed
    ExecMapperContext context = getExecContext();
    if (context != null && context.inputFileChanged()) {
        // The child operators cleanup if input file has changed
        cleanUpInputFileChanged();
    }
    int childrenDone = 0;
    for (MapOpCtx current : currentCtxs) {
        Object row = null;
        try {
            row = current.readRow(value, context);
            if (!current.forward(row)) {
                childrenDone++;
            }
        } catch (Exception e) {
            // TODO: policy on deserialization errors
            String message = null;
            try {
                message = toErrorMessage(value, row, current.rowObjectInspector);
            } catch (Throwable t) {
                message = "[" + row + ", " + value + "]: cannot get error message " + t.getMessage();
            }
            if (row == null) {
                deserialize_error_count.set(deserialize_error_count.get() + 1);
                LOG.trace("Hive Runtime Error while processing writable " + message);
                throw new HiveException("Hive Runtime Error while processing writable", e);
            }
            // Log the contents of the row that caused exception so that it's available for debugging. But
            // when exposed through an error message it can leak sensitive information, even to the
            // client application.
            LOG.trace("Hive Runtime Error while processing row " + message);
            throw new HiveException("Hive Runtime Error while processing row", e);
        }
    }
    rowsForwarded(childrenDone, 1);
}
Also used : ExecMapperContext(org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Example 5 with ExecMapperContext

use of org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext in project hive by apache.

the class MapJoinOperator method initializeOp.

@Override
protected void initializeOp(Configuration hconf) throws HiveException {
    this.hconf = hconf;
    unwrapContainer = new UnwrapRowContainer[conf.getTagLength()];
    super.initializeOp(hconf);
    int tagLen = conf.getTagLength();
    // On Tez only: The hash map might already be cached in the container we run
    // the task in. On MR: The cache is a no-op.
    String queryId = HiveConf.getVar(hconf, HiveConf.ConfVars.HIVEQUERYID);
    // The cacheKey may have already been defined in the MapJoin conf spec
    // as part of the Shared Work Optimization if it can be reused among
    // multiple mapjoin operators. In that case, we take that key from conf
    // and append this.getClass().getName() to disambiguate between different
    // classes that may be using the same source data, e.g.
    // VectorMapJoinInnerGenerateResultOperator and VectorMapJoinLeftSemiLongOperator.
    // If the cacheKey is not defined in the conf, then we generate it.
    cacheKey = conf.getCacheKey() == null ? MapJoinDesc.generateCacheKey(this.getOperatorId()) : conf.getCacheKey() + "_" + this.getClass().getName();
    cache = ObjectCacheFactory.getCache(hconf, queryId, false);
    loader = getHashTableLoader(hconf);
    bucketId = hconf.getInt(Constants.LLAP_BUCKET_ID, -1);
    numBuckets = hconf.getInt(Constants.LLAP_NUM_BUCKETS, -1);
    hashMapRowGetters = null;
    mapJoinTables = new MapJoinTableContainer[tagLen];
    mapJoinTableSerdes = new MapJoinTableContainerSerDe[tagLen];
    hashTblInitedOnce = false;
    // Reset grace hashjoin context so that there is no state maintained when operator/work is
    // retrieved from object cache
    hybridMapJoinLeftover = false;
    firstSmallTable = null;
    doFullOuterMapJoinInit();
    generateMapMetaData();
    isTestingNoHashTableLoad = HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVE_MAPJOIN_TESTING_NO_HASH_TABLE_LOAD);
    if (isTestingNoHashTableLoad) {
        return;
    }
    final ExecMapperContext mapContext = getExecContext();
    final MapredContext mrContext = MapredContext.get();
    if (!conf.isBucketMapJoin() && !conf.isDynamicPartitionHashJoin()) {
        /*
       * The issue with caching in case of bucket map join is that different tasks
       * process different buckets and if the container is reused to join a different bucket,
       * join results can be incorrect. The cache is keyed on operator id and for bucket map join
       * the operator does not change but data needed is different. For a proper fix, this
       * requires changes in the Tez API with regard to finding bucket id and
       * also ability to schedule tasks to re-use containers that have cached the specific bucket.
       */
        LOG.debug("This is not bucket map join, so cache");
        Future<Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>> future = cache.retrieveAsync(cacheKey, () -> loadHashTable(mapContext, mrContext));
        asyncInitOperations.add(future);
    } else if (!isInputFileChangeSensitive(mapContext)) {
        loadHashTable(mapContext, mrContext);
        hashTblInitedOnce = true;
    }
}
Also used : ExecMapperContext(org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext) MapJoinTableContainerSerDe(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe) MapJoinTableContainer(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer) Pair(org.apache.commons.lang3.tuple.Pair) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair)

Aggregations

ExecMapperContext (org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext)9 ArrayList (java.util.ArrayList)3 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)3 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)3 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)3 JobConf (org.apache.hadoop.mapred.JobConf)3 Configuration (org.apache.hadoop.conf.Configuration)2 CompilationOpContext (org.apache.hadoop.hive.ql.CompilationOpContext)2 AbstractMapOperator (org.apache.hadoop.hive.ql.exec.AbstractMapOperator)2 MapOperator (org.apache.hadoop.hive.ql.exec.MapOperator)2 Operator (org.apache.hadoop.hive.ql.exec.Operator)2 VectorMapOperator (org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator)2 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)2 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)2 LogicalOutput (org.apache.tez.runtime.api.LogicalOutput)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 IOException (java.io.IOException)1 Map (java.util.Map)1 Callable (java.util.concurrent.Callable)1 ImmutablePair (org.apache.commons.lang3.tuple.ImmutablePair)1