use of org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext in project hive by apache.
the class FetchOperator method setupExecContext.
private ExecMapperContext setupExecContext(Operator operator, List<Path> paths) {
ExecMapperContext context = null;
if (hasVC || work.getSplitSample() != null) {
context = new ExecMapperContext(job);
if (operator != null) {
operator.passExecContext(context);
}
}
setFetchOperatorContext(job, paths);
return context;
}
use of org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext in project hive by apache.
the class SparkMapRecordHandler method init.
@Override
public <K, V> void init(JobConf job, OutputCollector<K, V> output, Reporter reporter) throws Exception {
perfLogger.perfLogBegin(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS);
super.init(job, output, reporter);
try {
jc = job;
execContext = new ExecMapperContext(jc);
// create map and fetch operators
MapWork mrwork = Utilities.getMapWork(job);
for (PartitionDesc part : mrwork.getAliasToPartnInfo().values()) {
TableDesc tableDesc = part.getTableDesc();
Utilities.copyJobSecretToTableProperties(tableDesc);
}
CompilationOpContext runtimeCtx = new CompilationOpContext();
if (mrwork.getVectorMode()) {
mo = new VectorMapOperator(runtimeCtx);
} else {
mo = new MapOperator(runtimeCtx);
}
mo.setConf(mrwork);
// initialize map operator
mo.initialize(jc, null);
mo.setChildren(job);
LOG.info(mo.dump(0));
// initialize map local work
localWork = mrwork.getMapRedLocalWork();
execContext.setLocalWork(localWork);
MapredContext.init(true, new JobConf(jc));
MapredContext.get().setReporter(reporter);
mo.passExecContext(execContext);
mo.initializeLocalWork(jc);
mo.initializeMapOperator(jc);
mo.setReporter(rp);
if (localWork == null) {
return;
}
// The following code is for mapjoin
// initialize all the dummy ops
LOG.info("Initializing dummy operator");
List<Operator<? extends OperatorDesc>> dummyOps = localWork.getDummyParentOp();
for (Operator<? extends OperatorDesc> dummyOp : dummyOps) {
dummyOp.setExecContext(execContext);
dummyOp.initialize(jc, null);
}
} catch (Throwable e) {
abort = true;
if (e instanceof OutOfMemoryError) {
// Don't create a new object if we are already out of memory
throw (OutOfMemoryError) e;
} else {
throw new RuntimeException("Map operator initialization failed: " + e, e);
}
}
perfLogger.perfLogEnd(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS);
}
use of org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext in project hive by apache.
the class MapOperator method initializeAsRoot.
/**
* Initializes this map op as the root of the tree. It sets JobConf &
* MapRedWork and starts initialization of the operator tree rooted at this
* op.
*
* @param hconf
* @param mapWork
* @throws HiveException
*/
@VisibleForTesting
void initializeAsRoot(JobConf hconf, MapWork mapWork) throws Exception {
setConf(mapWork);
setChildren(hconf);
passExecContext(new ExecMapperContext(hconf));
initializeMapOperator(hconf);
}
use of org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext in project hive by apache.
the class MapOperator method process.
public void process(Writable value) throws HiveException {
// A mapper can span multiple files/partitions.
// The serializers need to be reset if the input file changed
ExecMapperContext context = getExecContext();
if (context != null && context.inputFileChanged()) {
// The child operators cleanup if input file has changed
cleanUpInputFileChanged();
}
int childrenDone = 0;
for (MapOpCtx current : currentCtxs) {
Object row = null;
try {
row = current.readRow(value, context);
if (!current.forward(row)) {
childrenDone++;
}
} catch (Exception e) {
// TODO: policy on deserialization errors
String message = null;
try {
message = toErrorMessage(value, row, current.rowObjectInspector);
} catch (Throwable t) {
message = "[" + row + ", " + value + "]: cannot get error message " + t.getMessage();
}
if (row == null) {
deserialize_error_count.set(deserialize_error_count.get() + 1);
LOG.trace("Hive Runtime Error while processing writable " + message);
throw new HiveException("Hive Runtime Error while processing writable", e);
}
// Log the contents of the row that caused exception so that it's available for debugging. But
// when exposed through an error message it can leak sensitive information, even to the
// client application.
LOG.trace("Hive Runtime Error while processing row " + message);
throw new HiveException("Hive Runtime Error while processing row", e);
}
}
rowsForwarded(childrenDone, 1);
}
use of org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext in project hive by apache.
the class MapJoinOperator method initializeOp.
@Override
protected void initializeOp(Configuration hconf) throws HiveException {
this.hconf = hconf;
unwrapContainer = new UnwrapRowContainer[conf.getTagLength()];
super.initializeOp(hconf);
int tagLen = conf.getTagLength();
// On Tez only: The hash map might already be cached in the container we run
// the task in. On MR: The cache is a no-op.
String queryId = HiveConf.getVar(hconf, HiveConf.ConfVars.HIVEQUERYID);
// The cacheKey may have already been defined in the MapJoin conf spec
// as part of the Shared Work Optimization if it can be reused among
// multiple mapjoin operators. In that case, we take that key from conf
// and append this.getClass().getName() to disambiguate between different
// classes that may be using the same source data, e.g.
// VectorMapJoinInnerGenerateResultOperator and VectorMapJoinLeftSemiLongOperator.
// If the cacheKey is not defined in the conf, then we generate it.
cacheKey = conf.getCacheKey() == null ? MapJoinDesc.generateCacheKey(this.getOperatorId()) : conf.getCacheKey() + "_" + this.getClass().getName();
cache = ObjectCacheFactory.getCache(hconf, queryId, false);
loader = getHashTableLoader(hconf);
bucketId = hconf.getInt(Constants.LLAP_BUCKET_ID, -1);
numBuckets = hconf.getInt(Constants.LLAP_NUM_BUCKETS, -1);
hashMapRowGetters = null;
mapJoinTables = new MapJoinTableContainer[tagLen];
mapJoinTableSerdes = new MapJoinTableContainerSerDe[tagLen];
hashTblInitedOnce = false;
// Reset grace hashjoin context so that there is no state maintained when operator/work is
// retrieved from object cache
hybridMapJoinLeftover = false;
firstSmallTable = null;
doFullOuterMapJoinInit();
generateMapMetaData();
isTestingNoHashTableLoad = HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVE_MAPJOIN_TESTING_NO_HASH_TABLE_LOAD);
if (isTestingNoHashTableLoad) {
return;
}
final ExecMapperContext mapContext = getExecContext();
final MapredContext mrContext = MapredContext.get();
if (!conf.isBucketMapJoin() && !conf.isDynamicPartitionHashJoin()) {
/*
* The issue with caching in case of bucket map join is that different tasks
* process different buckets and if the container is reused to join a different bucket,
* join results can be incorrect. The cache is keyed on operator id and for bucket map join
* the operator does not change but data needed is different. For a proper fix, this
* requires changes in the Tez API with regard to finding bucket id and
* also ability to schedule tasks to re-use containers that have cached the specific bucket.
*/
LOG.debug("This is not bucket map join, so cache");
Future<Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>> future = cache.retrieveAsync(cacheKey, () -> loadHashTable(mapContext, mrContext));
asyncInitOperations.add(future);
} else if (!isInputFileChangeSensitive(mapContext)) {
loadHashTable(mapContext, mrContext);
hashTblInitedOnce = true;
}
}
Aggregations