use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class MapRecordProcessor method init.
@Override
void init(MRTaskReporter mrReporter, Map<String, LogicalInput> inputs, Map<String, LogicalOutput> outputs) throws Exception {
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
super.init(mrReporter, inputs, outputs);
checkAbortCondition();
String key = processorContext.getTaskVertexName() + MAP_PLAN_KEY;
cacheKeys.add(key);
// create map and fetch operators
mapWork = (MapWork) cache.retrieve(key, new Callable<Object>() {
@Override
public Object call() {
return Utilities.getMapWork(jconf);
}
});
// TODO HIVE-14042. Cleanup may be required if exiting early.
Utilities.setMapWork(jconf, mapWork);
String prefixes = jconf.get(DagUtils.TEZ_MERGE_WORK_FILE_PREFIXES);
if (prefixes != null) {
mergeWorkList = new ArrayList<MapWork>();
for (final String prefix : prefixes.split(",")) {
if (prefix == null || prefix.isEmpty()) {
continue;
}
key = processorContext.getTaskVertexName() + prefix;
cacheKeys.add(key);
checkAbortCondition();
mergeWorkList.add((MapWork) cache.retrieve(key, new Callable<Object>() {
@Override
public Object call() {
return Utilities.getMergeWork(jconf, prefix);
}
}));
}
}
MapredContext.init(true, new JobConf(jconf));
((TezContext) MapredContext.get()).setInputs(inputs);
((TezContext) MapredContext.get()).setTezProcessorContext(processorContext);
// Update JobConf using MRInput, info like filename comes via this
checkAbortCondition();
legacyMRInput = getMRInput(inputs);
if (legacyMRInput != null) {
Configuration updatedConf = legacyMRInput.getConfigUpdates();
if (updatedConf != null) {
for (Entry<String, String> entry : updatedConf) {
jconf.set(entry.getKey(), entry.getValue());
}
}
}
checkAbortCondition();
createOutputMap();
// Start all the Outputs.
for (Entry<String, LogicalOutput> outputEntry : outputs.entrySet()) {
l4j.debug("Starting Output: " + outputEntry.getKey());
outputEntry.getValue().start();
((TezKVOutputCollector) outMap.get(outputEntry.getKey())).initialize();
}
checkAbortCondition();
try {
CompilationOpContext runtimeCtx = new CompilationOpContext();
if (mapWork.getVectorMode()) {
mapOp = new VectorMapOperator(runtimeCtx);
} else {
mapOp = new MapOperator(runtimeCtx);
}
// Not synchronizing creation of mapOp with an invocation. Check immediately
// after creation in case abort has been set.
// Relying on the regular flow to clean up the actual operator. i.e. If an exception is
// thrown, an attempt will be made to cleanup the op.
// If we are here - exit out via an exception. If we're in the middle of the opeartor.initialize
// call further down, we rely upon op.abort().
checkAbortCondition();
mapOp.clearConnectedOperators();
mapOp.setExecContext(execContext);
boolean fromCache = false;
if (mergeWorkList != null) {
AbstractMapOperator mergeMapOp = null;
for (BaseWork mergeWork : mergeWorkList) {
// TODO HIVE-14042. What is mergeWork, and why is it not part of the regular operator chain.
// The mergeMapOp.initialize call further down can block, and will not receive information
// about an abort request.
MapWork mergeMapWork = (MapWork) mergeWork;
if (mergeMapWork.getVectorMode()) {
mergeMapOp = new VectorMapOperator(runtimeCtx);
} else {
mergeMapOp = new MapOperator(runtimeCtx);
}
mergeMapOpList.add(mergeMapOp);
// initialize the merge operators first.
if (mergeMapOp != null) {
mergeMapOp.setConf(mergeMapWork);
l4j.info("Input name is " + mergeMapWork.getName());
jconf.set(Utilities.INPUT_NAME, mergeMapWork.getName());
mergeMapOp.initialize(jconf, null);
// if there are no files/partitions to read, we need to skip trying to read
MultiMRInput multiMRInput = multiMRInputMap.get(mergeMapWork.getName());
boolean skipRead = false;
if (multiMRInput == null) {
l4j.info("Multi MR Input for work " + mergeMapWork.getName() + " is null. Skipping read.");
skipRead = true;
} else {
Collection<KeyValueReader> keyValueReaders = multiMRInput.getKeyValueReaders();
if ((keyValueReaders == null) || (keyValueReaders.isEmpty())) {
l4j.info("Key value readers are null or empty and hence skipping read. " + "KeyValueReaders = " + keyValueReaders);
skipRead = true;
}
}
if (skipRead) {
List<Operator<?>> children = new ArrayList<Operator<?>>();
children.addAll(mergeMapOp.getConf().getAliasToWork().values());
// do the same thing as setChildren when there is nothing to read.
// the setChildren method initializes the object inspector needed by the operators
// based on path and partition information which we don't have in this case.
mergeMapOp.initEmptyInputChildren(children, jconf);
} else {
// the setChildren method initializes the object inspector needed by the operators
// based on path and partition information.
mergeMapOp.setChildren(jconf);
}
Operator<? extends OperatorDesc> finalOp = getFinalOp(mergeMapOp);
if (finalOp instanceof TezDummyStoreOperator) {
// we ensure that we don't try to read any data in case of skip read.
((TezDummyStoreOperator) finalOp).setFetchDone(skipRead);
mapOp.setConnectedOperators(mergeMapWork.getTag(), (DummyStoreOperator) finalOp);
} else {
// found the plan is already connected which means this is derived from the cache.
fromCache = true;
}
mergeMapOp.passExecContext(new ExecMapperContext(jconf));
mergeMapOp.initializeLocalWork(jconf);
}
}
}
if (!fromCache) {
// if not from cache, we still need to hook up the plans.
((TezContext) (MapredContext.get())).setDummyOpsMap(mapOp.getConnectedOperators());
}
// initialize map operator
mapOp.setConf(mapWork);
l4j.info("Main input name is " + mapWork.getName());
jconf.set(Utilities.INPUT_NAME, mapWork.getName());
mapOp.initialize(jconf, null);
checkAbortCondition();
mapOp.setChildren(jconf);
mapOp.passExecContext(execContext);
l4j.info(mapOp.dump(0));
// set memory available for operators
long memoryAvailableToTask = processorContext.getTotalMemoryAvailableToTask();
if (mapOp.getConf() != null) {
mapOp.getConf().setMaxMemoryAvailable(memoryAvailableToTask);
l4j.info("Memory available for operators set to {}", LlapUtil.humanReadableByteCount(memoryAvailableToTask));
}
OperatorUtils.setMemoryAvailable(mapOp.getChildOperators(), memoryAvailableToTask);
mapOp.initializeLocalWork(jconf);
// Setup values registry
checkAbortCondition();
String valueRegistryKey = DynamicValue.DYNAMIC_VALUE_REGISTRY_CACHE_KEY;
// On LLAP dynamic value registry might already be cached.
final DynamicValueRegistryTez registryTez = dynamicValueCache.retrieve(valueRegistryKey, new Callable<DynamicValueRegistryTez>() {
@Override
public DynamicValueRegistryTez call() {
return new DynamicValueRegistryTez();
}
});
dynamicValueCacheKeys.add(valueRegistryKey);
RegistryConfTez registryConf = new RegistryConfTez(jconf, mapWork, processorContext, inputs);
registryTez.init(registryConf);
checkAbortCondition();
initializeMapRecordSources();
mapOp.initializeMapOperator(jconf);
if ((mergeMapOpList != null) && mergeMapOpList.isEmpty() == false) {
for (AbstractMapOperator mergeMapOp : mergeMapOpList) {
jconf.set(Utilities.INPUT_NAME, mergeMapOp.getConf().getName());
// TODO HIVE-14042. abort handling: Handling of mergeMapOp
mergeMapOp.initializeMapOperator(jconf);
}
}
// Initialization isn't finished until all parents of all operators
// are initialized. For broadcast joins that means initializing the
// dummy parent operators as well.
List<HashTableDummyOperator> dummyOps = mapWork.getDummyOps();
jconf.set(Utilities.INPUT_NAME, mapWork.getName());
if (dummyOps != null) {
for (Operator<? extends OperatorDesc> dummyOp : dummyOps) {
dummyOp.setExecContext(execContext);
// TODO HIVE-14042. Handling of dummyOps, and propagating abort information to them
dummyOp.initialize(jconf, null);
}
}
OperatorUtils.setChildrenCollector(mapOp.getChildOperators(), outMap);
mapOp.setReporter(reporter);
MapredContext.get().setReporter(reporter);
} catch (Throwable e) {
setAborted(true);
if (e instanceof OutOfMemoryError) {
// Don't create a new object if we are already out of memory
throw (OutOfMemoryError) e;
} else if (e instanceof InterruptedException) {
l4j.info("Hit an interrupt while initializing MapRecordProcessor. Message={}", e.getMessage());
throw (InterruptedException) e;
} else {
throw new RuntimeException("Map operator initialization failed", e);
}
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class DagUtils method createVertex.
private Vertex createVertex(JobConf conf, MergeJoinWork mergeJoinWork, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fs, Path mrScratchDir, Context ctx, VertexType vertexType) throws Exception {
Utilities.setMergeWork(conf, mergeJoinWork, mrScratchDir, false);
if (mergeJoinWork.getMainWork() instanceof MapWork) {
List<BaseWork> mapWorkList = mergeJoinWork.getBaseWorkList();
MapWork mapWork = (MapWork) (mergeJoinWork.getMainWork());
Vertex mergeVx = createVertex(conf, mapWork, appJarLr, additionalLr, fs, mrScratchDir, ctx, vertexType);
conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
// mapreduce.tez.input.initializer.serialize.event.payload should be set
// to false when using this plug-in to avoid getting a serialized event at run-time.
conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
for (int i = 0; i < mapWorkList.size(); i++) {
mapWork = (MapWork) (mapWorkList.get(i));
conf.set(TEZ_MERGE_CURRENT_MERGE_FILE_PREFIX, mapWork.getName());
conf.set(Utilities.INPUT_NAME, mapWork.getName());
LOG.info("Going through each work and adding MultiMRInput");
mergeVx.addDataSource(mapWork.getName(), MultiMRInput.createConfigBuilder(conf, HiveInputFormat.class).build());
}
VertexManagerPluginDescriptor desc = VertexManagerPluginDescriptor.create(CustomPartitionVertex.class.getName());
// the +1 to the size is because of the main work.
CustomVertexConfiguration vertexConf = new CustomVertexConfiguration(mergeJoinWork.getMergeJoinOperator().getConf().getNumBuckets(), vertexType, mergeJoinWork.getBigTableAlias(), mapWorkList.size() + 1);
DataOutputBuffer dob = new DataOutputBuffer();
vertexConf.write(dob);
byte[] userPayload = dob.getData();
desc.setUserPayload(UserPayload.create(ByteBuffer.wrap(userPayload)));
mergeVx.setVertexManagerPlugin(desc);
return mergeVx;
} else {
Vertex mergeVx = createVertex(conf, (ReduceWork) mergeJoinWork.getMainWork(), appJarLr, additionalLr, fs, mrScratchDir, ctx);
return mergeVx;
}
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class SparkMergeFileRecordHandler method init.
@SuppressWarnings("unchecked")
@Override
public <K, V> void init(JobConf job, OutputCollector<K, V> output, Reporter reporter) throws Exception {
super.init(job, output, reporter);
try {
jc = job;
MapWork mapWork = Utilities.getMapWork(job);
if (mapWork instanceof MergeFileWork) {
MergeFileWork mergeFileWork = (MergeFileWork) mapWork;
String alias = mergeFileWork.getAliasToWork().keySet().iterator().next();
op = mergeFileWork.getAliasToWork().get(alias);
if (op instanceof AbstractFileMergeOperator) {
mergeOp = (AbstractFileMergeOperator<? extends FileMergeDesc>) op;
mergeOp.initializeOp(jc);
row = new Object[2];
abort = false;
} else {
abort = true;
throw new IllegalStateException("Merge file work's top operator should be an" + " instance of AbstractFileMergeOperator");
}
} else {
abort = true;
throw new IllegalStateException("Map work should be a merge file work.");
}
LOG.info(mergeOp.dump(0));
} catch (HiveException e) {
abort = true;
throw new RuntimeException(e);
}
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class IndexWhereProcessor method process.
@Override
public /**
* Process a node of the operator tree. This matches on the rule in IndexWhereTaskDispatcher
*/
Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
TableScanOperator operator = (TableScanOperator) nd;
List<Node> opChildren = operator.getChildren();
TableScanDesc operatorDesc = operator.getConf();
if (operatorDesc == null || !tsToIndices.containsKey(operator)) {
return null;
}
List<Index> indexes = tsToIndices.get(operator);
ExprNodeDesc predicate = operatorDesc.getFilterExpr();
IndexWhereProcCtx context = (IndexWhereProcCtx) procCtx;
ParseContext pctx = context.getParseContext();
LOG.info("Processing predicate for index optimization");
if (predicate == null) {
LOG.info("null predicate pushed down");
return null;
}
LOG.info(predicate.getExprString());
// check if we have tsToIndices on all partitions in this table scan
Set<Partition> queryPartitions;
try {
queryPartitions = IndexUtils.checkPartitionsCoveredByIndex(operator, pctx, indexes);
if (queryPartitions == null) {
// partitions not covered
return null;
}
} catch (HiveException e) {
LOG.error("Fatal Error: problem accessing metastore", e);
throw new SemanticException(e);
}
// we can only process MapReduce tasks to check input size
if (!context.getCurrentTask().isMapRedTask()) {
return null;
}
MapRedTask currentTask = (MapRedTask) context.getCurrentTask();
// get potential reentrant index queries from each index
Map<Index, HiveIndexQueryContext> queryContexts = new HashMap<Index, HiveIndexQueryContext>();
// make sure we have an index on the table being scanned
TableDesc tblDesc = operator.getTableDesc();
Map<String, List<Index>> indexesByType = new HashMap<String, List<Index>>();
for (Index indexOnTable : indexes) {
if (indexesByType.get(indexOnTable.getIndexHandlerClass()) == null) {
List<Index> newType = new ArrayList<Index>();
newType.add(indexOnTable);
indexesByType.put(indexOnTable.getIndexHandlerClass(), newType);
} else {
indexesByType.get(indexOnTable.getIndexHandlerClass()).add(indexOnTable);
}
}
// choose index type with most tsToIndices of the same type on the table
// TODO HIVE-2130 This would be a good place for some sort of cost based choice?
List<Index> bestIndexes = indexesByType.values().iterator().next();
for (List<Index> indexTypes : indexesByType.values()) {
if (bestIndexes.size() < indexTypes.size()) {
bestIndexes = indexTypes;
}
}
// rewrite index queries for the chosen index type
HiveIndexQueryContext tmpQueryContext = new HiveIndexQueryContext();
tmpQueryContext.setQueryPartitions(queryPartitions);
rewriteForIndexes(predicate, bestIndexes, pctx, currentTask, tmpQueryContext);
List<Task<?>> indexTasks = tmpQueryContext.getQueryTasks();
if (indexTasks != null && indexTasks.size() > 0) {
queryContexts.put(bestIndexes.get(0), tmpQueryContext);
}
// choose an index rewrite to use
if (queryContexts.size() > 0) {
// TODO HIVE-2130 This would be a good place for some sort of cost based choice?
Index chosenIndex = queryContexts.keySet().iterator().next();
// modify the parse context to use indexing
// we need to delay this until we choose one index so that we don't attempt to modify pctx multiple times
HiveIndexQueryContext queryContext = queryContexts.get(chosenIndex);
// prepare the map reduce job to use indexing
MapWork work = currentTask.getWork().getMapWork();
work.setInputformat(queryContext.getIndexInputFormat());
work.addIndexIntermediateFile(queryContext.getIndexIntermediateFile());
// modify inputs based on index query
Set<ReadEntity> inputs = pctx.getSemanticInputs();
inputs.addAll(queryContext.getAdditionalSemanticInputs());
List<Task<?>> chosenRewrite = queryContext.getQueryTasks();
// add dependencies so index query runs first
insertIndexQuery(pctx, context, chosenRewrite);
}
return null;
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class SplitSparkWorkResolver method setStatistics.
// we lost statistics & opTraits through cloning, try to get them back
private void setStatistics(BaseWork origin, BaseWork clone) {
if (origin instanceof MapWork && clone instanceof MapWork) {
MapWork originMW = (MapWork) origin;
MapWork cloneMW = (MapWork) clone;
for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : originMW.getAliasToWork().entrySet()) {
String alias = entry.getKey();
Operator<? extends OperatorDesc> cloneOP = cloneMW.getAliasToWork().get(alias);
if (cloneOP != null) {
setStatistics(entry.getValue(), cloneOP);
}
}
} else if (origin instanceof ReduceWork && clone instanceof ReduceWork) {
setStatistics(((ReduceWork) origin).getReducer(), ((ReduceWork) clone).getReducer());
}
}
Aggregations