use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.
the class GenTezWork method process.
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
GenTezProcContext context = (GenTezProcContext) procContext;
assert context != null && context.currentTask != null && context.currentRootOperator != null;
// Operator is a file sink or reduce sink. Something that forces
// a new vertex.
Operator<?> operator = (Operator<?>) nd;
// root is the start of the operator pipeline we're currently
// packing into a vertex, typically a table scan, union or join
Operator<?> root = context.currentRootOperator;
LOG.debug("Root operator: " + root);
LOG.debug("Leaf operator: " + operator);
if (context.clonedReduceSinks.contains(operator)) {
// just skip and keep going
return null;
}
TezWork tezWork = context.currentTask.getWork();
// Right now the work graph is pretty simple. If there is no
// Preceding work we have a root and will generate a map
// vertex. If there is a preceding work we will generate
// a reduce vertex
BaseWork work;
if (context.rootToWorkMap.containsKey(root)) {
// will result into a vertex with multiple FS or RS operators.
if (context.childToWorkMap.containsKey(operator)) {
// if we've seen both root and child, we can bail.
// clear out the mapjoin set. we don't need it anymore.
context.currentMapJoinOperators.clear();
// clear out the union set. we don't need it anymore.
context.currentUnionOperators.clear();
return null;
} else {
// At this point we don't have to do anything special. Just
// run through the regular paces w/o creating a new task.
work = context.rootToWorkMap.get(root);
}
} else {
// create a new vertex
if (context.preceedingWork == null) {
work = utils.createMapWork(context, root, tezWork, null);
} else {
work = GenTezUtils.createReduceWork(context, root, tezWork);
}
context.rootToWorkMap.put(root, work);
}
// this is where we set the sort columns that we will be using for KeyValueInputMerge
if (operator instanceof DummyStoreOperator) {
work.addSortCols(root.getOpTraits().getSortCols().get(0));
}
if (!context.childToWorkMap.containsKey(operator)) {
List<BaseWork> workItems = new LinkedList<BaseWork>();
workItems.add(work);
context.childToWorkMap.put(operator, workItems);
} else {
context.childToWorkMap.get(operator).add(work);
}
// which can affect the working of all downstream transformations.
if (context.currentMergeJoinOperator != null) {
// we are currently walking the big table side of the merge join. we need to create or hook up
// merge join work.
MergeJoinWork mergeJoinWork = null;
if (context.opMergeJoinWorkMap.containsKey(context.currentMergeJoinOperator)) {
// we have found a merge work corresponding to this closing operator. Hook up this work.
mergeJoinWork = context.opMergeJoinWorkMap.get(context.currentMergeJoinOperator);
} else {
// we need to create the merge join work
mergeJoinWork = new MergeJoinWork();
mergeJoinWork.setMergeJoinOperator(context.currentMergeJoinOperator);
tezWork.add(mergeJoinWork);
context.opMergeJoinWorkMap.put(context.currentMergeJoinOperator, mergeJoinWork);
}
// connect the work correctly.
work.addSortCols(root.getOpTraits().getSortCols().get(0));
mergeJoinWork.addMergedWork(work, null, context.leafOperatorToFollowingWork);
Operator<? extends OperatorDesc> parentOp = getParentFromStack(context.currentMergeJoinOperator, stack);
// Set the big table position. Both the reduce work and merge join operator
// should be set with the same value.
// int pos = context.currentMergeJoinOperator.getTagForOperator(parentOp);
int pos = context.currentMergeJoinOperator.getConf().getBigTablePosition();
work.setTag(pos);
context.currentMergeJoinOperator.getConf().setBigTablePosition(pos);
tezWork.setVertexType(work, VertexType.MULTI_INPUT_UNINITIALIZED_EDGES);
for (BaseWork parentWork : tezWork.getParents(work)) {
TezEdgeProperty edgeProp = tezWork.getEdgeProperty(parentWork, work);
tezWork.disconnect(parentWork, work);
tezWork.connect(parentWork, mergeJoinWork, edgeProp);
}
for (BaseWork childWork : tezWork.getChildren(work)) {
TezEdgeProperty edgeProp = tezWork.getEdgeProperty(work, childWork);
tezWork.disconnect(work, childWork);
tezWork.connect(mergeJoinWork, childWork, edgeProp);
}
tezWork.remove(work);
context.rootToWorkMap.put(root, mergeJoinWork);
context.childToWorkMap.get(operator).remove(work);
context.childToWorkMap.get(operator).add(mergeJoinWork);
work = mergeJoinWork;
context.currentMergeJoinOperator = null;
}
// remember which mapjoin operator links with which work
if (!context.currentMapJoinOperators.isEmpty()) {
for (MapJoinOperator mj : context.currentMapJoinOperators) {
// so we can later run the same logic that is run in ReduceSinkMapJoinProc.
if (mj.getConf().isDynamicPartitionHashJoin()) {
// Since this is a dynamic partitioned hash join, the work for this join should be a ReduceWork
ReduceWork reduceWork = (ReduceWork) work;
int bigTablePosition = mj.getConf().getPosBigTable();
reduceWork.setTag(bigTablePosition);
// Use context.mapJoinParentMap to get the original RS parents, because
// the MapJoin's parents may have been replaced by dummy operator.
List<Operator<?>> mapJoinOriginalParents = context.mapJoinParentMap.get(mj);
if (mapJoinOriginalParents == null) {
throw new SemanticException("Unexpected error - context.mapJoinParentMap did not have an entry for " + mj);
}
for (int pos = 0; pos < mapJoinOriginalParents.size(); ++pos) {
// This processing only needs to happen for the small tables
if (pos == bigTablePosition) {
continue;
}
Operator<?> parentOp = mapJoinOriginalParents.get(pos);
context.smallTableParentToMapJoinMap.put(parentOp, mj);
ReduceSinkOperator parentRS = (ReduceSinkOperator) parentOp;
// TableDesc needed for dynamic partitioned hash join
GenMapRedUtils.setKeyAndValueDesc(reduceWork, parentRS);
// has its ReduceSink parent removed.
if (!context.mapJoinToUnprocessedSmallTableReduceSinks.get(mj).contains(parentRS)) {
// This reduce sink has been processed already, so the work for the parentRS exists
BaseWork parentWork = ReduceSinkMapJoinProc.getMapJoinParentWork(context, parentRS);
int tag = parentRS.getConf().getTag();
tag = (tag == -1 ? 0 : tag);
reduceWork.getTagToInput().put(tag, parentWork.getName());
}
}
}
LOG.debug("Processing map join: " + mj);
// mapjoin later
if (!context.mapJoinWorkMap.containsKey(mj)) {
List<BaseWork> workItems = new LinkedList<BaseWork>();
workItems.add(work);
context.mapJoinWorkMap.put(mj, workItems);
} else {
context.mapJoinWorkMap.get(mj).add(work);
}
/*
* this happens in case of map join operations.
* The tree looks like this:
*
* RS <--- we are here perhaps
* |
* MapJoin
* / \
* RS TS
* /
* TS
*
* If we are at the RS pointed above, and we may have already visited the
* RS following the TS, we have already generated work for the TS-RS.
* We need to hook the current work to this generated work.
*/
if (context.linkOpWithWorkMap.containsKey(mj)) {
Map<BaseWork, TezEdgeProperty> linkWorkMap = context.linkOpWithWorkMap.get(mj);
if (linkWorkMap != null) {
// Note: it's not quite clear why this is done inside this if. Seems like it should be on the top level.
if (context.linkChildOpWithDummyOp.containsKey(mj)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Adding dummy ops to work: " + work.getName() + ": " + context.linkChildOpWithDummyOp.get(mj));
}
for (Operator<?> dummy : context.linkChildOpWithDummyOp.get(mj)) {
work.addDummyOp((HashTableDummyOperator) dummy);
}
}
for (Entry<BaseWork, TezEdgeProperty> parentWorkMap : linkWorkMap.entrySet()) {
BaseWork parentWork = parentWorkMap.getKey();
LOG.debug("connecting " + parentWork.getName() + " with " + work.getName());
TezEdgeProperty edgeProp = parentWorkMap.getValue();
tezWork.connect(parentWork, work, edgeProp);
if (edgeProp.getEdgeType() == EdgeType.CUSTOM_EDGE) {
tezWork.setVertexType(work, VertexType.INITIALIZED_EDGES);
}
// of the downstream work
for (ReduceSinkOperator r : context.linkWorkWithReduceSinkMap.get(parentWork)) {
if (!context.mapJoinParentMap.get(mj).contains(r)) {
// already connected this RS operator or we will connect it at subsequent pass.
continue;
}
if (r.getConf().getOutputName() != null) {
LOG.debug("Cloning reduce sink " + r + " for multi-child broadcast edge");
// we've already set this one up. Need to clone for the next work.
r = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(r.getCompilationOpContext(), (ReduceSinkDesc) r.getConf().clone(), new RowSchema(r.getSchema()), r.getParentOperators());
context.clonedReduceSinks.add(r);
}
r.getConf().setOutputName(work.getName());
context.connectedReduceSinks.add(r);
}
}
}
}
}
// clear out the set. we don't need it anymore.
context.currentMapJoinOperators.clear();
}
// we might have to connect parent work with this work later.
for (Operator<?> parent : new ArrayList<Operator<?>>(root.getParentOperators())) {
if (LOG.isDebugEnabled()) {
LOG.debug("Removing " + parent + " as parent from " + root);
}
context.leafOperatorToFollowingWork.remove(parent);
context.leafOperatorToFollowingWork.put(parent, work);
root.removeParent(parent);
}
if (!context.currentUnionOperators.isEmpty()) {
// if there are union all operators, it means that the walking context contains union all operators.
// please see more details of context.currentUnionOperator in GenTezWorkWalker
UnionWork unionWork;
if (context.unionWorkMap.containsKey(operator)) {
// since we've passed this operator before.
assert operator.getChildOperators().isEmpty();
unionWork = (UnionWork) context.unionWorkMap.get(operator);
// finally connect the union work with work
connectUnionWorkWithWork(unionWork, work, tezWork, context);
} else {
// we've not seen this terminal before. we need to check
// rootUnionWorkMap which contains the information of mapping the root
// operator of a union work to a union work
unionWork = context.rootUnionWorkMap.get(root);
if (unionWork == null) {
// if unionWork is null, it means it is the first time. we need to
// create a union work object and add this work to it. Subsequent
// work should reference the union and not the actual work.
unionWork = GenTezUtils.createUnionWork(context, root, operator, tezWork);
// finally connect the union work with work
connectUnionWorkWithWork(unionWork, work, tezWork, context);
}
}
context.currentUnionOperators.clear();
work = unionWork;
}
// reasons. Roots are data sources, leaves are data sinks. I know.
if (context.leafOperatorToFollowingWork.containsKey(operator)) {
BaseWork followingWork = context.leafOperatorToFollowingWork.get(operator);
long bytesPerReducer = context.conf.getLongVar(HiveConf.ConfVars.BYTESPERREDUCER);
LOG.debug("Second pass. Leaf operator: " + operator + " has common downstream work: " + followingWork);
if (operator instanceof DummyStoreOperator) {
// this is the small table side.
assert (followingWork instanceof MergeJoinWork);
MergeJoinWork mergeJoinWork = (MergeJoinWork) followingWork;
CommonMergeJoinOperator mergeJoinOp = mergeJoinWork.getMergeJoinOperator();
work.setTag(mergeJoinOp.getTagForOperator(operator));
mergeJoinWork.addMergedWork(null, work, context.leafOperatorToFollowingWork);
tezWork.setVertexType(mergeJoinWork, VertexType.MULTI_INPUT_UNINITIALIZED_EDGES);
for (BaseWork parentWork : tezWork.getParents(work)) {
TezEdgeProperty edgeProp = tezWork.getEdgeProperty(parentWork, work);
tezWork.disconnect(parentWork, work);
tezWork.connect(parentWork, mergeJoinWork, edgeProp);
}
work = mergeJoinWork;
} else {
// need to add this branch to the key + value info
assert operator instanceof ReduceSinkOperator && ((followingWork instanceof ReduceWork) || (followingWork instanceof MergeJoinWork) || followingWork instanceof UnionWork);
ReduceSinkOperator rs = (ReduceSinkOperator) operator;
ReduceWork rWork = null;
if (followingWork instanceof MergeJoinWork) {
MergeJoinWork mergeJoinWork = (MergeJoinWork) followingWork;
rWork = (ReduceWork) mergeJoinWork.getMainWork();
} else if (followingWork instanceof UnionWork) {
// this can only be possible if there is merge work followed by the union
UnionWork unionWork = (UnionWork) followingWork;
int index = getFollowingWorkIndex(tezWork, unionWork, rs);
BaseWork baseWork = tezWork.getChildren(unionWork).get(index);
if (baseWork instanceof MergeJoinWork) {
MergeJoinWork mergeJoinWork = (MergeJoinWork) baseWork;
// disconnect the connection to union work and connect to merge work
followingWork = mergeJoinWork;
rWork = (ReduceWork) mergeJoinWork.getMainWork();
} else {
rWork = (ReduceWork) baseWork;
}
} else {
rWork = (ReduceWork) followingWork;
}
GenMapRedUtils.setKeyAndValueDesc(rWork, rs);
// remember which parent belongs to which tag
int tag = rs.getConf().getTag();
rWork.getTagToInput().put(tag == -1 ? 0 : tag, work.getName());
// remember the output name of the reduce sink
rs.getConf().setOutputName(rWork.getName());
// For dynamic partitioned hash join, run the ReduceSinkMapJoinProc logic for any
// ReduceSink parents that we missed.
MapJoinOperator mj = context.smallTableParentToMapJoinMap.get(rs);
if (mj != null) {
// Only need to run the logic for tables we missed
if (context.mapJoinToUnprocessedSmallTableReduceSinks.get(mj).contains(rs)) {
// ReduceSinkMapJoinProc logic does not work unless the ReduceSink is connected as
// a parent of the MapJoin, but at this point we have already removed all of the
// parents from the MapJoin.
// Try temporarily adding the RS as a parent
ArrayList<Operator<?>> tempMJParents = new ArrayList<Operator<?>>();
tempMJParents.add(rs);
mj.setParentOperators(tempMJParents);
// ReduceSink also needs MapJoin as child
List<Operator<?>> rsChildren = rs.getChildOperators();
rsChildren.add(mj);
// Since the MapJoin has had all of its other parents removed at this point,
// it would be bad here if processReduceSinkToHashJoin() tries to do anything
// with the RS parent based on its position in the list of parents.
ReduceSinkMapJoinProc.processReduceSinkToHashJoin(rs, mj, context);
// Remove any parents from MapJoin again
mj.removeParents();
// TODO: do we also need to remove the MapJoin from the list of RS's children?
}
}
if (!context.connectedReduceSinks.contains(rs)) {
// add dependency between the two work items
TezEdgeProperty edgeProp;
EdgeType edgeType = GenTezUtils.determineEdgeType(work, followingWork, rs);
if (rWork.isAutoReduceParallelism()) {
edgeProp = new TezEdgeProperty(context.conf, edgeType, true, rWork.isSlowStart(), rWork.getMinReduceTasks(), rWork.getMaxReduceTasks(), bytesPerReducer);
} else {
edgeProp = new TezEdgeProperty(edgeType);
edgeProp.setSlowStart(rWork.isSlowStart());
}
tezWork.connect(work, followingWork, edgeProp);
context.connectedReduceSinks.add(rs);
}
}
} else {
LOG.debug("First pass. Leaf operator: " + operator);
}
// the next item will be a new root.
if (!operator.getChildOperators().isEmpty()) {
assert operator.getChildOperators().size() == 1;
context.parentOfRoot = operator;
context.currentRootOperator = operator.getChildOperators().get(0);
context.preceedingWork = work;
}
return null;
}
use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.
the class GenMapRedUtils method addStatsTask.
/**
* Add the StatsTask as a dependent task of the MoveTask
* because StatsTask will change the Table/Partition metadata. For atomicity, we
* should not change it before the data is actually there done by MoveTask.
*
* @param nd
* the FileSinkOperator whose results are taken care of by the MoveTask.
* @param mvTask
* The MoveTask that moves the FileSinkOperator's results.
* @param currTask
* The MapRedTask that the FileSinkOperator belongs to.
* @param hconf
* HiveConf
*/
public static void addStatsTask(FileSinkOperator nd, MoveTask mvTask, Task<? extends Serializable> currTask, HiveConf hconf) {
MoveWork mvWork = mvTask.getWork();
BasicStatsWork statsWork = null;
Table table = null;
boolean truncate = false;
if (mvWork.getLoadTableWork() != null) {
statsWork = new BasicStatsWork(mvWork.getLoadTableWork());
String tableName = mvWork.getLoadTableWork().getTable().getTableName();
truncate = mvWork.getLoadTableWork().getReplace();
try {
table = Hive.get().getTable(SessionState.get().getCurrentDatabase(), tableName);
} catch (HiveException e) {
throw new RuntimeException("unexpected; table should be present already..: " + tableName, e);
}
} else if (mvWork.getLoadFileWork() != null) {
statsWork = new BasicStatsWork(mvWork.getLoadFileWork());
truncate = true;
if (mvWork.getLoadFileWork().getCtasCreateTableDesc() != null) {
try {
table = mvWork.getLoadFileWork().getCtasCreateTableDesc().toTable(hconf);
} catch (HiveException e) {
LOG.debug("can't pre-create table for CTAS", e);
table = null;
}
} else if (mvWork.getLoadFileWork().getCreateViewDesc() != null) {
// CREATE MATERIALIZED VIEW ...
try {
table = mvWork.getLoadFileWork().getCreateViewDesc().toTable(hconf);
} catch (HiveException e) {
LOG.debug("can't pre-create table for MV", e);
table = null;
}
} else {
throw new RuntimeException("unexpected; this should be a CTAS or a CREATE/REBUILD MV - however no desc present");
}
}
assert statsWork != null : "Error when generating StatsTask";
if (currTask.getWork() instanceof MapredWork) {
MapredWork mrWork = (MapredWork) currTask.getWork();
mrWork.getMapWork().setGatheringStats(true);
if (mrWork.getReduceWork() != null) {
mrWork.getReduceWork().setGatheringStats(true);
}
} else if (currTask.getWork() instanceof SparkWork) {
SparkWork work = (SparkWork) currTask.getWork();
for (BaseWork w : work.getAllWork()) {
w.setGatheringStats(true);
}
} else {
// must be TezWork
TezWork work = (TezWork) currTask.getWork();
for (BaseWork w : work.getAllWork()) {
w.setGatheringStats(true);
}
}
StatsWork columnStatsWork = new StatsWork(table, statsWork, hconf);
columnStatsWork.collectStatsFromAggregator(nd.getConf());
columnStatsWork.truncateExisting(truncate);
columnStatsWork.setSourceTask(currTask);
Task<? extends Serializable> statsTask = TaskFactory.get(columnStatsWork);
// subscribe feeds from the MoveTask so that MoveTask can forward the list
// of dynamic partition list to the StatsTask
mvTask.addDependentTask(statsTask);
statsTask.subscribeFeed(mvTask);
}
use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.
the class VectorReduceSinkCommonOperator method initializeOp.
@Override
protected void initializeOp(Configuration hconf) throws HiveException {
super.initializeOp(hconf);
VectorExpression.doTransientInit(reduceSinkKeyExpressions);
VectorExpression.doTransientInit(reduceSinkValueExpressions);
if (LOG.isDebugEnabled()) {
// Determine the name of our map or reduce task for debug tracing.
BaseWork work = Utilities.getMapWork(hconf);
if (work == null) {
work = Utilities.getReduceWork(hconf);
}
taskName = work.getName();
}
String context = hconf.get(Operator.CONTEXT_NAME_KEY, "");
if (context != null && !context.isEmpty()) {
context = "_" + context.replace(" ", "_");
}
reduceSkipTag = conf.getSkipTag();
reduceTagByte = (byte) conf.getTag();
if (LOG.isInfoEnabled()) {
LOG.info("Using tag = " + (int) reduceTagByte);
}
if (!isEmptyKey) {
TableDesc keyTableDesc = conf.getKeySerializeInfo();
boolean[] columnSortOrder = getColumnSortOrder(keyTableDesc.getProperties(), reduceSinkKeyColumnMap.length);
byte[] columnNullMarker = getColumnNullMarker(keyTableDesc.getProperties(), reduceSinkKeyColumnMap.length, columnSortOrder);
byte[] columnNotNullMarker = getColumnNotNullMarker(keyTableDesc.getProperties(), reduceSinkKeyColumnMap.length, columnSortOrder);
keyBinarySortableSerializeWrite = new BinarySortableSerializeWrite(columnSortOrder, columnNullMarker, columnNotNullMarker);
}
if (!isEmptyValue) {
valueLazyBinarySerializeWrite = new LazyBinarySerializeWrite(reduceSinkValueColumnMap.length);
valueVectorSerializeRow = new VectorSerializeRow<LazyBinarySerializeWrite>(valueLazyBinarySerializeWrite);
valueVectorSerializeRow.init(reduceSinkValueTypeInfos, reduceSinkValueColumnMap);
valueOutput = new Output();
valueVectorSerializeRow.setOutput(valueOutput);
}
keyWritable = new HiveKey();
valueBytesWritable = new BytesWritable();
int limit = conf.getTopN();
float memUsage = conf.getTopNMemoryUsage();
if (limit >= 0 && memUsage > 0) {
reducerHash = new TopNHash();
reducerHash.initialize(limit, memUsage, conf.isMapGroupBy(), this, conf, hconf);
}
batchCounter = 0;
}
use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.
the class TestSparkTask method isEmptySparkWork.
private boolean isEmptySparkWork(SparkWork sparkWork) {
List<BaseWork> allWorks = sparkWork.getAllWork();
boolean allWorksIsEmtpy = true;
for (BaseWork work : allWorks) {
if (work.getAllOperators().size() > 0) {
allWorksIsEmtpy = false;
break;
}
}
return allWorksIsEmtpy;
}
use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.
the class TestDagUtils method testCredentialsNotOverwritten.
@Test
public void testCredentialsNotOverwritten() throws Exception {
final UserGroupInformation testUser = UserGroupInformation.createUserForTesting("test_user", new String[0]);
final DagUtils dagUtils = DagUtils.getInstance();
Credentials originalCredentials = new Credentials();
final Text testTokenAlias = new Text("my_test_token");
@SuppressWarnings("unchecked") Token<? extends TokenIdentifier> testToken = mock(Token.class);
originalCredentials.addToken(testTokenAlias, testToken);
Credentials testUserCredentials = new Credentials();
testUser.addCredentials(testUserCredentials);
final BaseWork work = mock(BaseWork.class);
final DAG dag = DAG.create("test_credentials_dag");
dag.setCredentials(originalCredentials);
testUser.doAs(new PrivilegedExceptionAction<Void>() {
@Override
public Void run() throws Exception {
dagUtils.addCredentials(work, dag);
return null;
}
});
Token<? extends TokenIdentifier> actualToken = dag.getCredentials().getToken(testTokenAlias);
assertEquals(testToken, actualToken);
}
Aggregations