use of org.apache.tez.dag.api.Vertex in project hive by apache.
the class GenericUDTFGetSplits method getSplits.
public InputSplit[] getSplits(JobConf job, int numSplits, TezWork work, Schema schema) throws IOException {
DAG dag = DAG.create(work.getName());
dag.setCredentials(job.getCredentials());
DagUtils utils = DagUtils.getInstance();
Context ctx = new Context(job);
MapWork mapWork = (MapWork) work.getAllWork().get(0);
// bunch of things get setup in the context based on conf but we need only the MR tmp directory
// for the following method.
JobConf wxConf = utils.initializeVertexConf(job, ctx, mapWork);
// TODO: should we also whitelist input formats here? from mapred.input.format.class
Path scratchDir = utils.createTezDir(ctx.getMRScratchDir(), job);
FileSystem fs = scratchDir.getFileSystem(job);
try {
LocalResource appJarLr = createJarLocalResource(utils.getExecJarPathLocal(), utils, job);
Vertex wx = utils.createVertex(wxConf, mapWork, scratchDir, appJarLr, new ArrayList<LocalResource>(), fs, ctx, false, work, work.getVertexType(mapWork));
String vertexName = wx.getName();
dag.addVertex(wx);
utils.addCredentials(mapWork, dag);
// we have the dag now proceed to get the splits:
Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS));
Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.LLAP_CLIENT_CONSISTENT_SPLITS));
HiveSplitGenerator splitGenerator = new HiveSplitGenerator(wxConf, mapWork);
List<Event> eventList = splitGenerator.initialize();
InputSplit[] result = new InputSplit[eventList.size() - 1];
InputConfigureVertexTasksEvent configureEvent = (InputConfigureVertexTasksEvent) eventList.get(0);
List<TaskLocationHint> hints = configureEvent.getLocationHint().getTaskLocationHints();
Preconditions.checkState(hints.size() == eventList.size() - 1);
if (LOG.isDebugEnabled()) {
LOG.debug("NumEvents=" + eventList.size() + ", NumSplits=" + result.length);
}
LlapCoordinator coordinator = LlapCoordinator.getInstance();
if (coordinator == null) {
throw new IOException("LLAP coordinator is not initialized; must be running in HS2 with " + ConfVars.LLAP_HS2_ENABLE_COORDINATOR.varname + " enabled");
}
// See the discussion in the implementation as to why we generate app ID.
ApplicationId applicationId = coordinator.createExtClientAppId();
// This assumes LLAP cluster owner is always the HS2 user.
String llapUser = UserGroupInformation.getLoginUser().getShortUserName();
String queryUser = null;
byte[] tokenBytes = null;
LlapSigner signer = null;
if (UserGroupInformation.isSecurityEnabled()) {
signer = coordinator.getLlapSigner(job);
// 1. Generate the token for query user (applies to all splits).
queryUser = SessionState.getUserFromAuthenticator();
if (queryUser == null) {
queryUser = UserGroupInformation.getCurrentUser().getUserName();
LOG.warn("Cannot determine the session user; using " + queryUser + " instead");
}
LlapTokenLocalClient tokenClient = coordinator.getLocalTokenClient(job, llapUser);
// We put the query user, not LLAP user, into the message and token.
Token<LlapTokenIdentifier> token = tokenClient.createToken(applicationId.toString(), queryUser, true);
LOG.info("Created the token for remote user: {}", token);
bos.reset();
token.write(dos);
tokenBytes = bos.toByteArray();
} else {
queryUser = UserGroupInformation.getCurrentUser().getUserName();
}
LOG.info("Number of splits: " + (eventList.size() - 1));
SignedMessage signedSvs = null;
for (int i = 0; i < eventList.size() - 1; i++) {
TaskSpec taskSpec = new TaskSpecBuilder().constructTaskSpec(dag, vertexName, eventList.size() - 1, applicationId, i);
// 2. Generate the vertex/submit information for all events.
if (i == 0) {
// The queryId could either be picked up from the current request being processed, or
// generated. The current request isn't exactly correct since the query is 'done' once we
// return the results. Generating a new one has the added benefit of working once this
// is moved out of a UDTF into a proper API.
// Setting this to the generated AppId which is unique.
// Despite the differences in TaskSpec, the vertex spec should be the same.
signedSvs = createSignedVertexSpec(signer, taskSpec, applicationId, queryUser, applicationId.toString());
}
SubmitWorkInfo submitWorkInfo = new SubmitWorkInfo(applicationId, System.currentTimeMillis(), taskSpec.getVertexParallelism(), signedSvs.message, signedSvs.signature);
byte[] submitWorkBytes = SubmitWorkInfo.toBytes(submitWorkInfo);
// 3. Generate input event.
SignedMessage eventBytes = makeEventBytes(wx, vertexName, eventList.get(i + 1), signer);
// 4. Make location hints.
SplitLocationInfo[] locations = makeLocationHints(hints.get(i));
result[i] = new LlapInputSplit(i, submitWorkBytes, eventBytes.message, eventBytes.signature, locations, schema, llapUser, tokenBytes);
}
return result;
} catch (Exception e) {
throw new IOException(e);
}
}
use of org.apache.tez.dag.api.Vertex in project hive by apache.
the class TestTezTask method setUp.
@SuppressWarnings("unchecked")
@Before
public void setUp() throws Exception {
utils = mock(DagUtils.class);
fs = mock(FileSystem.class);
path = mock(Path.class);
when(path.getFileSystem(any(Configuration.class))).thenReturn(fs);
when(utils.getTezDir(any(Path.class))).thenReturn(path);
when(utils.createVertex(any(JobConf.class), any(BaseWork.class), any(Path.class), any(LocalResource.class), any(List.class), any(FileSystem.class), any(Context.class), anyBoolean(), any(TezWork.class), any(VertexType.class))).thenAnswer(new Answer<Vertex>() {
@Override
public Vertex answer(InvocationOnMock invocation) throws Throwable {
Object[] args = invocation.getArguments();
return Vertex.create(((BaseWork) args[1]).getName(), mock(ProcessorDescriptor.class), 0, mock(Resource.class));
}
});
when(utils.createEdge(any(JobConf.class), any(Vertex.class), any(Vertex.class), any(TezEdgeProperty.class), any(VertexType.class))).thenAnswer(new Answer<Edge>() {
@Override
public Edge answer(InvocationOnMock invocation) throws Throwable {
Object[] args = invocation.getArguments();
return Edge.create((Vertex) args[1], (Vertex) args[2], mock(EdgeProperty.class));
}
});
work = new TezWork("", null);
mws = new MapWork[] { new MapWork(), new MapWork() };
rws = new ReduceWork[] { new ReduceWork(), new ReduceWork() };
work.addAll(mws);
work.addAll(rws);
int i = 0;
for (BaseWork w : work.getAllWork()) {
w.setName("Work " + (++i));
}
op = mock(Operator.class);
LinkedHashMap<String, Operator<? extends OperatorDesc>> map = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
map.put("foo", op);
mws[0].setAliasToWork(map);
mws[1].setAliasToWork(map);
LinkedHashMap<Path, ArrayList<String>> pathMap = new LinkedHashMap<>();
ArrayList<String> aliasList = new ArrayList<String>();
aliasList.add("foo");
pathMap.put(new Path("foo"), aliasList);
mws[0].setPathToAliases(pathMap);
mws[1].setPathToAliases(pathMap);
rws[0].setReducer(op);
rws[1].setReducer(op);
TezEdgeProperty edgeProp = new TezEdgeProperty(EdgeType.SIMPLE_EDGE);
work.connect(mws[0], rws[0], edgeProp);
work.connect(mws[1], rws[0], edgeProp);
work.connect(rws[0], rws[1], edgeProp);
task = new TezTask(utils);
task.setWork(work);
task.setConsole(mock(LogHelper.class));
QueryPlan mockQueryPlan = mock(QueryPlan.class);
doReturn(UUID.randomUUID().toString()).when(mockQueryPlan).getQueryId();
task.setQueryPlan(mockQueryPlan);
conf = new JobConf();
appLr = mock(LocalResource.class);
HiveConf hiveConf = new HiveConf();
hiveConf.setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER, "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory");
SessionState.start(hiveConf);
session = mock(TezClient.class);
sessionState = mock(TezSessionState.class);
when(sessionState.getSession()).thenReturn(session);
when(session.submitDAG(any(DAG.class))).thenThrow(new SessionNotRunning("")).thenReturn(mock(DAGClient.class));
}
use of org.apache.tez.dag.api.Vertex in project hive by apache.
the class DAGSummary method hiveInputRecordsFromOtherVertices.
private long hiveInputRecordsFromOtherVertices(String vertexName) {
List<Vertex> inputVerticesList = dag.getVertex(vertexName).getInputVertices();
long result = 0;
for (Vertex inputVertex : inputVerticesList) {
String intermediateRecordsCounterName = formattedName(ReduceSinkOperator.Counter.RECORDS_OUT_INTERMEDIATE.toString(), inputVertex.getName());
String recordsOutCounterName = formattedName(FileSinkOperator.Counter.RECORDS_OUT.toString(), inputVertex.getName());
result += (hiveCounterValue(intermediateRecordsCounterName) + hiveCounterValue(recordsOutCounterName));
}
return result;
}
use of org.apache.tez.dag.api.Vertex in project hive by apache.
the class DagUtils method createVertex.
/*
* Helper function to create Vertex for given ReduceWork.
*/
private Vertex createVertex(JobConf conf, ReduceWork reduceWork, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fs, Path mrScratchDir, Context ctx) throws Exception {
// set up operator plan
conf.set(Utilities.INPUT_NAME, reduceWork.getName());
Utilities.setReduceWork(conf, reduceWork, mrScratchDir, false);
// create the directories FileSinkOperators need
Utilities.createTmpDirs(conf, reduceWork);
VertexExecutionContext vertexExecutionContext = createVertexExecutionContext(reduceWork);
// create the vertex
Vertex reducer = Vertex.create(reduceWork.getName(), ProcessorDescriptor.create(ReduceTezProcessor.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(conf)), reduceWork.isAutoReduceParallelism() ? reduceWork.getMaxReduceTasks() : reduceWork.getNumReduceTasks(), getContainerResource(conf));
reducer.setTaskEnvironment(getContainerEnvironment(conf, false));
reducer.setExecutionContext(vertexExecutionContext);
reducer.setTaskLaunchCmdOpts(getContainerJavaOpts(conf));
Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
localResources.put(getBaseName(appJarLr), appJarLr);
for (LocalResource lr : additionalLr) {
localResources.put(getBaseName(lr), lr);
}
reducer.addTaskLocalFiles(localResources);
return reducer;
}
use of org.apache.tez.dag.api.Vertex in project hive by apache.
the class TezTask method build.
DAG build(JobConf conf, TezWork work, Path scratchDir, LocalResource appJarLr, List<LocalResource> additionalLr, Context ctx) throws Exception {
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_BUILD_DAG);
// getAllWork returns a topologically sorted list, which we use to make
// sure that vertices are created before they are used in edges.
List<BaseWork> ws = work.getAllWork();
Collections.reverse(ws);
FileSystem fs = scratchDir.getFileSystem(conf);
// the name of the dag is what is displayed in the AM/Job UI
String dagName = utils.createDagName(conf, queryPlan);
LOG.info("Dag name: " + dagName);
DAG dag = DAG.create(dagName);
// set some info for the query
JSONObject json = new JSONObject(new LinkedHashMap()).put("context", "Hive").put("description", ctx.getCmd());
String dagInfo = json.toString();
if (LOG.isDebugEnabled()) {
LOG.debug("DagInfo: " + dagInfo);
}
dag.setDAGInfo(dagInfo);
dag.setCredentials(conf.getCredentials());
setAccessControlsForCurrentUser(dag, queryPlan.getQueryId(), conf);
for (BaseWork w : ws) {
boolean isFinal = work.getLeaves().contains(w);
// translate work to vertex
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_CREATE_VERTEX + w.getName());
if (w instanceof UnionWork) {
// Special case for unions. These items translate to VertexGroups
List<BaseWork> unionWorkItems = new LinkedList<BaseWork>();
List<BaseWork> children = new LinkedList<BaseWork>();
// proper children of the union
for (BaseWork v : work.getChildren(w)) {
EdgeType type = work.getEdgeProperty(w, v).getEdgeType();
if (type == EdgeType.CONTAINS) {
unionWorkItems.add(v);
} else {
children.add(v);
}
}
// create VertexGroup
Vertex[] vertexArray = new Vertex[unionWorkItems.size()];
int i = 0;
for (BaseWork v : unionWorkItems) {
vertexArray[i++] = workToVertex.get(v);
}
VertexGroup group = dag.createVertexGroup(w.getName(), vertexArray);
// For a vertex group, all Outputs use the same Key-class, Val-class and partitioner.
// Pick any one source vertex to figure out the Edge configuration.
JobConf parentConf = workToConf.get(unionWorkItems.get(0));
// now hook up the children
for (BaseWork v : children) {
// finally we can create the grouped edge
GroupInputEdge e = utils.createEdge(group, parentConf, workToVertex.get(v), work.getEdgeProperty(w, v), work.getVertexType(v));
dag.addEdge(e);
}
} else {
// Regular vertices
JobConf wxConf = utils.initializeVertexConf(conf, ctx, w);
Vertex wx = utils.createVertex(wxConf, w, scratchDir, appJarLr, additionalLr, fs, ctx, !isFinal, work, work.getVertexType(w));
if (w.getReservedMemoryMB() > 0) {
// If reversedMemoryMB is set, make memory allocation fraction adjustment as needed
double frac = DagUtils.adjustMemoryReserveFraction(w.getReservedMemoryMB(), super.conf);
LOG.info("Setting " + TEZ_MEMORY_RESERVE_FRACTION + " to " + frac);
wx.setConf(TEZ_MEMORY_RESERVE_FRACTION, Double.toString(frac));
}
// Otherwise just leave it up to Tez to decide how much memory to allocate
dag.addVertex(wx);
utils.addCredentials(w, dag);
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_CREATE_VERTEX + w.getName());
workToVertex.put(w, wx);
workToConf.put(w, wxConf);
// add all dependencies (i.e.: edges) to the graph
for (BaseWork v : work.getChildren(w)) {
assert workToVertex.containsKey(v);
Edge e = null;
TezEdgeProperty edgeProp = work.getEdgeProperty(w, v);
e = utils.createEdge(wxConf, wx, workToVertex.get(v), edgeProp, work.getVertexType(v));
dag.addEdge(e);
}
}
}
// Clear the work map after build. TODO: remove caching instead?
Utilities.clearWorkMap(conf);
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_BUILD_DAG);
return dag;
}
Aggregations