use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class GenericUDTFGetSplits method getSplits.
public InputSplit[] getSplits(JobConf job, int numSplits, TezWork work, Schema schema) throws IOException {
DAG dag = DAG.create(work.getName());
dag.setCredentials(job.getCredentials());
DagUtils utils = DagUtils.getInstance();
Context ctx = new Context(job);
MapWork mapWork = (MapWork) work.getAllWork().get(0);
// bunch of things get setup in the context based on conf but we need only the MR tmp directory
// for the following method.
JobConf wxConf = utils.initializeVertexConf(job, ctx, mapWork);
// TODO: should we also whitelist input formats here? from mapred.input.format.class
Path scratchDir = utils.createTezDir(ctx.getMRScratchDir(), job);
FileSystem fs = scratchDir.getFileSystem(job);
try {
LocalResource appJarLr = createJarLocalResource(utils.getExecJarPathLocal(), utils, job);
Vertex wx = utils.createVertex(wxConf, mapWork, scratchDir, appJarLr, new ArrayList<LocalResource>(), fs, ctx, false, work, work.getVertexType(mapWork));
String vertexName = wx.getName();
dag.addVertex(wx);
utils.addCredentials(mapWork, dag);
// we have the dag now proceed to get the splits:
Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS));
Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.LLAP_CLIENT_CONSISTENT_SPLITS));
HiveSplitGenerator splitGenerator = new HiveSplitGenerator(wxConf, mapWork);
List<Event> eventList = splitGenerator.initialize();
InputSplit[] result = new InputSplit[eventList.size() - 1];
InputConfigureVertexTasksEvent configureEvent = (InputConfigureVertexTasksEvent) eventList.get(0);
List<TaskLocationHint> hints = configureEvent.getLocationHint().getTaskLocationHints();
Preconditions.checkState(hints.size() == eventList.size() - 1);
if (LOG.isDebugEnabled()) {
LOG.debug("NumEvents=" + eventList.size() + ", NumSplits=" + result.length);
}
LlapCoordinator coordinator = LlapCoordinator.getInstance();
if (coordinator == null) {
throw new IOException("LLAP coordinator is not initialized; must be running in HS2 with " + ConfVars.LLAP_HS2_ENABLE_COORDINATOR.varname + " enabled");
}
// See the discussion in the implementation as to why we generate app ID.
ApplicationId applicationId = coordinator.createExtClientAppId();
// This assumes LLAP cluster owner is always the HS2 user.
String llapUser = UserGroupInformation.getLoginUser().getShortUserName();
String queryUser = null;
byte[] tokenBytes = null;
LlapSigner signer = null;
if (UserGroupInformation.isSecurityEnabled()) {
signer = coordinator.getLlapSigner(job);
// 1. Generate the token for query user (applies to all splits).
queryUser = SessionState.getUserFromAuthenticator();
if (queryUser == null) {
queryUser = UserGroupInformation.getCurrentUser().getUserName();
LOG.warn("Cannot determine the session user; using " + queryUser + " instead");
}
LlapTokenLocalClient tokenClient = coordinator.getLocalTokenClient(job, llapUser);
// We put the query user, not LLAP user, into the message and token.
Token<LlapTokenIdentifier> token = tokenClient.createToken(applicationId.toString(), queryUser, true);
LOG.info("Created the token for remote user: {}", token);
bos.reset();
token.write(dos);
tokenBytes = bos.toByteArray();
} else {
queryUser = UserGroupInformation.getCurrentUser().getUserName();
}
LOG.info("Number of splits: " + (eventList.size() - 1));
SignedMessage signedSvs = null;
for (int i = 0; i < eventList.size() - 1; i++) {
TaskSpec taskSpec = new TaskSpecBuilder().constructTaskSpec(dag, vertexName, eventList.size() - 1, applicationId, i);
// 2. Generate the vertex/submit information for all events.
if (i == 0) {
// The queryId could either be picked up from the current request being processed, or
// generated. The current request isn't exactly correct since the query is 'done' once we
// return the results. Generating a new one has the added benefit of working once this
// is moved out of a UDTF into a proper API.
// Setting this to the generated AppId which is unique.
// Despite the differences in TaskSpec, the vertex spec should be the same.
signedSvs = createSignedVertexSpec(signer, taskSpec, applicationId, queryUser, applicationId.toString());
}
SubmitWorkInfo submitWorkInfo = new SubmitWorkInfo(applicationId, System.currentTimeMillis(), taskSpec.getVertexParallelism(), signedSvs.message, signedSvs.signature);
byte[] submitWorkBytes = SubmitWorkInfo.toBytes(submitWorkInfo);
// 3. Generate input event.
SignedMessage eventBytes = makeEventBytes(wx, vertexName, eventList.get(i + 1), signer);
// 4. Make location hints.
SplitLocationInfo[] locations = makeLocationHints(hints.get(i));
result[i] = new LlapInputSplit(i, submitWorkBytes, eventBytes.message, eventBytes.signature, locations, schema, llapUser, tokenBytes);
}
return result;
} catch (Exception e) {
throw new IOException(e);
}
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class TestTezTask method setUp.
@SuppressWarnings("unchecked")
@Before
public void setUp() throws Exception {
utils = mock(DagUtils.class);
fs = mock(FileSystem.class);
path = mock(Path.class);
when(path.getFileSystem(any(Configuration.class))).thenReturn(fs);
when(utils.getTezDir(any(Path.class))).thenReturn(path);
when(utils.createVertex(any(JobConf.class), any(BaseWork.class), any(Path.class), any(LocalResource.class), any(List.class), any(FileSystem.class), any(Context.class), anyBoolean(), any(TezWork.class), any(VertexType.class))).thenAnswer(new Answer<Vertex>() {
@Override
public Vertex answer(InvocationOnMock invocation) throws Throwable {
Object[] args = invocation.getArguments();
return Vertex.create(((BaseWork) args[1]).getName(), mock(ProcessorDescriptor.class), 0, mock(Resource.class));
}
});
when(utils.createEdge(any(JobConf.class), any(Vertex.class), any(Vertex.class), any(TezEdgeProperty.class), any(VertexType.class))).thenAnswer(new Answer<Edge>() {
@Override
public Edge answer(InvocationOnMock invocation) throws Throwable {
Object[] args = invocation.getArguments();
return Edge.create((Vertex) args[1], (Vertex) args[2], mock(EdgeProperty.class));
}
});
work = new TezWork("", null);
mws = new MapWork[] { new MapWork(), new MapWork() };
rws = new ReduceWork[] { new ReduceWork(), new ReduceWork() };
work.addAll(mws);
work.addAll(rws);
int i = 0;
for (BaseWork w : work.getAllWork()) {
w.setName("Work " + (++i));
}
op = mock(Operator.class);
LinkedHashMap<String, Operator<? extends OperatorDesc>> map = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
map.put("foo", op);
mws[0].setAliasToWork(map);
mws[1].setAliasToWork(map);
LinkedHashMap<Path, ArrayList<String>> pathMap = new LinkedHashMap<>();
ArrayList<String> aliasList = new ArrayList<String>();
aliasList.add("foo");
pathMap.put(new Path("foo"), aliasList);
mws[0].setPathToAliases(pathMap);
mws[1].setPathToAliases(pathMap);
rws[0].setReducer(op);
rws[1].setReducer(op);
TezEdgeProperty edgeProp = new TezEdgeProperty(EdgeType.SIMPLE_EDGE);
work.connect(mws[0], rws[0], edgeProp);
work.connect(mws[1], rws[0], edgeProp);
work.connect(rws[0], rws[1], edgeProp);
task = new TezTask(utils);
task.setWork(work);
task.setConsole(mock(LogHelper.class));
QueryPlan mockQueryPlan = mock(QueryPlan.class);
doReturn(UUID.randomUUID().toString()).when(mockQueryPlan).getQueryId();
task.setQueryPlan(mockQueryPlan);
conf = new JobConf();
appLr = mock(LocalResource.class);
HiveConf hiveConf = new HiveConf();
hiveConf.setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER, "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory");
SessionState.start(hiveConf);
session = mock(TezClient.class);
sessionState = mock(TezSessionState.class);
when(sessionState.getSession()).thenReturn(session);
when(session.submitDAG(any(DAG.class))).thenThrow(new SessionNotRunning("")).thenReturn(mock(DAGClient.class));
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class TestUtilities method runTestGetInputSummary.
private ContentSummary runTestGetInputSummary(JobConf jobConf, Properties properties, int numOfPartitions, int bytesPerFile, Class<? extends InputFormat> inputFormatClass) throws IOException {
// creates scratch directories needed by the Context object
SessionState.start(new HiveConf());
MapWork mapWork = new MapWork();
Context context = new Context(jobConf);
LinkedHashMap<Path, PartitionDesc> pathToPartitionInfo = new LinkedHashMap<>();
LinkedHashMap<Path, ArrayList<String>> pathToAliasTable = new LinkedHashMap<>();
TableScanOperator scanOp = new TableScanOperator();
PartitionDesc partitionDesc = new PartitionDesc(new TableDesc(inputFormatClass, null, properties), null);
String testTableName = "testTable";
Path testTablePath = new Path(testTableName);
Path[] testPartitionsPaths = new Path[numOfPartitions];
for (int i = 0; i < numOfPartitions; i++) {
String testPartitionName = "p=" + 1;
testPartitionsPaths[i] = new Path(testTablePath, "p=" + i);
pathToPartitionInfo.put(testPartitionsPaths[i], partitionDesc);
pathToAliasTable.put(testPartitionsPaths[i], Lists.newArrayList(testPartitionName));
mapWork.getAliasToWork().put(testPartitionName, scanOp);
}
mapWork.setPathToAliases(pathToAliasTable);
mapWork.setPathToPartitionInfo(pathToPartitionInfo);
FileSystem fs = FileSystem.getLocal(jobConf);
try {
fs.mkdirs(testTablePath);
byte[] data = new byte[bytesPerFile];
for (int i = 0; i < numOfPartitions; i++) {
fs.mkdirs(testPartitionsPaths[i]);
FSDataOutputStream out = fs.create(new Path(testPartitionsPaths[i], "test1.txt"));
out.write(data);
out.close();
}
return Utilities.getInputSummary(context, mapWork, null);
} finally {
if (fs.exists(testTablePath)) {
fs.delete(testTablePath, true);
}
}
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class TestUtilities method testGetInputPathsWithEmptyTables.
/**
* Check that calling {@link Utilities#getInputPaths(JobConf, MapWork, Path, Context, boolean)}
* can process two different empty tables without throwing any exceptions.
*/
@Test
public void testGetInputPathsWithEmptyTables() throws Exception {
String alias1Name = "alias1";
String alias2Name = "alias2";
MapWork mapWork1 = new MapWork();
MapWork mapWork2 = new MapWork();
JobConf jobConf = new JobConf();
Path nonExistentPath1 = new Path(UUID.randomUUID().toString());
Path nonExistentPath2 = new Path(UUID.randomUUID().toString());
PartitionDesc mockPartitionDesc = mock(PartitionDesc.class);
TableDesc mockTableDesc = mock(TableDesc.class);
when(mockTableDesc.isNonNative()).thenReturn(false);
when(mockTableDesc.getProperties()).thenReturn(new Properties());
when(mockPartitionDesc.getProperties()).thenReturn(new Properties());
when(mockPartitionDesc.getTableDesc()).thenReturn(mockTableDesc);
doReturn(HiveSequenceFileOutputFormat.class).when(mockPartitionDesc).getOutputFileFormatClass();
mapWork1.setPathToAliases(new LinkedHashMap<>(ImmutableMap.of(nonExistentPath1, Lists.newArrayList(alias1Name))));
mapWork1.setAliasToWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(ImmutableMap.of(alias1Name, (Operator<?>) mock(Operator.class))));
mapWork1.setPathToPartitionInfo(new LinkedHashMap<>(ImmutableMap.of(nonExistentPath1, mockPartitionDesc)));
mapWork2.setPathToAliases(new LinkedHashMap<>(ImmutableMap.of(nonExistentPath2, Lists.newArrayList(alias2Name))));
mapWork2.setAliasToWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(ImmutableMap.of(alias2Name, (Operator<?>) mock(Operator.class))));
mapWork2.setPathToPartitionInfo(new LinkedHashMap<>(ImmutableMap.of(nonExistentPath2, mockPartitionDesc)));
List<Path> inputPaths = new ArrayList<>();
try {
Path scratchDir = new Path(HiveConf.getVar(jobConf, HiveConf.ConfVars.LOCALSCRATCHDIR));
inputPaths.addAll(Utilities.getInputPaths(jobConf, mapWork1, scratchDir, mock(Context.class), false));
inputPaths.addAll(Utilities.getInputPaths(jobConf, mapWork2, scratchDir, mock(Context.class), false));
assertEquals(inputPaths.size(), 2);
} finally {
File file;
for (Path path : inputPaths) {
file = new File(path.toString());
if (file.exists()) {
file.delete();
}
}
}
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class TestDynamicPartitionPruner method testSingleSourceMultipleFiltersOrdering2.
@Test(timeout = 5000)
public void testSingleSourceMultipleFiltersOrdering2() throws InterruptedException, SerDeException {
InputInitializerContext mockInitContext = mock(InputInitializerContext.class);
doReturn(2).when(mockInitContext).getVertexNumTasks("v1");
MapWork mapWork = createMockMapWork(new TestSource("v1", 2));
DynamicPartitionPruner pruner = new DynamicPartitionPrunerForEventTesting(mockInitContext, mapWork);
PruneRunnable pruneRunnable = new PruneRunnable(pruner);
Thread t = new Thread(pruneRunnable);
t.start();
try {
pruneRunnable.start();
InputInitializerEvent event = InputInitializerEvent.create("FakeTarget", "TargetInput", ByteBuffer.allocate(0));
event.setSourceVertexName("v1");
pruner.processVertex("v1");
pruner.addEvent(event);
pruner.addEvent(event);
pruner.addEvent(event);
pruner.addEvent(event);
pruneRunnable.awaitEnd();
assertFalse(pruneRunnable.inError.get());
} finally {
t.interrupt();
t.join();
}
}
Aggregations