use of org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent in project hive by apache.
the class GenericUDTFGetSplits method getSplits.
public InputSplit[] getSplits(JobConf job, int numSplits, TezWork work, Schema schema) throws IOException {
DAG dag = DAG.create(work.getName());
dag.setCredentials(job.getCredentials());
DagUtils utils = DagUtils.getInstance();
Context ctx = new Context(job);
MapWork mapWork = (MapWork) work.getAllWork().get(0);
// bunch of things get setup in the context based on conf but we need only the MR tmp directory
// for the following method.
JobConf wxConf = utils.initializeVertexConf(job, ctx, mapWork);
// TODO: should we also whitelist input formats here? from mapred.input.format.class
Path scratchDir = utils.createTezDir(ctx.getMRScratchDir(), job);
FileSystem fs = scratchDir.getFileSystem(job);
try {
LocalResource appJarLr = createJarLocalResource(utils.getExecJarPathLocal(), utils, job);
Vertex wx = utils.createVertex(wxConf, mapWork, scratchDir, appJarLr, new ArrayList<LocalResource>(), fs, ctx, false, work, work.getVertexType(mapWork));
String vertexName = wx.getName();
dag.addVertex(wx);
utils.addCredentials(mapWork, dag);
// we have the dag now proceed to get the splits:
Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS));
Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.LLAP_CLIENT_CONSISTENT_SPLITS));
HiveSplitGenerator splitGenerator = new HiveSplitGenerator(wxConf, mapWork);
List<Event> eventList = splitGenerator.initialize();
InputSplit[] result = new InputSplit[eventList.size() - 1];
InputConfigureVertexTasksEvent configureEvent = (InputConfigureVertexTasksEvent) eventList.get(0);
List<TaskLocationHint> hints = configureEvent.getLocationHint().getTaskLocationHints();
Preconditions.checkState(hints.size() == eventList.size() - 1);
if (LOG.isDebugEnabled()) {
LOG.debug("NumEvents=" + eventList.size() + ", NumSplits=" + result.length);
}
LlapCoordinator coordinator = LlapCoordinator.getInstance();
if (coordinator == null) {
throw new IOException("LLAP coordinator is not initialized; must be running in HS2 with " + ConfVars.LLAP_HS2_ENABLE_COORDINATOR.varname + " enabled");
}
// See the discussion in the implementation as to why we generate app ID.
ApplicationId applicationId = coordinator.createExtClientAppId();
// This assumes LLAP cluster owner is always the HS2 user.
String llapUser = UserGroupInformation.getLoginUser().getShortUserName();
String queryUser = null;
byte[] tokenBytes = null;
LlapSigner signer = null;
if (UserGroupInformation.isSecurityEnabled()) {
signer = coordinator.getLlapSigner(job);
// 1. Generate the token for query user (applies to all splits).
queryUser = SessionState.getUserFromAuthenticator();
if (queryUser == null) {
queryUser = UserGroupInformation.getCurrentUser().getUserName();
LOG.warn("Cannot determine the session user; using " + queryUser + " instead");
}
LlapTokenLocalClient tokenClient = coordinator.getLocalTokenClient(job, llapUser);
// We put the query user, not LLAP user, into the message and token.
Token<LlapTokenIdentifier> token = tokenClient.createToken(applicationId.toString(), queryUser, true);
LOG.info("Created the token for remote user: {}", token);
bos.reset();
token.write(dos);
tokenBytes = bos.toByteArray();
} else {
queryUser = UserGroupInformation.getCurrentUser().getUserName();
}
LOG.info("Number of splits: " + (eventList.size() - 1));
SignedMessage signedSvs = null;
for (int i = 0; i < eventList.size() - 1; i++) {
TaskSpec taskSpec = new TaskSpecBuilder().constructTaskSpec(dag, vertexName, eventList.size() - 1, applicationId, i);
// 2. Generate the vertex/submit information for all events.
if (i == 0) {
// The queryId could either be picked up from the current request being processed, or
// generated. The current request isn't exactly correct since the query is 'done' once we
// return the results. Generating a new one has the added benefit of working once this
// is moved out of a UDTF into a proper API.
// Setting this to the generated AppId which is unique.
// Despite the differences in TaskSpec, the vertex spec should be the same.
signedSvs = createSignedVertexSpec(signer, taskSpec, applicationId, queryUser, applicationId.toString());
}
SubmitWorkInfo submitWorkInfo = new SubmitWorkInfo(applicationId, System.currentTimeMillis(), taskSpec.getVertexParallelism(), signedSvs.message, signedSvs.signature);
byte[] submitWorkBytes = SubmitWorkInfo.toBytes(submitWorkInfo);
// 3. Generate input event.
SignedMessage eventBytes = makeEventBytes(wx, vertexName, eventList.get(i + 1), signer);
// 4. Make location hints.
SplitLocationInfo[] locations = makeLocationHints(hints.get(i));
result[i] = new LlapInputSplit(i, submitWorkBytes, eventBytes.message, eventBytes.signature, locations, schema, llapUser, tokenBytes);
}
return result;
} catch (Exception e) {
throw new IOException(e);
}
}
use of org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent in project hive by apache.
the class HiveSplitGenerator method createEventList.
private List<Event> createEventList(boolean sendSerializedEvents, InputSplitInfoMem inputSplitInfo) {
List<Event> events = Lists.newArrayListWithCapacity(inputSplitInfo.getNumTasks() + 1);
InputConfigureVertexTasksEvent configureVertexEvent = InputConfigureVertexTasksEvent.create(inputSplitInfo.getNumTasks(), VertexLocationHint.create(inputSplitInfo.getTaskLocationHints()), InputSpecUpdate.getDefaultSinglePhysicalInputSpecUpdate());
events.add(configureVertexEvent);
if (sendSerializedEvents) {
MRSplitsProto splitsProto = inputSplitInfo.getSplitsProto();
int count = 0;
for (MRSplitProto mrSplit : splitsProto.getSplitsList()) {
InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count++, mrSplit.toByteString().asReadOnlyByteBuffer());
events.add(diEvent);
}
} else {
int count = 0;
for (org.apache.hadoop.mapred.InputSplit split : inputSplitInfo.getOldFormatSplits()) {
InputDataInformationEvent diEvent = InputDataInformationEvent.createWithObjectPayload(count++, split);
events.add(diEvent);
}
}
return events;
}
use of org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent in project hive by apache.
the class CustomPartitionVertex method onRootVertexInitialized.
// One call per root Input
@Override
public void onRootVertexInitialized(String inputName, InputDescriptor inputDescriptor, List<Event> events) {
numInputsSeenSoFar++;
LOG.info("On root vertex initialized " + inputName);
try {
// This is using the payload from the RootVertexInitializer corresponding
// to InputName. Ideally it should be using it's own configuration class -
// but that
// means serializing another instance.
MRInputUserPayloadProto protoPayload = MRInputHelpers.parseMRInputPayload(inputDescriptor.getUserPayload());
this.conf = TezUtils.createConfFromByteString(protoPayload.getConfigurationBytes());
/*
* Currently in tez, the flow of events is thus:
* "Generate Splits -> Initialize Vertex" (with parallelism info obtained
* from the generate splits phase). The generate splits phase groups
* splits using the TezGroupedSplitsInputFormat. However, for bucket map
* joins the grouping done by this input format results in incorrect
* results as the grouper has no knowledge of buckets. So, we initially
* set the input format to be HiveInputFormat (in DagUtils) for the case
* of bucket map joins so as to obtain un-grouped splits. We then group
* the splits corresponding to buckets using the tez grouper which returns
* TezGroupedSplits.
*/
// This assumes that Grouping will always be used.
// Enabling grouping on the payload.
MRInputUserPayloadProto updatedPayload = MRInputUserPayloadProto.newBuilder(protoPayload).setGroupingEnabled(true).build();
inputDescriptor.setUserPayload(UserPayload.create(updatedPayload.toByteString().asReadOnlyByteBuffer()));
} catch (IOException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
boolean dataInformationEventSeen = false;
Map<String, Set<FileSplit>> pathFileSplitsMap = new TreeMap<String, Set<FileSplit>>();
for (Event event : events) {
if (event instanceof InputConfigureVertexTasksEvent) {
// No tasks should have been started yet. Checked by initial state
// check.
LOG.info("Got a input configure vertex event for input: " + inputName);
Preconditions.checkState(dataInformationEventSeen == false);
InputConfigureVertexTasksEvent cEvent = (InputConfigureVertexTasksEvent) event;
// The vertex cannot be configured until all DataEvents are seen - to
// build the routing table.
configureVertexTaskEvent = cEvent;
LOG.info("Configure task for input name: " + inputName + " num tasks: " + configureVertexTaskEvent.getNumTasks());
}
if (event instanceof InputUpdatePayloadEvent) {
// this event can never occur. If it does, fail.
Preconditions.checkState(false);
} else if (event instanceof InputDataInformationEvent) {
dataInformationEventSeen = true;
InputDataInformationEvent diEvent = (InputDataInformationEvent) event;
FileSplit fileSplit;
try {
fileSplit = getFileSplitFromEvent(diEvent);
} catch (IOException e) {
throw new RuntimeException("Failed to get file split for event: " + diEvent, e);
}
Set<FileSplit> fsList = pathFileSplitsMap.get(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()));
if (fsList == null) {
fsList = new TreeSet<FileSplit>(new PathComparatorForSplit());
pathFileSplitsMap.put(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()), fsList);
}
fsList.add(fileSplit);
}
}
LOG.info("Path file splits map for input name: " + inputName + " is " + pathFileSplitsMap);
Multimap<Integer, InputSplit> bucketToInitialSplitMap = getBucketSplitMapForPath(pathFileSplitsMap);
try {
int totalResource = context.getTotalAvailableResource().getMemory();
int taskResource = context.getVertexTaskResource().getMemory();
float waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
int availableSlots = totalResource / taskResource;
LOG.info("Grouping splits. " + availableSlots + " available slots, " + waves + " waves. Bucket initial splits map: " + bucketToInitialSplitMap);
JobConf jobConf = new JobConf(conf);
ShimLoader.getHadoopShims().getMergedCredentials(jobConf);
Multimap<Integer, InputSplit> bucketToGroupedSplitMap = HashMultimap.<Integer, InputSplit>create();
boolean secondLevelGroupingDone = false;
if ((mainWorkName.isEmpty()) || (inputName.compareTo(mainWorkName) == 0)) {
SplitLocationProvider splitLocationProvider = Utils.getSplitLocationProvider(conf, LOG);
for (Integer key : bucketToInitialSplitMap.keySet()) {
InputSplit[] inputSplitArray = (bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
Multimap<Integer, InputSplit> groupedSplit = grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves, availableSlots, inputName, mainWorkName.isEmpty(), splitLocationProvider);
if (mainWorkName.isEmpty() == false) {
Multimap<Integer, InputSplit> singleBucketToGroupedSplit = HashMultimap.<Integer, InputSplit>create();
singleBucketToGroupedSplit.putAll(key, groupedSplit.values());
groupedSplit = grouper.group(jobConf, singleBucketToGroupedSplit, availableSlots, HiveConf.getFloatVar(conf, HiveConf.ConfVars.TEZ_SMB_NUMBER_WAVES), null);
secondLevelGroupingDone = true;
}
bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
}
processAllEvents(inputName, bucketToGroupedSplitMap, secondLevelGroupingDone);
} else {
SplitLocationProvider splitLocationProvider = Utils.getSplitLocationProvider(conf, LOG);
// all the bucket files.
for (Integer key : bucketToInitialSplitMap.keySet()) {
InputSplit[] inputSplitArray = (bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
Multimap<Integer, InputSplit> groupedSplit = grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves, availableSlots, inputName, false, splitLocationProvider);
bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
}
/*
* this is the small table side. In case of SMB join, we need to send each split to the
* corresponding bucket-based task on the other side. In case a split needs to go to
* multiple downstream tasks, we need to clone the event and send it to the right
* destination.
*/
LOG.info("This is the side work - multi-mr work.");
processAllSideEventsSetParallelism(inputName, bucketToGroupedSplitMap);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
Aggregations