use of org.apache.tez.dag.api.TaskLocationHint in project hive by apache.
the class GenericUDTFGetSplits method getSplits.
public InputSplit[] getSplits(JobConf job, int numSplits, TezWork work, Schema schema) throws IOException {
DAG dag = DAG.create(work.getName());
dag.setCredentials(job.getCredentials());
DagUtils utils = DagUtils.getInstance();
Context ctx = new Context(job);
MapWork mapWork = (MapWork) work.getAllWork().get(0);
// bunch of things get setup in the context based on conf but we need only the MR tmp directory
// for the following method.
JobConf wxConf = utils.initializeVertexConf(job, ctx, mapWork);
// TODO: should we also whitelist input formats here? from mapred.input.format.class
Path scratchDir = utils.createTezDir(ctx.getMRScratchDir(), job);
FileSystem fs = scratchDir.getFileSystem(job);
try {
LocalResource appJarLr = createJarLocalResource(utils.getExecJarPathLocal(), utils, job);
Vertex wx = utils.createVertex(wxConf, mapWork, scratchDir, appJarLr, new ArrayList<LocalResource>(), fs, ctx, false, work, work.getVertexType(mapWork));
String vertexName = wx.getName();
dag.addVertex(wx);
utils.addCredentials(mapWork, dag);
// we have the dag now proceed to get the splits:
Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS));
Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.LLAP_CLIENT_CONSISTENT_SPLITS));
HiveSplitGenerator splitGenerator = new HiveSplitGenerator(wxConf, mapWork);
List<Event> eventList = splitGenerator.initialize();
InputSplit[] result = new InputSplit[eventList.size() - 1];
InputConfigureVertexTasksEvent configureEvent = (InputConfigureVertexTasksEvent) eventList.get(0);
List<TaskLocationHint> hints = configureEvent.getLocationHint().getTaskLocationHints();
Preconditions.checkState(hints.size() == eventList.size() - 1);
if (LOG.isDebugEnabled()) {
LOG.debug("NumEvents=" + eventList.size() + ", NumSplits=" + result.length);
}
LlapCoordinator coordinator = LlapCoordinator.getInstance();
if (coordinator == null) {
throw new IOException("LLAP coordinator is not initialized; must be running in HS2 with " + ConfVars.LLAP_HS2_ENABLE_COORDINATOR.varname + " enabled");
}
// See the discussion in the implementation as to why we generate app ID.
ApplicationId applicationId = coordinator.createExtClientAppId();
// This assumes LLAP cluster owner is always the HS2 user.
String llapUser = UserGroupInformation.getLoginUser().getShortUserName();
String queryUser = null;
byte[] tokenBytes = null;
LlapSigner signer = null;
if (UserGroupInformation.isSecurityEnabled()) {
signer = coordinator.getLlapSigner(job);
// 1. Generate the token for query user (applies to all splits).
queryUser = SessionState.getUserFromAuthenticator();
if (queryUser == null) {
queryUser = UserGroupInformation.getCurrentUser().getUserName();
LOG.warn("Cannot determine the session user; using " + queryUser + " instead");
}
LlapTokenLocalClient tokenClient = coordinator.getLocalTokenClient(job, llapUser);
// We put the query user, not LLAP user, into the message and token.
Token<LlapTokenIdentifier> token = tokenClient.createToken(applicationId.toString(), queryUser, true);
LOG.info("Created the token for remote user: {}", token);
bos.reset();
token.write(dos);
tokenBytes = bos.toByteArray();
} else {
queryUser = UserGroupInformation.getCurrentUser().getUserName();
}
LOG.info("Number of splits: " + (eventList.size() - 1));
SignedMessage signedSvs = null;
for (int i = 0; i < eventList.size() - 1; i++) {
TaskSpec taskSpec = new TaskSpecBuilder().constructTaskSpec(dag, vertexName, eventList.size() - 1, applicationId, i);
// 2. Generate the vertex/submit information for all events.
if (i == 0) {
// The queryId could either be picked up from the current request being processed, or
// generated. The current request isn't exactly correct since the query is 'done' once we
// return the results. Generating a new one has the added benefit of working once this
// is moved out of a UDTF into a proper API.
// Setting this to the generated AppId which is unique.
// Despite the differences in TaskSpec, the vertex spec should be the same.
signedSvs = createSignedVertexSpec(signer, taskSpec, applicationId, queryUser, applicationId.toString());
}
SubmitWorkInfo submitWorkInfo = new SubmitWorkInfo(applicationId, System.currentTimeMillis(), taskSpec.getVertexParallelism(), signedSvs.message, signedSvs.signature);
byte[] submitWorkBytes = SubmitWorkInfo.toBytes(submitWorkInfo);
// 3. Generate input event.
SignedMessage eventBytes = makeEventBytes(wx, vertexName, eventList.get(i + 1), signer);
// 4. Make location hints.
SplitLocationInfo[] locations = makeLocationHints(hints.get(i));
result[i] = new LlapInputSplit(i, submitWorkBytes, eventBytes.message, eventBytes.signature, locations, schema, llapUser, tokenBytes);
}
return result;
} catch (Exception e) {
throw new IOException(e);
}
}
use of org.apache.tez.dag.api.TaskLocationHint in project hive by apache.
the class GenericUDTFGetSplits method makeLocationHints.
private SplitLocationInfo[] makeLocationHints(TaskLocationHint hint) {
Set<String> hosts = hint.getHosts();
if (hosts.size() != 1) {
LOG.warn("Bad # of locations: " + hosts.size());
}
SplitLocationInfo[] locations = new SplitLocationInfo[hosts.size()];
int j = 0;
for (String host : hosts) {
locations[j++] = new SplitLocationInfo(host, false);
}
return locations;
}
use of org.apache.tez.dag.api.TaskLocationHint in project hive by apache.
the class HiveSplitGenerator method initialize.
@SuppressWarnings("unchecked")
@Override
public List<Event> initialize() throws Exception {
// Setup the map work for this thread. Pruning modified the work instance to potentially remove
// partitions. The same work instance must be used when generating splits.
Utilities.setMapWork(jobConf, work);
try {
boolean sendSerializedEvents = conf.getBoolean("mapreduce.tez.input.initializer.serialize.event.payload", true);
// perform dynamic partition pruning
if (pruner != null) {
pruner.prune();
}
InputSplitInfoMem inputSplitInfo = null;
boolean generateConsistentSplits = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS);
LOG.info("GenerateConsistentSplitsInHive=" + generateConsistentSplits);
String realInputFormatName = conf.get("mapred.input.format.class");
boolean groupingEnabled = userPayloadProto.getGroupingEnabled();
if (groupingEnabled) {
// Need to instantiate the realInputFormat
InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) ReflectionUtils.newInstance(JavaUtils.loadClass(realInputFormatName), jobConf);
int totalResource = 0;
int taskResource = 0;
int availableSlots = 0;
// FIXME. Do the right thing Luke.
if (getContext() == null) {
// for now, totalResource = taskResource for llap
availableSlots = 1;
}
if (getContext() != null) {
totalResource = getContext().getTotalAvailableResource().getMemory();
taskResource = getContext().getVertexTaskResource().getMemory();
availableSlots = totalResource / taskResource;
}
if (HiveConf.getLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1) <= 1) {
// broken configuration from mapred-default.xml
final long blockSize = conf.getLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT);
final long minGrouping = conf.getLong(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE_DEFAULT);
final long preferredSplitSize = Math.min(blockSize / 2, minGrouping);
HiveConf.setLongVar(jobConf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, preferredSplitSize);
LOG.info("The preferred split size is " + preferredSplitSize);
}
// Create the un-grouped splits
float waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
// Raw splits
InputSplit[] splits = inputFormat.getSplits(jobConf, (int) (availableSlots * waves));
// Sort the splits, so that subsequent grouping is consistent.
Arrays.sort(splits, new InputSplitComparator());
LOG.info("Number of input splits: " + splits.length + ". " + availableSlots + " available slots, " + waves + " waves. Input format is: " + realInputFormatName);
if (work.getIncludedBuckets() != null) {
splits = pruneBuckets(work, splits);
}
Multimap<Integer, InputSplit> groupedSplits = splitGrouper.generateGroupedSplits(jobConf, conf, splits, waves, availableSlots, splitLocationProvider);
// And finally return them in a flat array
InputSplit[] flatSplits = groupedSplits.values().toArray(new InputSplit[0]);
LOG.info("Number of split groups: " + flatSplits.length);
List<TaskLocationHint> locationHints = splitGrouper.createTaskLocationHints(flatSplits, generateConsistentSplits);
inputSplitInfo = new InputSplitInfoMem(flatSplits, locationHints, flatSplits.length, null, jobConf);
} else {
// If this is used in the future - make sure to disable grouping in the payload, if it isn't already disabled
throw new RuntimeException("HiveInputFormat does not support non-grouped splits, InputFormatName is: " + realInputFormatName);
// inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(jobConf, false, 0);
}
return createEventList(sendSerializedEvents, inputSplitInfo);
} finally {
Utilities.clearWork(jobConf);
}
}
use of org.apache.tez.dag.api.TaskLocationHint in project hive by apache.
the class SplitGrouper method createTaskLocationHints.
/**
* Create task location hints from a set of input splits
* @param splits the actual splits
* @param consistentLocations whether to re-order locations for each split, if it's a file split
* @return taskLocationHints - 1 per input split specified
* @throws IOException
*/
public List<TaskLocationHint> createTaskLocationHints(InputSplit[] splits, boolean consistentLocations) throws IOException {
List<TaskLocationHint> locationHints = Lists.newArrayListWithCapacity(splits.length);
for (InputSplit split : splits) {
String rack = (split instanceof TezGroupedSplit) ? ((TezGroupedSplit) split).getRack() : null;
if (rack == null) {
String[] locations = split.getLocations();
if (locations != null && locations.length > 0) {
// Worthwhile only if more than 1 split, consistentGroupingEnabled and is a FileSplit
if (consistentLocations && locations.length > 1 && split instanceof FileSplit) {
Arrays.sort(locations);
FileSplit fileSplit = (FileSplit) split;
Path path = fileSplit.getPath();
long startLocation = fileSplit.getStart();
int hashCode = Objects.hash(path, startLocation);
int startIndex = hashCode % locations.length;
LinkedHashSet<String> locationSet = new LinkedHashSet<>(locations.length);
// Set up the locations starting from startIndex, and wrapping around the sorted array.
for (int i = 0; i < locations.length; i++) {
int index = (startIndex + i) % locations.length;
locationSet.add(locations[index]);
}
locationHints.add(TaskLocationHint.createTaskLocationHint(locationSet, null));
} else {
locationHints.add(TaskLocationHint.createTaskLocationHint(new LinkedHashSet<String>(Arrays.asList(split.getLocations())), null));
}
} else {
locationHints.add(TaskLocationHint.createTaskLocationHint(null, null));
}
} else {
locationHints.add(TaskLocationHint.createTaskLocationHint(null, Collections.singleton(rack)));
}
}
return locationHints;
}
Aggregations