use of org.apache.hadoop.mapred.split.SplitLocationProvider in project hive by apache.
the class Utils method getSplitLocationProvider.
public static SplitLocationProvider getSplitLocationProvider(Configuration conf, Logger LOG) throws IOException {
boolean useCustomLocations = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_EXECUTION_MODE).equals("llap") && HiveConf.getBoolVar(conf, HiveConf.ConfVars.LLAP_CLIENT_CONSISTENT_SPLITS);
SplitLocationProvider splitLocationProvider;
LOG.info("SplitGenerator using llap affinitized locations: " + useCustomLocations);
if (useCustomLocations) {
LlapRegistryService serviceRegistry = LlapRegistryService.getClient(conf);
LOG.info("Using LLAP instance " + serviceRegistry.getApplicationId());
Collection<ServiceInstance> serviceInstances = serviceRegistry.getInstances().getAllInstancesOrdered(true);
Preconditions.checkArgument(!serviceInstances.isEmpty(), "No running LLAP daemons! Please check LLAP service status and zookeeper configuration");
ArrayList<String> locations = new ArrayList<>(serviceInstances.size());
for (ServiceInstance serviceInstance : serviceInstances) {
if (LOG.isDebugEnabled()) {
LOG.debug("Adding " + serviceInstance.getWorkerIdentity() + " with hostname=" + serviceInstance.getHost() + " to list for split locations");
}
locations.add(serviceInstance.getHost());
}
splitLocationProvider = new HostAffinitySplitLocationProvider(locations);
} else {
splitLocationProvider = new SplitLocationProvider() {
@Override
public String[] getLocations(InputSplit split) throws IOException {
if (split == null) {
return null;
}
String[] locations = split.getLocations();
if (locations != null && locations.length == 1) {
if ("localhost".equals(locations[0])) {
return ArrayUtils.EMPTY_STRING_ARRAY;
}
}
return locations;
}
};
}
return splitLocationProvider;
}
use of org.apache.hadoop.mapred.split.SplitLocationProvider in project hive by apache.
the class CustomPartitionVertex method onRootVertexInitialized.
// One call per root Input
@Override
public void onRootVertexInitialized(String inputName, InputDescriptor inputDescriptor, List<Event> events) {
numInputsSeenSoFar++;
LOG.info("On root vertex initialized " + inputName);
try {
// This is using the payload from the RootVertexInitializer corresponding
// to InputName. Ideally it should be using it's own configuration class -
// but that
// means serializing another instance.
MRInputUserPayloadProto protoPayload = MRInputHelpers.parseMRInputPayload(inputDescriptor.getUserPayload());
this.conf = TezUtils.createConfFromByteString(protoPayload.getConfigurationBytes());
/*
* Currently in tez, the flow of events is thus:
* "Generate Splits -> Initialize Vertex" (with parallelism info obtained
* from the generate splits phase). The generate splits phase groups
* splits using the TezGroupedSplitsInputFormat. However, for bucket map
* joins the grouping done by this input format results in incorrect
* results as the grouper has no knowledge of buckets. So, we initially
* set the input format to be HiveInputFormat (in DagUtils) for the case
* of bucket map joins so as to obtain un-grouped splits. We then group
* the splits corresponding to buckets using the tez grouper which returns
* TezGroupedSplits.
*/
// This assumes that Grouping will always be used.
// Enabling grouping on the payload.
MRInputUserPayloadProto updatedPayload = MRInputUserPayloadProto.newBuilder(protoPayload).setGroupingEnabled(true).build();
inputDescriptor.setUserPayload(UserPayload.create(updatedPayload.toByteString().asReadOnlyByteBuffer()));
} catch (IOException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
boolean dataInformationEventSeen = false;
Map<String, Set<FileSplit>> pathFileSplitsMap = new TreeMap<String, Set<FileSplit>>();
for (Event event : events) {
if (event instanceof InputConfigureVertexTasksEvent) {
// No tasks should have been started yet. Checked by initial state
// check.
LOG.info("Got a input configure vertex event for input: " + inputName);
Preconditions.checkState(dataInformationEventSeen == false);
InputConfigureVertexTasksEvent cEvent = (InputConfigureVertexTasksEvent) event;
// The vertex cannot be configured until all DataEvents are seen - to
// build the routing table.
configureVertexTaskEvent = cEvent;
LOG.info("Configure task for input name: " + inputName + " num tasks: " + configureVertexTaskEvent.getNumTasks());
}
if (event instanceof InputUpdatePayloadEvent) {
// this event can never occur. If it does, fail.
Preconditions.checkState(false);
} else if (event instanceof InputDataInformationEvent) {
dataInformationEventSeen = true;
InputDataInformationEvent diEvent = (InputDataInformationEvent) event;
FileSplit fileSplit;
try {
fileSplit = getFileSplitFromEvent(diEvent);
} catch (IOException e) {
throw new RuntimeException("Failed to get file split for event: " + diEvent, e);
}
Set<FileSplit> fsList = pathFileSplitsMap.get(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()));
if (fsList == null) {
fsList = new TreeSet<FileSplit>(new PathComparatorForSplit());
pathFileSplitsMap.put(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()), fsList);
}
fsList.add(fileSplit);
}
}
LOG.info("Path file splits map for input name: " + inputName + " is " + pathFileSplitsMap);
Multimap<Integer, InputSplit> bucketToInitialSplitMap = getBucketSplitMapForPath(pathFileSplitsMap);
try {
int totalResource = context.getTotalAvailableResource().getMemory();
int taskResource = context.getVertexTaskResource().getMemory();
float waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
int availableSlots = totalResource / taskResource;
LOG.info("Grouping splits. " + availableSlots + " available slots, " + waves + " waves. Bucket initial splits map: " + bucketToInitialSplitMap);
JobConf jobConf = new JobConf(conf);
ShimLoader.getHadoopShims().getMergedCredentials(jobConf);
Multimap<Integer, InputSplit> bucketToGroupedSplitMap = HashMultimap.<Integer, InputSplit>create();
boolean secondLevelGroupingDone = false;
if ((mainWorkName.isEmpty()) || (inputName.compareTo(mainWorkName) == 0)) {
SplitLocationProvider splitLocationProvider = Utils.getSplitLocationProvider(conf, LOG);
for (Integer key : bucketToInitialSplitMap.keySet()) {
InputSplit[] inputSplitArray = (bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
Multimap<Integer, InputSplit> groupedSplit = grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves, availableSlots, inputName, mainWorkName.isEmpty(), splitLocationProvider);
if (mainWorkName.isEmpty() == false) {
Multimap<Integer, InputSplit> singleBucketToGroupedSplit = HashMultimap.<Integer, InputSplit>create();
singleBucketToGroupedSplit.putAll(key, groupedSplit.values());
groupedSplit = grouper.group(jobConf, singleBucketToGroupedSplit, availableSlots, HiveConf.getFloatVar(conf, HiveConf.ConfVars.TEZ_SMB_NUMBER_WAVES), null);
secondLevelGroupingDone = true;
}
bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
}
processAllEvents(inputName, bucketToGroupedSplitMap, secondLevelGroupingDone);
} else {
SplitLocationProvider splitLocationProvider = Utils.getSplitLocationProvider(conf, LOG);
// all the bucket files.
for (Integer key : bucketToInitialSplitMap.keySet()) {
InputSplit[] inputSplitArray = (bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
Multimap<Integer, InputSplit> groupedSplit = grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves, availableSlots, inputName, false, splitLocationProvider);
bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
}
/*
* this is the small table side. In case of SMB join, we need to send each split to the
* corresponding bucket-based task on the other side. In case a split needs to go to
* multiple downstream tasks, we need to clone the event and send it to the right
* destination.
*/
LOG.info("This is the side work - multi-mr work.");
processAllSideEventsSetParallelism(inputName, bucketToGroupedSplitMap);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
Aggregations