use of org.apache.hadoop.mapred.split.TezGroupedSplit in project hive by apache.
the class CustomPartitionVertex method processAllEvents.
private void processAllEvents(String inputName, Multimap<Integer, InputSplit> bucketToGroupedSplitMap, boolean secondLevelGroupingDone) throws IOException {
int totalInputsCount = 0;
List<Integer> numSplitsForTask = new ArrayList<Integer>();
for (Entry<Integer, Collection<InputSplit>> entry : bucketToGroupedSplitMap.asMap().entrySet()) {
int bucketNum = entry.getKey();
Collection<InputSplit> initialSplits = entry.getValue();
finalSplits.addAll(initialSplits);
for (InputSplit inputSplit : initialSplits) {
bucketToTaskMap.put(bucketNum, taskCount);
if (secondLevelGroupingDone) {
TezGroupedSplit groupedSplit = (TezGroupedSplit) inputSplit;
numSplitsForTask.add(groupedSplit.getGroupedSplits().size());
totalInputsCount += groupedSplit.getGroupedSplits().size();
} else {
numSplitsForTask.add(1);
totalInputsCount += 1;
}
taskCount++;
}
}
inputNameInputSpecMap.put(inputName, InputSpecUpdate.createPerTaskInputSpecUpdate(numSplitsForTask));
// Construct the EdgeManager descriptor to be used by all edges which need
// the routing table.
EdgeManagerPluginDescriptor hiveEdgeManagerDesc = null;
if ((vertexType == VertexType.MULTI_INPUT_INITIALIZED_EDGES) || (vertexType == VertexType.INITIALIZED_EDGES)) {
hiveEdgeManagerDesc = EdgeManagerPluginDescriptor.create(CustomPartitionEdge.class.getName());
UserPayload payload = getBytePayload(bucketToTaskMap);
hiveEdgeManagerDesc.setUserPayload(payload);
}
// Replace the edge manager for all vertices which have routing type custom.
for (Entry<String, EdgeProperty> edgeEntry : context.getInputVertexEdgeProperties().entrySet()) {
if (edgeEntry.getValue().getDataMovementType() == DataMovementType.CUSTOM && edgeEntry.getValue().getEdgeManagerDescriptor().getClassName().equals(CustomPartitionEdge.class.getName())) {
emMap.put(edgeEntry.getKey(), hiveEdgeManagerDesc);
}
}
LOG.info("Task count is " + taskCount + " for input name: " + inputName);
List<InputDataInformationEvent> taskEvents = Lists.newArrayListWithCapacity(totalInputsCount);
// Re-serialize the splits after grouping.
int count = 0;
for (InputSplit inputSplit : finalSplits) {
if (secondLevelGroupingDone) {
TezGroupedSplit tezGroupedSplit = (TezGroupedSplit) inputSplit;
for (InputSplit subSplit : tezGroupedSplit.getGroupedSplits()) {
if ((subSplit instanceof TezGroupedSplit) == false) {
throw new IOException("Unexpected split type found: " + subSplit.getClass().getCanonicalName());
}
MRSplitProto serializedSplit = MRInputHelpers.createSplitProto(subSplit);
InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count, serializedSplit.toByteString().asReadOnlyByteBuffer());
diEvent.setTargetIndex(count);
taskEvents.add(diEvent);
}
} else {
MRSplitProto serializedSplit = MRInputHelpers.createSplitProto(inputSplit);
InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count, serializedSplit.toByteString().asReadOnlyByteBuffer());
diEvent.setTargetIndex(count);
taskEvents.add(diEvent);
}
count++;
}
// Set the actual events for the tasks.
LOG.info("For input name: " + inputName + " task events size is " + taskEvents.size());
context.addRootInputEvents(inputName, taskEvents);
if (!inputToGroupedSplitMap.isEmpty()) {
for (Entry<String, Multimap<Integer, InputSplit>> entry : inputToGroupedSplitMap.entrySet()) {
processAllSideEvents(entry.getKey(), entry.getValue());
}
setVertexParallelismAndRootInputSpec(inputNameInputSpecMap);
inputToGroupedSplitMap.clear();
}
// Only done when it is a bucket map join only no SMB.
if (numInputsAffectingRootInputSpecUpdate == 1) {
setVertexParallelismAndRootInputSpec(inputNameInputSpecMap);
// Send the bucket IDs associated with the tasks, must happen after parallelism is set.
sendBucketIdsToProcessor();
}
}
use of org.apache.hadoop.mapred.split.TezGroupedSplit in project hive by apache.
the class SplitGrouper method createTaskLocationHints.
/**
* Create task location hints from a set of input splits
* @param splits the actual splits
* @param consistentLocations whether to re-order locations for each split, if it's a file split
* @return taskLocationHints - 1 per input split specified
* @throws IOException
*/
public List<TaskLocationHint> createTaskLocationHints(InputSplit[] splits, boolean consistentLocations) throws IOException {
List<TaskLocationHint> locationHints = Lists.newArrayListWithCapacity(splits.length);
for (InputSplit split : splits) {
String rack = (split instanceof TezGroupedSplit) ? ((TezGroupedSplit) split).getRack() : null;
if (rack == null) {
String[] locations = split.getLocations();
if (locations != null && locations.length > 0) {
// Worthwhile only if more than 1 split, consistentGroupingEnabled and is a FileSplit
if (consistentLocations && locations.length > 1 && split instanceof FileSplit) {
Arrays.sort(locations);
FileSplit fileSplit = (FileSplit) split;
Path path = fileSplit.getPath();
long startLocation = fileSplit.getStart();
int hashCode = Objects.hash(path, startLocation);
int startIndex = hashCode % locations.length;
LinkedHashSet<String> locationSet = new LinkedHashSet<>(locations.length);
// Set up the locations starting from startIndex, and wrapping around the sorted array.
for (int i = 0; i < locations.length; i++) {
int index = (startIndex + i) % locations.length;
locationSet.add(locations[index]);
}
locationHints.add(TaskLocationHint.createTaskLocationHint(locationSet, null));
} else {
locationHints.add(TaskLocationHint.createTaskLocationHint(new LinkedHashSet<String>(Arrays.asList(split.getLocations())), null));
}
} else {
locationHints.add(TaskLocationHint.createTaskLocationHint(null, null));
}
} else {
locationHints.add(TaskLocationHint.createTaskLocationHint(null, Collections.singleton(rack)));
}
}
return locationHints;
}
Aggregations