Search in sources :

Example 1 with TezGroupedSplit

use of org.apache.hadoop.mapred.split.TezGroupedSplit in project hive by apache.

the class CustomPartitionVertex method processAllEvents.

private void processAllEvents(String inputName, Multimap<Integer, InputSplit> bucketToGroupedSplitMap, boolean secondLevelGroupingDone) throws IOException {
    int totalInputsCount = 0;
    List<Integer> numSplitsForTask = new ArrayList<Integer>();
    for (Entry<Integer, Collection<InputSplit>> entry : bucketToGroupedSplitMap.asMap().entrySet()) {
        int bucketNum = entry.getKey();
        Collection<InputSplit> initialSplits = entry.getValue();
        finalSplits.addAll(initialSplits);
        for (InputSplit inputSplit : initialSplits) {
            bucketToTaskMap.put(bucketNum, taskCount);
            if (secondLevelGroupingDone) {
                TezGroupedSplit groupedSplit = (TezGroupedSplit) inputSplit;
                numSplitsForTask.add(groupedSplit.getGroupedSplits().size());
                totalInputsCount += groupedSplit.getGroupedSplits().size();
            } else {
                numSplitsForTask.add(1);
                totalInputsCount += 1;
            }
            taskCount++;
        }
    }
    inputNameInputSpecMap.put(inputName, InputSpecUpdate.createPerTaskInputSpecUpdate(numSplitsForTask));
    // Construct the EdgeManager descriptor to be used by all edges which need
    // the routing table.
    EdgeManagerPluginDescriptor hiveEdgeManagerDesc = null;
    if ((vertexType == VertexType.MULTI_INPUT_INITIALIZED_EDGES) || (vertexType == VertexType.INITIALIZED_EDGES)) {
        hiveEdgeManagerDesc = EdgeManagerPluginDescriptor.create(CustomPartitionEdge.class.getName());
        UserPayload payload = getBytePayload(bucketToTaskMap);
        hiveEdgeManagerDesc.setUserPayload(payload);
    }
    // Replace the edge manager for all vertices which have routing type custom.
    for (Entry<String, EdgeProperty> edgeEntry : context.getInputVertexEdgeProperties().entrySet()) {
        if (edgeEntry.getValue().getDataMovementType() == DataMovementType.CUSTOM && edgeEntry.getValue().getEdgeManagerDescriptor().getClassName().equals(CustomPartitionEdge.class.getName())) {
            emMap.put(edgeEntry.getKey(), hiveEdgeManagerDesc);
        }
    }
    LOG.info("Task count is " + taskCount + " for input name: " + inputName);
    List<InputDataInformationEvent> taskEvents = Lists.newArrayListWithCapacity(totalInputsCount);
    // Re-serialize the splits after grouping.
    int count = 0;
    for (InputSplit inputSplit : finalSplits) {
        if (secondLevelGroupingDone) {
            TezGroupedSplit tezGroupedSplit = (TezGroupedSplit) inputSplit;
            for (InputSplit subSplit : tezGroupedSplit.getGroupedSplits()) {
                if ((subSplit instanceof TezGroupedSplit) == false) {
                    throw new IOException("Unexpected split type found: " + subSplit.getClass().getCanonicalName());
                }
                MRSplitProto serializedSplit = MRInputHelpers.createSplitProto(subSplit);
                InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count, serializedSplit.toByteString().asReadOnlyByteBuffer());
                diEvent.setTargetIndex(count);
                taskEvents.add(diEvent);
            }
        } else {
            MRSplitProto serializedSplit = MRInputHelpers.createSplitProto(inputSplit);
            InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count, serializedSplit.toByteString().asReadOnlyByteBuffer());
            diEvent.setTargetIndex(count);
            taskEvents.add(diEvent);
        }
        count++;
    }
    // Set the actual events for the tasks.
    LOG.info("For input name: " + inputName + " task events size is " + taskEvents.size());
    context.addRootInputEvents(inputName, taskEvents);
    if (!inputToGroupedSplitMap.isEmpty()) {
        for (Entry<String, Multimap<Integer, InputSplit>> entry : inputToGroupedSplitMap.entrySet()) {
            processAllSideEvents(entry.getKey(), entry.getValue());
        }
        setVertexParallelismAndRootInputSpec(inputNameInputSpecMap);
        inputToGroupedSplitMap.clear();
    }
    // Only done when it is a bucket map join only no SMB.
    if (numInputsAffectingRootInputSpecUpdate == 1) {
        setVertexParallelismAndRootInputSpec(inputNameInputSpecMap);
        // Send the bucket IDs associated with the tasks, must happen after parallelism is set.
        sendBucketIdsToProcessor();
    }
}
Also used : UserPayload(org.apache.tez.dag.api.UserPayload) TezGroupedSplit(org.apache.hadoop.mapred.split.TezGroupedSplit) ByteString(com.google.protobuf.ByteString) IOException(java.io.IOException) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) ArrayListMultimap(com.google.common.collect.ArrayListMultimap) Multimap(com.google.common.collect.Multimap) HashMultimap(com.google.common.collect.HashMultimap) LinkedListMultimap(com.google.common.collect.LinkedListMultimap) EdgeManagerPluginDescriptor(org.apache.tez.dag.api.EdgeManagerPluginDescriptor) EdgeProperty(org.apache.tez.dag.api.EdgeProperty) InputSplit(org.apache.hadoop.mapred.InputSplit) MRSplitProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto)

Example 2 with TezGroupedSplit

use of org.apache.hadoop.mapred.split.TezGroupedSplit in project hive by apache.

the class SplitGrouper method createTaskLocationHints.

/**
 * Create task location hints from a set of input splits
 * @param splits the actual splits
 * @param consistentLocations whether to re-order locations for each split, if it's a file split
 * @return taskLocationHints - 1 per input split specified
 * @throws IOException
 */
public List<TaskLocationHint> createTaskLocationHints(InputSplit[] splits, boolean consistentLocations) throws IOException {
    List<TaskLocationHint> locationHints = Lists.newArrayListWithCapacity(splits.length);
    for (InputSplit split : splits) {
        String rack = (split instanceof TezGroupedSplit) ? ((TezGroupedSplit) split).getRack() : null;
        if (rack == null) {
            String[] locations = split.getLocations();
            if (locations != null && locations.length > 0) {
                // Worthwhile only if more than 1 split, consistentGroupingEnabled and is a FileSplit
                if (consistentLocations && locations.length > 1 && split instanceof FileSplit) {
                    Arrays.sort(locations);
                    FileSplit fileSplit = (FileSplit) split;
                    Path path = fileSplit.getPath();
                    long startLocation = fileSplit.getStart();
                    int hashCode = Objects.hash(path, startLocation);
                    int startIndex = hashCode % locations.length;
                    LinkedHashSet<String> locationSet = new LinkedHashSet<>(locations.length);
                    // Set up the locations starting from startIndex, and wrapping around the sorted array.
                    for (int i = 0; i < locations.length; i++) {
                        int index = (startIndex + i) % locations.length;
                        locationSet.add(locations[index]);
                    }
                    locationHints.add(TaskLocationHint.createTaskLocationHint(locationSet, null));
                } else {
                    locationHints.add(TaskLocationHint.createTaskLocationHint(new LinkedHashSet<String>(Arrays.asList(split.getLocations())), null));
                }
            } else {
                locationHints.add(TaskLocationHint.createTaskLocationHint(null, null));
            }
        } else {
            locationHints.add(TaskLocationHint.createTaskLocationHint(null, Collections.singleton(rack)));
        }
    }
    return locationHints;
}
Also used : TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) Path(org.apache.hadoop.fs.Path) LinkedHashSet(java.util.LinkedHashSet) TezGroupedSplit(org.apache.hadoop.mapred.split.TezGroupedSplit) FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint)

Aggregations

InputSplit (org.apache.hadoop.mapred.InputSplit)2 TezGroupedSplit (org.apache.hadoop.mapred.split.TezGroupedSplit)2 ArrayListMultimap (com.google.common.collect.ArrayListMultimap)1 HashMultimap (com.google.common.collect.HashMultimap)1 LinkedListMultimap (com.google.common.collect.LinkedListMultimap)1 Multimap (com.google.common.collect.Multimap)1 ByteString (com.google.protobuf.ByteString)1 IOException (java.io.IOException)1 LinkedHashSet (java.util.LinkedHashSet)1 Path (org.apache.hadoop.fs.Path)1 FileSplit (org.apache.hadoop.mapred.FileSplit)1 EdgeManagerPluginDescriptor (org.apache.tez.dag.api.EdgeManagerPluginDescriptor)1 EdgeProperty (org.apache.tez.dag.api.EdgeProperty)1 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)1 UserPayload (org.apache.tez.dag.api.UserPayload)1 VertexLocationHint (org.apache.tez.dag.api.VertexLocationHint)1 MRSplitProto (org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto)1