Search in sources :

Example 1 with MRSplitProto

use of org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto in project hive by apache.

the class CustomPartitionVertex method processAllEvents.

private void processAllEvents(String inputName, Multimap<Integer, InputSplit> bucketToGroupedSplitMap, boolean secondLevelGroupingDone) throws IOException {
    int totalInputsCount = 0;
    List<Integer> numSplitsForTask = new ArrayList<Integer>();
    for (Entry<Integer, Collection<InputSplit>> entry : bucketToGroupedSplitMap.asMap().entrySet()) {
        int bucketNum = entry.getKey();
        Collection<InputSplit> initialSplits = entry.getValue();
        finalSplits.addAll(initialSplits);
        for (InputSplit inputSplit : initialSplits) {
            bucketToTaskMap.put(bucketNum, taskCount);
            if (secondLevelGroupingDone) {
                TezGroupedSplit groupedSplit = (TezGroupedSplit) inputSplit;
                numSplitsForTask.add(groupedSplit.getGroupedSplits().size());
                totalInputsCount += groupedSplit.getGroupedSplits().size();
            } else {
                numSplitsForTask.add(1);
                totalInputsCount += 1;
            }
            taskCount++;
        }
    }
    inputNameInputSpecMap.put(inputName, InputSpecUpdate.createPerTaskInputSpecUpdate(numSplitsForTask));
    // Construct the EdgeManager descriptor to be used by all edges which need
    // the routing table.
    EdgeManagerPluginDescriptor hiveEdgeManagerDesc = null;
    if ((vertexType == VertexType.MULTI_INPUT_INITIALIZED_EDGES) || (vertexType == VertexType.INITIALIZED_EDGES)) {
        hiveEdgeManagerDesc = EdgeManagerPluginDescriptor.create(CustomPartitionEdge.class.getName());
        UserPayload payload = getBytePayload(bucketToTaskMap);
        hiveEdgeManagerDesc.setUserPayload(payload);
    }
    // Replace the edge manager for all vertices which have routing type custom.
    for (Entry<String, EdgeProperty> edgeEntry : context.getInputVertexEdgeProperties().entrySet()) {
        if (edgeEntry.getValue().getDataMovementType() == DataMovementType.CUSTOM && edgeEntry.getValue().getEdgeManagerDescriptor().getClassName().equals(CustomPartitionEdge.class.getName())) {
            emMap.put(edgeEntry.getKey(), hiveEdgeManagerDesc);
        }
    }
    LOG.info("Task count is " + taskCount + " for input name: " + inputName);
    List<InputDataInformationEvent> taskEvents = Lists.newArrayListWithCapacity(totalInputsCount);
    // Re-serialize the splits after grouping.
    int count = 0;
    for (InputSplit inputSplit : finalSplits) {
        if (secondLevelGroupingDone) {
            TezGroupedSplit tezGroupedSplit = (TezGroupedSplit) inputSplit;
            for (InputSplit subSplit : tezGroupedSplit.getGroupedSplits()) {
                if ((subSplit instanceof TezGroupedSplit) == false) {
                    throw new IOException("Unexpected split type found: " + subSplit.getClass().getCanonicalName());
                }
                MRSplitProto serializedSplit = MRInputHelpers.createSplitProto(subSplit);
                InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count, serializedSplit.toByteString().asReadOnlyByteBuffer());
                diEvent.setTargetIndex(count);
                taskEvents.add(diEvent);
            }
        } else {
            MRSplitProto serializedSplit = MRInputHelpers.createSplitProto(inputSplit);
            InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count, serializedSplit.toByteString().asReadOnlyByteBuffer());
            diEvent.setTargetIndex(count);
            taskEvents.add(diEvent);
        }
        count++;
    }
    // Set the actual events for the tasks.
    LOG.info("For input name: " + inputName + " task events size is " + taskEvents.size());
    context.addRootInputEvents(inputName, taskEvents);
    if (inputToGroupedSplitMap.isEmpty() == false) {
        for (Entry<String, Multimap<Integer, InputSplit>> entry : inputToGroupedSplitMap.entrySet()) {
            processAllSideEvents(entry.getKey(), entry.getValue());
        }
        setVertexParallelismAndRootInputSpec(inputNameInputSpecMap);
        inputToGroupedSplitMap.clear();
    }
    // Only done when it is a bucket map join only no SMB.
    if (numInputsAffectingRootInputSpecUpdate == 1) {
        setVertexParallelismAndRootInputSpec(inputNameInputSpecMap);
    }
}
Also used : UserPayload(org.apache.tez.dag.api.UserPayload) ArrayList(java.util.ArrayList) TezGroupedSplit(org.apache.hadoop.mapred.split.TezGroupedSplit) ByteString(com.google.protobuf.ByteString) IOException(java.io.IOException) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) ArrayListMultimap(com.google.common.collect.ArrayListMultimap) HashMultimap(com.google.common.collect.HashMultimap) LinkedListMultimap(com.google.common.collect.LinkedListMultimap) Multimap(com.google.common.collect.Multimap) EdgeManagerPluginDescriptor(org.apache.tez.dag.api.EdgeManagerPluginDescriptor) Collection(java.util.Collection) EdgeProperty(org.apache.tez.dag.api.EdgeProperty) InputSplit(org.apache.hadoop.mapred.InputSplit) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) MRSplitProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto)

Example 2 with MRSplitProto

use of org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto in project hive by apache.

the class HiveSplitGenerator method createEventList.

private List<Event> createEventList(boolean sendSerializedEvents, InputSplitInfoMem inputSplitInfo) {
    List<Event> events = Lists.newArrayListWithCapacity(inputSplitInfo.getNumTasks() + 1);
    InputConfigureVertexTasksEvent configureVertexEvent = InputConfigureVertexTasksEvent.create(inputSplitInfo.getNumTasks(), VertexLocationHint.create(inputSplitInfo.getTaskLocationHints()), InputSpecUpdate.getDefaultSinglePhysicalInputSpecUpdate());
    events.add(configureVertexEvent);
    if (sendSerializedEvents) {
        MRSplitsProto splitsProto = inputSplitInfo.getSplitsProto();
        int count = 0;
        for (MRSplitProto mrSplit : splitsProto.getSplitsList()) {
            InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count++, mrSplit.toByteString().asReadOnlyByteBuffer());
            events.add(diEvent);
        }
    } else {
        int count = 0;
        for (org.apache.hadoop.mapred.InputSplit split : inputSplitInfo.getOldFormatSplits()) {
            InputDataInformationEvent diEvent = InputDataInformationEvent.createWithObjectPayload(count++, split);
            events.add(diEvent);
        }
    }
    return events;
}
Also used : MRSplitsProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitsProto) InputSplit(org.apache.hadoop.mapred.InputSplit) InputInitializerEvent(org.apache.tez.runtime.api.events.InputInitializerEvent) Event(org.apache.tez.runtime.api.Event) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) MRSplitProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent)

Example 3 with MRSplitProto

use of org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto in project hive by apache.

the class CustomPartitionVertex method getFileSplitFromEvent.

private FileSplit getFileSplitFromEvent(InputDataInformationEvent event) throws IOException {
    InputSplit inputSplit = null;
    if (event.getDeserializedUserPayload() != null) {
        inputSplit = (InputSplit) event.getDeserializedUserPayload();
    } else {
        MRSplitProto splitProto = MRSplitProto.parseFrom(ByteString.copyFrom(event.getUserPayload()));
        SerializationFactory serializationFactory = new SerializationFactory(new Configuration());
        inputSplit = MRInputHelpers.createOldFormatSplitFromUserPayload(splitProto, serializationFactory);
    }
    if (!(inputSplit instanceof FileSplit)) {
        throw new UnsupportedOperationException("Cannot handle splits other than FileSplit for the moment. Current input split type: " + inputSplit.getClass().getSimpleName());
    }
    return (FileSplit) inputSplit;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) SerializationFactory(org.apache.hadoop.io.serializer.SerializationFactory) FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) MRSplitProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto)

Example 4 with MRSplitProto

use of org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto in project hive by apache.

the class CustomPartitionVertex method processAllSideEvents.

private void processAllSideEvents(String inputName, Multimap<Integer, InputSplit> bucketToGroupedSplitMap) throws IOException {
    List<InputDataInformationEvent> taskEvents = new ArrayList<InputDataInformationEvent>();
    LOG.info("We have a routing table and we are going to set the destination tasks for the" + " multi mr inputs. " + bucketToTaskMap);
    Integer[] numSplitsForTask = new Integer[taskCount];
    Arrays.fill(numSplitsForTask, 0);
    Multimap<Integer, ByteBuffer> bucketToSerializedSplitMap = LinkedListMultimap.create();
    // Create the list of serialized splits for each bucket.
    for (Entry<Integer, Collection<InputSplit>> entry : bucketToGroupedSplitMap.asMap().entrySet()) {
        for (InputSplit split : entry.getValue()) {
            MRSplitProto serializedSplit = MRInputHelpers.createSplitProto(split);
            ByteBuffer bs = serializedSplit.toByteString().asReadOnlyByteBuffer();
            bucketToSerializedSplitMap.put(entry.getKey(), bs);
        }
    }
    for (Entry<Integer, Collection<ByteBuffer>> entry : bucketToSerializedSplitMap.asMap().entrySet()) {
        Collection<Integer> destTasks = bucketToTaskMap.get(entry.getKey());
        if ((destTasks == null) || (destTasks.isEmpty())) {
            continue;
        }
        for (Integer task : destTasks) {
            int count = 0;
            for (ByteBuffer buf : entry.getValue()) {
                count++;
                InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count, buf);
                diEvent.setTargetIndex(task);
                taskEvents.add(diEvent);
            }
            numSplitsForTask[task] = count;
        }
    }
    inputNameInputSpecMap.put(inputName, InputSpecUpdate.createPerTaskInputSpecUpdate(Arrays.asList(numSplitsForTask)));
    LOG.info("For input name: " + inputName + " task events size is " + taskEvents.size());
    context.addRootInputEvents(inputName, taskEvents);
}
Also used : ArrayList(java.util.ArrayList) ByteBuffer(java.nio.ByteBuffer) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) Collection(java.util.Collection) InputSplit(org.apache.hadoop.mapred.InputSplit) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) MRSplitProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto)

Aggregations

InputSplit (org.apache.hadoop.mapred.InputSplit)4 MRSplitProto (org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto)4 VertexLocationHint (org.apache.tez.dag.api.VertexLocationHint)3 InputDataInformationEvent (org.apache.tez.runtime.api.events.InputDataInformationEvent)3 ArrayList (java.util.ArrayList)2 Collection (java.util.Collection)2 ArrayListMultimap (com.google.common.collect.ArrayListMultimap)1 HashMultimap (com.google.common.collect.HashMultimap)1 LinkedListMultimap (com.google.common.collect.LinkedListMultimap)1 Multimap (com.google.common.collect.Multimap)1 ByteString (com.google.protobuf.ByteString)1 IOException (java.io.IOException)1 ByteBuffer (java.nio.ByteBuffer)1 Configuration (org.apache.hadoop.conf.Configuration)1 SerializationFactory (org.apache.hadoop.io.serializer.SerializationFactory)1 FileSplit (org.apache.hadoop.mapred.FileSplit)1 TezGroupedSplit (org.apache.hadoop.mapred.split.TezGroupedSplit)1 EdgeManagerPluginDescriptor (org.apache.tez.dag.api.EdgeManagerPluginDescriptor)1 EdgeProperty (org.apache.tez.dag.api.EdgeProperty)1 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)1