use of org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto in project hive by apache.
the class CustomPartitionVertex method processAllEvents.
private void processAllEvents(String inputName, Multimap<Integer, InputSplit> bucketToGroupedSplitMap, boolean secondLevelGroupingDone) throws IOException {
int totalInputsCount = 0;
List<Integer> numSplitsForTask = new ArrayList<Integer>();
for (Entry<Integer, Collection<InputSplit>> entry : bucketToGroupedSplitMap.asMap().entrySet()) {
int bucketNum = entry.getKey();
Collection<InputSplit> initialSplits = entry.getValue();
finalSplits.addAll(initialSplits);
for (InputSplit inputSplit : initialSplits) {
bucketToTaskMap.put(bucketNum, taskCount);
if (secondLevelGroupingDone) {
TezGroupedSplit groupedSplit = (TezGroupedSplit) inputSplit;
numSplitsForTask.add(groupedSplit.getGroupedSplits().size());
totalInputsCount += groupedSplit.getGroupedSplits().size();
} else {
numSplitsForTask.add(1);
totalInputsCount += 1;
}
taskCount++;
}
}
inputNameInputSpecMap.put(inputName, InputSpecUpdate.createPerTaskInputSpecUpdate(numSplitsForTask));
// Construct the EdgeManager descriptor to be used by all edges which need
// the routing table.
EdgeManagerPluginDescriptor hiveEdgeManagerDesc = null;
if ((vertexType == VertexType.MULTI_INPUT_INITIALIZED_EDGES) || (vertexType == VertexType.INITIALIZED_EDGES)) {
hiveEdgeManagerDesc = EdgeManagerPluginDescriptor.create(CustomPartitionEdge.class.getName());
UserPayload payload = getBytePayload(bucketToTaskMap);
hiveEdgeManagerDesc.setUserPayload(payload);
}
// Replace the edge manager for all vertices which have routing type custom.
for (Entry<String, EdgeProperty> edgeEntry : context.getInputVertexEdgeProperties().entrySet()) {
if (edgeEntry.getValue().getDataMovementType() == DataMovementType.CUSTOM && edgeEntry.getValue().getEdgeManagerDescriptor().getClassName().equals(CustomPartitionEdge.class.getName())) {
emMap.put(edgeEntry.getKey(), hiveEdgeManagerDesc);
}
}
LOG.info("Task count is " + taskCount + " for input name: " + inputName);
List<InputDataInformationEvent> taskEvents = Lists.newArrayListWithCapacity(totalInputsCount);
// Re-serialize the splits after grouping.
int count = 0;
for (InputSplit inputSplit : finalSplits) {
if (secondLevelGroupingDone) {
TezGroupedSplit tezGroupedSplit = (TezGroupedSplit) inputSplit;
for (InputSplit subSplit : tezGroupedSplit.getGroupedSplits()) {
if ((subSplit instanceof TezGroupedSplit) == false) {
throw new IOException("Unexpected split type found: " + subSplit.getClass().getCanonicalName());
}
MRSplitProto serializedSplit = MRInputHelpers.createSplitProto(subSplit);
InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count, serializedSplit.toByteString().asReadOnlyByteBuffer());
diEvent.setTargetIndex(count);
taskEvents.add(diEvent);
}
} else {
MRSplitProto serializedSplit = MRInputHelpers.createSplitProto(inputSplit);
InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count, serializedSplit.toByteString().asReadOnlyByteBuffer());
diEvent.setTargetIndex(count);
taskEvents.add(diEvent);
}
count++;
}
// Set the actual events for the tasks.
LOG.info("For input name: " + inputName + " task events size is " + taskEvents.size());
context.addRootInputEvents(inputName, taskEvents);
if (inputToGroupedSplitMap.isEmpty() == false) {
for (Entry<String, Multimap<Integer, InputSplit>> entry : inputToGroupedSplitMap.entrySet()) {
processAllSideEvents(entry.getKey(), entry.getValue());
}
setVertexParallelismAndRootInputSpec(inputNameInputSpecMap);
inputToGroupedSplitMap.clear();
}
// Only done when it is a bucket map join only no SMB.
if (numInputsAffectingRootInputSpecUpdate == 1) {
setVertexParallelismAndRootInputSpec(inputNameInputSpecMap);
}
}
use of org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto in project hive by apache.
the class HiveSplitGenerator method createEventList.
private List<Event> createEventList(boolean sendSerializedEvents, InputSplitInfoMem inputSplitInfo) {
List<Event> events = Lists.newArrayListWithCapacity(inputSplitInfo.getNumTasks() + 1);
InputConfigureVertexTasksEvent configureVertexEvent = InputConfigureVertexTasksEvent.create(inputSplitInfo.getNumTasks(), VertexLocationHint.create(inputSplitInfo.getTaskLocationHints()), InputSpecUpdate.getDefaultSinglePhysicalInputSpecUpdate());
events.add(configureVertexEvent);
if (sendSerializedEvents) {
MRSplitsProto splitsProto = inputSplitInfo.getSplitsProto();
int count = 0;
for (MRSplitProto mrSplit : splitsProto.getSplitsList()) {
InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count++, mrSplit.toByteString().asReadOnlyByteBuffer());
events.add(diEvent);
}
} else {
int count = 0;
for (org.apache.hadoop.mapred.InputSplit split : inputSplitInfo.getOldFormatSplits()) {
InputDataInformationEvent diEvent = InputDataInformationEvent.createWithObjectPayload(count++, split);
events.add(diEvent);
}
}
return events;
}
use of org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto in project hive by apache.
the class CustomPartitionVertex method getFileSplitFromEvent.
private FileSplit getFileSplitFromEvent(InputDataInformationEvent event) throws IOException {
InputSplit inputSplit = null;
if (event.getDeserializedUserPayload() != null) {
inputSplit = (InputSplit) event.getDeserializedUserPayload();
} else {
MRSplitProto splitProto = MRSplitProto.parseFrom(ByteString.copyFrom(event.getUserPayload()));
SerializationFactory serializationFactory = new SerializationFactory(new Configuration());
inputSplit = MRInputHelpers.createOldFormatSplitFromUserPayload(splitProto, serializationFactory);
}
if (!(inputSplit instanceof FileSplit)) {
throw new UnsupportedOperationException("Cannot handle splits other than FileSplit for the moment. Current input split type: " + inputSplit.getClass().getSimpleName());
}
return (FileSplit) inputSplit;
}
use of org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto in project hive by apache.
the class CustomPartitionVertex method processAllSideEvents.
private void processAllSideEvents(String inputName, Multimap<Integer, InputSplit> bucketToGroupedSplitMap) throws IOException {
List<InputDataInformationEvent> taskEvents = new ArrayList<InputDataInformationEvent>();
LOG.info("We have a routing table and we are going to set the destination tasks for the" + " multi mr inputs. " + bucketToTaskMap);
Integer[] numSplitsForTask = new Integer[taskCount];
Arrays.fill(numSplitsForTask, 0);
Multimap<Integer, ByteBuffer> bucketToSerializedSplitMap = LinkedListMultimap.create();
// Create the list of serialized splits for each bucket.
for (Entry<Integer, Collection<InputSplit>> entry : bucketToGroupedSplitMap.asMap().entrySet()) {
for (InputSplit split : entry.getValue()) {
MRSplitProto serializedSplit = MRInputHelpers.createSplitProto(split);
ByteBuffer bs = serializedSplit.toByteString().asReadOnlyByteBuffer();
bucketToSerializedSplitMap.put(entry.getKey(), bs);
}
}
for (Entry<Integer, Collection<ByteBuffer>> entry : bucketToSerializedSplitMap.asMap().entrySet()) {
Collection<Integer> destTasks = bucketToTaskMap.get(entry.getKey());
if ((destTasks == null) || (destTasks.isEmpty())) {
continue;
}
for (Integer task : destTasks) {
int count = 0;
for (ByteBuffer buf : entry.getValue()) {
count++;
InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count, buf);
diEvent.setTargetIndex(task);
taskEvents.add(diEvent);
}
numSplitsForTask[task] = count;
}
}
inputNameInputSpecMap.put(inputName, InputSpecUpdate.createPerTaskInputSpecUpdate(Arrays.asList(numSplitsForTask)));
LOG.info("For input name: " + inputName + " task events size is " + taskEvents.size());
context.addRootInputEvents(inputName, taskEvents);
}
Aggregations