use of org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto in project tez by apache.
the class MRInputAMSplitGenerator method initialize.
@Override
public List<Event> initialize() throws Exception {
StopWatch sw = new StopWatch().start();
MRInputUserPayloadProto userPayloadProto = MRInputHelpers.parseMRInputPayload(getContext().getInputUserPayload());
sw.stop();
if (LOG.isDebugEnabled()) {
LOG.debug("Time to parse MRInput payload into prot: " + sw.now(TimeUnit.MILLISECONDS));
}
sw.reset().start();
Configuration conf = TezUtils.createConfFromByteString(userPayloadProto.getConfigurationBytes());
sendSerializedEvents = conf.getBoolean(MRJobConfig.MR_TEZ_INPUT_INITIALIZER_SERIALIZE_EVENT_PAYLOAD, MRJobConfig.MR_TEZ_INPUT_INITIALIZER_SERIALIZE_EVENT_PAYLOAD_DEFAULT);
sw.stop();
if (LOG.isDebugEnabled()) {
LOG.debug("Emitting serialized splits: " + sendSerializedEvents + " for input " + getContext().getInputName());
LOG.debug("Time converting ByteString to configuration: " + sw.now(TimeUnit.MILLISECONDS));
}
sw.reset().start();
int totalResource = getContext().getTotalAvailableResource().getMemory();
int taskResource = getContext().getVertexTaskResource().getMemory();
float waves = conf.getFloat(TezSplitGrouper.TEZ_GROUPING_SPLIT_WAVES, TezSplitGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
int numTasks = (int) ((totalResource * waves) / taskResource);
boolean groupSplits = userPayloadProto.getGroupingEnabled();
boolean sortSplits = userPayloadProto.getSortSplitsEnabled();
LOG.info("Input " + getContext().getInputName() + " asking for " + numTasks + " tasks. Headroom: " + totalResource + ". Task Resource: " + taskResource + ". waves: " + waves + ". groupingEnabled: " + groupSplits + ". SortSplitsEnabled: " + sortSplits);
// Read all credentials into the credentials instance stored in JobConf.
JobConf jobConf = new JobConf(conf);
jobConf.getCredentials().mergeAll(UserGroupInformation.getCurrentUser().getCredentials());
InputSplitInfoMem inputSplitInfo = null;
inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(jobConf, groupSplits, sortSplits, groupSplits ? numTasks : 0);
sw.stop();
if (LOG.isDebugEnabled()) {
LOG.debug("Time to create splits to mem: " + sw.now(TimeUnit.MILLISECONDS));
}
List<Event> events = Lists.newArrayListWithCapacity(inputSplitInfo.getNumTasks() + 1);
InputConfigureVertexTasksEvent configureVertexEvent = InputConfigureVertexTasksEvent.create(inputSplitInfo.getNumTasks(), VertexLocationHint.create(inputSplitInfo.getTaskLocationHints()), InputSpecUpdate.getDefaultSinglePhysicalInputSpecUpdate());
events.add(configureVertexEvent);
if (sendSerializedEvents) {
MRSplitsProto splitsProto = inputSplitInfo.getSplitsProto();
int count = 0;
for (MRSplitProto mrSplit : splitsProto.getSplitsList()) {
// Unnecessary array copy, can be avoided by using ByteBuffer instead of a raw array.
InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count++, mrSplit.toByteString().asReadOnlyByteBuffer());
events.add(diEvent);
}
} else {
int count = 0;
if (inputSplitInfo.holdsNewFormatSplits()) {
for (org.apache.hadoop.mapreduce.InputSplit split : inputSplitInfo.getNewFormatSplits()) {
InputDataInformationEvent diEvent = InputDataInformationEvent.createWithObjectPayload(count++, split);
events.add(diEvent);
}
} else {
for (org.apache.hadoop.mapred.InputSplit split : inputSplitInfo.getOldFormatSplits()) {
InputDataInformationEvent diEvent = InputDataInformationEvent.createWithObjectPayload(count++, split);
events.add(diEvent);
}
}
}
return events;
}
use of org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto in project tez by apache.
the class MRInputSplitDistributor method initialize.
@Override
public List<Event> initialize() throws IOException {
StopWatch sw = new StopWatch().start();
MRInputUserPayloadProto userPayloadProto = MRInputHelpers.parseMRInputPayload(getContext().getInputUserPayload());
sw.stop();
if (LOG.isDebugEnabled()) {
LOG.debug("Time to parse MRInput payload into prot: " + sw.now(TimeUnit.MILLISECONDS));
}
Configuration conf = TezUtils.createConfFromByteString(userPayloadProto.getConfigurationBytes());
JobConf jobConf = new JobConf(conf);
boolean useNewApi = jobConf.getUseNewMapper();
sendSerializedEvents = conf.getBoolean(MRJobConfig.MR_TEZ_INPUT_INITIALIZER_SERIALIZE_EVENT_PAYLOAD, MRJobConfig.MR_TEZ_INPUT_INITIALIZER_SERIALIZE_EVENT_PAYLOAD_DEFAULT);
LOG.info("Emitting serialized splits: " + sendSerializedEvents);
this.splitsProto = userPayloadProto.getSplits();
MRInputUserPayloadProto.Builder updatedPayloadBuilder = MRInputUserPayloadProto.newBuilder(userPayloadProto);
updatedPayloadBuilder.clearSplits();
List<Event> events = Lists.newArrayListWithCapacity(this.splitsProto.getSplitsCount() + 1);
InputUpdatePayloadEvent updatePayloadEvent = InputUpdatePayloadEvent.create(updatedPayloadBuilder.build().toByteString().asReadOnlyByteBuffer());
events.add(updatePayloadEvent);
int count = 0;
for (MRSplitProto mrSplit : this.splitsProto.getSplitsList()) {
InputDataInformationEvent diEvent;
if (sendSerializedEvents) {
// Unnecessary array copy, can be avoided by using ByteBuffer instead of
// a raw array.
diEvent = InputDataInformationEvent.createWithSerializedPayload(count++, mrSplit.toByteString().asReadOnlyByteBuffer());
} else {
if (useNewApi) {
org.apache.hadoop.mapreduce.InputSplit newInputSplit = MRInputUtils.getNewSplitDetailsFromEvent(mrSplit, conf);
diEvent = InputDataInformationEvent.createWithObjectPayload(count++, newInputSplit);
} else {
org.apache.hadoop.mapred.InputSplit oldInputSplit = MRInputUtils.getOldSplitDetailsFromEvent(mrSplit, conf);
diEvent = InputDataInformationEvent.createWithObjectPayload(count++, oldInputSplit);
}
}
events.add(diEvent);
}
return events;
}
use of org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto in project hive by apache.
the class CustomPartitionVertex method onRootVertexInitialized.
// One call per root Input
@Override
public void onRootVertexInitialized(String inputName, InputDescriptor inputDescriptor, List<Event> events) {
numInputsSeenSoFar++;
LOG.info("On root vertex initialized " + inputName);
try {
// This is using the payload from the RootVertexInitializer corresponding
// to InputName. Ideally it should be using it's own configuration class -
// but that
// means serializing another instance.
MRInputUserPayloadProto protoPayload = MRInputHelpers.parseMRInputPayload(inputDescriptor.getUserPayload());
this.conf = TezUtils.createConfFromByteString(protoPayload.getConfigurationBytes());
/*
* Currently in tez, the flow of events is thus:
* "Generate Splits -> Initialize Vertex" (with parallelism info obtained
* from the generate splits phase). The generate splits phase groups
* splits using the TezGroupedSplitsInputFormat. However, for bucket map
* joins the grouping done by this input format results in incorrect
* results as the grouper has no knowledge of buckets. So, we initially
* set the input format to be HiveInputFormat (in DagUtils) for the case
* of bucket map joins so as to obtain un-grouped splits. We then group
* the splits corresponding to buckets using the tez grouper which returns
* TezGroupedSplits.
*/
// This assumes that Grouping will always be used.
// Enabling grouping on the payload.
MRInputUserPayloadProto updatedPayload = MRInputUserPayloadProto.newBuilder(protoPayload).setGroupingEnabled(true).build();
inputDescriptor.setUserPayload(UserPayload.create(updatedPayload.toByteString().asReadOnlyByteBuffer()));
} catch (IOException e) {
throw new RuntimeException(e);
}
boolean dataInformationEventSeen = false;
Map<String, Set<FileSplit>> pathFileSplitsMap = new TreeMap<String, Set<FileSplit>>();
for (Event event : events) {
if (event instanceof InputConfigureVertexTasksEvent) {
// No tasks should have been started yet. Checked by initial state
// check.
LOG.info("Got a input configure vertex event for input: " + inputName);
Preconditions.checkState(dataInformationEventSeen == false);
InputConfigureVertexTasksEvent cEvent = (InputConfigureVertexTasksEvent) event;
// The vertex cannot be configured until all DataEvents are seen - to
// build the routing table.
configureVertexTaskEvent = cEvent;
LOG.info("Configure task for input name: " + inputName + " num tasks: " + configureVertexTaskEvent.getNumTasks());
}
if (event instanceof InputUpdatePayloadEvent) {
// this event can never occur. If it does, fail.
Preconditions.checkState(false);
} else if (event instanceof InputDataInformationEvent) {
dataInformationEventSeen = true;
InputDataInformationEvent diEvent = (InputDataInformationEvent) event;
FileSplit fileSplit;
try {
fileSplit = getFileSplitFromEvent(diEvent);
} catch (IOException e) {
throw new RuntimeException("Failed to get file split for event: " + diEvent, e);
}
Set<FileSplit> fsList = pathFileSplitsMap.get(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()));
if (fsList == null) {
fsList = new TreeSet<FileSplit>(new PathComparatorForSplit());
pathFileSplitsMap.put(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()), fsList);
}
fsList.add(fileSplit);
}
}
LOG.debug("Path file splits map for input name: {} is {}", inputName, pathFileSplitsMap);
Multimap<Integer, InputSplit> bucketToInitialSplitMap = getBucketSplitMapForPath(inputName, pathFileSplitsMap);
try {
int totalResource = context.getTotalAvailableResource().getMemory();
int taskResource = context.getVertexTaskResource().getMemory();
float waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
int availableSlots = totalResource / taskResource;
LOG.debug("Grouping splits. {} available slots, {} waves. Bucket initial splits map: {}", availableSlots, waves, bucketToInitialSplitMap);
JobConf jobConf = new JobConf(conf);
ShimLoader.getHadoopShims().getMergedCredentials(jobConf);
Multimap<Integer, InputSplit> bucketToGroupedSplitMap = HashMultimap.<Integer, InputSplit>create();
boolean secondLevelGroupingDone = false;
if ((mainWorkName.isEmpty()) || (inputName.compareTo(mainWorkName) == 0)) {
SplitLocationProvider splitLocationProvider = Utils.getSplitLocationProvider(conf, LOG);
for (Integer key : bucketToInitialSplitMap.keySet()) {
InputSplit[] inputSplitArray = (bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
Multimap<Integer, InputSplit> groupedSplit = grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves, availableSlots, inputName, mainWorkName.isEmpty(), splitLocationProvider);
if (mainWorkName.isEmpty() == false) {
Multimap<Integer, InputSplit> singleBucketToGroupedSplit = HashMultimap.<Integer, InputSplit>create();
singleBucketToGroupedSplit.putAll(key, groupedSplit.values());
groupedSplit = grouper.group(jobConf, singleBucketToGroupedSplit, availableSlots, HiveConf.getFloatVar(conf, HiveConf.ConfVars.TEZ_SMB_NUMBER_WAVES), null);
secondLevelGroupingDone = true;
}
bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
}
processAllEvents(inputName, bucketToGroupedSplitMap, secondLevelGroupingDone);
} else {
SplitLocationProvider splitLocationProvider = Utils.getSplitLocationProvider(conf, LOG);
// all the bucket files.
for (Integer key : bucketToInitialSplitMap.keySet()) {
InputSplit[] inputSplitArray = (bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
Multimap<Integer, InputSplit> groupedSplit = grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves, availableSlots, inputName, false, splitLocationProvider);
bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
}
/*
* this is the small table side. In case of SMB join, we need to send each split to the
* corresponding bucket-based task on the other side. In case a split needs to go to
* multiple downstream tasks, we need to clone the event and send it to the right
* destination.
*/
LOG.info("This is the side work - multi-mr work.");
processAllSideEventsSetParallelism(inputName, bucketToGroupedSplitMap);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
Aggregations