Search in sources :

Example 1 with MRInputUserPayloadProto

use of org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto in project tez by apache.

the class MRInputAMSplitGenerator method initialize.

@Override
public List<Event> initialize() throws Exception {
    StopWatch sw = new StopWatch().start();
    MRInputUserPayloadProto userPayloadProto = MRInputHelpers.parseMRInputPayload(getContext().getInputUserPayload());
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Time to parse MRInput payload into prot: " + sw.now(TimeUnit.MILLISECONDS));
    }
    sw.reset().start();
    Configuration conf = TezUtils.createConfFromByteString(userPayloadProto.getConfigurationBytes());
    sendSerializedEvents = conf.getBoolean(MRJobConfig.MR_TEZ_INPUT_INITIALIZER_SERIALIZE_EVENT_PAYLOAD, MRJobConfig.MR_TEZ_INPUT_INITIALIZER_SERIALIZE_EVENT_PAYLOAD_DEFAULT);
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Emitting serialized splits: " + sendSerializedEvents + " for input " + getContext().getInputName());
        LOG.debug("Time converting ByteString to configuration: " + sw.now(TimeUnit.MILLISECONDS));
    }
    sw.reset().start();
    int totalResource = getContext().getTotalAvailableResource().getMemory();
    int taskResource = getContext().getVertexTaskResource().getMemory();
    float waves = conf.getFloat(TezSplitGrouper.TEZ_GROUPING_SPLIT_WAVES, TezSplitGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
    int numTasks = (int) ((totalResource * waves) / taskResource);
    boolean groupSplits = userPayloadProto.getGroupingEnabled();
    boolean sortSplits = userPayloadProto.getSortSplitsEnabled();
    LOG.info("Input " + getContext().getInputName() + " asking for " + numTasks + " tasks. Headroom: " + totalResource + ". Task Resource: " + taskResource + ". waves: " + waves + ". groupingEnabled: " + groupSplits + ". SortSplitsEnabled: " + sortSplits);
    // Read all credentials into the credentials instance stored in JobConf.
    JobConf jobConf = new JobConf(conf);
    jobConf.getCredentials().mergeAll(UserGroupInformation.getCurrentUser().getCredentials());
    InputSplitInfoMem inputSplitInfo = null;
    inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(jobConf, groupSplits, sortSplits, groupSplits ? numTasks : 0);
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Time to create splits to mem: " + sw.now(TimeUnit.MILLISECONDS));
    }
    List<Event> events = Lists.newArrayListWithCapacity(inputSplitInfo.getNumTasks() + 1);
    InputConfigureVertexTasksEvent configureVertexEvent = InputConfigureVertexTasksEvent.create(inputSplitInfo.getNumTasks(), VertexLocationHint.create(inputSplitInfo.getTaskLocationHints()), InputSpecUpdate.getDefaultSinglePhysicalInputSpecUpdate());
    events.add(configureVertexEvent);
    if (sendSerializedEvents) {
        MRSplitsProto splitsProto = inputSplitInfo.getSplitsProto();
        int count = 0;
        for (MRSplitProto mrSplit : splitsProto.getSplitsList()) {
            // Unnecessary array copy, can be avoided by using ByteBuffer instead of a raw array.
            InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count++, mrSplit.toByteString().asReadOnlyByteBuffer());
            events.add(diEvent);
        }
    } else {
        int count = 0;
        if (inputSplitInfo.holdsNewFormatSplits()) {
            for (org.apache.hadoop.mapreduce.InputSplit split : inputSplitInfo.getNewFormatSplits()) {
                InputDataInformationEvent diEvent = InputDataInformationEvent.createWithObjectPayload(count++, split);
                events.add(diEvent);
            }
        } else {
            for (org.apache.hadoop.mapred.InputSplit split : inputSplitInfo.getOldFormatSplits()) {
                InputDataInformationEvent diEvent = InputDataInformationEvent.createWithObjectPayload(count++, split);
                events.add(diEvent);
            }
        }
    }
    return events;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) StopWatch(org.apache.tez.util.StopWatch) MRSplitsProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitsProto) InputSplitInfoMem(org.apache.tez.mapreduce.hadoop.InputSplitInfoMem) MRInputUserPayloadProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto) InputInitializerEvent(org.apache.tez.runtime.api.events.InputInitializerEvent) Event(org.apache.tez.runtime.api.Event) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) JobConf(org.apache.hadoop.mapred.JobConf) MRSplitProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent)

Example 2 with MRInputUserPayloadProto

use of org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto in project tez by apache.

the class MRInputSplitDistributor method initialize.

@Override
public List<Event> initialize() throws IOException {
    StopWatch sw = new StopWatch().start();
    MRInputUserPayloadProto userPayloadProto = MRInputHelpers.parseMRInputPayload(getContext().getInputUserPayload());
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Time to parse MRInput payload into prot: " + sw.now(TimeUnit.MILLISECONDS));
    }
    Configuration conf = TezUtils.createConfFromByteString(userPayloadProto.getConfigurationBytes());
    JobConf jobConf = new JobConf(conf);
    boolean useNewApi = jobConf.getUseNewMapper();
    sendSerializedEvents = conf.getBoolean(MRJobConfig.MR_TEZ_INPUT_INITIALIZER_SERIALIZE_EVENT_PAYLOAD, MRJobConfig.MR_TEZ_INPUT_INITIALIZER_SERIALIZE_EVENT_PAYLOAD_DEFAULT);
    LOG.info("Emitting serialized splits: " + sendSerializedEvents);
    this.splitsProto = userPayloadProto.getSplits();
    MRInputUserPayloadProto.Builder updatedPayloadBuilder = MRInputUserPayloadProto.newBuilder(userPayloadProto);
    updatedPayloadBuilder.clearSplits();
    List<Event> events = Lists.newArrayListWithCapacity(this.splitsProto.getSplitsCount() + 1);
    InputUpdatePayloadEvent updatePayloadEvent = InputUpdatePayloadEvent.create(updatedPayloadBuilder.build().toByteString().asReadOnlyByteBuffer());
    events.add(updatePayloadEvent);
    int count = 0;
    for (MRSplitProto mrSplit : this.splitsProto.getSplitsList()) {
        InputDataInformationEvent diEvent;
        if (sendSerializedEvents) {
            // Unnecessary array copy, can be avoided by using ByteBuffer instead of
            // a raw array.
            diEvent = InputDataInformationEvent.createWithSerializedPayload(count++, mrSplit.toByteString().asReadOnlyByteBuffer());
        } else {
            if (useNewApi) {
                org.apache.hadoop.mapreduce.InputSplit newInputSplit = MRInputUtils.getNewSplitDetailsFromEvent(mrSplit, conf);
                diEvent = InputDataInformationEvent.createWithObjectPayload(count++, newInputSplit);
            } else {
                org.apache.hadoop.mapred.InputSplit oldInputSplit = MRInputUtils.getOldSplitDetailsFromEvent(mrSplit, conf);
                diEvent = InputDataInformationEvent.createWithObjectPayload(count++, oldInputSplit);
            }
        }
        events.add(diEvent);
    }
    return events;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) StopWatch(org.apache.tez.util.StopWatch) InputUpdatePayloadEvent(org.apache.tez.runtime.api.events.InputUpdatePayloadEvent) MRInputUserPayloadProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto) InputInitializerEvent(org.apache.tez.runtime.api.events.InputInitializerEvent) InputUpdatePayloadEvent(org.apache.tez.runtime.api.events.InputUpdatePayloadEvent) Event(org.apache.tez.runtime.api.Event) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) JobConf(org.apache.hadoop.mapred.JobConf) MRSplitProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent)

Example 3 with MRInputUserPayloadProto

use of org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto in project hive by apache.

the class CustomPartitionVertex method onRootVertexInitialized.

// One call per root Input
@Override
public void onRootVertexInitialized(String inputName, InputDescriptor inputDescriptor, List<Event> events) {
    numInputsSeenSoFar++;
    LOG.info("On root vertex initialized " + inputName);
    try {
        // This is using the payload from the RootVertexInitializer corresponding
        // to InputName. Ideally it should be using it's own configuration class -
        // but that
        // means serializing another instance.
        MRInputUserPayloadProto protoPayload = MRInputHelpers.parseMRInputPayload(inputDescriptor.getUserPayload());
        this.conf = TezUtils.createConfFromByteString(protoPayload.getConfigurationBytes());
        /*
       * Currently in tez, the flow of events is thus:
       * "Generate Splits -> Initialize Vertex" (with parallelism info obtained
       * from the generate splits phase). The generate splits phase groups
       * splits using the TezGroupedSplitsInputFormat. However, for bucket map
       * joins the grouping done by this input format results in incorrect
       * results as the grouper has no knowledge of buckets. So, we initially
       * set the input format to be HiveInputFormat (in DagUtils) for the case
       * of bucket map joins so as to obtain un-grouped splits. We then group
       * the splits corresponding to buckets using the tez grouper which returns
       * TezGroupedSplits.
       */
        // This assumes that Grouping will always be used.
        // Enabling grouping on the payload.
        MRInputUserPayloadProto updatedPayload = MRInputUserPayloadProto.newBuilder(protoPayload).setGroupingEnabled(true).build();
        inputDescriptor.setUserPayload(UserPayload.create(updatedPayload.toByteString().asReadOnlyByteBuffer()));
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    boolean dataInformationEventSeen = false;
    Map<String, Set<FileSplit>> pathFileSplitsMap = new TreeMap<String, Set<FileSplit>>();
    for (Event event : events) {
        if (event instanceof InputConfigureVertexTasksEvent) {
            // No tasks should have been started yet. Checked by initial state
            // check.
            LOG.info("Got a input configure vertex event for input: " + inputName);
            Preconditions.checkState(dataInformationEventSeen == false);
            InputConfigureVertexTasksEvent cEvent = (InputConfigureVertexTasksEvent) event;
            // The vertex cannot be configured until all DataEvents are seen - to
            // build the routing table.
            configureVertexTaskEvent = cEvent;
            LOG.info("Configure task for input name: " + inputName + " num tasks: " + configureVertexTaskEvent.getNumTasks());
        }
        if (event instanceof InputUpdatePayloadEvent) {
            // this event can never occur. If it does, fail.
            Preconditions.checkState(false);
        } else if (event instanceof InputDataInformationEvent) {
            dataInformationEventSeen = true;
            InputDataInformationEvent diEvent = (InputDataInformationEvent) event;
            FileSplit fileSplit;
            try {
                fileSplit = getFileSplitFromEvent(diEvent);
            } catch (IOException e) {
                throw new RuntimeException("Failed to get file split for event: " + diEvent, e);
            }
            Set<FileSplit> fsList = pathFileSplitsMap.get(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()));
            if (fsList == null) {
                fsList = new TreeSet<FileSplit>(new PathComparatorForSplit());
                pathFileSplitsMap.put(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()), fsList);
            }
            fsList.add(fileSplit);
        }
    }
    LOG.debug("Path file splits map for input name: {} is {}", inputName, pathFileSplitsMap);
    Multimap<Integer, InputSplit> bucketToInitialSplitMap = getBucketSplitMapForPath(inputName, pathFileSplitsMap);
    try {
        int totalResource = context.getTotalAvailableResource().getMemory();
        int taskResource = context.getVertexTaskResource().getMemory();
        float waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
        int availableSlots = totalResource / taskResource;
        LOG.debug("Grouping splits. {} available slots, {} waves. Bucket initial splits map: {}", availableSlots, waves, bucketToInitialSplitMap);
        JobConf jobConf = new JobConf(conf);
        ShimLoader.getHadoopShims().getMergedCredentials(jobConf);
        Multimap<Integer, InputSplit> bucketToGroupedSplitMap = HashMultimap.<Integer, InputSplit>create();
        boolean secondLevelGroupingDone = false;
        if ((mainWorkName.isEmpty()) || (inputName.compareTo(mainWorkName) == 0)) {
            SplitLocationProvider splitLocationProvider = Utils.getSplitLocationProvider(conf, LOG);
            for (Integer key : bucketToInitialSplitMap.keySet()) {
                InputSplit[] inputSplitArray = (bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
                Multimap<Integer, InputSplit> groupedSplit = grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves, availableSlots, inputName, mainWorkName.isEmpty(), splitLocationProvider);
                if (mainWorkName.isEmpty() == false) {
                    Multimap<Integer, InputSplit> singleBucketToGroupedSplit = HashMultimap.<Integer, InputSplit>create();
                    singleBucketToGroupedSplit.putAll(key, groupedSplit.values());
                    groupedSplit = grouper.group(jobConf, singleBucketToGroupedSplit, availableSlots, HiveConf.getFloatVar(conf, HiveConf.ConfVars.TEZ_SMB_NUMBER_WAVES), null);
                    secondLevelGroupingDone = true;
                }
                bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
            }
            processAllEvents(inputName, bucketToGroupedSplitMap, secondLevelGroupingDone);
        } else {
            SplitLocationProvider splitLocationProvider = Utils.getSplitLocationProvider(conf, LOG);
            // all the bucket files.
            for (Integer key : bucketToInitialSplitMap.keySet()) {
                InputSplit[] inputSplitArray = (bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
                Multimap<Integer, InputSplit> groupedSplit = grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves, availableSlots, inputName, false, splitLocationProvider);
                bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
            }
            /*
         * this is the small table side. In case of SMB join, we need to send each split to the
         * corresponding bucket-based task on the other side. In case a split needs to go to
         * multiple downstream tasks, we need to clone the event and send it to the right
         * destination.
         */
            LOG.info("This is the side work - multi-mr work.");
            processAllSideEventsSetParallelism(inputName, bucketToGroupedSplitMap);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : IOException(java.io.IOException) ByteString(com.google.protobuf.ByteString) FileSplit(org.apache.hadoop.mapred.FileSplit) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) IOException(java.io.IOException) MRInputUserPayloadProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto) Event(org.apache.tez.runtime.api.Event) SplitLocationProvider(org.apache.hadoop.mapred.split.SplitLocationProvider) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

JobConf (org.apache.hadoop.mapred.JobConf)3 MRInputUserPayloadProto (org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto)3 Event (org.apache.tez.runtime.api.Event)3 Configuration (org.apache.hadoop.conf.Configuration)2 VertexLocationHint (org.apache.tez.dag.api.VertexLocationHint)2 MRSplitProto (org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto)2 InputDataInformationEvent (org.apache.tez.runtime.api.events.InputDataInformationEvent)2 InputInitializerEvent (org.apache.tez.runtime.api.events.InputInitializerEvent)2 StopWatch (org.apache.tez.util.StopWatch)2 ByteString (com.google.protobuf.ByteString)1 IOException (java.io.IOException)1 FileSplit (org.apache.hadoop.mapred.FileSplit)1 InputSplit (org.apache.hadoop.mapred.InputSplit)1 SplitLocationProvider (org.apache.hadoop.mapred.split.SplitLocationProvider)1 InputSplitInfoMem (org.apache.tez.mapreduce.hadoop.InputSplitInfoMem)1 MRSplitsProto (org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitsProto)1 InputConfigureVertexTasksEvent (org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent)1 InputUpdatePayloadEvent (org.apache.tez.runtime.api.events.InputUpdatePayloadEvent)1