Search in sources :

Example 1 with SplitLocationProvider

use of org.apache.hadoop.mapred.split.SplitLocationProvider in project hive by apache.

the class Utils method getSplitLocationProvider.

public static SplitLocationProvider getSplitLocationProvider(Configuration conf, Logger LOG) throws IOException {
    boolean useCustomLocations = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_EXECUTION_MODE).equals("llap") && HiveConf.getBoolVar(conf, HiveConf.ConfVars.LLAP_CLIENT_CONSISTENT_SPLITS);
    SplitLocationProvider splitLocationProvider;
    LOG.info("SplitGenerator using llap affinitized locations: " + useCustomLocations);
    if (useCustomLocations) {
        LlapRegistryService serviceRegistry = LlapRegistryService.getClient(conf);
        LOG.info("Using LLAP instance " + serviceRegistry.getApplicationId());
        Collection<ServiceInstance> serviceInstances = serviceRegistry.getInstances().getAllInstancesOrdered(true);
        Preconditions.checkArgument(!serviceInstances.isEmpty(), "No running LLAP daemons! Please check LLAP service status and zookeeper configuration");
        ArrayList<String> locations = new ArrayList<>(serviceInstances.size());
        for (ServiceInstance serviceInstance : serviceInstances) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Adding " + serviceInstance.getWorkerIdentity() + " with hostname=" + serviceInstance.getHost() + " to list for split locations");
            }
            locations.add(serviceInstance.getHost());
        }
        splitLocationProvider = new HostAffinitySplitLocationProvider(locations);
    } else {
        splitLocationProvider = new SplitLocationProvider() {

            @Override
            public String[] getLocations(InputSplit split) throws IOException {
                if (split == null) {
                    return null;
                }
                String[] locations = split.getLocations();
                if (locations != null && locations.length == 1) {
                    if ("localhost".equals(locations[0])) {
                        return ArrayUtils.EMPTY_STRING_ARRAY;
                    }
                }
                return locations;
            }
        };
    }
    return splitLocationProvider;
}
Also used : ArrayList(java.util.ArrayList) ServiceInstance(org.apache.hadoop.hive.llap.registry.ServiceInstance) SplitLocationProvider(org.apache.hadoop.mapred.split.SplitLocationProvider) LlapRegistryService(org.apache.hadoop.hive.llap.registry.impl.LlapRegistryService) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 2 with SplitLocationProvider

use of org.apache.hadoop.mapred.split.SplitLocationProvider in project hive by apache.

the class CustomPartitionVertex method onRootVertexInitialized.

// One call per root Input
@Override
public void onRootVertexInitialized(String inputName, InputDescriptor inputDescriptor, List<Event> events) {
    numInputsSeenSoFar++;
    LOG.info("On root vertex initialized " + inputName);
    try {
        // This is using the payload from the RootVertexInitializer corresponding
        // to InputName. Ideally it should be using it's own configuration class -
        // but that
        // means serializing another instance.
        MRInputUserPayloadProto protoPayload = MRInputHelpers.parseMRInputPayload(inputDescriptor.getUserPayload());
        this.conf = TezUtils.createConfFromByteString(protoPayload.getConfigurationBytes());
        /*
       * Currently in tez, the flow of events is thus:
       * "Generate Splits -> Initialize Vertex" (with parallelism info obtained
       * from the generate splits phase). The generate splits phase groups
       * splits using the TezGroupedSplitsInputFormat. However, for bucket map
       * joins the grouping done by this input format results in incorrect
       * results as the grouper has no knowledge of buckets. So, we initially
       * set the input format to be HiveInputFormat (in DagUtils) for the case
       * of bucket map joins so as to obtain un-grouped splits. We then group
       * the splits corresponding to buckets using the tez grouper which returns
       * TezGroupedSplits.
       */
        // This assumes that Grouping will always be used.
        // Enabling grouping on the payload.
        MRInputUserPayloadProto updatedPayload = MRInputUserPayloadProto.newBuilder(protoPayload).setGroupingEnabled(true).build();
        inputDescriptor.setUserPayload(UserPayload.create(updatedPayload.toByteString().asReadOnlyByteBuffer()));
    } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException(e);
    }
    boolean dataInformationEventSeen = false;
    Map<String, Set<FileSplit>> pathFileSplitsMap = new TreeMap<String, Set<FileSplit>>();
    for (Event event : events) {
        if (event instanceof InputConfigureVertexTasksEvent) {
            // No tasks should have been started yet. Checked by initial state
            // check.
            LOG.info("Got a input configure vertex event for input: " + inputName);
            Preconditions.checkState(dataInformationEventSeen == false);
            InputConfigureVertexTasksEvent cEvent = (InputConfigureVertexTasksEvent) event;
            // The vertex cannot be configured until all DataEvents are seen - to
            // build the routing table.
            configureVertexTaskEvent = cEvent;
            LOG.info("Configure task for input name: " + inputName + " num tasks: " + configureVertexTaskEvent.getNumTasks());
        }
        if (event instanceof InputUpdatePayloadEvent) {
            // this event can never occur. If it does, fail.
            Preconditions.checkState(false);
        } else if (event instanceof InputDataInformationEvent) {
            dataInformationEventSeen = true;
            InputDataInformationEvent diEvent = (InputDataInformationEvent) event;
            FileSplit fileSplit;
            try {
                fileSplit = getFileSplitFromEvent(diEvent);
            } catch (IOException e) {
                throw new RuntimeException("Failed to get file split for event: " + diEvent, e);
            }
            Set<FileSplit> fsList = pathFileSplitsMap.get(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()));
            if (fsList == null) {
                fsList = new TreeSet<FileSplit>(new PathComparatorForSplit());
                pathFileSplitsMap.put(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()), fsList);
            }
            fsList.add(fileSplit);
        }
    }
    LOG.info("Path file splits map for input name: " + inputName + " is " + pathFileSplitsMap);
    Multimap<Integer, InputSplit> bucketToInitialSplitMap = getBucketSplitMapForPath(pathFileSplitsMap);
    try {
        int totalResource = context.getTotalAvailableResource().getMemory();
        int taskResource = context.getVertexTaskResource().getMemory();
        float waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
        int availableSlots = totalResource / taskResource;
        LOG.info("Grouping splits. " + availableSlots + " available slots, " + waves + " waves. Bucket initial splits map: " + bucketToInitialSplitMap);
        JobConf jobConf = new JobConf(conf);
        ShimLoader.getHadoopShims().getMergedCredentials(jobConf);
        Multimap<Integer, InputSplit> bucketToGroupedSplitMap = HashMultimap.<Integer, InputSplit>create();
        boolean secondLevelGroupingDone = false;
        if ((mainWorkName.isEmpty()) || (inputName.compareTo(mainWorkName) == 0)) {
            SplitLocationProvider splitLocationProvider = Utils.getSplitLocationProvider(conf, LOG);
            for (Integer key : bucketToInitialSplitMap.keySet()) {
                InputSplit[] inputSplitArray = (bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
                Multimap<Integer, InputSplit> groupedSplit = grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves, availableSlots, inputName, mainWorkName.isEmpty(), splitLocationProvider);
                if (mainWorkName.isEmpty() == false) {
                    Multimap<Integer, InputSplit> singleBucketToGroupedSplit = HashMultimap.<Integer, InputSplit>create();
                    singleBucketToGroupedSplit.putAll(key, groupedSplit.values());
                    groupedSplit = grouper.group(jobConf, singleBucketToGroupedSplit, availableSlots, HiveConf.getFloatVar(conf, HiveConf.ConfVars.TEZ_SMB_NUMBER_WAVES), null);
                    secondLevelGroupingDone = true;
                }
                bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
            }
            processAllEvents(inputName, bucketToGroupedSplitMap, secondLevelGroupingDone);
        } else {
            SplitLocationProvider splitLocationProvider = Utils.getSplitLocationProvider(conf, LOG);
            // all the bucket files.
            for (Integer key : bucketToInitialSplitMap.keySet()) {
                InputSplit[] inputSplitArray = (bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
                Multimap<Integer, InputSplit> groupedSplit = grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves, availableSlots, inputName, false, splitLocationProvider);
                bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
            }
            /*
         * this is the small table side. In case of SMB join, we need to send each split to the
         * corresponding bucket-based task on the other side. In case a split needs to go to
         * multiple downstream tasks, we need to clone the event and send it to the right
         * destination.
         */
            LOG.info("This is the side work - multi-mr work.");
            processAllSideEventsSetParallelism(inputName, bucketToGroupedSplitMap);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : Set(java.util.Set) TreeSet(java.util.TreeSet) IOException(java.io.IOException) ByteString(com.google.protobuf.ByteString) TreeMap(java.util.TreeMap) FileSplit(org.apache.hadoop.mapred.FileSplit) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) IOException(java.io.IOException) InputUpdatePayloadEvent(org.apache.tez.runtime.api.events.InputUpdatePayloadEvent) TreeSet(java.util.TreeSet) MRInputUserPayloadProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto) Event(org.apache.tez.runtime.api.Event) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) InputUpdatePayloadEvent(org.apache.tez.runtime.api.events.InputUpdatePayloadEvent) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) SplitLocationProvider(org.apache.hadoop.mapred.split.SplitLocationProvider) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent)

Aggregations

IOException (java.io.IOException)2 InputSplit (org.apache.hadoop.mapred.InputSplit)2 SplitLocationProvider (org.apache.hadoop.mapred.split.SplitLocationProvider)2 ByteString (com.google.protobuf.ByteString)1 ArrayList (java.util.ArrayList)1 Set (java.util.Set)1 TreeMap (java.util.TreeMap)1 TreeSet (java.util.TreeSet)1 ServiceInstance (org.apache.hadoop.hive.llap.registry.ServiceInstance)1 LlapRegistryService (org.apache.hadoop.hive.llap.registry.impl.LlapRegistryService)1 FileSplit (org.apache.hadoop.mapred.FileSplit)1 JobConf (org.apache.hadoop.mapred.JobConf)1 VertexLocationHint (org.apache.tez.dag.api.VertexLocationHint)1 MRInputUserPayloadProto (org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto)1 Event (org.apache.tez.runtime.api.Event)1 InputConfigureVertexTasksEvent (org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent)1 InputDataInformationEvent (org.apache.tez.runtime.api.events.InputDataInformationEvent)1 InputUpdatePayloadEvent (org.apache.tez.runtime.api.events.InputUpdatePayloadEvent)1 VertexManagerEvent (org.apache.tez.runtime.api.events.VertexManagerEvent)1