Search in sources :

Example 1 with InactiveServiceInstance

use of org.apache.hadoop.hive.llap.registry.impl.InactiveServiceInstance in project hive by apache.

the class LlapTaskSchedulerService method selectHost.

/**
 * @param request the list of preferred hosts. null implies any host
 * @return
 */
private SelectHostResult selectHost(TaskInfo request) {
    String[] requestedHosts = request.requestedHosts;
    String requestedHostsDebugStr = Arrays.toString(requestedHosts);
    if (LOG.isDebugEnabled()) {
        LOG.debug("selectingHost for task={} on hosts={}", request.task, requestedHostsDebugStr);
    }
    long schedulerAttemptTime = clock.getTime();
    // Read-lock. Not updating any stats at the moment.
    readLock.lock();
    try {
        boolean shouldDelayForLocality = request.shouldDelayForLocality(schedulerAttemptTime);
        LOG.debug("ShouldDelayForLocality={} for task={} on hosts={}", shouldDelayForLocality, request.task, requestedHostsDebugStr);
        if (requestedHosts != null && requestedHosts.length > 0) {
            int prefHostCount = -1;
            boolean requestedHostsWillBecomeAvailable = false;
            for (String host : requestedHosts) {
                prefHostCount++;
                // Pick the first host always. Weak attempt at cache affinity.
                Set<LlapServiceInstance> instances = activeInstances.getByHost(host);
                if (!instances.isEmpty()) {
                    for (LlapServiceInstance inst : instances) {
                        NodeInfo nodeInfo = instanceToNodeMap.get(inst.getWorkerIdentity());
                        if (nodeInfo != null) {
                            if (nodeInfo.canAcceptTask()) {
                                // Successfully scheduled.
                                LOG.info("Assigning {} when looking for {}." + " local=true FirstRequestedHost={}, #prefLocations={}", nodeInfo.toShortString(), host, (prefHostCount == 0), requestedHosts.length);
                                return new SelectHostResult(nodeInfo);
                            } else {
                                // The node cannot accept a task at the moment.
                                if (shouldDelayForLocality) {
                                    // Perform some checks on whether the node will become available or not.
                                    if (request.shouldForceLocality()) {
                                        requestedHostsWillBecomeAvailable = true;
                                    } else {
                                        if (nodeInfo.getEnableTime() > request.getLocalityDelayTimeout() && nodeInfo.isDisabled() && nodeInfo.hadCommFailure()) {
                                            LOG.debug("Host={} will not become available within requested timeout", nodeInfo);
                                        // This node will likely be activated after the task timeout expires.
                                        } else {
                                            // Worth waiting for the timeout.
                                            requestedHostsWillBecomeAvailable = true;
                                        }
                                    }
                                }
                            }
                        } else {
                            LOG.warn("Null NodeInfo when attempting to get host with worker {}, and host {}", inst, host);
                        // Leave requestedHostWillBecomeAvailable as is. If some other host is found - delay,
                        // else ends up allocating to a random host immediately.
                        }
                    }
                }
            }
            // Check if forcing the location is required.
            if (shouldDelayForLocality) {
                if (requestedHostsWillBecomeAvailable) {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Delaying local allocation for [" + request.task + "] when trying to allocate on [" + requestedHostsDebugStr + "]" + ". ScheduleAttemptTime=" + schedulerAttemptTime + ", taskDelayTimeout=" + request.getLocalityDelayTimeout());
                    }
                    return SELECT_HOST_RESULT_DELAYED_LOCALITY;
                } else {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Skipping local allocation for [" + request.task + "] when trying to allocate on [" + requestedHostsDebugStr + "] since none of these hosts are part of the known list");
                    }
                }
            }
        }
        /* fall through - miss in locality or no locality-requested */
        Collection<LlapServiceInstance> instances = activeInstances.getAllInstancesOrdered(true);
        List<NodeInfo> allNodes = new ArrayList<>(instances.size());
        List<NodeInfo> activeNodesWithFreeSlots = new ArrayList<>();
        for (LlapServiceInstance inst : instances) {
            if (inst instanceof InactiveServiceInstance) {
                allNodes.add(null);
            } else {
                NodeInfo nodeInfo = instanceToNodeMap.get(inst.getWorkerIdentity());
                if (nodeInfo == null) {
                    allNodes.add(null);
                } else {
                    allNodes.add(nodeInfo);
                    if (nodeInfo.canAcceptTask()) {
                        activeNodesWithFreeSlots.add(nodeInfo);
                    }
                }
            }
        }
        if (allNodes.isEmpty()) {
            return SELECT_HOST_RESULT_DELAYED_RESOURCES;
        }
        // no locality-requested, randomly pick a node containing free slots
        if (requestedHosts == null || requestedHosts.length == 0) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("No-locality requested. Selecting a random host for task={}", request.task);
            }
            return randomSelection(activeNodesWithFreeSlots);
        }
        // miss in locality request, try picking consistent location with fallback to random selection
        final String firstRequestedHost = requestedHosts[0];
        int requestedHostIdx = -1;
        for (int i = 0; i < allNodes.size(); i++) {
            NodeInfo nodeInfo = allNodes.get(i);
            if (nodeInfo != null) {
                if (nodeInfo.getHost().equals(firstRequestedHost)) {
                    requestedHostIdx = i;
                    break;
                }
            }
        }
        // TODO: At this point we don't know the slot number of the requested host, so can't rollover to next available
        if (requestedHostIdx == -1) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Requested node [{}] in consistent order does not exist. Falling back to random selection for " + "request {}", firstRequestedHost, request);
            }
            return randomSelection(activeNodesWithFreeSlots);
        }
        // requested host is still alive but cannot accept task, pick the next available host in consistent order
        for (int i = 0; i < allNodes.size(); i++) {
            NodeInfo nodeInfo = allNodes.get((i + requestedHostIdx + 1) % allNodes.size());
            // next node in consistent order died or does not have free slots, rollover to next
            if (nodeInfo == null || !nodeInfo.canAcceptTask()) {
                continue;
            } else {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Assigning {} in consistent order when looking for first requested host, from #hosts={}," + " requestedHosts={}", nodeInfo.toShortString(), allNodes.size(), ((requestedHosts == null || requestedHosts.length == 0) ? "null" : requestedHostsDebugStr));
                }
                return new SelectHostResult(nodeInfo);
            }
        }
        return SELECT_HOST_RESULT_DELAYED_RESOURCES;
    } finally {
        readLock.unlock();
    }
}
Also used : ArrayList(java.util.ArrayList) LlapServiceInstance(org.apache.hadoop.hive.llap.registry.LlapServiceInstance) InactiveServiceInstance(org.apache.hadoop.hive.llap.registry.impl.InactiveServiceInstance)

Example 2 with InactiveServiceInstance

use of org.apache.hadoop.hive.llap.registry.impl.InactiveServiceInstance in project hive by apache.

the class LlapClusterStateForCompile method initClusterInfo.

public boolean initClusterInfo() {
    if (!isUpdateNeeded())
        return true;
    synchronized (updateInfoLock) {
        // At this point, no one will take the write lock and update, so we can do the last check.
        if (!isUpdateNeeded())
            return true;
        if (svc == null) {
            try {
                svc = LlapRegistryService.getClient(conf);
            } catch (Throwable t) {
                LOG.info("Cannot create the client; ignoring", t);
                // Don't fail; this is best-effort.
                return false;
            }
        }
        ServiceInstanceSet<LlapServiceInstance> instances;
        try {
            instances = svc.getInstances(10);
        } catch (IOException e) {
            LOG.info("Cannot update cluster information; ignoring", e);
            // Don't wait for the cluster if not started; this is best-effort.
            return false;
        }
        int executorsLocal = 0, noConfigNodesLocal = 0;
        for (LlapServiceInstance si : instances.getAll()) {
            // Shouldn't happen in getAll.
            if (si instanceof InactiveServiceInstance)
                continue;
            Map<String, String> props = si.getProperties();
            if (props == null) {
                ++noConfigNodesLocal;
                continue;
            }
            try {
                int numExecutors = Integer.parseInt(props.get(ConfVars.LLAP_DAEMON_NUM_EXECUTORS.varname));
                executorsLocal += numExecutors;
                if (numExecutorsPerNode == -1) {
                    numExecutorsPerNode = numExecutors;
                }
                if (memoryPerInstance == -1) {
                    memoryPerInstance = si.getResource().getMemorySize() * 1024L * 1024L;
                }
            } catch (NumberFormatException e) {
                ++noConfigNodesLocal;
            }
        }
        noConfigNodeCount = noConfigNodesLocal;
        executorCount = executorsLocal;
        lastClusterUpdateNs = System.nanoTime();
        return true;
    }
}
Also used : InactiveServiceInstance(org.apache.hadoop.hive.llap.registry.impl.InactiveServiceInstance) LlapServiceInstance(org.apache.hadoop.hive.llap.registry.LlapServiceInstance) IOException(java.io.IOException)

Example 3 with InactiveServiceInstance

use of org.apache.hadoop.hive.llap.registry.impl.InactiveServiceInstance in project hive by apache.

the class LlapTaskSchedulerService method getResourceAvailability.

private Pair<Resource, Map<String, List<NodeInfo>>> getResourceAvailability() {
    int memory = 0;
    int vcores = 0;
    int numInstancesFound = 0;
    Map<String, List<NodeInfo>> availableHostMap;
    readLock.lock();
    try {
        // maintain insertion order (needed for Next slot in locality miss)
        availableHostMap = new LinkedHashMap<>(instanceToNodeMap.size());
        Collection<LlapServiceInstance> instances = consistentSplits ? // might also include Inactive instances
        activeInstances.getAllInstancesOrdered(true) : // if consistent splits are NOT used we don't need the ordering as there will be no cache benefit anyways
        activeInstances.getAll();
        boolean foundSlot = false;
        for (LlapServiceInstance inst : instances) {
            NodeInfo nodeInfo = instanceToNodeMap.get(inst.getWorkerIdentity());
            if (nodeInfo != null) {
                List<NodeInfo> hostList = availableHostMap.get(nodeInfo.getHost());
                if (hostList == null) {
                    hostList = new ArrayList<>();
                    availableHostMap.put(nodeInfo.getHost(), hostList);
                }
                if (!(inst instanceof InactiveServiceInstance)) {
                    Resource r = inst.getResource();
                    memory += r.getMemory();
                    vcores += r.getVirtualCores();
                    numInstancesFound++;
                    // Hosts, however, exist even for nodes that do not currently have resources
                    if (nodeInfo.canAcceptTask()) {
                        foundSlot = true;
                        hostList.add(nodeInfo);
                    }
                }
            } else {
                LOG.warn("Null NodeInfo when attempting to get available resources for " + inst.getWorkerIdentity());
            }
        }
        // set it false here to bail out early when we know there are no resources available.
        if (!foundSlot) {
            isClusterCapacityFull.set(true);
        }
    } finally {
        readLock.unlock();
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Available resources: numInstancesFound={}, totalMem={}, totalVcores={} availableHosts: {}", numInstancesFound, memory, vcores, availableHostMap.size());
    }
    return new ImmutablePair<>(Resource.newInstance(memory, vcores), availableHostMap);
}
Also used : Resource(org.apache.hadoop.yarn.api.records.Resource) LlapServiceInstance(org.apache.hadoop.hive.llap.registry.LlapServiceInstance) InactiveServiceInstance(org.apache.hadoop.hive.llap.registry.impl.InactiveServiceInstance) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair) ArrayList(java.util.ArrayList) List(java.util.List) LinkedList(java.util.LinkedList)

Example 4 with InactiveServiceInstance

use of org.apache.hadoop.hive.llap.registry.impl.InactiveServiceInstance in project hive by apache.

the class TestUtils method testGetSplitLocationProvider.

@Test
public void testGetSplitLocationProvider() throws IOException, URISyntaxException {
    // Create test LlapServiceInstances to make sure that we can handle all of the instance types
    List<LlapServiceInstance> instances = new ArrayList<>(3);
    // Set 1 inactive instance to make sure that this does not cause problem for us
    LlapServiceInstance inactive = new InactiveServiceInstance(INACTIVE);
    instances.add(inactive);
    HiveConf conf = new HiveConf();
    conf.set(HiveConf.ConfVars.HIVE_ZOOKEEPER_QUORUM.varname, "localhost");
    LlapZookeeperRegistryImpl dynRegistry = new LlapZookeeperRegistryImpl("dyn", conf);
    Endpoint rpcEndpoint = RegistryTypeUtils.ipcEndpoint("llap", new InetSocketAddress(ACTIVE, 4000));
    Endpoint shuffle = RegistryTypeUtils.ipcEndpoint("shuffle", new InetSocketAddress(ACTIVE, 4000));
    Endpoint mng = RegistryTypeUtils.ipcEndpoint("llapmng", new InetSocketAddress(ACTIVE, 4000));
    Endpoint outputFormat = RegistryTypeUtils.ipcEndpoint("llapoutputformat", new InetSocketAddress(ACTIVE, 4000));
    Endpoint services = RegistryTypeUtils.webEndpoint("services", new URI(ACTIVE + ":4000"));
    // Set 1 active instance
    ServiceRecord enabledSrv = new ServiceRecord();
    enabledSrv.addInternalEndpoint(rpcEndpoint);
    enabledSrv.addInternalEndpoint(shuffle);
    enabledSrv.addInternalEndpoint(mng);
    enabledSrv.addInternalEndpoint(outputFormat);
    enabledSrv.addExternalEndpoint(services);
    enabledSrv.set(LlapRegistryService.LLAP_DAEMON_NUM_ENABLED_EXECUTORS, 10);
    enabledSrv.set(HiveConf.ConfVars.LLAP_DAEMON_MEMORY_PER_INSTANCE_MB.varname, 100);
    LlapZookeeperRegistryImpl.DynamicServiceInstance dynamic = dynRegistry.new DynamicServiceInstance(enabledSrv);
    instances.add(dynamic);
    // Set 1 instance with 0 executors
    ServiceRecord disabledSrv = new ServiceRecord(enabledSrv);
    disabledSrv.set(LlapRegistryService.LLAP_DAEMON_NUM_ENABLED_EXECUTORS, 0);
    LlapZookeeperRegistryImpl.DynamicServiceInstance disabled = dynRegistry.new DynamicServiceInstance(disabledSrv);
    disabled.setHost(DISABLED);
    instances.add(disabled);
    when(mockRegistry.getInstances()).thenReturn(mockInstanceSet);
    when(mockInstanceSet.getAllInstancesOrdered(anyBoolean())).thenReturn(instances);
    SplitLocationProvider provider = Utils.getCustomSplitLocationProvider(mockRegistry, LOG);
    assertLocations((HostAffinitySplitLocationProvider) provider, new String[] { ACTIVE });
    // Check if fixed stuff is working as well
    LlapFixedRegistryImpl fixRegistry = new LlapFixedRegistryImpl("llap", new HiveConf());
    // Instance for testing fixed registry instances
    LlapServiceInstance fixed = fixRegistry.new FixedServiceInstance(FIXED);
    instances.remove(dynamic);
    instances.add(fixed);
    provider = Utils.getCustomSplitLocationProvider(mockRegistry, LOG);
    assertLocations((HostAffinitySplitLocationProvider) provider, new String[] { FIXED });
}
Also used : InetSocketAddress(java.net.InetSocketAddress) ArrayList(java.util.ArrayList) LlapServiceInstance(org.apache.hadoop.hive.llap.registry.LlapServiceInstance) LlapFixedRegistryImpl(org.apache.hadoop.hive.llap.registry.impl.LlapFixedRegistryImpl) URI(java.net.URI) ServiceRecord(org.apache.hadoop.registry.client.types.ServiceRecord) InactiveServiceInstance(org.apache.hadoop.hive.llap.registry.impl.InactiveServiceInstance) Endpoint(org.apache.hadoop.registry.client.types.Endpoint) LlapZookeeperRegistryImpl(org.apache.hadoop.hive.llap.registry.impl.LlapZookeeperRegistryImpl) HiveConf(org.apache.hadoop.hive.conf.HiveConf) SplitLocationProvider(org.apache.hadoop.mapred.split.SplitLocationProvider) Test(org.junit.Test)

Aggregations

LlapServiceInstance (org.apache.hadoop.hive.llap.registry.LlapServiceInstance)4 InactiveServiceInstance (org.apache.hadoop.hive.llap.registry.impl.InactiveServiceInstance)4 ArrayList (java.util.ArrayList)3 IOException (java.io.IOException)1 InetSocketAddress (java.net.InetSocketAddress)1 URI (java.net.URI)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 ImmutablePair (org.apache.commons.lang3.tuple.ImmutablePair)1 HiveConf (org.apache.hadoop.hive.conf.HiveConf)1 LlapFixedRegistryImpl (org.apache.hadoop.hive.llap.registry.impl.LlapFixedRegistryImpl)1 LlapZookeeperRegistryImpl (org.apache.hadoop.hive.llap.registry.impl.LlapZookeeperRegistryImpl)1 SplitLocationProvider (org.apache.hadoop.mapred.split.SplitLocationProvider)1 Endpoint (org.apache.hadoop.registry.client.types.Endpoint)1 ServiceRecord (org.apache.hadoop.registry.client.types.ServiceRecord)1 Resource (org.apache.hadoop.yarn.api.records.Resource)1 Test (org.junit.Test)1