Search in sources :

Example 6 with DrillbitEndpoint

use of org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint in project drill by apache.

the class SplittingParallelizer method generateWorkUnits.

   * Split plan into multiple plans based on parallelization
   * Ideally it is applicable only to plans with two major fragments: Screen and UnionExchange
   * But there could be cases where we can remove even multiple exchanges like in case of "order by"
   * End goal is to get single major fragment: Screen with chain that ends up with a single minor fragment
   * from Leaf Exchange. This way each plan can run independently without any exchange involvement
   * @param options
   * @param foremanNode - not really applicable
   * @param queryId
   * @param reader
   * @param rootNode
   * @param planningSet
   * @param session
   * @param queryContextInfo
   * @return
   * @throws ExecutionSetupException
private List<QueryWorkUnit> generateWorkUnits(OptionList options, DrillbitEndpoint foremanNode, QueryId queryId, PhysicalPlanReader reader, Fragment rootNode, PlanningSet planningSet, UserSession session, QueryContextInformation queryContextInfo) throws ExecutionSetupException {
    // now we generate all the individual plan fragments and associated assignments. Note, we need all endpoints
    // assigned before we can materialize, so we start a new loop here rather than utilizing the previous one.
    List<QueryWorkUnit> workUnits = Lists.newArrayList();
    int plansCount = 0;
    DrillbitEndpoint[] endPoints = null;
    long initialAllocation = 0;
    long maxAllocation = 0;
    final Iterator<Wrapper> iter = planningSet.iterator();
    while (iter.hasNext()) {
        Wrapper wrapper =;
        Fragment node = wrapper.getNode();
        boolean isLeafFragment = node.getReceivingExchangePairs().size() == 0;
        final PhysicalOperator physicalOperatorRoot = node.getRoot();
        // get all the needed info from leaf fragment
        if ((physicalOperatorRoot instanceof Exchange) && isLeafFragment) {
            // need to get info about
            // number of minor fragments
            // assignedEndPoints
            // allocation
            plansCount = wrapper.getWidth();
            initialAllocation = (wrapper.getInitialAllocation() != 0) ? wrapper.getInitialAllocation() / plansCount : 0;
            maxAllocation = (wrapper.getMaxAllocation() != 0) ? wrapper.getMaxAllocation() / plansCount : 0;
            endPoints = new DrillbitEndpoint[plansCount];
            for (int mfId = 0; mfId < plansCount; mfId++) {
                endPoints[mfId] = wrapper.getAssignedEndpoint(mfId);
    if (plansCount == 0) {
        // no exchange, return list of single QueryWorkUnit
        workUnits.add(generateWorkUnit(options, foremanNode, queryId, reader, rootNode, planningSet, session, queryContextInfo));
        return workUnits;
    for (Wrapper wrapper : planningSet) {
        Fragment node = wrapper.getNode();
        final PhysicalOperator physicalOperatorRoot = node.getRoot();
        if (physicalOperatorRoot instanceof Exchange) {
            // get to 0 MajorFragment
        boolean isRootNode = rootNode == node;
        if (isRootNode && wrapper.getWidth() != 1) {
            throw new ForemanSetupException(String.format("Failure while trying to setup fragment. " + "The root fragment must always have parallelization one. In the current case, the width was set to %d.", wrapper.getWidth()));
        // this fragment is always leaf, as we are removing all the exchanges
        boolean isLeafFragment = true;
        FragmentHandle handle = //
        wrapper.getMajorFragmentId()).setMinorFragmentId(// minor fragment ID is going to be always 0, as plan will be split
        // Create a minorFragment for each major fragment.
        for (int minorFragmentId = 0; minorFragmentId < plansCount; minorFragmentId++) {
            // those fragments should be empty
            List<PlanFragment> fragments = Lists.newArrayList();
            PlanFragment rootFragment = null;
            FragmentRoot rootOperator = null;
            IndexedFragmentNode iNode = new IndexedFragmentNode(minorFragmentId, wrapper);
            // two visitors here
            // 1. To remove exchange
            // 2. To reset operator IDs as exchanges were removed
            PhysicalOperator op = physicalOperatorRoot.accept(ExchangeRemoverMaterializer.INSTANCE, iNode).accept(OperatorIdVisitor.INSTANCE, 0);
            Preconditions.checkArgument(op instanceof FragmentRoot);
            FragmentRoot root = (FragmentRoot) op;
            // get plan as JSON
            String plan;
            String optionsData;
            try {
                plan = reader.writeJson(root);
                optionsData = reader.writeJson(options);
            } catch (JsonProcessingException e) {
                throw new ForemanSetupException("Failure while trying to convert fragment into json.", e);
            PlanFragment fragment = //
            initialAllocation).setMemMax(// TODO - for some reason OOM is using leaf fragment max allocation divided by width
            if (isRootNode) {
                if (logger.isDebugEnabled()) {
                    logger.debug("Root fragment:\n {}", DrillStringUtils.unescapeJava(fragment.toString()));
                rootFragment = fragment;
                rootOperator = root;
            } else {
                if (logger.isDebugEnabled()) {
                    logger.debug("Remote fragment:\n {}", DrillStringUtils.unescapeJava(fragment.toString()));
                throw new ForemanSetupException(String.format("There should not be non-root/remote fragment present in plan split, but there is:", DrillStringUtils.unescapeJava(fragment.toString())));
            // fragments should be always empty here
            workUnits.add(new QueryWorkUnit(rootOperator, rootFragment, fragments));
    return workUnits;
Also used : Wrapper(org.apache.drill.exec.planner.fragment.Wrapper) QueryWorkUnit( FragmentRoot(org.apache.drill.exec.physical.base.FragmentRoot) FragmentHandle(org.apache.drill.exec.proto.ExecProtos.FragmentHandle) PlanFragment(org.apache.drill.exec.proto.BitControl.PlanFragment) Fragment(org.apache.drill.exec.planner.fragment.Fragment) IndexedFragmentNode(org.apache.drill.exec.planner.fragment.Materializer.IndexedFragmentNode) DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) PlanFragment(org.apache.drill.exec.proto.BitControl.PlanFragment) Exchange(org.apache.drill.exec.physical.base.Exchange) DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) PhysicalOperator(org.apache.drill.exec.physical.base.PhysicalOperator) JsonProcessingException(com.fasterxml.jackson.core.JsonProcessingException) ForemanSetupException(

Example 7 with DrillbitEndpoint

use of org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint in project drill by apache.

the class SoftAffinityFragmentParallelizer method findEndpoints.

// Assign endpoints based on the given endpoint list, affinity map and width.
private List<DrillbitEndpoint> findEndpoints(final Collection<DrillbitEndpoint> activeEndpoints, final Map<DrillbitEndpoint, EndpointAffinity> endpointAffinityMap, final int width, final ParallelizationParameters parameters) throws PhysicalOperatorSetupException {
    final List<DrillbitEndpoint> endpoints = Lists.newArrayList();
    if (endpointAffinityMap.size() > 0) {
        // Get EndpointAffinity list sorted in descending order of affinity values
        List<EndpointAffinity> sortedAffinityList = ENDPOINT_AFFINITY_ORDERING.immutableSortedCopy(endpointAffinityMap.values());
        // Find the number of mandatory nodes (nodes with +infinity affinity).
        int numRequiredNodes = 0;
        for (EndpointAffinity ep : sortedAffinityList) {
            if (ep.isAssignmentRequired()) {
            } else {
                // of non-mandatory node
        if (width < numRequiredNodes) {
            throw new PhysicalOperatorSetupException("Can not parallelize the fragment as the parallelization width (" + width + ") is " + "less than the number of mandatory nodes (" + numRequiredNodes + " nodes with +INFINITE affinity).");
        // Find the maximum number of slots which should go to endpoints with affinity (See DRILL-825 for details)
        int affinedSlots = Math.max(1, (int) (Math.ceil((double) parameters.getAffinityFactor() * width / activeEndpoints.size()) * sortedAffinityList.size()));
        // Make sure affined slots is at least the number of mandatory nodes
        affinedSlots = Math.max(affinedSlots, numRequiredNodes);
        // Cap the affined slots to max parallelization width
        affinedSlots = Math.min(affinedSlots, width);
        Iterator<EndpointAffinity> affinedEPItr = Iterators.cycle(sortedAffinityList);
        // Keep adding until we have selected "affinedSlots" number of endpoints.
        while (endpoints.size() < affinedSlots) {
            EndpointAffinity ea =;
    // add remaining endpoints if required
    if (endpoints.size() < width) {
        // Get a list of endpoints that are not part of the affinity endpoint list
        List<DrillbitEndpoint> endpointsWithNoAffinity;
        final Set<DrillbitEndpoint> endpointsWithAffinity = endpointAffinityMap.keySet();
        if (endpointAffinityMap.size() > 0) {
            endpointsWithNoAffinity = Lists.newArrayList();
            for (DrillbitEndpoint ep : activeEndpoints) {
                if (!endpointsWithAffinity.contains(ep)) {
        } else {
            // Need to create a copy instead of an
            endpointsWithNoAffinity = Lists.newArrayList(activeEndpoints);
        // immutable copy, because we need to shuffle the list (next statement) and Collections.shuffle() doesn't
        // support immutable copy as input.
        // round robin with random start.
        Collections.shuffle(endpointsWithNoAffinity, ThreadLocalRandom.current());
        Iterator<DrillbitEndpoint> otherEPItr = Iterators.cycle(endpointsWithNoAffinity.size() > 0 ? endpointsWithNoAffinity : endpointsWithAffinity);
        while (endpoints.size() < width) {
    return endpoints;
Also used : DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) PhysicalOperatorSetupException(org.apache.drill.exec.physical.PhysicalOperatorSetupException) EndpointAffinity(org.apache.drill.exec.physical.EndpointAffinity) DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint)

Example 8 with DrillbitEndpoint

use of org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint in project drill by apache.

the class HardAffinityFragmentParallelizer method parallelizeFragment.

public void parallelizeFragment(final Wrapper fragmentWrapper, final ParallelizationParameters parameters, final Collection<DrillbitEndpoint> activeEndpoints) throws PhysicalOperatorSetupException {
    final Stats stats = fragmentWrapper.getStats();
    final ParallelizationInfo pInfo = stats.getParallelizationInfo();
    int totalMaxWidth = 0;
    // Go through the affinity map and extract the endpoints that have mandatory assignment requirement
    final Map<DrillbitEndpoint, EndpointAffinity> endpointPool = Maps.newHashMap();
    for (Entry<DrillbitEndpoint, EndpointAffinity> entry : pInfo.getEndpointAffinityMap().entrySet()) {
        if (entry.getValue().isAssignmentRequired()) {
            endpointPool.put(entry.getKey(), entry.getValue());
            // Limit the max width of the endpoint to allowed max width.
            totalMaxWidth += Math.min(parameters.getMaxWidthPerNode(), entry.getValue().getMaxWidth());
            if (totalMaxWidth < 0) {
                // If the totalWidth overflows, just keep it at the max value.
                totalMaxWidth = Integer.MAX_VALUE;
    // Step 1: Find the width taking into various parameters
    // 1.1. Find the parallelization based on cost. Use max cost of all operators in this fragment; this is consistent
    //      with the calculation that ExcessiveExchangeRemover uses.
    int width = (int) Math.ceil(stats.getMaxCost() / parameters.getSliceTarget());
    // 1.2. Make sure the width is at least the number of endpoints that require an assignment
    width = Math.max(endpointPool.size(), width);
    // 1.3. Cap the parallelization width by fragment level width limit and system level per query width limit
    width = Math.max(1, Math.min(width, pInfo.getMaxWidth()));
    checkOrThrow(endpointPool.size() <= width, logger, "Number of mandatory endpoints ({}) that require an assignment is more than the allowed fragment max " + "width ({}).", endpointPool.size(), pInfo.getMaxWidth());
    // 1.4 Cap the parallelization width by global max query width
    width = Math.max(1, Math.min(width, parameters.getMaxGlobalWidth()));
    checkOrThrow(endpointPool.size() <= width, logger, "Number of mandatory endpoints ({}) that require an assignment is more than the allowed global query " + "width ({}).", endpointPool.size(), parameters.getMaxGlobalWidth());
    // 1.5 Cap the parallelization width by max allowed parallelization per node
    width = Math.max(1, Math.min(width, endpointPool.size() * parameters.getMaxWidthPerNode()));
    // 1.6 Cap the parallelization width by total of max allowed width per node. The reason is if we the width is more,
    // we end up allocating more work units to one or more endpoints that don't have those many work units.
    width = Math.min(totalMaxWidth, width);
    // Step 2: Select the endpoints
    final Map<DrillbitEndpoint, Integer> endpoints = Maps.newHashMap();
    // 2.1 First add each endpoint from the pool once so that the mandatory assignment requirement is fulfilled.
    for (Entry<DrillbitEndpoint, EndpointAffinity> entry : endpointPool.entrySet()) {
        endpoints.put(entry.getKey(), 1);
    int totalAssigned = endpoints.size();
    // 2.2 Assign the remaining slots to endpoints proportional to the affinity of each endpoint
    int remainingSlots = width - endpoints.size();
    while (remainingSlots > 0) {
        for (EndpointAffinity epAf : endpointPool.values()) {
            final int moreAllocation = (int) Math.ceil(epAf.getAffinity() * remainingSlots);
            int currentAssignments = endpoints.get(epAf.getEndpoint());
            for (int i = 0; i < moreAllocation && totalAssigned < width && currentAssignments < parameters.getMaxWidthPerNode() && currentAssignments < epAf.getMaxWidth(); i++) {
            endpoints.put(epAf.getEndpoint(), currentAssignments);
        final int previousRemainingSlots = remainingSlots;
        remainingSlots = width - totalAssigned;
        if (previousRemainingSlots == remainingSlots) {
            logger.error("Can't parallelize fragment: " + "Every mandatory node has exhausted the maximum width per node limit." + EOL + "Endpoint pool: {}" + EOL + "Assignment so far: {}" + EOL + "Width: {}", endpointPool, endpoints, width);
            throw new PhysicalOperatorSetupException("Can not parallelize fragment.");
    final List<DrillbitEndpoint> assignedEndpoints = Lists.newArrayList();
    for (Entry<DrillbitEndpoint, Integer> entry : endpoints.entrySet()) {
        for (int i = 0; i < entry.getValue(); i++) {
Also used : DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) PhysicalOperatorSetupException(org.apache.drill.exec.physical.PhysicalOperatorSetupException) EndpointAffinity(org.apache.drill.exec.physical.EndpointAffinity) DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint)

Example 9 with DrillbitEndpoint

use of org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint in project drill by apache.

the class ServiceEngine method start.

public DrillbitEndpoint start() throws DrillbitStartupException, UnknownHostException {
    // loopback address check
    if (isDistributedMode && InetAddress.getByName(hostName).isLoopbackAddress()) {
        throw new DrillbitStartupException("Drillbit is disallowed to bind to loopback address in distributed mode.");
    final int userPort = userServer.bind(intialUserPort, allowPortHunting);
    DrillbitEndpoint partialEndpoint = DrillbitEndpoint.newBuilder().setAddress(hostName).setUserPort(userPort).setVersion(DrillVersionInfo.getVersion()).build();
    partialEndpoint = controller.start(partialEndpoint, allowPortHunting);
    return dataPool.start(partialEndpoint, allowPortHunting);
Also used : DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) DrillbitStartupException(org.apache.drill.exec.exception.DrillbitStartupException) DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint)

Example 10 with DrillbitEndpoint

use of org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint in project drill by apache.

the class Foreman method setupNonRootFragments.

   * Set up the non-root fragments for execution. Some may be local, and some may be remote.
   * Messages are sent immediately, so they may start returning data even before we complete this.
   * @param fragments the fragments
   * @throws ForemanException
private void setupNonRootFragments(final Collection<PlanFragment> fragments) throws ForemanException {
    if (fragments.isEmpty()) {
        // nothing to do here
     * We will send a single message to each endpoint, regardless of how many fragments will be
     * executed there. We need to start up the intermediate fragments first so that they will be
     * ready once the leaf fragments start producing data. To satisfy both of these, we will
     * make a pass through the fragments and put them into these two maps according to their
     * leaf/intermediate state, as well as their target drillbit.
    final Multimap<DrillbitEndpoint, PlanFragment> leafFragmentMap = ArrayListMultimap.create();
    final Multimap<DrillbitEndpoint, PlanFragment> intFragmentMap = ArrayListMultimap.create();
    // record all fragments for status purposes.
    for (final PlanFragment planFragment : fragments) {
        logger.trace("Tracking intermediate remote node {} with data {}", planFragment.getAssignment(), planFragment.getFragmentJson());
        queryManager.addFragmentStatusTracker(planFragment, false);
        if (planFragment.getLeafFragment()) {
            leafFragmentMap.put(planFragment.getAssignment(), planFragment);
        } else {
            intFragmentMap.put(planFragment.getAssignment(), planFragment);
     * We need to wait for the intermediates to be sent so that they'll be set up by the time
     * the leaves start producing data. We'll use this latch to wait for the responses.
     * However, in order not to hang the process if any of the RPC requests fails, we always
     * count down (see FragmentSubmitFailures), but we count the number of failures so that we'll
     * know if any submissions did fail.
    final int numIntFragments = intFragmentMap.keySet().size();
    final ExtendedLatch endpointLatch = new ExtendedLatch(numIntFragments);
    final FragmentSubmitFailures fragmentSubmitFailures = new FragmentSubmitFailures();
    // send remote intermediate fragments
    for (final DrillbitEndpoint ep : intFragmentMap.keySet()) {
        sendRemoteFragments(ep, intFragmentMap.get(ep), endpointLatch, fragmentSubmitFailures);
    final long timeout = RPC_WAIT_IN_MSECS_PER_FRAGMENT * numIntFragments;
    if (numIntFragments > 0 && !endpointLatch.awaitUninterruptibly(timeout)) {
        long numberRemaining = endpointLatch.getCount();
        throw UserException.connectionError().message("Exceeded timeout (%d) while waiting send intermediate work fragments to remote nodes. " + "Sent %d and only heard response back from %d nodes.", timeout, numIntFragments, numIntFragments - numberRemaining).build(logger);
    // if any of the intermediate fragment submissions failed, fail the query
    final List<FragmentSubmitFailures.SubmissionException> submissionExceptions = fragmentSubmitFailures.submissionExceptions;
    if (submissionExceptions.size() > 0) {
        Set<DrillbitEndpoint> endpoints = Sets.newHashSet();
        StringBuilder sb = new StringBuilder();
        boolean first = true;
        for (FragmentSubmitFailures.SubmissionException e : fragmentSubmitFailures.submissionExceptions) {
            DrillbitEndpoint endpoint = e.drillbitEndpoint;
            if (endpoints.add(endpoint)) {
                if (first) {
                    first = false;
                } else {
                    sb.append(", ");
        throw UserException.connectionError(submissionExceptions.get(0).rpcException).message("Error setting up remote intermediate fragment execution").addContext("Nodes with failures", sb.toString()).build(logger);
    injector.injectChecked(queryContext.getExecutionControls(), "send-fragments", ForemanException.class);
     * Send the remote (leaf) fragments; we don't wait for these. Any problems will come in through
     * the regular sendListener event delivery.
    for (final DrillbitEndpoint ep : leafFragmentMap.keySet()) {
        sendRemoteFragments(ep, leafFragmentMap.get(ep), null, null);
Also used : PlanFragment(org.apache.drill.exec.proto.BitControl.PlanFragment) DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) ExtendedLatch(org.apache.drill.common.concurrent.ExtendedLatch) DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint)


DrillbitEndpoint (org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint)77 Test (org.junit.Test)23 EndpointAffinity (org.apache.drill.exec.physical.EndpointAffinity)14 IOException ( Stopwatch ( ArrayList (java.util.ArrayList)7 PlanFragment (org.apache.drill.exec.proto.BitControl.PlanFragment)7 ServerName (org.apache.hadoop.hbase.ServerName)7 HRegionInfo (org.apache.hadoop.hbase.HRegionInfo)6 Entry (java.util.Map.Entry)5 DrillConfig (org.apache.drill.common.config.DrillConfig)5 FragmentHandle (org.apache.drill.exec.proto.ExecProtos.FragmentHandle)5 DrillbitContext (org.apache.drill.exec.server.DrillbitContext)5 HBaseGroupScan ( HBaseScanSpec ( QueryWorkUnit ( JsonProcessingException (com.fasterxml.jackson.core.JsonProcessingException)4 HashMap (java.util.HashMap)4 AtomicLong (java.util.concurrent.atomic.AtomicLong)4 NonStrictExpectations (mockit.NonStrictExpectations)4