use of org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint in project drill by apache.
the class SplittingParallelizer method generateWorkUnits.
/**
* Split plan into multiple plans based on parallelization
* Ideally it is applicable only to plans with two major fragments: Screen and UnionExchange
* But there could be cases where we can remove even multiple exchanges like in case of "order by"
* End goal is to get single major fragment: Screen with chain that ends up with a single minor fragment
* from Leaf Exchange. This way each plan can run independently without any exchange involvement
* @param options
* @param foremanNode - not really applicable
* @param queryId
* @param reader
* @param rootNode
* @param planningSet
* @param session
* @param queryContextInfo
* @return
* @throws ExecutionSetupException
*/
private List<QueryWorkUnit> generateWorkUnits(OptionList options, DrillbitEndpoint foremanNode, QueryId queryId, PhysicalPlanReader reader, Fragment rootNode, PlanningSet planningSet, UserSession session, QueryContextInformation queryContextInfo) throws ExecutionSetupException {
// now we generate all the individual plan fragments and associated assignments. Note, we need all endpoints
// assigned before we can materialize, so we start a new loop here rather than utilizing the previous one.
List<QueryWorkUnit> workUnits = Lists.newArrayList();
int plansCount = 0;
DrillbitEndpoint[] endPoints = null;
long initialAllocation = 0;
long maxAllocation = 0;
final Iterator<Wrapper> iter = planningSet.iterator();
while (iter.hasNext()) {
Wrapper wrapper = iter.next();
Fragment node = wrapper.getNode();
boolean isLeafFragment = node.getReceivingExchangePairs().size() == 0;
final PhysicalOperator physicalOperatorRoot = node.getRoot();
// get all the needed info from leaf fragment
if ((physicalOperatorRoot instanceof Exchange) && isLeafFragment) {
// need to get info about
// number of minor fragments
// assignedEndPoints
// allocation
plansCount = wrapper.getWidth();
initialAllocation = (wrapper.getInitialAllocation() != 0) ? wrapper.getInitialAllocation() / plansCount : 0;
maxAllocation = (wrapper.getMaxAllocation() != 0) ? wrapper.getMaxAllocation() / plansCount : 0;
endPoints = new DrillbitEndpoint[plansCount];
for (int mfId = 0; mfId < plansCount; mfId++) {
endPoints[mfId] = wrapper.getAssignedEndpoint(mfId);
}
}
}
if (plansCount == 0) {
// no exchange, return list of single QueryWorkUnit
workUnits.add(generateWorkUnit(options, foremanNode, queryId, reader, rootNode, planningSet, session, queryContextInfo));
return workUnits;
}
for (Wrapper wrapper : planningSet) {
Fragment node = wrapper.getNode();
final PhysicalOperator physicalOperatorRoot = node.getRoot();
if (physicalOperatorRoot instanceof Exchange) {
// get to 0 MajorFragment
continue;
}
boolean isRootNode = rootNode == node;
if (isRootNode && wrapper.getWidth() != 1) {
throw new ForemanSetupException(String.format("Failure while trying to setup fragment. " + "The root fragment must always have parallelization one. In the current case, the width was set to %d.", wrapper.getWidth()));
}
// this fragment is always leaf, as we are removing all the exchanges
boolean isLeafFragment = true;
FragmentHandle handle = //
FragmentHandle.newBuilder().setMajorFragmentId(//
wrapper.getMajorFragmentId()).setMinorFragmentId(// minor fragment ID is going to be always 0, as plan will be split
0).setQueryId(//
queryId).build();
// Create a minorFragment for each major fragment.
for (int minorFragmentId = 0; minorFragmentId < plansCount; minorFragmentId++) {
// those fragments should be empty
List<PlanFragment> fragments = Lists.newArrayList();
PlanFragment rootFragment = null;
FragmentRoot rootOperator = null;
IndexedFragmentNode iNode = new IndexedFragmentNode(minorFragmentId, wrapper);
wrapper.resetAllocation();
// two visitors here
// 1. To remove exchange
// 2. To reset operator IDs as exchanges were removed
PhysicalOperator op = physicalOperatorRoot.accept(ExchangeRemoverMaterializer.INSTANCE, iNode).accept(OperatorIdVisitor.INSTANCE, 0);
Preconditions.checkArgument(op instanceof FragmentRoot);
FragmentRoot root = (FragmentRoot) op;
// get plan as JSON
String plan;
String optionsData;
try {
plan = reader.writeJson(root);
optionsData = reader.writeJson(options);
} catch (JsonProcessingException e) {
throw new ForemanSetupException("Failure while trying to convert fragment into json.", e);
}
PlanFragment fragment = //
PlanFragment.newBuilder().setForeman(//
endPoints[minorFragmentId]).setFragmentJson(//
plan).setHandle(//
handle).setAssignment(//
endPoints[minorFragmentId]).setLeafFragment(//
isLeafFragment).setContext(queryContextInfo).setMemInitial(//
initialAllocation).setMemMax(// TODO - for some reason OOM is using leaf fragment max allocation divided by width
wrapper.getMaxAllocation()).setOptionsJson(optionsData).setCredentials(session.getCredentials()).addAllCollector(CountRequiredFragments.getCollectors(root)).build();
if (isRootNode) {
if (logger.isDebugEnabled()) {
logger.debug("Root fragment:\n {}", DrillStringUtils.unescapeJava(fragment.toString()));
}
rootFragment = fragment;
rootOperator = root;
} else {
if (logger.isDebugEnabled()) {
logger.debug("Remote fragment:\n {}", DrillStringUtils.unescapeJava(fragment.toString()));
}
throw new ForemanSetupException(String.format("There should not be non-root/remote fragment present in plan split, but there is:", DrillStringUtils.unescapeJava(fragment.toString())));
}
// fragments should be always empty here
workUnits.add(new QueryWorkUnit(rootOperator, rootFragment, fragments));
}
}
return workUnits;
}
use of org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint in project drill by apache.
the class SoftAffinityFragmentParallelizer method findEndpoints.
// Assign endpoints based on the given endpoint list, affinity map and width.
private List<DrillbitEndpoint> findEndpoints(final Collection<DrillbitEndpoint> activeEndpoints, final Map<DrillbitEndpoint, EndpointAffinity> endpointAffinityMap, final int width, final ParallelizationParameters parameters) throws PhysicalOperatorSetupException {
final List<DrillbitEndpoint> endpoints = Lists.newArrayList();
if (endpointAffinityMap.size() > 0) {
// Get EndpointAffinity list sorted in descending order of affinity values
List<EndpointAffinity> sortedAffinityList = ENDPOINT_AFFINITY_ORDERING.immutableSortedCopy(endpointAffinityMap.values());
// Find the number of mandatory nodes (nodes with +infinity affinity).
int numRequiredNodes = 0;
for (EndpointAffinity ep : sortedAffinityList) {
if (ep.isAssignmentRequired()) {
numRequiredNodes++;
} else {
// of non-mandatory node
break;
}
}
if (width < numRequiredNodes) {
throw new PhysicalOperatorSetupException("Can not parallelize the fragment as the parallelization width (" + width + ") is " + "less than the number of mandatory nodes (" + numRequiredNodes + " nodes with +INFINITE affinity).");
}
// Find the maximum number of slots which should go to endpoints with affinity (See DRILL-825 for details)
int affinedSlots = Math.max(1, (int) (Math.ceil((double) parameters.getAffinityFactor() * width / activeEndpoints.size()) * sortedAffinityList.size()));
// Make sure affined slots is at least the number of mandatory nodes
affinedSlots = Math.max(affinedSlots, numRequiredNodes);
// Cap the affined slots to max parallelization width
affinedSlots = Math.min(affinedSlots, width);
Iterator<EndpointAffinity> affinedEPItr = Iterators.cycle(sortedAffinityList);
// Keep adding until we have selected "affinedSlots" number of endpoints.
while (endpoints.size() < affinedSlots) {
EndpointAffinity ea = affinedEPItr.next();
endpoints.add(ea.getEndpoint());
}
}
// add remaining endpoints if required
if (endpoints.size() < width) {
// Get a list of endpoints that are not part of the affinity endpoint list
List<DrillbitEndpoint> endpointsWithNoAffinity;
final Set<DrillbitEndpoint> endpointsWithAffinity = endpointAffinityMap.keySet();
if (endpointAffinityMap.size() > 0) {
endpointsWithNoAffinity = Lists.newArrayList();
for (DrillbitEndpoint ep : activeEndpoints) {
if (!endpointsWithAffinity.contains(ep)) {
endpointsWithNoAffinity.add(ep);
}
}
} else {
// Need to create a copy instead of an
endpointsWithNoAffinity = Lists.newArrayList(activeEndpoints);
// immutable copy, because we need to shuffle the list (next statement) and Collections.shuffle() doesn't
// support immutable copy as input.
}
// round robin with random start.
Collections.shuffle(endpointsWithNoAffinity, ThreadLocalRandom.current());
Iterator<DrillbitEndpoint> otherEPItr = Iterators.cycle(endpointsWithNoAffinity.size() > 0 ? endpointsWithNoAffinity : endpointsWithAffinity);
while (endpoints.size() < width) {
endpoints.add(otherEPItr.next());
}
}
return endpoints;
}
use of org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint in project drill by apache.
the class HardAffinityFragmentParallelizer method parallelizeFragment.
@Override
public void parallelizeFragment(final Wrapper fragmentWrapper, final ParallelizationParameters parameters, final Collection<DrillbitEndpoint> activeEndpoints) throws PhysicalOperatorSetupException {
final Stats stats = fragmentWrapper.getStats();
final ParallelizationInfo pInfo = stats.getParallelizationInfo();
int totalMaxWidth = 0;
// Go through the affinity map and extract the endpoints that have mandatory assignment requirement
final Map<DrillbitEndpoint, EndpointAffinity> endpointPool = Maps.newHashMap();
for (Entry<DrillbitEndpoint, EndpointAffinity> entry : pInfo.getEndpointAffinityMap().entrySet()) {
if (entry.getValue().isAssignmentRequired()) {
endpointPool.put(entry.getKey(), entry.getValue());
// Limit the max width of the endpoint to allowed max width.
totalMaxWidth += Math.min(parameters.getMaxWidthPerNode(), entry.getValue().getMaxWidth());
if (totalMaxWidth < 0) {
// If the totalWidth overflows, just keep it at the max value.
totalMaxWidth = Integer.MAX_VALUE;
}
}
}
// Step 1: Find the width taking into various parameters
// 1.1. Find the parallelization based on cost. Use max cost of all operators in this fragment; this is consistent
// with the calculation that ExcessiveExchangeRemover uses.
int width = (int) Math.ceil(stats.getMaxCost() / parameters.getSliceTarget());
// 1.2. Make sure the width is at least the number of endpoints that require an assignment
width = Math.max(endpointPool.size(), width);
// 1.3. Cap the parallelization width by fragment level width limit and system level per query width limit
width = Math.max(1, Math.min(width, pInfo.getMaxWidth()));
checkOrThrow(endpointPool.size() <= width, logger, "Number of mandatory endpoints ({}) that require an assignment is more than the allowed fragment max " + "width ({}).", endpointPool.size(), pInfo.getMaxWidth());
// 1.4 Cap the parallelization width by global max query width
width = Math.max(1, Math.min(width, parameters.getMaxGlobalWidth()));
checkOrThrow(endpointPool.size() <= width, logger, "Number of mandatory endpoints ({}) that require an assignment is more than the allowed global query " + "width ({}).", endpointPool.size(), parameters.getMaxGlobalWidth());
// 1.5 Cap the parallelization width by max allowed parallelization per node
width = Math.max(1, Math.min(width, endpointPool.size() * parameters.getMaxWidthPerNode()));
// 1.6 Cap the parallelization width by total of max allowed width per node. The reason is if we the width is more,
// we end up allocating more work units to one or more endpoints that don't have those many work units.
width = Math.min(totalMaxWidth, width);
// Step 2: Select the endpoints
final Map<DrillbitEndpoint, Integer> endpoints = Maps.newHashMap();
// 2.1 First add each endpoint from the pool once so that the mandatory assignment requirement is fulfilled.
for (Entry<DrillbitEndpoint, EndpointAffinity> entry : endpointPool.entrySet()) {
endpoints.put(entry.getKey(), 1);
}
int totalAssigned = endpoints.size();
// 2.2 Assign the remaining slots to endpoints proportional to the affinity of each endpoint
int remainingSlots = width - endpoints.size();
while (remainingSlots > 0) {
for (EndpointAffinity epAf : endpointPool.values()) {
final int moreAllocation = (int) Math.ceil(epAf.getAffinity() * remainingSlots);
int currentAssignments = endpoints.get(epAf.getEndpoint());
for (int i = 0; i < moreAllocation && totalAssigned < width && currentAssignments < parameters.getMaxWidthPerNode() && currentAssignments < epAf.getMaxWidth(); i++) {
totalAssigned++;
currentAssignments++;
}
endpoints.put(epAf.getEndpoint(), currentAssignments);
}
final int previousRemainingSlots = remainingSlots;
remainingSlots = width - totalAssigned;
if (previousRemainingSlots == remainingSlots) {
logger.error("Can't parallelize fragment: " + "Every mandatory node has exhausted the maximum width per node limit." + EOL + "Endpoint pool: {}" + EOL + "Assignment so far: {}" + EOL + "Width: {}", endpointPool, endpoints, width);
throw new PhysicalOperatorSetupException("Can not parallelize fragment.");
}
}
final List<DrillbitEndpoint> assignedEndpoints = Lists.newArrayList();
for (Entry<DrillbitEndpoint, Integer> entry : endpoints.entrySet()) {
for (int i = 0; i < entry.getValue(); i++) {
assignedEndpoints.add(entry.getKey());
}
}
fragmentWrapper.setWidth(width);
fragmentWrapper.assignEndpoints(assignedEndpoints);
}
use of org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint in project drill by apache.
the class ServiceEngine method start.
public DrillbitEndpoint start() throws DrillbitStartupException, UnknownHostException {
// loopback address check
if (isDistributedMode && InetAddress.getByName(hostName).isLoopbackAddress()) {
throw new DrillbitStartupException("Drillbit is disallowed to bind to loopback address in distributed mode.");
}
final int userPort = userServer.bind(intialUserPort, allowPortHunting);
DrillbitEndpoint partialEndpoint = DrillbitEndpoint.newBuilder().setAddress(hostName).setUserPort(userPort).setVersion(DrillVersionInfo.getVersion()).build();
partialEndpoint = controller.start(partialEndpoint, allowPortHunting);
return dataPool.start(partialEndpoint, allowPortHunting);
}
use of org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint in project drill by apache.
the class Foreman method setupNonRootFragments.
/**
* Set up the non-root fragments for execution. Some may be local, and some may be remote.
* Messages are sent immediately, so they may start returning data even before we complete this.
*
* @param fragments the fragments
* @throws ForemanException
*/
private void setupNonRootFragments(final Collection<PlanFragment> fragments) throws ForemanException {
if (fragments.isEmpty()) {
// nothing to do here
return;
}
/*
* We will send a single message to each endpoint, regardless of how many fragments will be
* executed there. We need to start up the intermediate fragments first so that they will be
* ready once the leaf fragments start producing data. To satisfy both of these, we will
* make a pass through the fragments and put them into these two maps according to their
* leaf/intermediate state, as well as their target drillbit.
*/
final Multimap<DrillbitEndpoint, PlanFragment> leafFragmentMap = ArrayListMultimap.create();
final Multimap<DrillbitEndpoint, PlanFragment> intFragmentMap = ArrayListMultimap.create();
// record all fragments for status purposes.
for (final PlanFragment planFragment : fragments) {
logger.trace("Tracking intermediate remote node {} with data {}", planFragment.getAssignment(), planFragment.getFragmentJson());
queryManager.addFragmentStatusTracker(planFragment, false);
if (planFragment.getLeafFragment()) {
leafFragmentMap.put(planFragment.getAssignment(), planFragment);
} else {
intFragmentMap.put(planFragment.getAssignment(), planFragment);
}
}
/*
* We need to wait for the intermediates to be sent so that they'll be set up by the time
* the leaves start producing data. We'll use this latch to wait for the responses.
*
* However, in order not to hang the process if any of the RPC requests fails, we always
* count down (see FragmentSubmitFailures), but we count the number of failures so that we'll
* know if any submissions did fail.
*/
final int numIntFragments = intFragmentMap.keySet().size();
final ExtendedLatch endpointLatch = new ExtendedLatch(numIntFragments);
final FragmentSubmitFailures fragmentSubmitFailures = new FragmentSubmitFailures();
// send remote intermediate fragments
for (final DrillbitEndpoint ep : intFragmentMap.keySet()) {
sendRemoteFragments(ep, intFragmentMap.get(ep), endpointLatch, fragmentSubmitFailures);
}
final long timeout = RPC_WAIT_IN_MSECS_PER_FRAGMENT * numIntFragments;
if (numIntFragments > 0 && !endpointLatch.awaitUninterruptibly(timeout)) {
long numberRemaining = endpointLatch.getCount();
throw UserException.connectionError().message("Exceeded timeout (%d) while waiting send intermediate work fragments to remote nodes. " + "Sent %d and only heard response back from %d nodes.", timeout, numIntFragments, numIntFragments - numberRemaining).build(logger);
}
// if any of the intermediate fragment submissions failed, fail the query
final List<FragmentSubmitFailures.SubmissionException> submissionExceptions = fragmentSubmitFailures.submissionExceptions;
if (submissionExceptions.size() > 0) {
Set<DrillbitEndpoint> endpoints = Sets.newHashSet();
StringBuilder sb = new StringBuilder();
boolean first = true;
for (FragmentSubmitFailures.SubmissionException e : fragmentSubmitFailures.submissionExceptions) {
DrillbitEndpoint endpoint = e.drillbitEndpoint;
if (endpoints.add(endpoint)) {
if (first) {
first = false;
} else {
sb.append(", ");
}
sb.append(endpoint.getAddress());
}
}
throw UserException.connectionError(submissionExceptions.get(0).rpcException).message("Error setting up remote intermediate fragment execution").addContext("Nodes with failures", sb.toString()).build(logger);
}
injector.injectChecked(queryContext.getExecutionControls(), "send-fragments", ForemanException.class);
/*
* Send the remote (leaf) fragments; we don't wait for these. Any problems will come in through
* the regular sendListener event delivery.
*/
for (final DrillbitEndpoint ep : leafFragmentMap.keySet()) {
sendRemoteFragments(ep, leafFragmentMap.get(ep), null, null);
}
}
Aggregations