Search in sources :

Example 41 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class Execution method scheduleOrUpdateConsumers.

void scheduleOrUpdateConsumers(List<List<ExecutionEdge>> allConsumers) {
    final int numConsumers = allConsumers.size();
    if (numConsumers > 1) {
        fail(new IllegalStateException("Currently, only a single consumer group per partition is supported."));
    } else if (numConsumers == 0) {
        return;
    }
    for (ExecutionEdge edge : allConsumers.get(0)) {
        final ExecutionVertex consumerVertex = edge.getTarget();
        final Execution consumer = consumerVertex.getCurrentExecutionAttempt();
        final ExecutionState consumerState = consumer.getState();
        final IntermediateResultPartition partition = edge.getSource();
        // ----------------------------------------------------------------
        if (consumerState == CREATED) {
            final Execution partitionExecution = partition.getProducer().getCurrentExecutionAttempt();
            consumerVertex.cachePartitionInfo(PartialInputChannelDeploymentDescriptor.fromEdge(partition, partitionExecution));
            // When deploying a consuming task, its task deployment descriptor will contain all
            // deployment information available at the respective time. It is possible that some
            // of the partitions to be consumed have not been created yet. These are updated
            // runtime via the update messages.
            //
            // TODO The current approach may send many update messages even though the consuming
            // task has already been deployed with all necessary information. We have to check
            // whether this is a problem and fix it, if it is.
            FlinkFuture.supplyAsync(new Callable<Void>() {

                @Override
                public Void call() throws Exception {
                    try {
                        consumerVertex.scheduleForExecution(consumerVertex.getExecutionGraph().getSlotProvider(), consumerVertex.getExecutionGraph().isQueuedSchedulingAllowed());
                    } catch (Throwable t) {
                        consumerVertex.fail(new IllegalStateException("Could not schedule consumer " + "vertex " + consumerVertex, t));
                    }
                    return null;
                }
            }, executor);
            // double check to resolve race conditions
            if (consumerVertex.getExecutionState() == RUNNING) {
                consumerVertex.sendPartitionInfos();
            }
        } else // ----------------------------------------------------------------
        // Consumer is running => send update message now
        // ----------------------------------------------------------------
        {
            if (consumerState == RUNNING) {
                final SimpleSlot consumerSlot = consumer.getAssignedResource();
                if (consumerSlot == null) {
                    // The consumer has been reset concurrently
                    continue;
                }
                final TaskManagerLocation partitionTaskManagerLocation = partition.getProducer().getCurrentAssignedResource().getTaskManagerLocation();
                final ResourceID partitionTaskManager = partitionTaskManagerLocation.getResourceID();
                final ResourceID consumerTaskManager = consumerSlot.getTaskManagerID();
                final ResultPartitionID partitionId = new ResultPartitionID(partition.getPartitionId(), attemptId);
                final ResultPartitionLocation partitionLocation;
                if (consumerTaskManager.equals(partitionTaskManager)) {
                    // Consuming task is deployed to the same instance as the partition => local
                    partitionLocation = ResultPartitionLocation.createLocal();
                } else {
                    // Different instances => remote
                    final ConnectionID connectionId = new ConnectionID(partitionTaskManagerLocation, partition.getIntermediateResult().getConnectionIndex());
                    partitionLocation = ResultPartitionLocation.createRemote(connectionId);
                }
                final InputChannelDeploymentDescriptor descriptor = new InputChannelDeploymentDescriptor(partitionId, partitionLocation);
                consumer.sendUpdatePartitionInfoRpcCall(Collections.singleton(new PartitionInfo(partition.getIntermediateResult().getId(), descriptor)));
            } else // ----------------------------------------------------------------
            if (consumerState == SCHEDULED || consumerState == DEPLOYING) {
                final Execution partitionExecution = partition.getProducer().getCurrentExecutionAttempt();
                consumerVertex.cachePartitionInfo(PartialInputChannelDeploymentDescriptor.fromEdge(partition, partitionExecution));
                // double check to resolve race conditions
                if (consumerVertex.getExecutionState() == RUNNING) {
                    consumerVertex.sendPartitionInfos();
                }
            }
        }
    }
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) CoLocationConstraint(org.apache.flink.runtime.jobmanager.scheduler.CoLocationConstraint) TimeoutException(java.util.concurrent.TimeoutException) JobException(org.apache.flink.runtime.JobException) ConnectionID(org.apache.flink.runtime.io.network.ConnectionID) ResultPartitionLocation(org.apache.flink.runtime.deployment.ResultPartitionLocation) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) PartialInputChannelDeploymentDescriptor(org.apache.flink.runtime.deployment.PartialInputChannelDeploymentDescriptor) InputChannelDeploymentDescriptor(org.apache.flink.runtime.deployment.InputChannelDeploymentDescriptor) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID)

Example 42 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class SlotPool method offerSlot.

/**
	 * Slot offering by TaskManager with AllocationID. The AllocationID is originally generated by this pool and
	 * transfer through the ResourceManager to TaskManager. We use it to distinguish the different allocation
	 * we issued. Slot offering may be rejected if we find something mismatching or there is actually no pending
	 * request waiting for this slot (maybe fulfilled by some other returned slot).
	 *
	 * @param slot The offered slot
	 * @return True if we accept the offering
	 */
@RpcMethod
public boolean offerSlot(final AllocatedSlot slot) {
    validateRunsInMainThread();
    // check if this TaskManager is valid
    final ResourceID resourceID = slot.getTaskManagerId();
    final AllocationID allocationID = slot.getSlotAllocationId();
    if (!registeredTaskManagers.contains(resourceID)) {
        LOG.debug("Received outdated slot offering [{}] from unregistered TaskManager: {}", slot.getSlotAllocationId(), slot);
        return false;
    }
    // check whether we have already using this slot
    if (allocatedSlots.contains(allocationID) || availableSlots.contains(allocationID)) {
        LOG.debug("Received repeated offer for slot [{}]. Ignoring.", allocationID);
        // and mark the offering as a success
        return true;
    }
    // check whether we have request waiting for this slot
    PendingRequest pendingRequest = pendingRequests.remove(allocationID);
    if (pendingRequest != null) {
        // we were waiting for this!
        SimpleSlot resultSlot = createSimpleSlot(slot, Locality.UNKNOWN);
        pendingRequest.future().complete(resultSlot);
        allocatedSlots.add(resultSlot);
    } else {
        // we were actually not waiting for this:
        //   - could be that this request had been fulfilled
        //   - we are receiving the slots from TaskManagers after becoming leaders
        availableSlots.add(slot, clock.relativeTimeMillis());
    }
    // too long and timed out
    return true;
}
Also used : ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) RpcMethod(org.apache.flink.runtime.rpc.RpcMethod)

Example 43 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class SlotSharingGroupAssignment method releaseSimpleSlot.

// ------------------------------------------------------------------------
//  Slot releasing
// ------------------------------------------------------------------------
/**
	 * Releases the simple slot from the assignment group.
	 * 
	 * @param simpleSlot The SimpleSlot to be released
	 */
void releaseSimpleSlot(SimpleSlot simpleSlot) {
    synchronized (lock) {
        // that the releasing is in progress
        if (simpleSlot.markCancelled()) {
            // sanity checks
            if (simpleSlot.isAlive()) {
                throw new IllegalStateException("slot is still alive");
            }
            // check whether the slot is already released
            if (simpleSlot.markReleased()) {
                LOG.debug("Release simple slot {}.", simpleSlot);
                AbstractID groupID = simpleSlot.getGroupID();
                SharedSlot parent = simpleSlot.getParent();
                // if we have a group ID, then our parent slot is tracked here
                if (groupID != null && !allSlots.contains(parent)) {
                    throw new IllegalArgumentException("Slot was not associated with this SlotSharingGroup before.");
                }
                int parentRemaining = parent.removeDisposedChildSlot(simpleSlot);
                if (parentRemaining > 0) {
                    if (groupID != null) {
                        // if we have a group ID, then our parent becomes available
                        // for that group again. otherwise, the slot is part of a
                        // co-location group and nothing becomes immediately available
                        Map<ResourceID, List<SharedSlot>> slotsForJid = availableSlotsPerJid.get(groupID);
                        // sanity check
                        if (slotsForJid == null) {
                            throw new IllegalStateException("Trying to return a slot for group " + groupID + " when available slots indicated that all slots were available.");
                        }
                        putIntoMultiMap(slotsForJid, parent.getTaskManagerID(), parent);
                    }
                } else {
                    // the parent shared slot is now empty and can be released
                    parent.markCancelled();
                    internalDisposeEmptySharedSlot(parent);
                }
            }
        }
    }
}
Also used : ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ArrayList(java.util.ArrayList) List(java.util.List) AbstractID(org.apache.flink.util.AbstractID) CoLocationConstraint(org.apache.flink.runtime.jobmanager.scheduler.CoLocationConstraint)

Example 44 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class SlotSharingGroupAssignment method getSlotForTaskInternal.

private Tuple2<SharedSlot, Locality> getSlotForTaskInternal(AbstractID groupId, Iterable<TaskManagerLocation> preferredLocations, boolean localOnly) {
    // check if there is anything at all in this group assignment
    if (allSlots.isEmpty()) {
        return null;
    }
    // get the available slots for the group
    Map<ResourceID, List<SharedSlot>> slotsForGroup = availableSlotsPerJid.get(groupId);
    if (slotsForGroup == null) {
        // we have a new group, so all slots are available
        slotsForGroup = new LinkedHashMap<>();
        availableSlotsPerJid.put(groupId, slotsForGroup);
        for (SharedSlot availableSlot : allSlots) {
            putIntoMultiMap(slotsForGroup, availableSlot.getTaskManagerID(), availableSlot);
        }
    } else if (slotsForGroup.isEmpty()) {
        // the group exists, but nothing is available for that group
        return null;
    }
    // check whether we can schedule the task to a preferred location
    boolean didNotGetPreferred = false;
    if (preferredLocations != null) {
        for (TaskManagerLocation location : preferredLocations) {
            // set the flag that we failed a preferred location. If one will be found,
            // we return early anyways and skip the flag evaluation
            didNotGetPreferred = true;
            SharedSlot slot = removeFromMultiMap(slotsForGroup, location.getResourceID());
            if (slot != null && slot.isAlive()) {
                return new Tuple2<>(slot, Locality.LOCAL);
            }
        }
    }
    // if we want only local assignments, exit now with a "not found" result
    if (didNotGetPreferred && localOnly) {
        return null;
    }
    Locality locality = didNotGetPreferred ? Locality.NON_LOCAL : Locality.UNCONSTRAINED;
    // schedule the task to any available location
    SharedSlot slot;
    while ((slot = pollFromMultiMap(slotsForGroup)) != null) {
        if (slot.isAlive()) {
            return new Tuple2<>(slot, locality);
        }
    }
    // nothing available after all, all slots were dead
    return null;
}
Also used : ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) Locality(org.apache.flink.runtime.jobmanager.scheduler.Locality) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) Tuple2(org.apache.flink.api.java.tuple.Tuple2) ArrayList(java.util.ArrayList) List(java.util.List)

Example 45 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class SlotSharingGroupAssignment method addSharedSlotAndAllocateSubSlot.

private SimpleSlot addSharedSlotAndAllocateSubSlot(SharedSlot sharedSlot, Locality locality, JobVertexID groupId, CoLocationConstraint constraint) {
    // sanity checks
    if (!sharedSlot.isRootAndEmpty()) {
        throw new IllegalArgumentException("The given slot is not an empty root slot.");
    }
    final ResourceID location = sharedSlot.getTaskManagerID();
    synchronized (lock) {
        // early out in case that the slot died (instance disappeared)
        if (!sharedSlot.isAlive()) {
            return null;
        }
        // add to the total bookkeeping
        if (!allSlots.add(sharedSlot)) {
            throw new IllegalArgumentException("Slot was already contained in the assignment group");
        }
        SimpleSlot subSlot;
        AbstractID groupIdForMap;
        if (constraint == null) {
            // allocate us a sub slot to return
            subSlot = sharedSlot.allocateSubSlot(groupId);
            groupIdForMap = groupId;
        } else {
            // sanity check
            if (constraint.isAssignedAndAlive()) {
                throw new IllegalStateException("Trying to add a shared slot to a co-location constraint that has a life slot.");
            }
            // we need a co-location slot --> a SimpleSlot nested in a SharedSlot to
            //                                host other co-located tasks
            SharedSlot constraintGroupSlot = sharedSlot.allocateSharedSlot(constraint.getGroupId());
            groupIdForMap = constraint.getGroupId();
            if (constraintGroupSlot != null) {
                // the sub-slots in the co-location constraint slot have no own group IDs
                subSlot = constraintGroupSlot.allocateSubSlot(null);
                if (subSlot != null) {
                    // all went well, we can give the constraint its slot
                    constraint.setSharedSlot(constraintGroupSlot);
                // NOTE: Do not lock the location constraint, because we don't yet know whether we will
                // take the slot here
                } else {
                    // if we could not create a sub slot, release the co-location slot
                    // note that this does implicitly release the slot we have just added
                    // as well, because we release its last child slot. That is expected
                    // and desired.
                    constraintGroupSlot.releaseSlot();
                }
            } else {
                // this should not happen, as we are under the lock that also
                // guards slot disposals. Keep the check to be on the safe side
                subSlot = null;
            }
        }
        if (subSlot != null) {
            // preserve the locality information
            subSlot.setLocality(locality);
            // let the other groups know that this slot exists and that they
            // can place a task into this slot.
            boolean entryForNewJidExists = false;
            for (Map.Entry<AbstractID, Map<ResourceID, List<SharedSlot>>> entry : availableSlotsPerJid.entrySet()) {
                // there is already an entry for this groupID
                if (entry.getKey().equals(groupIdForMap)) {
                    entryForNewJidExists = true;
                    continue;
                }
                Map<ResourceID, List<SharedSlot>> available = entry.getValue();
                putIntoMultiMap(available, location, sharedSlot);
            }
            // make sure an empty entry exists for this group, if no other entry exists
            if (!entryForNewJidExists) {
                availableSlotsPerJid.put(groupIdForMap, new LinkedHashMap<ResourceID, List<SharedSlot>>());
            }
            return subSlot;
        } else {
            // This should be a rare case, since this method is called with a fresh slot.
            return null;
        }
    }
// end synchronized (lock)
}
Also used : ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ArrayList(java.util.ArrayList) List(java.util.List) AbstractID(org.apache.flink.util.AbstractID) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Aggregations

ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)74 Test (org.junit.Test)48 TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)25 Time (org.apache.flink.api.common.time.Time)18 UUID (java.util.UUID)16 JobID (org.apache.flink.api.common.JobID)16 Configuration (org.apache.flink.configuration.Configuration)14 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)13 JavaTestKit (akka.testkit.JavaTestKit)12 MetricRegistry (org.apache.flink.runtime.metrics.MetricRegistry)12 InetAddress (java.net.InetAddress)11 SlotID (org.apache.flink.runtime.clusterframework.types.SlotID)10 HeartbeatServices (org.apache.flink.runtime.heartbeat.HeartbeatServices)10 TestingHighAvailabilityServices (org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices)10 SlotRequest (org.apache.flink.runtime.resourcemanager.SlotRequest)10 IOManager (org.apache.flink.runtime.io.disk.iomanager.IOManager)9 NetworkEnvironment (org.apache.flink.runtime.io.network.NetworkEnvironment)9 ActorTaskManagerGateway (org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway)9 MemoryManager (org.apache.flink.runtime.memory.MemoryManager)9 TestingSerialRpcService (org.apache.flink.runtime.rpc.TestingSerialRpcService)9