Search in sources :

Example 16 with JobVertexID

use of org.apache.flink.runtime.jobgraph.JobVertexID in project flink by apache.

the class ScheduleWithCoLocationHintTest method testSlotReleasedInBetween.

@Test
public void testSlotReleasedInBetween() {
    try {
        JobVertexID jid1 = new JobVertexID();
        JobVertexID jid2 = new JobVertexID();
        Scheduler scheduler = new Scheduler(TestingUtils.directExecutionContext());
        Instance i1 = getRandomInstance(1);
        Instance i2 = getRandomInstance(1);
        TaskManagerLocation loc1 = i1.getTaskManagerLocation();
        TaskManagerLocation loc2 = i2.getTaskManagerLocation();
        scheduler.newInstanceAvailable(i2);
        scheduler.newInstanceAvailable(i1);
        assertEquals(2, scheduler.getNumberOfAvailableSlots());
        SlotSharingGroup sharingGroup = new SlotSharingGroup();
        CoLocationGroup ccg = new CoLocationGroup();
        CoLocationConstraint cc1 = new CoLocationConstraint(ccg);
        CoLocationConstraint cc2 = new CoLocationConstraint(ccg);
        SimpleSlot s1 = scheduler.allocateSlot(new ScheduledUnit(getTestVertexWithLocation(jid1, 0, 2, loc1), sharingGroup, cc1), false).get();
        SimpleSlot s2 = scheduler.allocateSlot(new ScheduledUnit(getTestVertexWithLocation(jid1, 1, 2, loc2), sharingGroup, cc2), false).get();
        s1.releaseSlot();
        s2.releaseSlot();
        assertEquals(2, scheduler.getNumberOfAvailableSlots());
        assertEquals(0, sharingGroup.getTaskAssignment().getNumberOfSlots());
        SimpleSlot s3 = scheduler.allocateSlot(new ScheduledUnit(getTestVertexWithLocation(jid2, 0, 2, loc2), sharingGroup, cc1), false).get();
        SimpleSlot s4 = scheduler.allocateSlot(new ScheduledUnit(getTestVertexWithLocation(jid2, 1, 2, loc1), sharingGroup, cc2), false).get();
        // still preserves the previous instance mapping)
        assertEquals(i1.getTaskManagerID(), s3.getTaskManagerID());
        assertEquals(i2.getTaskManagerID(), s4.getTaskManagerID());
        s3.releaseSlot();
        s4.releaseSlot();
        assertEquals(2, scheduler.getNumberOfAvailableSlots());
        assertEquals(4, scheduler.getNumberOfLocalizedAssignments());
        assertEquals(0, scheduler.getNumberOfNonLocalizedAssignments());
        assertEquals(0, scheduler.getNumberOfUnconstrainedAssignments());
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : Instance(org.apache.flink.runtime.instance.Instance) SchedulerTestUtils.getRandomInstance(org.apache.flink.runtime.jobmanager.scheduler.SchedulerTestUtils.getRandomInstance) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) ExecutionException(java.util.concurrent.ExecutionException) Test(org.junit.Test)

Example 17 with JobVertexID

use of org.apache.flink.runtime.jobgraph.JobVertexID in project flink by apache.

the class SchedulerSlotSharingTest method testSequentialAllocateAndRelease.

@Test
public void testSequentialAllocateAndRelease() {
    try {
        final JobVertexID jid1 = new JobVertexID();
        final JobVertexID jid2 = new JobVertexID();
        final JobVertexID jid3 = new JobVertexID();
        final JobVertexID jid4 = new JobVertexID();
        final SlotSharingGroup sharingGroup = new SlotSharingGroup(jid1, jid2, jid3, jid4);
        final Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
        scheduler.newInstanceAvailable(getRandomInstance(4));
        // allocate something from group 1 and 2 interleaved with schedule for group 3
        SimpleSlot slot_1_1 = scheduler.allocateSlot(new ScheduledUnit(getTestVertex(jid1, 0, 4), sharingGroup), false).get();
        SimpleSlot slot_1_2 = scheduler.allocateSlot(new ScheduledUnit(getTestVertex(jid1, 1, 4), sharingGroup), false).get();
        SimpleSlot slot_2_1 = scheduler.allocateSlot(new ScheduledUnit(getTestVertex(jid2, 0, 4), sharingGroup), false).get();
        SimpleSlot slot_2_2 = scheduler.allocateSlot(new ScheduledUnit(getTestVertex(jid2, 1, 4), sharingGroup), false).get();
        SimpleSlot slot_3 = scheduler.allocateSlot(new ScheduledUnit(getTestVertex(jid3, 0, 1), sharingGroup), false).get();
        SimpleSlot slot_1_3 = scheduler.allocateSlot(new ScheduledUnit(getTestVertex(jid1, 2, 4), sharingGroup), false).get();
        SimpleSlot slot_1_4 = scheduler.allocateSlot(new ScheduledUnit(getTestVertex(jid1, 3, 4), sharingGroup), false).get();
        SimpleSlot slot_2_3 = scheduler.allocateSlot(new ScheduledUnit(getTestVertex(jid2, 2, 4), sharingGroup), false).get();
        SimpleSlot slot_2_4 = scheduler.allocateSlot(new ScheduledUnit(getTestVertex(jid2, 3, 4), sharingGroup), false).get();
        // release groups 1 and 2
        slot_1_1.releaseSlot();
        slot_1_2.releaseSlot();
        slot_1_3.releaseSlot();
        slot_1_4.releaseSlot();
        slot_2_1.releaseSlot();
        slot_2_2.releaseSlot();
        slot_2_3.releaseSlot();
        slot_2_4.releaseSlot();
        // allocate group 4
        SimpleSlot slot_4_1 = scheduler.allocateSlot(new ScheduledUnit(getTestVertex(jid4, 0, 4), sharingGroup), false).get();
        SimpleSlot slot_4_2 = scheduler.allocateSlot(new ScheduledUnit(getTestVertex(jid4, 1, 4), sharingGroup), false).get();
        SimpleSlot slot_4_3 = scheduler.allocateSlot(new ScheduledUnit(getTestVertex(jid4, 2, 4), sharingGroup), false).get();
        SimpleSlot slot_4_4 = scheduler.allocateSlot(new ScheduledUnit(getTestVertex(jid4, 3, 4), sharingGroup), false).get();
        // release groups 3 and 4
        slot_3.releaseSlot();
        slot_4_1.releaseSlot();
        slot_4_2.releaseSlot();
        slot_4_3.releaseSlot();
        slot_4_4.releaseSlot();
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) ExecutionException(java.util.concurrent.ExecutionException) Test(org.junit.Test)

Example 18 with JobVertexID

use of org.apache.flink.runtime.jobgraph.JobVertexID in project flink by apache.

the class StreamTaskTest method createTask.

public static Task createTask(Class<? extends AbstractInvokable> invokable, StreamConfig taskConfig, Configuration taskManagerConfig) throws Exception {
    LibraryCacheManager libCache = mock(LibraryCacheManager.class);
    when(libCache.getClassLoader(any(JobID.class))).thenReturn(StreamTaskTest.class.getClassLoader());
    ResultPartitionManager partitionManager = mock(ResultPartitionManager.class);
    ResultPartitionConsumableNotifier consumableNotifier = mock(ResultPartitionConsumableNotifier.class);
    PartitionProducerStateChecker partitionProducerStateChecker = mock(PartitionProducerStateChecker.class);
    Executor executor = mock(Executor.class);
    NetworkEnvironment network = mock(NetworkEnvironment.class);
    when(network.getResultPartitionManager()).thenReturn(partitionManager);
    when(network.getDefaultIOMode()).thenReturn(IOManager.IOMode.SYNC);
    when(network.createKvStateTaskRegistry(any(JobID.class), any(JobVertexID.class))).thenReturn(mock(TaskKvStateRegistry.class));
    JobInformation jobInformation = new JobInformation(new JobID(), "Job Name", new SerializedValue<>(new ExecutionConfig()), new Configuration(), Collections.<BlobKey>emptyList(), Collections.<URL>emptyList());
    TaskInformation taskInformation = new TaskInformation(new JobVertexID(), "Test Task", 1, 1, invokable.getName(), taskConfig.getConfiguration());
    return new Task(jobInformation, taskInformation, new ExecutionAttemptID(), new AllocationID(), 0, 0, Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.<InputGateDeploymentDescriptor>emptyList(), 0, new TaskStateHandles(), mock(MemoryManager.class), mock(IOManager.class), network, mock(BroadcastVariableManager.class), mock(TaskManagerActions.class), mock(InputSplitProvider.class), mock(CheckpointResponder.class), libCache, mock(FileCache.class), new TestingTaskManagerRuntimeInfo(taskManagerConfig, new String[] { System.getProperty("java.io.tmpdir") }), new UnregisteredTaskMetricsGroup(), consumableNotifier, partitionProducerStateChecker, executor);
}
Also used : Task(org.apache.flink.runtime.taskmanager.Task) Configuration(org.apache.flink.configuration.Configuration) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) TaskKvStateRegistry(org.apache.flink.runtime.query.TaskKvStateRegistry) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) Matchers.anyString(org.mockito.Matchers.anyString) TaskManagerActions(org.apache.flink.runtime.taskmanager.TaskManagerActions) Executor(java.util.concurrent.Executor) TestingTaskManagerRuntimeInfo(org.apache.flink.runtime.util.TestingTaskManagerRuntimeInfo) BroadcastVariableManager(org.apache.flink.runtime.broadcast.BroadcastVariableManager) PartitionProducerStateChecker(org.apache.flink.runtime.io.network.netty.PartitionProducerStateChecker) ResultPartitionConsumableNotifier(org.apache.flink.runtime.io.network.partition.ResultPartitionConsumableNotifier) InputSplitProvider(org.apache.flink.runtime.jobgraph.tasks.InputSplitProvider) UnregisteredTaskMetricsGroup(org.apache.flink.runtime.operators.testutils.UnregisteredTaskMetricsGroup) JobInformation(org.apache.flink.runtime.executiongraph.JobInformation) TaskInformation(org.apache.flink.runtime.executiongraph.TaskInformation) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) CheckpointResponder(org.apache.flink.runtime.taskmanager.CheckpointResponder) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) LibraryCacheManager(org.apache.flink.runtime.execution.librarycache.LibraryCacheManager) ResultPartitionManager(org.apache.flink.runtime.io.network.partition.ResultPartitionManager) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) FileCache(org.apache.flink.runtime.filecache.FileCache) TaskStateHandles(org.apache.flink.runtime.state.TaskStateHandles) NetworkEnvironment(org.apache.flink.runtime.io.network.NetworkEnvironment) JobID(org.apache.flink.api.common.JobID)

Example 19 with JobVertexID

use of org.apache.flink.runtime.jobgraph.JobVertexID in project flink by apache.

the class SavepointLoader method loadAndValidateSavepoint.

/**
	 * Loads a savepoint back as a {@link CompletedCheckpoint}.
	 *
	 * <p>This method verifies that tasks and parallelism still match the savepoint parameters.
	 *
	 * @param jobId          The JobID of the job to load the savepoint for.
	 * @param tasks          Tasks that will possibly be reset
	 * @param savepointPath  The path of the savepoint to rollback to
	 * @param classLoader    The class loader to resolve serialized classes in legacy savepoint versions.
	 * @param allowNonRestoredState Allow to skip checkpoint state that cannot be mapped
	 * to any job vertex in tasks.
	 *
	 * @throws IllegalStateException If mismatch between program and savepoint state
	 * @throws IOException             If savepoint store failure
	 */
public static CompletedCheckpoint loadAndValidateSavepoint(JobID jobId, Map<JobVertexID, ExecutionJobVertex> tasks, String savepointPath, ClassLoader classLoader, boolean allowNonRestoredState) throws IOException {
    // (1) load the savepoint
    final Tuple2<Savepoint, StreamStateHandle> savepointAndHandle = SavepointStore.loadSavepointWithHandle(savepointPath, classLoader);
    final Savepoint savepoint = savepointAndHandle.f0;
    final StreamStateHandle metadataHandle = savepointAndHandle.f1;
    final Map<JobVertexID, TaskState> taskStates = new HashMap<>(savepoint.getTaskStates().size());
    boolean expandedToLegacyIds = false;
    // (2) validate it (parallelism, etc)
    for (TaskState taskState : savepoint.getTaskStates()) {
        ExecutionJobVertex executionJobVertex = tasks.get(taskState.getJobVertexID());
        // for example as generated from older flink versions, to provide backwards compatibility.
        if (executionJobVertex == null && !expandedToLegacyIds) {
            tasks = ExecutionJobVertex.includeLegacyJobVertexIDs(tasks);
            executionJobVertex = tasks.get(taskState.getJobVertexID());
            expandedToLegacyIds = true;
            LOG.info("Could not find ExecutionJobVertex. Including legacy JobVertexIDs in search.");
        }
        if (executionJobVertex != null) {
            if (executionJobVertex.getMaxParallelism() == taskState.getMaxParallelism() || !executionJobVertex.isMaxParallelismConfigured()) {
                taskStates.put(taskState.getJobVertexID(), taskState);
            } else {
                String msg = String.format("Failed to rollback to savepoint %s. " + "Max parallelism mismatch between savepoint state and new program. " + "Cannot map operator %s with max parallelism %d to new program with " + "max parallelism %d. This indicates that the program has been changed " + "in a non-compatible way after the savepoint.", savepoint, taskState.getJobVertexID(), taskState.getMaxParallelism(), executionJobVertex.getMaxParallelism());
                throw new IllegalStateException(msg);
            }
        } else if (allowNonRestoredState) {
            LOG.info("Skipping savepoint state for operator {}.", taskState.getJobVertexID());
        } else {
            String msg = String.format("Failed to rollback to savepoint %s. " + "Cannot map savepoint state for operator %s to the new program, " + "because the operator is not available in the new program. If " + "you want to allow to skip this, you can set the --allowNonRestoredState " + "option on the CLI.", savepointPath, taskState.getJobVertexID());
            throw new IllegalStateException(msg);
        }
    }
    // (3) convert to checkpoint so the system can fall back to it
    CheckpointProperties props = CheckpointProperties.forStandardSavepoint();
    return new CompletedCheckpoint(jobId, savepoint.getCheckpointId(), 0L, 0L, taskStates, props, metadataHandle, savepointPath);
}
Also used : CompletedCheckpoint(org.apache.flink.runtime.checkpoint.CompletedCheckpoint) HashMap(java.util.HashMap) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) TaskState(org.apache.flink.runtime.checkpoint.TaskState) CheckpointProperties(org.apache.flink.runtime.checkpoint.CheckpointProperties)

Example 20 with JobVertexID

use of org.apache.flink.runtime.jobgraph.JobVertexID in project flink by apache.

the class PendingCheckpoint method acknowledgeTask.

/**
	 * Acknowledges the task with the given execution attempt id and the given subtask state.
	 *
	 * @param executionAttemptId of the acknowledged task
	 * @param subtaskState of the acknowledged task
	 * @param metrics Checkpoint metrics for the stats
	 * @return TaskAcknowledgeResult of the operation
	 */
public TaskAcknowledgeResult acknowledgeTask(ExecutionAttemptID executionAttemptId, SubtaskState subtaskState, CheckpointMetrics metrics) {
    synchronized (lock) {
        if (discarded) {
            return TaskAcknowledgeResult.DISCARDED;
        }
        final ExecutionVertex vertex = notYetAcknowledgedTasks.remove(executionAttemptId);
        if (vertex == null) {
            if (acknowledgedTasks.contains(executionAttemptId)) {
                return TaskAcknowledgeResult.DUPLICATE;
            } else {
                return TaskAcknowledgeResult.UNKNOWN;
            }
        } else {
            acknowledgedTasks.add(executionAttemptId);
        }
        JobVertexID jobVertexID = vertex.getJobvertexId();
        int subtaskIndex = vertex.getParallelSubtaskIndex();
        long ackTimestamp = System.currentTimeMillis();
        long stateSize = 0;
        if (null != subtaskState) {
            TaskState taskState = taskStates.get(jobVertexID);
            if (null == taskState) {
                @SuppressWarnings("deprecation") ChainedStateHandle<StreamStateHandle> nonPartitionedState = subtaskState.getLegacyOperatorState();
                ChainedStateHandle<OperatorStateHandle> partitioneableState = subtaskState.getManagedOperatorState();
                //TODO this should go away when we remove chained state, assigning state to operators directly instead
                int chainLength;
                if (nonPartitionedState != null) {
                    chainLength = nonPartitionedState.getLength();
                } else if (partitioneableState != null) {
                    chainLength = partitioneableState.getLength();
                } else {
                    chainLength = 1;
                }
                taskState = new TaskState(jobVertexID, vertex.getTotalNumberOfParallelSubtasks(), vertex.getMaxParallelism(), chainLength);
                taskStates.put(jobVertexID, taskState);
            }
            taskState.putState(subtaskIndex, subtaskState);
            stateSize = subtaskState.getStateSize();
        }
        ++numAcknowledgedTasks;
        // publish the checkpoint statistics
        // to prevent null-pointers from concurrent modification, copy reference onto stack
        final PendingCheckpointStats statsCallback = this.statsCallback;
        if (statsCallback != null) {
            // Do this in millis because the web frontend works with them
            long alignmentDurationMillis = metrics.getAlignmentDurationNanos() / 1_000_000;
            SubtaskStateStats subtaskStateStats = new SubtaskStateStats(subtaskIndex, ackTimestamp, stateSize, metrics.getSyncDurationMillis(), metrics.getAsyncDurationMillis(), metrics.getBytesBufferedInAlignment(), alignmentDurationMillis);
            statsCallback.reportSubtaskStats(jobVertexID, subtaskStateStats);
        }
        return TaskAcknowledgeResult.SUCCESS;
    }
}
Also used : StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) OperatorStateHandle(org.apache.flink.runtime.state.OperatorStateHandle) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) Savepoint(org.apache.flink.runtime.checkpoint.savepoint.Savepoint)

Aggregations

JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)191 Test (org.junit.Test)145 JobID (org.apache.flink.api.common.JobID)88 SimpleSlot (org.apache.flink.runtime.instance.SimpleSlot)46 HashMap (java.util.HashMap)38 Configuration (org.apache.flink.configuration.Configuration)33 Instance (org.apache.flink.runtime.instance.Instance)33 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)30 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)30 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)28 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)27 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)25 IOException (java.io.IOException)24 KeyGroupRange (org.apache.flink.runtime.state.KeyGroupRange)24 ExecutionException (java.util.concurrent.ExecutionException)23 ActorTaskManagerGateway (org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway)22 ArrayList (java.util.ArrayList)20 ActorRef (akka.actor.ActorRef)18 TaskDeploymentDescriptor (org.apache.flink.runtime.deployment.TaskDeploymentDescriptor)18 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)15