use of org.apache.flink.kubernetes.operator.exception.DeploymentFailedException in project flink-kubernetes-operator by apache.
the class FlinkServiceTest method testGetLastCheckpoint.
@Test
public void testGetLastCheckpoint() throws Exception {
ObjectMapper objectMapper = RestMapperUtils.getStrictObjectMapper();
var testingClusterClient = new TestingClusterClient<>(configuration, TestUtils.TEST_DEPLOYMENT_NAME);
String responseWithHistory = "{\"counts\":{\"restored\":1,\"total\":79,\"in_progress\":0,\"completed\":69,\"failed\":10},\"summary\":{\"checkpointed_size\":{\"min\":23928,\"max\":53670,\"avg\":28551,\"p50\":28239,\"p90\":28563,\"p95\":28635,\"p99\":53670,\"p999\":53670},\"state_size\":{\"min\":23928,\"max\":53670,\"avg\":28551,\"p50\":28239,\"p90\":28563,\"p95\":28635,\"p99\":53670,\"p999\":53670},\"end_to_end_duration\":{\"min\":14,\"max\":117,\"avg\":24,\"p50\":22,\"p90\":32,\"p95\":40.5,\"p99\":117,\"p999\":117},\"alignment_buffered\":{\"min\":0,\"max\":0,\"avg\":0,\"p50\":0,\"p90\":0,\"p95\":0,\"p99\":0,\"p999\":0},\"processed_data\":{\"min\":0,\"max\":1274,\"avg\":280,\"p50\":112,\"p90\":840,\"p95\":1071,\"p99\":1274,\"p999\":1274},\"persisted_data\":{\"min\":0,\"max\":0,\"avg\":0,\"p50\":0,\"p90\":0,\"p95\":0,\"p99\":0,\"p999\":0}},\"latest\":{\"completed\":{\"className\":\"completed\",\"id\":96,\"status\":\"COMPLETED\",\"is_savepoint\":false,\"trigger_timestamp\":1653212837604,\"latest_ack_timestamp\":1653212837621,\"checkpointed_size\":28437,\"state_size\":28437,\"end_to_end_duration\":17,\"alignment_buffered\":0,\"processed_data\":560,\"persisted_data\":0,\"num_subtasks\":4,\"num_acknowledged_subtasks\":4,\"checkpoint_type\":\"CHECKPOINT\",\"tasks\":{},\"external_path\":\"file:/flink-data/checkpoints/00000000000000000000000000000000/chk-96\",\"discarded\":false},\"savepoint\":{\"className\":\"completed\",\"id\":51,\"status\":\"COMPLETED\",\"is_savepoint\":true,\"trigger_timestamp\":1653212748176,\"latest_ack_timestamp\":1653212748233,\"checkpointed_size\":53670,\"state_size\":53670,\"end_to_end_duration\":57,\"alignment_buffered\":0,\"processed_data\":483,\"persisted_data\":0,\"num_subtasks\":4,\"num_acknowledged_subtasks\":4,\"checkpoint_type\":\"SAVEPOINT\",\"tasks\":{},\"external_path\":\"file:/flink-data/savepoints/savepoint-000000-e8ea2482ce4f\",\"discarded\":false},\"failed\":null,\"restored\":{\"id\":27,\"restore_timestamp\":1653212683022,\"is_savepoint\":true,\"external_path\":\"file:/flink-data/savepoints/savepoint-000000-5930e5326ca7\"}},\"history\":[{\"className\":\"completed\",\"id\":96,\"status\":\"COMPLETED\",\"is_savepoint\":false,\"trigger_timestamp\":1653212837604,\"latest_ack_timestamp\":1653212837621,\"checkpointed_size\":28437,\"state_size\":28437,\"end_to_end_duration\":17,\"alignment_buffered\":0,\"processed_data\":560,\"persisted_data\":0,\"num_subtasks\":4,\"num_acknowledged_subtasks\":4,\"checkpoint_type\":\"CHECKPOINT\",\"tasks\":{},\"external_path\":\"file:/flink-data/checkpoints/00000000000000000000000000000000/chk-96\",\"discarded\":false},{\"className\":\"completed\",\"id\":95,\"status\":\"COMPLETED\",\"is_savepoint\":false,\"trigger_timestamp\":1653212835603,\"latest_ack_timestamp\":1653212835622,\"checkpointed_size\":28473,\"state_size\":28473,\"end_to_end_duration\":19,\"alignment_buffered\":0,\"processed_data\":42,\"persisted_data\":0,\"num_subtasks\":4,\"num_acknowledged_subtasks\":4,\"checkpoint_type\":\"CHECKPOINT\",\"tasks\":{},\"external_path\":\"file:/flink-data/checkpoints/00000000000000000000000000000000/chk-95\",\"discarded\":true},{\"className\":\"completed\",\"id\":94,\"status\":\"COMPLETED\",\"is_savepoint\":false,\"trigger_timestamp\":1653212833603,\"latest_ack_timestamp\":1653212833623,\"checkpointed_size\":27969,\"state_size\":27969,\"end_to_end_duration\":20,\"alignment_buffered\":0,\"processed_data\":28,\"persisted_data\":0,\"num_subtasks\":4,\"num_acknowledged_subtasks\":4,\"checkpoint_type\":\"CHECKPOINT\",\"tasks\":{},\"external_path\":\"file:/flink-data/checkpoints/00000000000000000000000000000000/chk-94\",\"discarded\":true},{\"className\":\"completed\",\"id\":93,\"status\":\"COMPLETED\",\"is_savepoint\":false,\"trigger_timestamp\":1653212831603,\"latest_ack_timestamp\":1653212831621,\"checkpointed_size\":28113,\"state_size\":28113,\"end_to_end_duration\":18,\"alignment_buffered\":0,\"processed_data\":138,\"persisted_data\":0,\"num_subtasks\":4,\"num_acknowledged_subtasks\":4,\"checkpoint_type\":\"CHECKPOINT\",\"tasks\":{},\"external_path\":\"file:/flink-data/checkpoints/00000000000000000000000000000000/chk-93\",\"discarded\":true},{\"className\":\"completed\",\"id\":92,\"status\":\"COMPLETED\",\"is_savepoint\":false,\"trigger_timestamp\":1653212829603,\"latest_ack_timestamp\":1653212829621,\"checkpointed_size\":28293,\"state_size\":28293,\"end_to_end_duration\":18,\"alignment_buffered\":0,\"processed_data\":196,\"persisted_data\":0,\"num_subtasks\":4,\"num_acknowledged_subtasks\":4,\"checkpoint_type\":\"CHECKPOINT\",\"tasks\":{},\"external_path\":\"file:/flink-data/checkpoints/00000000000000000000000000000000/chk-92\",\"discarded\":true},{\"className\":\"completed\",\"id\":91,\"status\":\"COMPLETED\",\"is_savepoint\":false,\"trigger_timestamp\":1653212827603,\"latest_ack_timestamp\":1653212827629,\"checkpointed_size\":27969,\"state_size\":27969,\"end_to_end_duration\":26,\"alignment_buffered\":0,\"processed_data\":0,\"persisted_data\":0,\"num_subtasks\":4,\"num_acknowledged_subtasks\":4,\"checkpoint_type\":\"CHECKPOINT\",\"tasks\":{},\"external_path\":\"file:/flink-data/checkpoints/00000000000000000000000000000000/chk-91\",\"discarded\":true},{\"className\":\"completed\",\"id\":90,\"status\":\"COMPLETED\",\"is_savepoint\":false,\"trigger_timestamp\":1653212825603,\"latest_ack_timestamp\":1653212825641,\"checkpointed_size\":27735,\"state_size\":27735,\"end_to_end_duration\":38,\"alignment_buffered\":0,\"processed_data\":0,\"persisted_data\":0,\"num_subtasks\":4,\"num_acknowledged_subtasks\":4,\"checkpoint_type\":\"CHECKPOINT\",\"tasks\":{},\"external_path\":\"file:/flink-data/checkpoints/00000000000000000000000000000000/chk-90\",\"discarded\":true},{\"className\":\"completed\",\"id\":89,\"status\":\"COMPLETED\",\"is_savepoint\":false,\"trigger_timestamp\":1653212823603,\"latest_ack_timestamp\":1653212823618,\"checkpointed_size\":28545,\"state_size\":28545,\"end_to_end_duration\":15,\"alignment_buffered\":0,\"processed_data\":364,\"persisted_data\":0,\"num_subtasks\":4,\"num_acknowledged_subtasks\":4,\"checkpoint_type\":\"CHECKPOINT\",\"tasks\":{},\"external_path\":\"file:/flink-data/checkpoints/00000000000000000000000000000000/chk-89\",\"discarded\":true},{\"className\":\"completed\",\"id\":88,\"status\":\"COMPLETED\",\"is_savepoint\":false,\"trigger_timestamp\":1653212821603,\"latest_ack_timestamp\":1653212821619,\"checkpointed_size\":28275,\"state_size\":28275,\"end_to_end_duration\":16,\"alignment_buffered\":0,\"processed_data\":0,\"persisted_data\":0,\"num_subtasks\":4,\"num_acknowledged_subtasks\":4,\"checkpoint_type\":\"CHECKPOINT\",\"tasks\":{},\"external_path\":\"file:/flink-data/checkpoints/00000000000000000000000000000000/chk-88\",\"discarded\":true},{\"className\":\"completed\",\"id\":87,\"status\":\"COMPLETED\",\"is_savepoint\":false,\"trigger_timestamp\":1653212819604,\"latest_ack_timestamp\":1653212819622,\"checkpointed_size\":28518,\"state_size\":28518,\"end_to_end_duration\":18,\"alignment_buffered\":0,\"processed_data\":0,\"persisted_data\":0,\"num_subtasks\":4,\"num_acknowledged_subtasks\":4,\"checkpoint_type\":\"CHECKPOINT\",\"tasks\":{},\"external_path\":\"file:/flink-data/checkpoints/00000000000000000000000000000000/chk-87\",\"discarded\":true}]}";
String responseWithoutHistory = "{\"counts\":{\"restored\":1,\"total\":79,\"in_progress\":0,\"completed\":69,\"failed\":10},\"summary\":{\"checkpointed_size\":{\"min\":23928,\"max\":53670,\"avg\":28551,\"p50\":28239,\"p90\":28563,\"p95\":28635,\"p99\":53670,\"p999\":53670},\"state_size\":{\"min\":23928,\"max\":53670,\"avg\":28551,\"p50\":28239,\"p90\":28563,\"p95\":28635,\"p99\":53670,\"p999\":53670},\"end_to_end_duration\":{\"min\":14,\"max\":117,\"avg\":24,\"p50\":22,\"p90\":32,\"p95\":40.5,\"p99\":117,\"p999\":117},\"alignment_buffered\":{\"min\":0,\"max\":0,\"avg\":0,\"p50\":0,\"p90\":0,\"p95\":0,\"p99\":0,\"p999\":0},\"processed_data\":{\"min\":0,\"max\":1274,\"avg\":280,\"p50\":112,\"p90\":840,\"p95\":1071,\"p99\":1274,\"p999\":1274},\"persisted_data\":{\"min\":0,\"max\":0,\"avg\":0,\"p50\":0,\"p90\":0,\"p95\":0,\"p99\":0,\"p999\":0}},\"latest\":{\"completed\":null,\"savepoint\":null,\"failed\":null,\"restored\":{\"id\":27,\"restore_timestamp\":1653212683022,\"is_savepoint\":true,\"external_path\":\"file:/flink-data/savepoints/savepoint-000000-5930e5326ca7\"}},\"history\":[]}";
String responseWithoutHistoryInternal = "{\"counts\":{\"restored\":1,\"total\":79,\"in_progress\":0,\"completed\":69,\"failed\":10},\"summary\":{\"checkpointed_size\":{\"min\":23928,\"max\":53670,\"avg\":28551,\"p50\":28239,\"p90\":28563,\"p95\":28635,\"p99\":53670,\"p999\":53670},\"state_size\":{\"min\":23928,\"max\":53670,\"avg\":28551,\"p50\":28239,\"p90\":28563,\"p95\":28635,\"p99\":53670,\"p999\":53670},\"end_to_end_duration\":{\"min\":14,\"max\":117,\"avg\":24,\"p50\":22,\"p90\":32,\"p95\":40.5,\"p99\":117,\"p999\":117},\"alignment_buffered\":{\"min\":0,\"max\":0,\"avg\":0,\"p50\":0,\"p90\":0,\"p95\":0,\"p99\":0,\"p999\":0},\"processed_data\":{\"min\":0,\"max\":1274,\"avg\":280,\"p50\":112,\"p90\":840,\"p95\":1071,\"p99\":1274,\"p999\":1274},\"persisted_data\":{\"min\":0,\"max\":0,\"avg\":0,\"p50\":0,\"p90\":0,\"p95\":0,\"p99\":0,\"p999\":0}},\"latest\":{\"completed\":null,\"savepoint\":null,\"failed\":null,\"restored\":{\"id\":27,\"restore_timestamp\":1653212683022,\"is_savepoint\":true,\"external_path\":\"<checkpoint-not-externally-addressable>\"}},\"history\":[]}";
var responseContainer = new ArrayList<CheckpointHistoryWrapper>();
testingClusterClient.setRequestProcessor((headers, parameters, requestBody) -> {
if (headers instanceof CustomCheckpointingStatisticsHeaders) {
return CompletableFuture.completedFuture(responseContainer.get(0));
}
fail("unknown request");
return null;
});
var flinkService = createFlinkService(testingClusterClient);
responseContainer.add(objectMapper.readValue(responseWithHistory, CheckpointHistoryWrapper.class));
var checkpointOpt = flinkService.getLastCheckpoint(new JobID(), new Configuration());
assertEquals("file:/flink-data/checkpoints/00000000000000000000000000000000/chk-96", checkpointOpt.get().getLocation());
responseContainer.set(0, objectMapper.readValue(responseWithoutHistory, CheckpointHistoryWrapper.class));
checkpointOpt = flinkService.getLastCheckpoint(new JobID(), new Configuration());
assertEquals("file:/flink-data/savepoints/savepoint-000000-5930e5326ca7", checkpointOpt.get().getLocation());
responseContainer.set(0, objectMapper.readValue(responseWithoutHistoryInternal, CheckpointHistoryWrapper.class));
try {
flinkService.getLastCheckpoint(new JobID(), new Configuration());
fail();
} catch (DeploymentFailedException dpe) {
}
}
use of org.apache.flink.kubernetes.operator.exception.DeploymentFailedException in project flink-kubernetes-operator by apache.
the class AbstractDeploymentObserver method observeJmDeployment.
protected void observeJmDeployment(FlinkDeployment flinkApp, Context context, Configuration effectiveConfig) {
FlinkDeploymentStatus deploymentStatus = flinkApp.getStatus();
JobManagerDeploymentStatus previousJmStatus = deploymentStatus.getJobManagerDeploymentStatus();
if (isSuspendedJob(flinkApp)) {
logger.debug("Skipping observe step for suspended application deployments.");
return;
}
flinkApp.getStatus().setClusterInfo(new HashMap<>());
logger.info("Observing JobManager deployment. Previous status: {}", previousJmStatus.name());
if (JobManagerDeploymentStatus.DEPLOYED_NOT_READY == previousJmStatus) {
logger.info("JobManager deployment is ready");
deploymentStatus.setJobManagerDeploymentStatus(JobManagerDeploymentStatus.READY);
return;
}
Optional<Deployment> deployment = context.getSecondaryResource(Deployment.class);
if (deployment.isPresent()) {
DeploymentStatus status = deployment.get().getStatus();
DeploymentSpec spec = deployment.get().getSpec();
if (status != null && status.getAvailableReplicas() != null && spec.getReplicas().intValue() == status.getReplicas() && spec.getReplicas().intValue() == status.getAvailableReplicas() && flinkService.isJobManagerPortReady(effectiveConfig)) {
// typically it takes a few seconds for the REST server to be ready
logger.info("JobManager deployment port is ready, waiting for the Flink REST API...");
deploymentStatus.setJobManagerDeploymentStatus(JobManagerDeploymentStatus.DEPLOYED_NOT_READY);
return;
}
try {
checkFailedCreate(status);
// checking the pod is expensive; only do it when the deployment isn't ready
checkCrashLoopBackoff(flinkApp, effectiveConfig);
} catch (DeploymentFailedException dfe) {
// throw only when not already in error status to allow for spec update
deploymentStatus.getJobStatus().setState(JobStatus.RECONCILING.name());
if (!JobManagerDeploymentStatus.ERROR.equals(deploymentStatus.getJobManagerDeploymentStatus())) {
throw dfe;
}
return;
}
logger.info("JobManager is being deployed");
deploymentStatus.setJobManagerDeploymentStatus(JobManagerDeploymentStatus.DEPLOYING);
return;
}
deploymentStatus.setJobManagerDeploymentStatus(JobManagerDeploymentStatus.MISSING);
deploymentStatus.getJobStatus().setState(JobStatus.RECONCILING.name());
if (previousJmStatus != JobManagerDeploymentStatus.MISSING && previousJmStatus != JobManagerDeploymentStatus.ERROR) {
onMissingDeployment(flinkApp);
}
}
use of org.apache.flink.kubernetes.operator.exception.DeploymentFailedException in project flink-kubernetes-operator by apache.
the class FlinkDeploymentController method reconcile.
@Override
public UpdateControl<FlinkDeployment> reconcile(FlinkDeployment flinkApp, Context context) throws Exception {
LOG.info("Starting reconciliation");
statusHelper.updateStatusFromCache(flinkApp);
FlinkDeployment previousDeployment = ReconciliationUtils.clone(flinkApp);
try {
observerFactory.getOrCreate(flinkApp).observe(flinkApp, context);
if (!validateDeployment(flinkApp)) {
metricManager.onUpdate(flinkApp);
statusHelper.patchAndCacheStatus(flinkApp);
return ReconciliationUtils.toUpdateControl(configManager.getOperatorConfiguration(), flinkApp, previousDeployment, false);
}
reconcilerFactory.getOrCreate(flinkApp).reconcile(flinkApp, context);
} catch (DeploymentFailedException dfe) {
handleDeploymentFailed(flinkApp, dfe);
} catch (Exception e) {
throw new ReconciliationException(e);
}
LOG.info("End of reconciliation");
metricManager.onUpdate(flinkApp);
statusHelper.patchAndCacheStatus(flinkApp);
return ReconciliationUtils.toUpdateControl(configManager.getOperatorConfiguration(), flinkApp, previousDeployment, true);
}
use of org.apache.flink.kubernetes.operator.exception.DeploymentFailedException in project flink-kubernetes-operator by apache.
the class ApplicationReconcilerTest method testUpgrade.
@ParameterizedTest
@EnumSource(FlinkVersion.class)
public void testUpgrade(FlinkVersion flinkVersion) throws Exception {
TestingFlinkService flinkService = new TestingFlinkService();
Context context = flinkService.getContext();
ApplicationReconciler reconciler = new ApplicationReconciler(kubernetesClient, flinkService, configManager);
FlinkDeployment deployment = TestUtils.buildApplicationCluster(flinkVersion);
reconciler.reconcile(deployment, context);
List<Tuple2<String, JobStatusMessage>> runningJobs = flinkService.listJobs();
verifyAndSetRunningJobsToStatus(deployment, runningJobs);
// Test stateless upgrade
FlinkDeployment statelessUpgrade = ReconciliationUtils.clone(deployment);
statelessUpgrade.getSpec().getJob().setUpgradeMode(UpgradeMode.STATELESS);
statelessUpgrade.getSpec().getFlinkConfiguration().put("new", "conf");
reconciler.reconcile(statelessUpgrade, context);
runningJobs = flinkService.listJobs();
assertEquals(0, flinkService.getRunningCount());
reconciler.reconcile(statelessUpgrade, context);
runningJobs = flinkService.listJobs();
assertEquals(1, flinkService.getRunningCount());
assertNull(runningJobs.get(0).f0);
deployment.getStatus().getJobStatus().setJobId(runningJobs.get(0).f1.getJobId().toHexString());
// Test stateful upgrade
FlinkDeployment statefulUpgrade = ReconciliationUtils.clone(deployment);
statefulUpgrade.getSpec().getJob().setUpgradeMode(UpgradeMode.SAVEPOINT);
statefulUpgrade.getSpec().getFlinkConfiguration().put("new", "conf2");
reconciler.reconcile(statefulUpgrade, context);
runningJobs = flinkService.listJobs();
assertEquals(0, flinkService.getRunningCount());
reconciler.reconcile(statefulUpgrade, context);
runningJobs = flinkService.listJobs();
assertEquals(1, flinkService.getRunningCount());
assertEquals("savepoint_0", runningJobs.get(0).f0);
assertEquals(SavepointTriggerType.UPGRADE, statefulUpgrade.getStatus().getJobStatus().getSavepointInfo().getLastSavepoint().getTriggerType());
deployment.getSpec().getJob().setUpgradeMode(UpgradeMode.LAST_STATE);
deployment.getSpec().setRestartNonce(100L);
flinkService.setHaDataAvailable(false);
deployment.getStatus().getJobStatus().setState("RECONCILING");
try {
deployment.getStatus().setJobManagerDeploymentStatus(JobManagerDeploymentStatus.MISSING);
reconciler.reconcile(deployment, context);
fail();
} catch (DeploymentFailedException expected) {
}
try {
deployment.getStatus().setJobManagerDeploymentStatus(JobManagerDeploymentStatus.ERROR);
reconciler.reconcile(deployment, context);
fail();
} catch (DeploymentFailedException expected) {
}
flinkService.clear();
deployment.getSpec().getJob().setUpgradeMode(UpgradeMode.LAST_STATE);
deployment.getSpec().setRestartNonce(200L);
flinkService.setHaDataAvailable(false);
deployment.getStatus().getJobStatus().getSavepointInfo().setLastSavepoint(Savepoint.of("finished_sp", SavepointTriggerType.UPGRADE));
deployment.getStatus().getJobStatus().setState("FINISHED");
deployment.getStatus().setJobManagerDeploymentStatus(JobManagerDeploymentStatus.READY);
reconciler.reconcile(deployment, context);
reconciler.reconcile(deployment, context);
assertEquals(1, flinkService.getRunningCount());
assertEquals("finished_sp", runningJobs.get(0).f0);
}
use of org.apache.flink.kubernetes.operator.exception.DeploymentFailedException in project flink-kubernetes-operator by apache.
the class Observer method observeJmDeployment.
private void observeJmDeployment(FlinkDeployment flinkApp, Context context, Configuration effectiveConfig) {
FlinkDeploymentStatus deploymentStatus = flinkApp.getStatus();
JobManagerDeploymentStatus previousJmStatus = deploymentStatus.getJobManagerDeploymentStatus();
if (JobManagerDeploymentStatus.READY == previousJmStatus) {
return;
}
if (JobManagerDeploymentStatus.DEPLOYED_NOT_READY == previousJmStatus) {
deploymentStatus.setJobManagerDeploymentStatus(JobManagerDeploymentStatus.READY);
return;
}
Optional<Deployment> deployment = context.getSecondaryResource(Deployment.class);
if (deployment.isPresent()) {
DeploymentStatus status = deployment.get().getStatus();
DeploymentSpec spec = deployment.get().getSpec();
if (status != null && status.getAvailableReplicas() != null && spec.getReplicas().intValue() == status.getReplicas() && spec.getReplicas().intValue() == status.getAvailableReplicas() && flinkService.isJobManagerPortReady(effectiveConfig)) {
// typically it takes a few seconds for the REST server to be ready
LOG.info("JobManager deployment {} in namespace {} port ready, waiting for the REST API...", flinkApp.getMetadata().getName(), flinkApp.getMetadata().getNamespace());
deploymentStatus.setJobManagerDeploymentStatus(JobManagerDeploymentStatus.DEPLOYED_NOT_READY);
return;
}
LOG.info("JobManager deployment {} in namespace {} exists but not ready yet, status {}", flinkApp.getMetadata().getName(), flinkApp.getMetadata().getNamespace(), status);
List<DeploymentCondition> conditions = status.getConditions();
for (DeploymentCondition dc : conditions) {
if ("FailedCreate".equals(dc.getReason()) && "ReplicaFailure".equals(dc.getType())) {
// throw only when not already in error status to allow for spec update
if (!JobManagerDeploymentStatus.ERROR.equals(deploymentStatus.getJobManagerDeploymentStatus())) {
throw new DeploymentFailedException(DeploymentFailedException.COMPONENT_JOBMANAGER, dc);
}
return;
}
}
deploymentStatus.setJobManagerDeploymentStatus(JobManagerDeploymentStatus.DEPLOYING);
return;
}
deploymentStatus.setJobManagerDeploymentStatus(JobManagerDeploymentStatus.MISSING);
}
Aggregations