use of com.hartwig.pipeline.execution.vm.storage.PersistentStorageStrategy in project pipeline5 by hartwigmedical.
the class GoogleComputeEngine method submit.
public PipelineStatus submit(final RuntimeBucket bucket, final VirtualMachineJobDefinition jobDefinition, final String discriminator) {
String vmName = format("%s%s-%s", bucket.runId(), discriminator.isEmpty() ? "" : "-" + discriminator, jobDefinition.name());
RuntimeFiles flags = RuntimeFiles.of(discriminator);
PipelineStatus status = PipelineStatus.FAILED;
try {
BucketCompletionWatcher.State currentState = bucketWatcher.currentState(bucket, flags);
if (currentState == BucketCompletionWatcher.State.SUCCESS) {
LOGGER.info("Compute engine job [{}] already exists, and succeeded. Skipping job.", vmName);
return PipelineStatus.SKIPPED;
} else if (currentState == BucketCompletionWatcher.State.FAILURE) {
LOGGER.info("Compute engine job [{}] already exists, but failed. Deleting state and restarting.", vmName);
bucket.delete(flags.failure());
bucket.delete(jobDefinition.namespacedResults().path());
}
String project = arguments.project();
List<Zone> zones = fetchZones();
zoneRandomizer.accept(zones);
int index = 0;
boolean keepTrying = !zones.isEmpty();
while (keepTrying) {
Zone currentZone = zones.get(index % zones.size());
Instance instance = lifecycleManager.newInstance();
instance.setName(vmName);
instance.setZone(currentZone.getName());
instance.setTags(new Tags().setItems(arguments.tags()));
if (arguments.usePreemptibleVms()) {
instance.setScheduling(new Scheduling().setPreemptible(true));
}
instance.setMachineType(machineType(currentZone.getName(), jobDefinition.performanceProfile().uri(), project));
final Map<String, String> labelMap = labels.asMap(List.of(Map.entry("job_name", jobDefinition.name())));
instance.setLabels(labelMap);
addServiceAccount(instance);
Image image = attachDisks(compute, instance, jobDefinition, project, vmName, currentZone.getName(), arguments.imageName().isPresent() ? compute.images().get(arguments.imageProject().orElse(VirtualMachineJobDefinition.HMF_IMAGE_PROJECT), arguments.imageName().get()).execute() : resolveLatestImage(compute, jobDefinition.imageFamily(), arguments.imageProject().orElse(project)), labelMap);
LOGGER.info("Submitting compute engine job [{}] using image [{}] in zone [{}]", vmName, image.getName(), currentZone.getName());
String startupScript = arguments.useLocalSsds() ? jobDefinition.startupCommand().asUnixString(new LocalSsdStorageStrategy(jobDefinition.localSsdCount())) : jobDefinition.startupCommand().asUnixString(new PersistentStorageStrategy());
addStartupCommand(instance, bucket, flags, startupScript);
addNetworkInterface(instance, project);
Operation result = lifecycleManager.deleteOldInstancesAndStart(instance, currentZone.getName(), vmName);
if (result.getError() == null) {
LOGGER.debug("Successfully initialised [{}]", vmName);
status = waitForCompletion(bucket, flags, currentZone, instance);
if (status != PipelineStatus.PREEMPTED) {
if (arguments.useLocalSsds()) {
// Instances with local SSDs cannot be stopped or restarted
lifecycleManager.delete(currentZone.getName(), vmName);
} else {
lifecycleManager.stop(currentZone.getName(), vmName);
if (status == PipelineStatus.SUCCESS) {
lifecycleManager.delete(currentZone.getName(), vmName);
} else {
lifecycleManager.disableStartupScript(currentZone.getName(), instance.getName());
}
}
LOGGER.info("Compute engine job [{}] is complete with status [{}]", vmName, status);
keepTrying = false;
} else {
LOGGER.info("Instance [{}] in [{}] was pre-empted", vmName, currentZone.getName());
}
} else if (anyErrorMatch(result, ZONE_EXHAUSTED_ERROR_CODE)) {
LOGGER.warn("Zone [{}] has insufficient resources to fulfill the request for [{}]. Trying next zone", currentZone.getName(), vmName);
} else if (anyErrorMatch(result, UNSUPPORTED_OPERATION_ERROR_CODE)) {
LOGGER.warn("Received unsupported operation from GCE for [{}], this likely means the instance was pre-empted before it could " + "start, or another operation has yet to complete. Trying next zone.", vmName);
} else if (anyErrorMatch(result, QUOTA_EXCEEDED)) {
throw new RuntimeException(String.format("Quota exceeded for [%s], will keep trying until resources are available. Quota [%s]", vmName, result.getError().getErrors().get(0).getMessage()));
} else {
throw new RuntimeException(result.getError().toPrettyString());
}
index++;
}
} catch (IOException e) {
String message = format("An error occurred running job on compute engine [%s]", vmName);
LOGGER.error(message, e);
return PipelineStatus.FAILED;
}
return status;
}
Aggregations