Search in sources :

Example 1 with PersistentStorageStrategy

use of com.hartwig.pipeline.execution.vm.storage.PersistentStorageStrategy in project pipeline5 by hartwigmedical.

the class GoogleComputeEngine method submit.

public PipelineStatus submit(final RuntimeBucket bucket, final VirtualMachineJobDefinition jobDefinition, final String discriminator) {
    String vmName = format("%s%s-%s", bucket.runId(), discriminator.isEmpty() ? "" : "-" + discriminator, jobDefinition.name());
    RuntimeFiles flags = RuntimeFiles.of(discriminator);
    PipelineStatus status = PipelineStatus.FAILED;
    try {
        BucketCompletionWatcher.State currentState = bucketWatcher.currentState(bucket, flags);
        if (currentState == BucketCompletionWatcher.State.SUCCESS) {
            LOGGER.info("Compute engine job [{}] already exists, and succeeded. Skipping job.", vmName);
            return PipelineStatus.SKIPPED;
        } else if (currentState == BucketCompletionWatcher.State.FAILURE) {
            LOGGER.info("Compute engine job [{}] already exists, but failed. Deleting state and restarting.", vmName);
            bucket.delete(flags.failure());
            bucket.delete(jobDefinition.namespacedResults().path());
        }
        String project = arguments.project();
        List<Zone> zones = fetchZones();
        zoneRandomizer.accept(zones);
        int index = 0;
        boolean keepTrying = !zones.isEmpty();
        while (keepTrying) {
            Zone currentZone = zones.get(index % zones.size());
            Instance instance = lifecycleManager.newInstance();
            instance.setName(vmName);
            instance.setZone(currentZone.getName());
            instance.setTags(new Tags().setItems(arguments.tags()));
            if (arguments.usePreemptibleVms()) {
                instance.setScheduling(new Scheduling().setPreemptible(true));
            }
            instance.setMachineType(machineType(currentZone.getName(), jobDefinition.performanceProfile().uri(), project));
            final Map<String, String> labelMap = labels.asMap(List.of(Map.entry("job_name", jobDefinition.name())));
            instance.setLabels(labelMap);
            addServiceAccount(instance);
            Image image = attachDisks(compute, instance, jobDefinition, project, vmName, currentZone.getName(), arguments.imageName().isPresent() ? compute.images().get(arguments.imageProject().orElse(VirtualMachineJobDefinition.HMF_IMAGE_PROJECT), arguments.imageName().get()).execute() : resolveLatestImage(compute, jobDefinition.imageFamily(), arguments.imageProject().orElse(project)), labelMap);
            LOGGER.info("Submitting compute engine job [{}] using image [{}] in zone [{}]", vmName, image.getName(), currentZone.getName());
            String startupScript = arguments.useLocalSsds() ? jobDefinition.startupCommand().asUnixString(new LocalSsdStorageStrategy(jobDefinition.localSsdCount())) : jobDefinition.startupCommand().asUnixString(new PersistentStorageStrategy());
            addStartupCommand(instance, bucket, flags, startupScript);
            addNetworkInterface(instance, project);
            Operation result = lifecycleManager.deleteOldInstancesAndStart(instance, currentZone.getName(), vmName);
            if (result.getError() == null) {
                LOGGER.debug("Successfully initialised [{}]", vmName);
                status = waitForCompletion(bucket, flags, currentZone, instance);
                if (status != PipelineStatus.PREEMPTED) {
                    if (arguments.useLocalSsds()) {
                        // Instances with local SSDs cannot be stopped or restarted
                        lifecycleManager.delete(currentZone.getName(), vmName);
                    } else {
                        lifecycleManager.stop(currentZone.getName(), vmName);
                        if (status == PipelineStatus.SUCCESS) {
                            lifecycleManager.delete(currentZone.getName(), vmName);
                        } else {
                            lifecycleManager.disableStartupScript(currentZone.getName(), instance.getName());
                        }
                    }
                    LOGGER.info("Compute engine job [{}] is complete with status [{}]", vmName, status);
                    keepTrying = false;
                } else {
                    LOGGER.info("Instance [{}] in [{}] was pre-empted", vmName, currentZone.getName());
                }
            } else if (anyErrorMatch(result, ZONE_EXHAUSTED_ERROR_CODE)) {
                LOGGER.warn("Zone [{}] has insufficient resources to fulfill the request for [{}]. Trying next zone", currentZone.getName(), vmName);
            } else if (anyErrorMatch(result, UNSUPPORTED_OPERATION_ERROR_CODE)) {
                LOGGER.warn("Received unsupported operation from GCE for [{}], this likely means the instance was pre-empted before it could " + "start, or another operation has yet to complete. Trying next zone.", vmName);
            } else if (anyErrorMatch(result, QUOTA_EXCEEDED)) {
                throw new RuntimeException(String.format("Quota exceeded for [%s], will keep trying until resources are available. Quota [%s]", vmName, result.getError().getErrors().get(0).getMessage()));
            } else {
                throw new RuntimeException(result.getError().toPrettyString());
            }
            index++;
        }
    } catch (IOException e) {
        String message = format("An error occurred running job on compute engine [%s]", vmName);
        LOGGER.error(message, e);
        return PipelineStatus.FAILED;
    }
    return status;
}
Also used : PipelineStatus(com.hartwig.pipeline.execution.PipelineStatus) LocalSsdStorageStrategy(com.hartwig.pipeline.execution.vm.storage.LocalSsdStorageStrategy) PersistentStorageStrategy(com.hartwig.pipeline.execution.vm.storage.PersistentStorageStrategy) Instance(com.google.api.services.compute.model.Instance) Zone(com.google.api.services.compute.model.Zone) Scheduling(com.google.api.services.compute.model.Scheduling) Operation(com.google.api.services.compute.model.Operation) IOException(java.io.IOException) Image(com.google.api.services.compute.model.Image) Tags(com.google.api.services.compute.model.Tags)

Aggregations

Image (com.google.api.services.compute.model.Image)1 Instance (com.google.api.services.compute.model.Instance)1 Operation (com.google.api.services.compute.model.Operation)1 Scheduling (com.google.api.services.compute.model.Scheduling)1 Tags (com.google.api.services.compute.model.Tags)1 Zone (com.google.api.services.compute.model.Zone)1 PipelineStatus (com.hartwig.pipeline.execution.PipelineStatus)1 LocalSsdStorageStrategy (com.hartwig.pipeline.execution.vm.storage.LocalSsdStorageStrategy)1 PersistentStorageStrategy (com.hartwig.pipeline.execution.vm.storage.PersistentStorageStrategy)1 IOException (java.io.IOException)1