use of io.cdap.cdap.runtime.spi.provisioner.RetryableProvisionException in project cdap by caskdata.
the class ProvisioningTask method executeOnce.
/**
* Executes one iteration of subtask. It persists task info before each subtask such that this task
* can be re-created from the task info stored in the ProvisionerStore.
*/
@Override
public final long executeOnce() throws Exception {
RetryStrategy retryStrategy = getRetryStrategy();
Map<ProvisioningOp.Status, ProvisioningSubtask> subTasks = getSubTasks();
ProvisioningTaskInfo currentTaskInfo = persistTaskInfo(taskInfo, retryStrategy);
ProvisioningOp.Status state = currentTaskInfo.getProvisioningOp().getStatus();
if (state == ProvisioningOp.Status.CANCELLED) {
LOG.debug("Cancelled {} task for program run {}.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
return -1L;
}
// Get the sub-task to execute
ProvisioningSubtask subtask = subTasks.get(state);
if (subtask == null) {
// should never happen
throw new IllegalStateException(String.format("Invalid state '%s' in provisioning task for program run '%s'. " + "This means there is a bug in provisioning state machine. " + "Please reach out to the development team.", state, programRunId));
}
if (subtask == EndSubtask.INSTANCE) {
LOG.debug("Completed {} task for program run {}.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
return -1L;
}
if (subTaskStartTime == 0L) {
subTaskStartTime = System.currentTimeMillis();
}
try {
PROGRESS_LOG.debug("Executing {} subtask {} for program run {}.", currentTaskInfo.getProvisioningOp().getType(), state, programRunId);
taskInfo = Retries.callWithInterruptibleRetries(() -> subtask.execute(currentTaskInfo), retryStrategy, t -> t instanceof RetryableProvisionException).orElse(null);
PROGRESS_LOG.debug("Completed {} subtask {} for program run {}.", currentTaskInfo.getProvisioningOp().getType(), state, programRunId);
// Nothing more to execute
if (taskInfo == null) {
LOG.debug("No more {} task for program run {}.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
return -1L;
}
ProvisioningOp.Status nextState = taskInfo.getProvisioningOp().getStatus();
// If state doesn't change, determine the delay based on the polling strategy
if (state == nextState) {
if (subTaskPollingStrategy == null) {
subTaskPollingStrategy = provisioner.getPollingStrategy(provisionerContext, taskInfo.getCluster());
}
return Math.max(0, subTaskPollingStrategy.nextPoll(subTaskExecNums++, subTaskStartTime));
}
// Otherwise, execute the next task immediately.
subTaskPollingStrategy = null;
subTaskStartTime = 0L;
subTaskExecNums = 0;
return 0;
} catch (InterruptedException e) {
throw e;
} catch (Exception e) {
LOG.error("{} task failed in {} state for program run {} due to {}.", currentTaskInfo.getProvisioningOp().getType(), state, programRunId, e.getMessage(), e);
handleSubtaskFailure(currentTaskInfo, e);
ProvisioningOp failureOp = new ProvisioningOp(currentTaskInfo.getProvisioningOp().getType(), ProvisioningOp.Status.FAILED);
ProvisioningTaskInfo failureInfo = new ProvisioningTaskInfo(currentTaskInfo, failureOp, currentTaskInfo.getCluster());
persistTaskInfo(failureInfo, retryStrategy);
LOG.debug("Terminated {} task for program run {} due to exception.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
return -1L;
}
}
use of io.cdap.cdap.runtime.spi.provisioner.RetryableProvisionException in project cdap by caskdata.
the class DataprocClient method getClusterStatus.
/**
* Get the status of the specified cluster.
*
* @param name the cluster name
* @return the cluster status
* @throws RetryableProvisionException if there was a non 4xx error code returned
*/
io.cdap.cdap.runtime.spi.provisioner.ClusterStatus getClusterStatus(String name) throws RetryableProvisionException {
io.cdap.cdap.runtime.spi.provisioner.ClusterStatus status = getDataprocCluster(name).map(cluster -> convertStatus(cluster.getStatus())).orElse(io.cdap.cdap.runtime.spi.provisioner.ClusterStatus.NOT_EXISTS);
// if it failed, try to get the create operation and log the error message
try {
if (status == io.cdap.cdap.runtime.spi.provisioner.ClusterStatus.FAILED) {
String resourceName = String.format("projects/%s/regions/%s/operations", conf.getProjectId(), conf.getRegion());
String filter = String.format("clusterName=%s AND operationType=CREATE", name);
OperationsClient.ListOperationsPagedResponse operationsResponse = client.getOperationsClient().listOperations(resourceName, filter);
OperationsClient.ListOperationsPage page = operationsResponse.getPage();
if (page == null) {
LOG.warn("Unable to get the cause of the cluster creation failure.");
return status;
}
if (page.getPageElementCount() > 1) {
// shouldn't be possible
LOG.warn("Multiple create operations found for cluster {}, may not be able to find the failure message.", name);
}
if (page.getPageElementCount() > 0) {
Operation operation = page.getValues().iterator().next();
Status operationError = operation.getError();
if (operationError != null) {
LOG.warn("Failed to create cluster {}: {}", name, operationError.getMessage());
}
}
}
} catch (Exception e) {
// if we failed to get the operations list, log an error and proceed with normal execution
LOG.warn("Unable to get the cause of the cluster creation failure.", e);
}
return status;
}
use of io.cdap.cdap.runtime.spi.provisioner.RetryableProvisionException in project cdap by caskdata.
the class DataprocClient method updateClusterLabels.
/**
* Updates labelsToSet on the given Dataproc cluster.
*
* @param clusterName name of the cluster
* @param labelsToSet Key/Value pairs to set on the Dataproc cluster.
* @param labelsToRemove collection of labels to remove from the Dataproc cluster.
*/
void updateClusterLabels(String clusterName, Map<String, String> labelsToSet, Collection<String> labelsToRemove) throws RetryableProvisionException, InterruptedException {
if (labelsToSet.isEmpty() && labelsToRemove.isEmpty()) {
return;
}
try {
Cluster cluster = getDataprocCluster(clusterName).filter(c -> c.getStatus().getState() == ClusterStatus.State.RUNNING).orElseThrow(() -> new DataprocRuntimeException("Dataproc cluster " + clusterName + " does not exist or not in running state"));
Map<String, String> existingLabels = cluster.getLabelsMap();
// no need to update the cluster labelsToSet.
if (labelsToSet.entrySet().stream().allMatch(e -> Objects.equals(e.getValue(), existingLabels.get(e.getKey()))) && labelsToRemove.stream().noneMatch(existingLabels::containsKey)) {
return;
}
Map<String, String> newLabels = new HashMap<>(existingLabels);
newLabels.keySet().removeAll(labelsToRemove);
newLabels.putAll(labelsToSet);
FieldMask updateMask = FieldMask.newBuilder().addPaths("labels").build();
OperationFuture<Cluster, ClusterOperationMetadata> operationFuture = client.updateClusterAsync(UpdateClusterRequest.newBuilder().setProjectId(conf.getProjectId()).setRegion(conf.getRegion()).setClusterName(clusterName).setCluster(cluster.toBuilder().clearLabels().putAllLabels(newLabels)).setUpdateMask(updateMask).build());
ClusterOperationMetadata metadata = operationFuture.getMetadata().get();
int numWarnings = metadata.getWarningsCount();
if (numWarnings > 0) {
LOG.warn("Encountered {} warning {} while setting labels on cluster:\n{}", numWarnings, numWarnings > 1 ? "s" : "", String.join("\n", metadata.getWarningsList()));
}
} catch (ExecutionException e) {
Throwable cause = e.getCause();
if (cause instanceof ApiException) {
throw handleApiException((ApiException) cause);
}
throw new DataprocRuntimeException(cause);
}
}
use of io.cdap.cdap.runtime.spi.provisioner.RetryableProvisionException in project cdap by caskdata.
the class DataprocClient method findNetwork.
private static String findNetwork(String project, Compute compute) throws IOException, RetryableProvisionException {
List<Network> networks;
try {
NetworkList networkList = compute.networks().list(project).execute();
networks = networkList.getItems();
} catch (Exception e) {
handleRetryableExceptions(e);
throw e;
}
if (networks == null || networks.isEmpty()) {
throw new IllegalArgumentException(String.format("Unable to find any networks in project '%s'. " + "Please create a network in the project.", project));
}
for (Network network : networks) {
if ("default".equals(network.getName())) {
return network.getName();
}
}
return networks.iterator().next().getName();
}
use of io.cdap.cdap.runtime.spi.provisioner.RetryableProvisionException in project cdap by caskdata.
the class DataprocClient method getFirewallTargetTags.
/**
* Finds ingress firewall rules for the configured network that matches the required firewall port as
* defined in {@link FirewallPort}.
*
* @return a {@link Collection} of tags that need to be added to the VM to have those firewall rules applies
* @throws IOException If failed to discover those firewall rules
*/
private List<String> getFirewallTargetTags(Network network, boolean useInternalIP) throws IOException, RetryableProvisionException {
FirewallList firewalls;
try {
firewalls = compute.firewalls().list(conf.getNetworkHostProjectID()).execute();
} catch (Exception e) {
handleRetryableExceptions(e);
throw e;
}
List<String> tags = new ArrayList<>();
Set<FirewallPort> requiredPorts = EnumSet.allOf(FirewallPort.class);
// Iterate all firewall rules and see if it has ingress rules for all required firewall port.
for (Firewall firewall : Optional.ofNullable(firewalls.getItems()).orElse(Collections.emptyList())) {
// network is a url like https://www.googleapis.com/compute/v1/projects/<project>/<region>/networks/<name>
// we want to get the last section of the path and compare to the configured network name
int idx = firewall.getNetwork().lastIndexOf('/');
String networkName = idx >= 0 ? firewall.getNetwork().substring(idx + 1) : firewall.getNetwork();
if (!networkName.equals(network.getName())) {
continue;
}
String direction = firewall.getDirection();
if (!"INGRESS".equals(direction) || firewall.getAllowed() == null) {
continue;
}
if (useInternalIP) {
// private IP blocks in order to be able to communicate with Dataproc.
try {
List<IPRange> sourceRanges = Optional.ofNullable(firewall.getSourceRanges()).map(DataprocUtils::parseIPRanges).orElse(Collections.emptyList());
if (!sourceRanges.isEmpty()) {
boolean isPrivate = PRIVATE_IP_RANGES.stream().anyMatch(privateRange -> sourceRanges.stream().anyMatch(privateRange::isOverlap));
if (!isPrivate) {
continue;
}
}
} catch (Exception e) {
LOG.warn("Failed to parse source ranges from firewall rule {}", firewall.getName(), e);
}
}
for (Firewall.Allowed allowed : firewall.getAllowed()) {
String protocol = allowed.getIPProtocol();
boolean addTag = false;
if ("all".equalsIgnoreCase(protocol)) {
requiredPorts.clear();
addTag = true;
} else if ("tcp".equalsIgnoreCase(protocol) && isPortAllowed(allowed.getPorts(), FirewallPort.SSH.port)) {
requiredPorts.remove(FirewallPort.SSH);
addTag = true;
}
if (addTag && firewall.getTargetTags() != null && !firewall.getTargetTags().isEmpty()) {
tags.add(firewall.getTargetTags().iterator().next());
}
}
}
if (!requiredPorts.isEmpty()) {
String portList = requiredPorts.stream().map(p -> String.valueOf(p.port)).collect(Collectors.joining(","));
throw new IllegalArgumentException(String.format("Could not find an ingress firewall rule for network '%s' in project '%s' for ports '%s'. " + "Please create a rule to allow incoming traffic on those ports for your IP range.", network.getName(), conf.getNetworkHostProjectID(), portList));
}
return tags;
}
Aggregations