use of io.cdap.cdap.runtime.spi.provisioner.ProvisionerContext in project cdap by caskdata.
the class ProvisioningService method createProvisionTask.
private Runnable createProvisionTask(ProvisioningTaskInfo taskInfo, Provisioner provisioner) {
ProgramRunId programRunId = taskInfo.getProgramRunId();
ProgramOptions programOptions = taskInfo.getProgramOptions();
Map<String, String> systemArgs = programOptions.getArguments().asMap();
ProvisionerContext context;
try {
SSHContext sshContext = new DefaultSSHContext(Networks.getAddress(cConf, Constants.NETWORK_PROXY_ADDRESS), locationFactory.create(taskInfo.getSecureKeysDir()), createSSHKeyPair(taskInfo));
context = createContext(cConf, programOptions, programRunId, taskInfo.getUser(), taskInfo.getProvisionerProperties(), sshContext);
} catch (IOException e) {
runWithProgramLogging(taskInfo.getProgramRunId(), systemArgs, () -> LOG.error("Failed to load ssh key. The run will be marked as failed.", e));
programStateWriter.error(programRunId, new IllegalStateException("Failed to load ssh key.", e));
provisionerNotifier.deprovisioning(taskInfo.getProgramRunId());
return () -> {
};
} catch (InvalidMacroException e) {
runWithProgramLogging(taskInfo.getProgramRunId(), systemArgs, () -> LOG.error("Could not evaluate macros while provisoning. " + "The run will be marked as failed.", e));
programStateWriter.error(programRunId, new IllegalStateException("Could not evaluate macros while provisioning", e));
provisionerNotifier.deprovisioning(taskInfo.getProgramRunId());
return () -> {
};
}
// TODO: (CDAP-13246) pick up timeout from profile instead of hardcoding
ProvisioningTask task = new ProvisionTask(taskInfo, transactionRunner, provisioner, context, provisionerNotifier, programStateWriter, 300);
ProvisioningTaskKey taskKey = new ProvisioningTaskKey(programRunId, ProvisioningOp.Type.PROVISION);
return () -> taskExecutor.submit(taskKey, () -> callWithProgramLogging(programRunId, systemArgs, () -> {
try {
return task.executeOnce();
} catch (InterruptedException e) {
LOG.debug("Provision task for program run {} interrupted.", taskInfo.getProgramRunId());
throw e;
} catch (Exception e) {
LOG.info("Provision task for program run {} failed.", taskInfo.getProgramRunId(), e);
throw e;
}
}));
}
use of io.cdap.cdap.runtime.spi.provisioner.ProvisionerContext in project cdap by caskdata.
the class ProvisioningService method getRuntimeJobManager.
/**
* Returns runtime job manager implementation.
*
* @param programRunId program run
* @param programOptions program options
* @return an object of runtime job manager
*/
public Optional<RuntimeJobManager> getRuntimeJobManager(ProgramRunId programRunId, ProgramOptions programOptions) {
Map<String, String> systemArgs = programOptions.getArguments().asMap();
String name = SystemArguments.getProfileProvisioner(systemArgs);
Provisioner provisioner = provisionerInfo.get().provisioners.get(name);
String user = programOptions.getArguments().getOption(ProgramOptionConstants.USER_ID);
Map<String, String> properties = SystemArguments.getProfileProperties(systemArgs);
ProvisionerContext context = createContext(cConf, programOptions, programRunId, user, properties, null);
return provisioner.getRuntimeJobManager(context);
}
use of io.cdap.cdap.runtime.spi.provisioner.ProvisionerContext in project cdap by caskdata.
the class DataprocProvisioner method tryReuseCluster.
/**
* If cluster reuse is enabled & possible tries to find a cluster to reuse.
* @param client data proc client
* @param context provisioner contex
* @param conf dataproc configuration
* @return a cluster ready to reuse or null if none available.
*/
@Nullable
private Cluster tryReuseCluster(DataprocClient client, ProvisionerContext context, DataprocConf conf) throws RetryableProvisionException, IOException {
if (!isReuseSupported(conf)) {
LOG.debug("Not checking cluster reuse, enabled: {}, skip delete: {}, idle ttl: {}, reuse threshold: {}", conf.isClusterReuseEnabled(), conf.isSkipDelete(), conf.getIdleTTLMinutes(), conf.getClusterReuseThresholdMinutes());
return null;
}
String clusterKey = getRunKey(context);
// For idempotency, check if we already have the cluster allocated
Optional<Cluster> clusterOptional = findCluster(clusterKey, client);
if (clusterOptional.isPresent()) {
Cluster cluster = clusterOptional.get();
if (cluster.getStatus() == ClusterStatus.CREATING || cluster.getStatus() == ClusterStatus.RUNNING) {
LOG.debug("Found allocated cluster {}", cluster.getName());
return cluster;
} else {
LOG.debug("Preallocated cluster {} has expired, will find a new one");
// Let's remove the reuse label to ensure new cluster will be picked up by findCluster
try {
client.updateClusterLabels(cluster.getName(), Collections.emptyMap(), Collections.singleton(LABEL_RUN_KEY));
} catch (Exception e) {
LOG.trace("Unable to remove reuse label, cluster may have died already", e);
if (!LOG.isTraceEnabled()) {
LOG.debug("Unable to remove reuse label, cluster may have died already");
}
}
}
}
Lock reuseLock = getSystemContext().getLock(REUSE_LOCK);
reuseLock.lock();
try {
Map<String, String> filter = new HashMap<>();
String normalizedProfileName = getNormalizedProfileName(context);
if (normalizedProfileName != null) {
filter.put(LABEL_PROFILE, normalizedProfileName);
}
filter.put(LABEL_VERSON, getVersionLabel());
filter.put(LABEL_REUSE_KEY, conf.getClusterReuseKey());
filter.put(LABEL_REUSE_UNTIL, "*");
Optional<Cluster> cluster = client.getClusters(ClusterStatus.RUNNING, filter, clientCluster -> {
// Verify reuse label
long reuseUntil = Long.valueOf(clientCluster.getLabelsOrDefault(LABEL_REUSE_UNTIL, "0"));
long now = System.currentTimeMillis();
if (reuseUntil < now) {
LOG.debug("Skipping expired cluster {}, reuse until {} is before now {}", clientCluster.getClusterName(), reuseUntil, now);
return false;
}
return true;
}).findAny();
if (cluster.isPresent()) {
String clusterName = cluster.get().getName();
LOG.info("Found cluster to reuse: {}", clusterName);
// Add cdap-reuse-for to find cluster later if needed
// And remove reuseUntil to indicate the cluster is taken
client.updateClusterLabels(clusterName, Collections.singletonMap(LABEL_RUN_KEY, clusterKey), Collections.singleton(LABEL_REUSE_UNTIL));
} else {
LOG.debug("Could not find any available cluster to reuse.");
}
return cluster.orElse(null);
} catch (Exception e) {
LOG.warn("Error retrieving clusters to reuse, will create a new one", e);
return null;
} finally {
reuseLock.unlock();
}
}
use of io.cdap.cdap.runtime.spi.provisioner.ProvisionerContext in project cdap by caskdata.
the class ExistingDataprocProvisioner method createCluster.
@Override
public Cluster createCluster(ProvisionerContext context) throws Exception {
Map<String, String> contextProperties = createContextProperties(context);
DataprocConf conf = DataprocConf.create(contextProperties);
if (context.getRuntimeMonitorType() == RuntimeMonitorType.SSH) {
String sshUser = contextProperties.get(SSH_USER);
String sshKey = contextProperties.get(SSH_KEY);
if (Strings.isNullOrEmpty(sshUser) || Strings.isNullOrEmpty(sshKey)) {
throw new DataprocRuntimeException("SSH User and key are required for monitoring through SSH.");
}
SSHKeyPair sshKeyPair = new SSHKeyPair(new SSHPublicKey(sshUser, ""), () -> sshKey.getBytes(StandardCharsets.UTF_8));
// The ssh context shouldn't be null, but protect it in case there is platform bug
Optional.ofNullable(context.getSSHContext()).ifPresent(c -> c.setSSHKeyPair(sshKeyPair));
}
String clusterName = contextProperties.get(CLUSTER_NAME);
try (DataprocClient client = DataprocClient.fromConf(conf, false)) {
try {
client.updateClusterLabels(clusterName, getSystemLabels());
} catch (DataprocRuntimeException e) {
// Only log the stacktrace if trace log level is enabled
if (LOG.isTraceEnabled()) {
LOG.trace("Cannot update cluster labels due to {}", e.getMessage(), e);
} else {
LOG.debug("Cannot update cluster labels due to {}", e.getMessage());
}
}
return client.getCluster(clusterName).filter(c -> c.getStatus() == ClusterStatus.RUNNING).orElseThrow(() -> new DataprocRuntimeException("Dataproc cluster " + clusterName + " does not exist or not in running state."));
}
}
use of io.cdap.cdap.runtime.spi.provisioner.ProvisionerContext in project cdap by caskdata.
the class ProvisioningService method getClusterStatus.
/**
* Returns the {@link ClusterStatus} for the cluster being used to execute the given program run.
*
* @param programRunId the program run id for checking the cluster status
* @param programOptions the program options for the given run
* @param cluster the {@link Cluster} information for the given run
* @param userId the user id to use for {@link SecureStore} operation.
* @return the {@link ClusterStatus}
* @throws Exception if non-retryable exception is encountered when querying cluster status
*/
public ClusterStatus getClusterStatus(ProgramRunId programRunId, ProgramOptions programOptions, Cluster cluster, String userId) throws Exception {
Map<String, String> systemArgs = programOptions.getArguments().asMap();
String name = SystemArguments.getProfileProvisioner(systemArgs);
Provisioner provisioner = provisionerInfo.get().provisioners.get(name);
// If there is no provisioner available, we can't do anything further, hence returning NOT_EXISTS
if (provisioner == null) {
return ClusterStatus.NOT_EXISTS;
}
Map<String, String> properties = SystemArguments.getProfileProperties(systemArgs);
// Create the ProvisionerContext and query the cluster status using the provisioner
ProvisionerContext context;
try {
DefaultSSHContext defaultSSHContext = null;
if (!getRuntimeJobManager(programRunId, programOptions).isPresent()) {
defaultSSHContext = new DefaultSSHContext(Networks.getAddress(cConf, Constants.NETWORK_PROXY_ADDRESS), null, null);
}
context = createContext(cConf, programOptions, programRunId, userId, properties, defaultSSHContext);
} catch (InvalidMacroException e) {
// This shouldn't happen
runWithProgramLogging(programRunId, systemArgs, () -> LOG.error("Could not evaluate macros while checking cluster status.", e));
return ClusterStatus.NOT_EXISTS;
}
return Retries.callWithRetries(() -> provisioner.getClusterStatus(context, cluster), RetryStrategies.exponentialDelay(1, 5, TimeUnit.SECONDS), RetryableProvisionException.class::isInstance);
}
Aggregations