use of io.cdap.cdap.internal.app.store.RunRecordDetail in project cdap by caskdata.
the class DistributedProgramRuntimeService method list.
@Override
public Map<RunId, RuntimeInfo> list(ProgramType type) {
Map<RunId, RuntimeInfo> result = new HashMap<>(super.list(type));
// Table holds the Twill RunId and TwillController associated with the program matching the input type
Table<ProgramId, RunId, TwillController> twillProgramInfo = HashBasedTable.create();
List<RuntimeInfo> runtimeInfos = getRuntimeInfos();
// Goes through all live application and fill the twillProgramInfo table
for (TwillRunner.LiveInfo liveInfo : twillRunner.lookupLive()) {
String appName = liveInfo.getApplicationName();
ProgramId programId = TwillAppNames.fromTwillAppName(appName, false);
if (programId == null) {
continue;
}
if (!type.equals(programId.getType())) {
continue;
}
for (TwillController controller : liveInfo.getControllers()) {
RunId twillRunId = controller.getRunId();
// If it already in the runtime info, no need to lookup
if (runtimeInfos.stream().anyMatch(info -> twillRunId.equals(info.getTwillRunId()))) {
continue;
}
twillProgramInfo.put(programId, twillRunId, controller);
}
}
if (twillProgramInfo.isEmpty()) {
return ImmutableMap.copyOf(result);
}
final Set<RunId> twillRunIds = twillProgramInfo.columnKeySet();
Collection<RunRecordDetail> activeRunRecords;
synchronized (this) {
activeRunRecords = store.getRuns(ProgramRunStatus.RUNNING, record -> record.getTwillRunId() != null && twillRunIds.contains(org.apache.twill.internal.RunIds.fromString(record.getTwillRunId()))).values();
}
for (RunRecordDetail record : activeRunRecords) {
String twillRunId = record.getTwillRunId();
if (twillRunId == null) {
// This is unexpected. Just log and ignore the run record
LOG.warn("No twill runId for in run record {}.", record);
continue;
}
RunId twillRunIdFromRecord = org.apache.twill.internal.RunIds.fromString(twillRunId);
// Get the CDAP RunId from RunRecord
RunId runId = RunIds.fromString(record.getPid());
// Get the Program and TwillController for the current twillRunId
Map<ProgramId, TwillController> mapForTwillId = twillProgramInfo.columnMap().get(twillRunIdFromRecord);
Map.Entry<ProgramId, TwillController> entry = mapForTwillId.entrySet().iterator().next();
// Create RuntimeInfo for the current Twill RunId
if (result.computeIfAbsent(runId, rid -> createRuntimeInfo(entry.getKey(), rid, entry.getValue())) == null) {
LOG.warn("Unable to create runtime info for program {} with run id {}", entry.getKey(), runId);
}
}
return ImmutableMap.copyOf(result);
}
use of io.cdap.cdap.internal.app.store.RunRecordDetail in project cdap by caskdata.
the class RunRecordCorrectorService method doFixRunRecords.
/**
* Fix all the possible inconsistent states for RunRecords that shows it is in RUNNING state but actually not
* via check to {@link ProgramRuntimeService} for a type of CDAP program.
*
* @return the set of fixed {@link ProgramRunId}.
*/
private Set<ProgramRunId> doFixRunRecords() {
LOG.trace("Start getting run records not actually running ...");
// Get run records in STARTING, RUNNING and SUSPENDED states that are actually not running
// Do it in micro batches of transactions to avoid tx timeout
Set<ProgramRunId> fixedPrograms = new HashSet<>();
Predicate<RunRecordDetail> filter = createFilter(fixedPrograms);
for (ProgramRunStatus status : NOT_STOPPED_STATUSES) {
while (true) {
// runs are not guaranteed to come back in order of start time, so need to scan the entire time range
// each time. Should not be worse in performance than specifying a more restrictive time range
// because time range is just used as a read-time filter.
Map<ProgramRunId, RunRecordDetail> runs = store.getRuns(status, 0L, Long.MAX_VALUE, txBatchSize, filter);
LOG.trace("{} run records in {} state but are not actually running", runs.size(), status);
if (runs.isEmpty()) {
break;
}
for (RunRecordDetail record : runs.values()) {
ProgramRunId programRunId = record.getProgramRunId();
String msg = String.format("Fixed RunRecord for program run %s in %s state because it is actually not running", programRunId, record.getStatus());
programStateWriter.error(programRunId, new ProgramRunAbortedException(msg));
fixedPrograms.add(programRunId);
LOG.warn(msg);
}
}
}
if (fixedPrograms.isEmpty()) {
LOG.trace("No RunRecord found with status in {}, but the program are not actually running", NOT_STOPPED_STATUSES);
} else {
LOG.warn("Fixed {} RunRecords with status in {}, but the programs are not actually running", fixedPrograms.size(), NOT_STOPPED_STATUSES);
}
return fixedPrograms;
}
use of io.cdap.cdap.internal.app.store.RunRecordDetail in project cdap by caskdata.
the class DirectRuntimeRequestValidatorTest method testFetcher.
@Test
public void testFetcher() throws BadRequestException {
ArtifactId artifactId = new ArtifactId("test", new ArtifactVersion("1.0"), ArtifactScope.USER);
ProgramRunId programRunId = NamespaceId.DEFAULT.app("app").spark("spark").run(RunIds.generate());
ProgramRunStatus programRunStatus = ProgramRunStatus.RUNNING;
RunRecordDetail runRecord = RunRecordDetail.builder().setProgramRunId(programRunId).setStartTime(System.currentTimeMillis()).setArtifactId(artifactId).setStatus(programRunStatus).setSystemArgs(ImmutableMap.of(SystemArguments.PROFILE_NAME, "default", SystemArguments.PROFILE_PROVISIONER, "native")).setProfileId(NamespaceId.DEFAULT.profile("native")).setSourceId(new byte[MessageId.RAW_ID_SIZE]).build();
MockProgramRunRecordFetcher runRecordFetcher = new MockProgramRunRecordFetcher().setRunRecord(runRecord);
RuntimeRequestValidator validator = new DirectRuntimeRequestValidator(cConf, txRunner, runRecordFetcher, accessEnforcer, authenticationContext);
// The first call should be hitting the run record fetching to fetch the run record.
ProgramRunInfo programRunInfo = validator.getProgramRunStatus(programRunId, new DefaultHttpRequest(HttpVersion.HTTP_1_1, HttpMethod.GET, "/"));
Assert.assertEquals(programRunStatus, programRunInfo.getProgramRunStatus());
// The second call will hit the runtime store, so it shouldn't matter what the run record fetch returns
runRecordFetcher.setRunRecord(null);
programRunInfo = validator.getProgramRunStatus(programRunId, new DefaultHttpRequest(HttpVersion.HTTP_1_1, HttpMethod.GET, "/"));
Assert.assertEquals(programRunStatus, programRunInfo.getProgramRunStatus());
}
use of io.cdap.cdap.internal.app.store.RunRecordDetail in project cdap by caskdata.
the class ProgramNotificationSubscriberService method handleClusterEvent.
/**
* Handles a notification related to cluster operations.
*
* @param programRunId program run id from the event
* @param clusterStatus cluster status from the event
* @param notification the notification to process
* @param messageIdBytes the unique ID for the notification message
* @param appMetadataStore the data table to use
* @param context the table context for performing table operations
* @return an {@link Optional} of {@link Runnable} to carry a task to execute after handling of this event completed.
* See {@link #postProcess()} for details.
* @throws IOException if failed to read/write to the app metadata store.
*/
private Optional<Runnable> handleClusterEvent(ProgramRunId programRunId, ProgramRunClusterStatus clusterStatus, Notification notification, byte[] messageIdBytes, AppMetadataStore appMetadataStore, StructuredTableContext context) throws IOException {
Map<String, String> properties = notification.getProperties();
ProgramOptions programOptions = ProgramOptions.fromNotification(notification, GSON);
String userId = properties.get(ProgramOptionConstants.USER_ID);
long endTs = getTimeSeconds(properties, ProgramOptionConstants.CLUSTER_END_TIME);
ProgramDescriptor programDescriptor = GSON.fromJson(properties.get(ProgramOptionConstants.PROGRAM_DESCRIPTOR), ProgramDescriptor.class);
switch(clusterStatus) {
case PROVISIONING:
appMetadataStore.recordProgramProvisioning(programRunId, programOptions.getUserArguments().asMap(), programOptions.getArguments().asMap(), messageIdBytes, programDescriptor.getArtifactId().toApiArtifactId());
ProvisionRequest provisionRequest = new ProvisionRequest(programRunId, programOptions, programDescriptor, userId);
return Optional.of(provisioningService.provision(provisionRequest, context));
case PROVISIONED:
Cluster cluster = GSON.fromJson(properties.get(ProgramOptionConstants.CLUSTER), Cluster.class);
appMetadataStore.recordProgramProvisioned(programRunId, cluster.getNodes().size(), messageIdBytes);
// Update the ProgramOptions system arguments to include information needed for program execution
Map<String, String> systemArgs = new HashMap<>(programOptions.getArguments().asMap());
systemArgs.put(ProgramOptionConstants.USER_ID, properties.get(ProgramOptionConstants.USER_ID));
systemArgs.put(ProgramOptionConstants.CLUSTER, properties.get(ProgramOptionConstants.CLUSTER));
systemArgs.put(ProgramOptionConstants.SECURE_KEYS_DIR, properties.get(ProgramOptionConstants.SECURE_KEYS_DIR));
ProgramOptions newProgramOptions = new SimpleProgramOptions(programOptions.getProgramId(), new BasicArguments(systemArgs), programOptions.getUserArguments());
// Publish the program STARTING state before starting the program
programStateWriter.start(programRunId, newProgramOptions, null, programDescriptor);
// emit provisioning time metric
long provisioningTime = System.currentTimeMillis() / 1000 - RunIds.getTime(programRunId.getRun(), TimeUnit.SECONDS);
SystemArguments.getProfileIdFromArgs(programRunId.getNamespaceId(), systemArgs).ifPresent(profileId -> emitProvisioningTimeMetric(programRunId, profileId, programOptions, provisioningTime));
break;
case DEPROVISIONING:
RunRecordDetail recordedMeta = appMetadataStore.recordProgramDeprovisioning(programRunId, messageIdBytes);
// or an invalid state transition. In both cases, we should not try to deprovision the cluster.
if (recordedMeta != null) {
return Optional.of(provisioningService.deprovision(programRunId, context));
}
break;
case DEPROVISIONED:
appMetadataStore.recordProgramDeprovisioned(programRunId, endTs, messageIdBytes);
break;
case ORPHANED:
appMetadataStore.recordProgramOrphaned(programRunId, endTs, messageIdBytes);
break;
}
return Optional.empty();
}
use of io.cdap.cdap.internal.app.store.RunRecordDetail in project cdap by caskdata.
the class ProgramNotificationSubscriberService method doStartUp.
@Override
protected void doStartUp() throws Exception {
super.doStartUp();
int batchSize = cConf.getInt(Constants.RuntimeMonitor.INIT_BATCH_SIZE);
RetryStrategy retryStrategy = RetryStrategies.fromConfiguration(cConf, "system.runtime.monitor.");
long startTs = System.currentTimeMillis();
Retries.runWithRetries(() -> store.scanActiveRuns(batchSize, (runRecordDetail) -> {
if (runRecordDetail.getStartTs() > startTs) {
return;
}
try {
if (runRecordDetail.getStatus() == ProgramRunStatus.PENDING) {
runRecordMonitorService.addRequest(runRecordDetail.getProgramRunId());
} else if (runRecordDetail.getStatus() == ProgramRunStatus.STARTING) {
runRecordMonitorService.addRequest(runRecordDetail.getProgramRunId());
// It is unknown what is the state of program runs in STARTING state.
// A STARTING message is published again to retry STARTING logic.
ProgramOptions programOptions = new SimpleProgramOptions(runRecordDetail.getProgramRunId().getParent(), new BasicArguments(runRecordDetail.getSystemArgs()), new BasicArguments(runRecordDetail.getUserArgs()));
LOG.debug("Retrying to start run {}.", runRecordDetail.getProgramRunId());
programStateWriter.start(runRecordDetail.getProgramRunId(), programOptions, null, this.store.loadProgram(runRecordDetail.getProgramRunId().getParent()));
}
} catch (Exception e) {
ProgramRunId programRunId = runRecordDetail.getProgramRunId();
LOG.warn("Retrying to start run {} failed. Marking it as failed.", programRunId, e);
programStateWriter.error(programRunId, e);
}
}), retryStrategy, e -> true);
}
Aggregations