Search in sources :

Example 1 with ProgramRunCluster

use of io.cdap.cdap.proto.ProgramRunCluster in project cdap by caskdata.

the class DefaultStoreTest method testLogProgramRunHistory.

@Test
public void testLogProgramRunHistory() {
    Map<String, String> noRuntimeArgsProps = ImmutableMap.of("runtimeArgs", GSON.toJson(ImmutableMap.<String, String>of()));
    // record finished Workflow
    ProgramId programId = new ProgramId("account1", "application1", ProgramType.WORKFLOW, "wf1");
    long now = System.currentTimeMillis();
    long startTimeSecs = TimeUnit.MILLISECONDS.toSeconds(now);
    RunId run1 = RunIds.generate(now - 20000);
    ArtifactId artifactId = programId.getNamespaceId().artifact("testArtifact", "1.0").toApiArtifactId();
    setStartAndRunning(programId.run(run1.getId()), artifactId);
    store.setStop(programId.run(run1.getId()), startTimeSecs - 10, ProgramController.State.ERROR.getRunStatus(), AppFabricTestHelper.createSourceId(++sourceId));
    // record another finished Workflow
    RunId run2 = RunIds.generate(now - 10000);
    setStartAndRunning(programId.run(run2.getId()), artifactId);
    store.setStop(programId.run(run2.getId()), startTimeSecs - 5, ProgramController.State.COMPLETED.getRunStatus(), AppFabricTestHelper.createSourceId(++sourceId));
    // record a suspended Workflow
    RunId run21 = RunIds.generate(now - 7500);
    setStartAndRunning(programId.run(run21.getId()), artifactId);
    store.setSuspend(programId.run(run21.getId()), AppFabricTestHelper.createSourceId(++sourceId), -1);
    // record not finished Workflow
    RunId run3 = RunIds.generate(now);
    setStartAndRunning(programId.run(run3.getId()), artifactId);
    // For a RunRecordDetail that has not yet been completed, getStopTs should return null
    RunRecordDetail runRecord = store.getRun(programId.run(run3.getId()));
    Assert.assertNotNull(runRecord);
    Assert.assertNull(runRecord.getStopTs());
    // record run of different program
    ProgramId programId2 = new ProgramId("account1", "application1", ProgramType.WORKFLOW, "wf2");
    RunId run4 = RunIds.generate(now - 5000);
    setStartAndRunning(programId2.run(run4.getId()), artifactId);
    store.setStop(programId2.run(run4.getId()), startTimeSecs - 4, ProgramController.State.COMPLETED.getRunStatus(), AppFabricTestHelper.createSourceId(++sourceId));
    // record for different account
    setStartAndRunning(new ProgramId("account2", "application1", ProgramType.WORKFLOW, "wf1").run(run3.getId()), artifactId);
    // we should probably be better with "get" method in DefaultStore interface to do that, but we don't have one
    Map<ProgramRunId, RunRecordDetail> successHistorymap = store.getRuns(programId, ProgramRunStatus.COMPLETED, 0, Long.MAX_VALUE, Integer.MAX_VALUE);
    Map<ProgramRunId, RunRecordDetail> failureHistorymap = store.getRuns(programId, ProgramRunStatus.FAILED, startTimeSecs - 20, startTimeSecs - 10, Integer.MAX_VALUE);
    Assert.assertEquals(failureHistorymap, store.getRuns(programId, ProgramRunStatus.FAILED, 0, Long.MAX_VALUE, Integer.MAX_VALUE));
    Map<ProgramRunId, RunRecordDetail> suspendedHistorymap = store.getRuns(programId, ProgramRunStatus.SUSPENDED, startTimeSecs - 20, startTimeSecs, Integer.MAX_VALUE);
    // only finished + succeeded runs should be returned
    Assert.assertEquals(1, successHistorymap.size());
    // only finished + failed runs should be returned
    Assert.assertEquals(1, failureHistorymap.size());
    // only suspended runs should be returned
    Assert.assertEquals(1, suspendedHistorymap.size());
    // records should be sorted by start time latest to earliest
    RunRecordDetail run = successHistorymap.values().iterator().next();
    Assert.assertEquals(startTimeSecs - 10, run.getStartTs());
    Assert.assertEquals(Long.valueOf(startTimeSecs - 5), run.getStopTs());
    Assert.assertEquals(ProgramController.State.COMPLETED.getRunStatus(), run.getStatus());
    run = failureHistorymap.values().iterator().next();
    Assert.assertEquals(startTimeSecs - 20, run.getStartTs());
    Assert.assertEquals(Long.valueOf(startTimeSecs - 10), run.getStopTs());
    Assert.assertEquals(ProgramController.State.ERROR.getRunStatus(), run.getStatus());
    run = suspendedHistorymap.values().iterator().next();
    Assert.assertEquals(run21.getId(), run.getPid());
    Assert.assertEquals(ProgramController.State.SUSPENDED.getRunStatus(), run.getStatus());
    // Assert all history
    Map<ProgramRunId, RunRecordDetail> allHistorymap = store.getRuns(programId, ProgramRunStatus.ALL, startTimeSecs - 20, startTimeSecs + 1, Integer.MAX_VALUE);
    Assert.assertEquals(allHistorymap.toString(), 4, allHistorymap.size());
    // Assert running programs
    Map<ProgramRunId, RunRecordDetail> runningHistorymap = store.getRuns(programId, ProgramRunStatus.RUNNING, startTimeSecs, startTimeSecs + 1, 100);
    Assert.assertEquals(1, runningHistorymap.size());
    Assert.assertEquals(runningHistorymap, store.getRuns(programId, ProgramRunStatus.RUNNING, 0, Long.MAX_VALUE, 100));
    // Get a run record for running program
    RunRecordDetail expectedRunning = runningHistorymap.values().iterator().next();
    Assert.assertNotNull(expectedRunning);
    RunRecordDetail actualRunning = store.getRun(programId.run(expectedRunning.getPid()));
    Assert.assertEquals(expectedRunning, actualRunning);
    // Get a run record for completed run
    RunRecordDetail expectedCompleted = successHistorymap.values().iterator().next();
    Assert.assertNotNull(expectedCompleted);
    RunRecordDetail actualCompleted = store.getRun(programId.run(expectedCompleted.getPid()));
    Assert.assertEquals(expectedCompleted, actualCompleted);
    // Get a run record for suspended run
    RunRecordDetail expectedSuspended = suspendedHistorymap.values().iterator().next();
    Assert.assertNotNull(expectedSuspended);
    RunRecordDetail actualSuspended = store.getRun(programId.run(expectedSuspended.getPid()));
    Assert.assertEquals(expectedSuspended, actualSuspended);
    ProgramRunCluster emptyCluster = new ProgramRunCluster(ProgramRunClusterStatus.PROVISIONED, null, 0);
    // Record workflow that starts but encounters error before it runs
    RunId run7 = RunIds.generate(now);
    Map<String, String> emptyArgs = ImmutableMap.of();
    setStart(programId.run(run7.getId()), emptyArgs, emptyArgs, artifactId);
    store.setStop(programId.run(run7.getId()), startTimeSecs + 1, ProgramController.State.ERROR.getRunStatus(), AppFabricTestHelper.createSourceId(++sourceId));
    RunRecordDetail expectedRunRecord7 = RunRecordDetail.builder().setProgramRunId(programId.run(run7)).setStartTime(startTimeSecs).setStopTime(startTimeSecs + 1).setStatus(ProgramRunStatus.FAILED).setProperties(noRuntimeArgsProps).setCluster(emptyCluster).setArtifactId(artifactId).setSourceId(AppFabricTestHelper.createSourceId(sourceId)).build();
    RunRecordDetail actualRecord7 = store.getRun(programId.run(run7.getId()));
    Assert.assertEquals(expectedRunRecord7, actualRecord7);
    // Record workflow that starts and suspends before it runs
    RunId run8 = RunIds.generate(now);
    setStart(programId.run(run8.getId()), emptyArgs, emptyArgs, artifactId);
    store.setSuspend(programId.run(run8.getId()), AppFabricTestHelper.createSourceId(++sourceId), -1);
    RunRecordDetail expectedRunRecord8 = RunRecordDetail.builder().setProgramRunId(programId.run(run8)).setStartTime(startTimeSecs).setStatus(ProgramRunStatus.SUSPENDED).setProperties(noRuntimeArgsProps).setCluster(emptyCluster).setArtifactId(artifactId).setSourceId(AppFabricTestHelper.createSourceId(sourceId)).build();
    RunRecordDetail actualRecord8 = store.getRun(programId.run(run8.getId()));
    Assert.assertEquals(expectedRunRecord8, actualRecord8);
    // Record workflow that is killed while suspended
    RunId run9 = RunIds.generate(now);
    setStartAndRunning(programId.run(run9.getId()), artifactId);
    store.setSuspend(programId.run(run9.getId()), AppFabricTestHelper.createSourceId(++sourceId), -1);
    store.setStop(programId.run(run9.getId()), startTimeSecs + 5, ProgramRunStatus.KILLED, AppFabricTestHelper.createSourceId(++sourceId));
    RunRecordDetail expectedRunRecord9 = RunRecordDetail.builder().setProgramRunId(programId.run(run9)).setStartTime(startTimeSecs).setRunTime(startTimeSecs + 1).setStopTime(startTimeSecs + 5).setStatus(ProgramRunStatus.KILLED).setProperties(noRuntimeArgsProps).setCluster(emptyCluster).setArtifactId(artifactId).setSourceId(AppFabricTestHelper.createSourceId(sourceId)).build();
    RunRecordDetail actualRecord9 = store.getRun(programId.run(run9.getId()));
    Assert.assertEquals(expectedRunRecord9, actualRecord9);
    // Non-existent run record should give null
    Assert.assertNull(store.getRun(programId.run(UUID.randomUUID().toString())));
    // Searching for history in wrong time range should give us no results
    Assert.assertTrue(store.getRuns(programId, ProgramRunStatus.COMPLETED, startTimeSecs - 5000, startTimeSecs - 2000, Integer.MAX_VALUE).isEmpty());
    Assert.assertTrue(store.getRuns(programId, ProgramRunStatus.ALL, startTimeSecs - 5000, startTimeSecs - 2000, Integer.MAX_VALUE).isEmpty());
}
Also used : ProgramRunCluster(io.cdap.cdap.proto.ProgramRunCluster) ArtifactId(io.cdap.cdap.api.artifact.ArtifactId) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) ProgramId(io.cdap.cdap.proto.id.ProgramId) RunId(org.apache.twill.api.RunId) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) Test(org.junit.Test)

Example 2 with ProgramRunCluster

use of io.cdap.cdap.proto.ProgramRunCluster in project cdap by caskdata.

the class AppMetadataStore method recordProgramProvisioning.

/**
 * Record that the program run is provisioning compute resources for the run. If the current status has
 * a higher source id, this call will be ignored.
 *
 * @param programRunId program run
 * @param runtimeArgs runtime arguments
 * @param systemArgs system arguments
 * @param sourceId unique id representing the source of program run status, such as the message id of the program
 *                 run status notification in TMS. The source id must increase as the recording time of the program
 *                 run status increases, so that the attempt to persist program run status older than the existing
 *                 program run status will be ignored
 * @param artifactId artifact id of the program's application -
 *                   its null only for older messages that were not processed before upgrading to 5.0
 * @return {@link ProgramRunClusterStatus#PROVISIONING} if it is successfully persisted, {@code null} otherwise.
 */
@Nullable
public RunRecordDetail recordProgramProvisioning(ProgramRunId programRunId, Map<String, String> runtimeArgs, Map<String, String> systemArgs, byte[] sourceId, @Nullable ArtifactId artifactId) throws IOException {
    long startTs = RunIds.getTime(programRunId.getRun(), TimeUnit.SECONDS);
    if (startTs == -1L) {
        LOG.error("Ignoring unexpected request to record provisioning state for program run {} that does not have " + "a timestamp in the run id.", programRunId);
        return null;
    }
    RunRecordDetail existing = getRun(programRunId);
    // for some reason, there is an existing run record.
    if (existing != null) {
        LOG.error("Ignoring unexpected request to record provisioning state for program run {} that has an existing " + "run record in run state {} and cluster state {}.", programRunId, existing.getStatus(), existing.getCluster().getStatus());
        return null;
    }
    Optional<ProfileId> profileId = SystemArguments.getProfileIdFromArgs(programRunId.getNamespaceId(), systemArgs);
    if (!profileId.isPresent()) {
        LOG.error("Ignoring unexpected request to record provisioning state for program run {} that does not have " + "a profile assigned to it.", programRunId);
        return null;
    }
    ProgramRunCluster cluster = new ProgramRunCluster(ProgramRunClusterStatus.PROVISIONING, null, null);
    RunRecordDetail meta = RunRecordDetail.builder().setProgramRunId(programRunId).setStartTime(startTs).setStatus(ProgramRunStatus.PENDING).setProperties(getRecordProperties(systemArgs, runtimeArgs)).setSystemArgs(systemArgs).setCluster(cluster).setProfileId(profileId.get()).setPeerName(systemArgs.get(ProgramOptionConstants.PEER_NAME)).setSourceId(sourceId).setArtifactId(artifactId).setPrincipal(systemArgs.get(ProgramOptionConstants.PRINCIPAL)).build();
    writeNewRunRecord(meta, TYPE_RUN_RECORD_ACTIVE);
    LOG.trace("Recorded {} for program {}", ProgramRunClusterStatus.PROVISIONING, programRunId);
    return meta;
}
Also used : ProfileId(io.cdap.cdap.proto.id.ProfileId) ProgramRunCluster(io.cdap.cdap.proto.ProgramRunCluster) Nullable(javax.annotation.Nullable)

Example 3 with ProgramRunCluster

use of io.cdap.cdap.proto.ProgramRunCluster in project cdap by caskdata.

the class AppMetadataStore method recordProgramDeprovisioning.

/**
 * Record that the program run has started de-provisioning compute resources for the run. If the current status has
 * a higher source id, this call will be ignored.
 *
 * @param programRunId program run
 * @param sourceId unique id representing the source of program run status, such as the message id of the program
 *                 run status notification in TMS. The source id must increase as the recording time of the program
 *                 run status increases, so that the attempt to persist program run status older than the existing
 *                 program run status will be ignored
 * @return {@link RunRecordDetail} that was persisted, or {@code null} if the update was ignored.
 */
@Nullable
public RunRecordDetail recordProgramDeprovisioning(ProgramRunId programRunId, byte[] sourceId) throws IOException {
    RunRecordDetail existing = getRun(programRunId);
    if (existing == null) {
        LOG.debug("Ignoring unexpected transition of program run {} to cluster state {} with no existing run record.", programRunId, ProgramRunClusterStatus.DEPROVISIONING);
        return null;
    }
    if (!isValid(existing, existing.getStatus(), ProgramRunClusterStatus.DEPROVISIONING, sourceId)) {
        return null;
    }
    delete(existing);
    List<Field<?>> key = getProgramRunInvertedTimeKey(TYPE_RUN_RECORD_COMPLETED, programRunId, existing.getStartTs());
    ProgramRunCluster cluster = new ProgramRunCluster(ProgramRunClusterStatus.DEPROVISIONING, null, existing.getCluster().getNumNodes());
    RunRecordDetail meta = RunRecordDetail.builder(existing).setCluster(cluster).setSourceId(sourceId).build();
    writeToRunRecordTableWithPrimaryKeys(key, meta);
    LOG.trace("Recorded {} for program {}", ProgramRunClusterStatus.DEPROVISIONING, programRunId);
    return meta;
}
Also used : ProgramRunCluster(io.cdap.cdap.proto.ProgramRunCluster) Field(io.cdap.cdap.spi.data.table.field.Field) Nullable(javax.annotation.Nullable)

Example 4 with ProgramRunCluster

use of io.cdap.cdap.proto.ProgramRunCluster in project cdap by caskdata.

the class AppMetadataStore method recordProgramProvisioned.

/**
 * Record that the program run has completed provisioning compute resources for the run. If the current status has
 * a higher source id, this call will be ignored.
 *
 * @param programRunId program run
 * @param numNodes number of cluster nodes provisioned
 * @param sourceId unique id representing the source of program run status, such as the message id of the program
 *                 run status notification in TMS. The source id must increase as the recording time of the program
 *                 run status increases, so that the attempt to persist program run status older than the existing
 *                 program run status will be ignored
 * @return {@link RunRecordDetail} that was persisted, or {@code null} if the update was ignored.
 */
@Nullable
public RunRecordDetail recordProgramProvisioned(ProgramRunId programRunId, int numNodes, byte[] sourceId) throws IOException {
    RunRecordDetail existing = getRun(programRunId);
    if (existing == null) {
        LOG.warn("Ignoring unexpected request to transition program run {} from non-existent state to cluster state {}.", programRunId, ProgramRunClusterStatus.PROVISIONED);
        return null;
    }
    if (!isValid(existing, existing.getStatus(), ProgramRunClusterStatus.PROVISIONED, sourceId)) {
        return null;
    }
    // Delete the old run record
    delete(existing);
    List<Field<?>> key = getProgramRunInvertedTimeKey(TYPE_RUN_RECORD_ACTIVE, programRunId, existing.getStartTs());
    ProgramRunCluster cluster = new ProgramRunCluster(ProgramRunClusterStatus.PROVISIONED, null, numNodes);
    RunRecordDetail meta = RunRecordDetail.builder(existing).setCluster(cluster).setSourceId(sourceId).build();
    writeToRunRecordTableWithPrimaryKeys(key, meta);
    LOG.trace("Recorded {} for program {}", ProgramRunClusterStatus.PROVISIONED, programRunId);
    return meta;
}
Also used : ProgramRunCluster(io.cdap.cdap.proto.ProgramRunCluster) Field(io.cdap.cdap.spi.data.table.field.Field) Nullable(javax.annotation.Nullable)

Example 5 with ProgramRunCluster

use of io.cdap.cdap.proto.ProgramRunCluster in project cdap by caskdata.

the class AppMetadataStore method recordProgramOrphaned.

/**
 * Record that the program run has been orphaned. If the current status has a higher source id,
 * this call will be ignored.
 *
 * @param programRunId program run
 * @param sourceId unique id representing the source of program run status, such as the message id of the program
 *                 run status notification in TMS. The source id must increase as the recording time of the program
 *                 run status increases, so that the attempt to persist program run status older than the existing
 *                 program run status will be ignored
 * @param endTs timestamp in seconds for when the cluster was orphaned
 * @return {@link RunRecordDetail} that was persisted, or {@code null} if the update was ignored.
 */
@Nullable
public RunRecordDetail recordProgramOrphaned(ProgramRunId programRunId, long endTs, byte[] sourceId) throws IOException {
    RunRecordDetail existing = getRun(programRunId);
    if (existing == null) {
        LOG.debug("Ignoring unexpected transition of program run {} to cluster state {} with no existing run record.", programRunId, ProgramRunClusterStatus.DEPROVISIONED);
        return null;
    }
    if (!isValid(existing, existing.getStatus(), ProgramRunClusterStatus.ORPHANED, sourceId)) {
        return null;
    }
    delete(existing);
    List<Field<?>> key = getProgramRunInvertedTimeKey(TYPE_RUN_RECORD_COMPLETED, programRunId, existing.getStartTs());
    ProgramRunCluster cluster = new ProgramRunCluster(ProgramRunClusterStatus.ORPHANED, endTs, existing.getCluster().getNumNodes());
    RunRecordDetail meta = RunRecordDetail.builder(existing).setCluster(cluster).setSourceId(sourceId).build();
    writeToRunRecordTableWithPrimaryKeys(key, meta);
    LOG.trace("Recorded {} for program {}", ProgramRunClusterStatus.ORPHANED, programRunId);
    return meta;
}
Also used : ProgramRunCluster(io.cdap.cdap.proto.ProgramRunCluster) Field(io.cdap.cdap.spi.data.table.field.Field) Nullable(javax.annotation.Nullable)

Aggregations

ProgramRunCluster (io.cdap.cdap.proto.ProgramRunCluster)7 Nullable (javax.annotation.Nullable)5 Field (io.cdap.cdap.spi.data.table.field.Field)4 ProgramRunId (io.cdap.cdap.proto.id.ProgramRunId)2 RunId (org.apache.twill.api.RunId)2 LoggingEvent (ch.qos.logback.classic.spi.LoggingEvent)1 ArtifactId (io.cdap.cdap.api.artifact.ArtifactId)1 LogEvent (io.cdap.cdap.logging.read.LogEvent)1 LogOffset (io.cdap.cdap.logging.read.LogOffset)1 ProfileId (io.cdap.cdap.proto.id.ProfileId)1 ProgramId (io.cdap.cdap.proto.id.ProgramId)1 Test (org.junit.Test)1