use of org.apache.helix.task.JobContext in project helix by apache.
the class TestJobFailureTaskNotStarted method testTaskNotStarted.
@Test
public void testTaskNotStarted() throws InterruptedException {
setupUnbalancedDB();
final String BLOCK_WORKFLOW_NAME = "blockWorkflow";
final String FAIL_WORKFLOW_NAME = "failWorkflow";
final String FAIL_JOB_NAME = "failJob";
ConfigAccessor configAccessor = new ConfigAccessor(_gZkClient);
final int numTask = configAccessor.getClusterConfig(CLUSTER_NAME).getMaxConcurrentTaskPerInstance();
// Tasks targeting the unbalanced DB, the instance is setup to stuck on INIT->RUNNING, so it takes all threads
// on that instance.
JobConfig.Builder blockJobBuilder = new JobConfig.Builder().setWorkflow(BLOCK_WORKFLOW_NAME).setTargetResource(UNBALANCED_DB_NAME).setTargetPartitionStates(Sets.newHashSet(MasterSlaveSMD.States.MASTER.name())).setCommand(MockTask.TASK_COMMAND).setNumConcurrentTasksPerInstance(numTask);
Workflow.Builder blockWorkflowBuilder = new Workflow.Builder(BLOCK_WORKFLOW_NAME).addJob("blockJob", blockJobBuilder);
_driver.start(blockWorkflowBuilder.build());
Assert.assertTrue(TaskTestUtil.pollForAllTasksBlock(_manager.getHelixDataAccessor(), _blockedParticipant.getInstanceName(), numTask, 10000));
// Now, all HelixTask threads are stuck at INIT->RUNNING for task state transition(user task can't be submitted)
// New tasks assigned to the instance won't start INIT->RUNNING transition at all.
// A to-be-failed job, 2 tasks, 1 stuck and 1 fail, making the job fail.
JobConfig.Builder failJobBuilder = new JobConfig.Builder().setWorkflow(FAIL_WORKFLOW_NAME).setTargetResource(DB_NAME).setTargetPartitionStates(Sets.newHashSet(MasterSlaveSMD.States.MASTER.name())).setCommand(MockTask.TASK_COMMAND).setJobCommandConfigMap(ImmutableMap.of(MockTask.TASK_RESULT_STATUS, TaskResult.Status.FAILED.name()));
Workflow.Builder failWorkflowBuilder = new Workflow.Builder(FAIL_WORKFLOW_NAME).addJob(FAIL_JOB_NAME, failJobBuilder);
_driver.start(failWorkflowBuilder.build());
_driver.pollForJobState(FAIL_WORKFLOW_NAME, TaskUtil.getNamespacedJobName(FAIL_WORKFLOW_NAME, FAIL_JOB_NAME), TaskState.FAILED);
_driver.pollForWorkflowState(FAIL_WORKFLOW_NAME, TaskState.FAILED);
JobContext jobContext = _driver.getJobContext(TaskUtil.getNamespacedJobName(FAIL_WORKFLOW_NAME, FAIL_JOB_NAME));
for (int pId : jobContext.getPartitionSet()) {
if (jobContext.getAssignedParticipant(pId).equals(_blockedParticipant.getInstanceName())) {
Assert.assertEquals(jobContext.getPartitionState(pId), TaskPartitionState.TASK_ABORTED);
} else if (jobContext.getAssignedParticipant(pId).equals(_normalParticipant.getInstanceName())) {
Assert.assertEquals(jobContext.getPartitionState(pId), TaskPartitionState.TASK_ERROR);
} else {
throw new HelixException("There should be only 2 instances, 1 blocked, 1 normal.");
}
}
}
use of org.apache.helix.task.JobContext in project helix by apache.
the class TestJobQueueCleanUp method testJobQueueAutoCleanUp.
@Test
public void testJobQueueAutoCleanUp() throws InterruptedException {
int capacity = 10;
String queueName = TestHelper.getTestMethodName();
JobQueue.Builder builder = TaskTestUtil.buildJobQueue(queueName, capacity);
WorkflowConfig.Builder cfgBuilder = new WorkflowConfig.Builder(builder.getWorkflowConfig());
cfgBuilder.setJobPurgeInterval(1000);
builder.setWorkflowConfig(cfgBuilder.build());
JobConfig.Builder jobBuilder = new JobConfig.Builder().setTargetResource(WorkflowGenerator.DEFAULT_TGT_DB).setCommand(MockTask.TASK_COMMAND).setMaxAttemptsPerTask(2).setJobCommandConfigMap(ImmutableMap.of(MockTask.SUCCESS_COUNT_BEFORE_FAIL, String.valueOf(capacity / 2))).setExpiry(200L);
Set<String> deletedJobs = new HashSet<String>();
Set<String> remainJobs = new HashSet<String>();
for (int i = 0; i < capacity; i++) {
builder.enqueueJob("JOB" + i, jobBuilder);
if (i < capacity / 2) {
deletedJobs.add("JOB" + i);
} else {
remainJobs.add(TaskUtil.getNamespacedJobName(queueName, "JOB" + i));
}
}
_driver.start(builder.build());
_driver.pollForJobState(queueName, TaskUtil.getNamespacedJobName(queueName, "JOB" + (capacity - 1)), TaskState.FAILED);
Thread.sleep(2000);
WorkflowConfig config = _driver.getWorkflowConfig(queueName);
Assert.assertEquals(config.getJobDag().getAllNodes(), remainJobs);
WorkflowContext context = _driver.getWorkflowContext(queueName);
Assert.assertEquals(context.getJobStates().keySet(), remainJobs);
Assert.assertTrue(remainJobs.containsAll(context.getJobStartTimes().keySet()));
for (String job : deletedJobs) {
JobConfig cfg = _driver.getJobConfig(job);
JobContext ctx = _driver.getJobContext(job);
Assert.assertNull(cfg);
Assert.assertNull(ctx);
}
}
use of org.apache.helix.task.JobContext in project helix by apache.
the class TaskTestUtil method pollForWorkflowParallelState.
// 1. Different jobs in a same work flow is in RUNNING at the same time
// 2. When disallow overlap assignment, no two jobs in the same work flow is in RUNNING at the same instance
// Use this method with caution because it assumes workflow doesn't finish too quickly and number of parallel running
// tasks can be counted.
public static boolean pollForWorkflowParallelState(TaskDriver driver, String workflowName) throws InterruptedException {
WorkflowConfig workflowConfig = driver.getWorkflowConfig(workflowName);
Assert.assertNotNull(workflowConfig);
WorkflowContext workflowContext = null;
while (workflowContext == null) {
workflowContext = driver.getWorkflowContext(workflowName);
Thread.sleep(100);
}
int maxRunningCount = 0;
boolean finished = false;
while (!finished) {
finished = true;
int runningCount = 0;
workflowContext = driver.getWorkflowContext(workflowName);
for (String jobName : workflowConfig.getJobDag().getAllNodes()) {
TaskState jobState = workflowContext.getJobState(jobName);
if (jobState == TaskState.IN_PROGRESS) {
++runningCount;
finished = false;
}
}
if (runningCount > maxRunningCount) {
maxRunningCount = runningCount;
}
List<JobContext> jobContextList = new ArrayList<JobContext>();
for (String jobName : workflowConfig.getJobDag().getAllNodes()) {
JobContext jobContext = driver.getJobContext(jobName);
if (jobContext != null) {
jobContextList.add(driver.getJobContext(jobName));
}
}
if (!workflowConfig.isAllowOverlapJobAssignment()) {
Set<String> instances = new HashSet<String>();
for (JobContext jobContext : jobContextList) {
for (int partition : jobContext.getPartitionSet()) {
String instance = jobContext.getAssignedParticipant(partition);
TaskPartitionState taskPartitionState = jobContext.getPartitionState(partition);
if (instance == null) {
continue;
}
if (taskPartitionState != TaskPartitionState.INIT && taskPartitionState != TaskPartitionState.RUNNING) {
continue;
}
if (instances.contains(instance)) {
return false;
}
TaskPartitionState state = jobContext.getPartitionState(partition);
if (state != TaskPartitionState.COMPLETED) {
instances.add(instance);
}
}
}
}
Thread.sleep(100);
}
return maxRunningCount > 1 && (workflowConfig.isJobQueue() ? maxRunningCount <= workflowConfig.getParallelJobs() : true);
}
use of org.apache.helix.task.JobContext in project helix by apache.
the class TestTaskRebalancer method partitionSet.
@Test
public void partitionSet() throws Exception {
final String jobResource = "partitionSet";
ImmutableList<String> targetPartitions = ImmutableList.of("TestDB_1", "TestDB_2", "TestDB_3", "TestDB_5", "TestDB_8", "TestDB_13");
// construct and submit our basic workflow
Map<String, String> commandConfig = ImmutableMap.of(TIMEOUT_CONFIG, String.valueOf(100));
JobConfig.Builder jobBuilder = JobConfig.Builder.fromMap(WorkflowGenerator.DEFAULT_JOB_CONFIG);
jobBuilder.setJobCommandConfigMap(commandConfig).setMaxAttemptsPerTask(1).setTargetPartitions(targetPartitions);
Workflow flow = WorkflowGenerator.generateSingleJobWorkflowBuilder(jobResource, jobBuilder).build();
_driver.start(flow);
// wait for job completeness/timeout
_driver.pollForWorkflowState(jobResource, TaskState.COMPLETED);
// see if resulting context completed successfully for our partition set
String namespacedName = TaskUtil.getNamespacedJobName(jobResource);
JobContext ctx = _driver.getJobContext(namespacedName);
WorkflowContext workflowContext = _driver.getWorkflowContext(jobResource);
Assert.assertNotNull(ctx);
Assert.assertNotNull(workflowContext);
Assert.assertEquals(workflowContext.getJobState(namespacedName), TaskState.COMPLETED);
for (String pName : targetPartitions) {
int i = ctx.getPartitionsByTarget().get(pName).get(0);
Assert.assertEquals(ctx.getPartitionState(i), TaskPartitionState.COMPLETED);
Assert.assertEquals(ctx.getPartitionNumAttempts(i), 1);
}
}
use of org.apache.helix.task.JobContext in project helix by apache.
the class TestTaskRebalancer method timeouts.
@Test
public void timeouts() throws Exception {
final String jobResource = "timeouts";
JobConfig.Builder jobBuilder = JobConfig.Builder.fromMap(WorkflowGenerator.DEFAULT_JOB_CONFIG);
jobBuilder.setJobCommandConfigMap(WorkflowGenerator.DEFAULT_COMMAND_CONFIG).setMaxAttemptsPerTask(2).setTimeoutPerTask(100);
Workflow flow = WorkflowGenerator.generateSingleJobWorkflowBuilder(jobResource, jobBuilder).build();
_driver.start(flow);
// Wait until the job reports failure.
_driver.pollForWorkflowState(jobResource, TaskState.FAILED);
// Check that all partitions timed out up to maxAttempts
JobContext ctx = _driver.getJobContext(TaskUtil.getNamespacedJobName(jobResource));
int maxAttempts = 0;
boolean sawTimedoutTask = false;
for (int i = 0; i < _numParitions; i++) {
TaskPartitionState state = ctx.getPartitionState(i);
if (state != null) {
if (state == TaskPartitionState.TIMED_OUT) {
sawTimedoutTask = true;
}
// At least one task timed out, other might be aborted due to job failure.
Assert.assertTrue(state == TaskPartitionState.TIMED_OUT || state == TaskPartitionState.TASK_ABORTED);
maxAttempts = Math.max(maxAttempts, ctx.getPartitionNumAttempts(i));
}
}
Assert.assertTrue(sawTimedoutTask);
Assert.assertEquals(maxAttempts, 2);
}
Aggregations