use of org.apache.druid.indexer.TaskState in project druid by druid-io.
the class ParallelIndexSupervisorTask method runSinglePhaseParallel.
/**
* Run the single phase parallel indexing for best-effort rollup. In this mode, each sub task created by
* the supervisor task reads data and generates segments individually.
*/
private TaskStatus runSinglePhaseParallel(TaskToolbox toolbox) throws Exception {
ingestionState = IngestionState.BUILD_SEGMENTS;
ParallelIndexTaskRunner<SinglePhaseSubTask, PushedSegmentsReport> parallelSinglePhaseRunner = createRunner(toolbox, this::createSinglePhaseTaskRunner);
final TaskState state = runNextPhase(parallelSinglePhaseRunner);
TaskStatus taskStatus;
if (state.isSuccess()) {
// noinspection ConstantConditions
publishSegments(toolbox, parallelSinglePhaseRunner.getReports());
if (awaitSegmentAvailabilityTimeoutMillis > 0) {
waitForSegmentAvailability(parallelSinglePhaseRunner.getReports());
}
taskStatus = TaskStatus.success(getId());
} else {
// there is only success or failure after running....
Preconditions.checkState(state.isFailure(), "Unrecognized state after task is complete[%s]", state);
final String errorMessage;
if (parallelSinglePhaseRunner.getStopReason() != null) {
errorMessage = parallelSinglePhaseRunner.getStopReason();
} else {
errorMessage = StringUtils.format(TASK_PHASE_FAILURE_MSG, parallelSinglePhaseRunner.getName());
}
taskStatus = TaskStatus.failure(getId(), errorMessage);
}
toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports(taskStatus, segmentAvailabilityConfirmationCompleted));
return taskStatus;
}
use of org.apache.druid.indexer.TaskState in project druid by druid-io.
the class ParallelIndexSupervisorTask method runHashPartitionMultiPhaseParallel.
@VisibleForTesting
TaskStatus runHashPartitionMultiPhaseParallel(TaskToolbox toolbox) throws Exception {
TaskState state;
ParallelIndexIngestionSpec ingestionSchemaToUse = ingestionSchema;
if (!(ingestionSchema.getTuningConfig().getPartitionsSpec() instanceof HashedPartitionsSpec)) {
// only range and hash partitioning is supported for multiphase parallel ingestion, see runMultiPhaseParallel()
throw new ISE("forceGuaranteedRollup is set but partitionsSpec [%s] is not a single_dim or hash partition spec.", ingestionSchema.getTuningConfig().getPartitionsSpec());
}
final Map<Interval, Integer> intervalToNumShards;
HashedPartitionsSpec partitionsSpec = (HashedPartitionsSpec) ingestionSchema.getTuningConfig().getPartitionsSpec();
final boolean needsInputSampling = partitionsSpec.getNumShards() == null || ingestionSchemaToUse.getDataSchema().getGranularitySpec().inputIntervals().isEmpty();
if (needsInputSampling) {
// 0. need to determine intervals and numShards by scanning the data
LOG.info("Needs to determine intervals or numShards, beginning %s phase.", PartialDimensionCardinalityTask.TYPE);
ParallelIndexTaskRunner<PartialDimensionCardinalityTask, DimensionCardinalityReport> cardinalityRunner = createRunner(toolbox, this::createPartialDimensionCardinalityRunner);
state = runNextPhase(cardinalityRunner);
if (state.isFailure()) {
String errMsg = StringUtils.format(TASK_PHASE_FAILURE_MSG, cardinalityRunner.getName());
return TaskStatus.failure(getId(), errMsg);
}
if (cardinalityRunner.getReports().isEmpty()) {
String msg = "No valid rows for hash partitioning." + " All rows may have invalid timestamps or have been filtered out.";
LOG.warn(msg);
return TaskStatus.success(getId(), msg);
}
if (partitionsSpec.getNumShards() == null) {
int effectiveMaxRowsPerSegment = partitionsSpec.getMaxRowsPerSegment() == null ? PartitionsSpec.DEFAULT_MAX_ROWS_PER_SEGMENT : partitionsSpec.getMaxRowsPerSegment();
LOG.info("effective maxRowsPerSegment is: " + effectiveMaxRowsPerSegment);
intervalToNumShards = determineNumShardsFromCardinalityReport(cardinalityRunner.getReports().values(), effectiveMaxRowsPerSegment);
} else {
intervalToNumShards = CollectionUtils.mapValues(mergeCardinalityReports(cardinalityRunner.getReports().values()), k -> partitionsSpec.getNumShards());
}
ingestionSchemaToUse = rewriteIngestionSpecWithIntervalsIfMissing(ingestionSchemaToUse, intervalToNumShards.keySet());
} else {
// numShards will be determined in PartialHashSegmentGenerateTask
intervalToNumShards = null;
}
// 1. Partial segment generation phase
final ParallelIndexIngestionSpec segmentCreateIngestionSpec = ingestionSchemaToUse;
ParallelIndexTaskRunner<PartialHashSegmentGenerateTask, GeneratedPartitionsReport> indexingRunner = createRunner(toolbox, f -> createPartialHashSegmentGenerateRunner(toolbox, segmentCreateIngestionSpec, intervalToNumShards));
state = runNextPhase(indexingRunner);
if (state.isFailure()) {
String errMsg = StringUtils.format(TASK_PHASE_FAILURE_MSG, indexingRunner.getName());
return TaskStatus.failure(getId(), errMsg);
}
// 2. Partial segment merge phase
// partition (interval, partitionId) -> partition locations
Map<Partition, List<PartitionLocation>> partitionToLocations = getPartitionToLocations(indexingRunner.getReports());
final List<PartialSegmentMergeIOConfig> ioConfigs = createGenericMergeIOConfigs(ingestionSchema.getTuningConfig().getTotalNumMergeTasks(), partitionToLocations);
final ParallelIndexIngestionSpec segmentMergeIngestionSpec = ingestionSchemaToUse;
final ParallelIndexTaskRunner<PartialGenericSegmentMergeTask, PushedSegmentsReport> mergeRunner = createRunner(toolbox, tb -> createPartialGenericSegmentMergeRunner(tb, ioConfigs, segmentMergeIngestionSpec));
state = runNextPhase(mergeRunner);
TaskStatus taskStatus;
if (state.isSuccess()) {
// noinspection ConstantConditions
publishSegments(toolbox, mergeRunner.getReports());
if (awaitSegmentAvailabilityTimeoutMillis > 0) {
waitForSegmentAvailability(mergeRunner.getReports());
}
taskStatus = TaskStatus.success(getId());
} else {
// there is only success or failure after running....
Preconditions.checkState(state.isFailure(), "Unrecognized state after task is complete[%s]", state);
String errMsg = StringUtils.format(TASK_PHASE_FAILURE_MSG, mergeRunner.getName());
taskStatus = TaskStatus.failure(getId(), errMsg);
}
toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports(taskStatus, segmentAvailabilityConfirmationCompleted));
return taskStatus;
}
use of org.apache.druid.indexer.TaskState in project druid by druid-io.
the class HttpRemoteTaskRunner method taskComplete.
// CAUTION: This method calls RemoteTaskRunnerWorkItem.setResult(..) which results in TaskQueue.notifyStatus() being called
// because that is attached by TaskQueue to task result future. So, this method must not be called with "statusLock"
// held. See https://github.com/apache/druid/issues/6201
private void taskComplete(HttpRemoteTaskRunnerWorkItem taskRunnerWorkItem, WorkerHolder workerHolder, TaskStatus taskStatus) {
Preconditions.checkState(!Thread.holdsLock(statusLock), "Current thread must not hold statusLock.");
Preconditions.checkNotNull(taskRunnerWorkItem, "taskRunnerWorkItem");
Preconditions.checkNotNull(taskStatus, "taskStatus");
if (workerHolder != null) {
log.info("Worker[%s] completed task[%s] with status[%s]", workerHolder.getWorker().getHost(), taskStatus.getId(), taskStatus.getStatusCode());
// Worker is done with this task
workerHolder.setLastCompletedTaskTime(DateTimes.nowUtc());
}
if (taskRunnerWorkItem.getResult().isDone()) {
// This is not the first complete event.
try {
TaskState lastKnownState = taskRunnerWorkItem.getResult().get().getStatusCode();
if (taskStatus.getStatusCode() != lastKnownState) {
log.warn("The state of the new task complete event is different from its last known state. " + "New state[%s], last known state[%s]", taskStatus.getStatusCode(), lastKnownState);
}
} catch (InterruptedException e) {
log.warn(e, "Interrupted while getting the last known task status.");
Thread.currentThread().interrupt();
} catch (ExecutionException e) {
// This case should not really happen.
log.warn(e, "Failed to get the last known task status. Ignoring this failure.");
}
} else {
// Notify interested parties
taskRunnerWorkItem.setResult(taskStatus);
TaskRunnerUtils.notifyStatusChanged(listeners, taskStatus.getId(), taskStatus);
// Update success/failure counters, Blacklist node if there are too many failures.
if (workerHolder != null) {
blacklistWorkerIfNeeded(taskStatus, workerHolder);
}
}
synchronized (statusLock) {
statusLock.notifyAll();
}
}
use of org.apache.druid.indexer.TaskState in project druid by druid-io.
the class RemoteTaskRunner method taskComplete.
private void taskComplete(RemoteTaskRunnerWorkItem taskRunnerWorkItem, @Nullable ZkWorker zkWorker, TaskStatus taskStatus) {
Preconditions.checkNotNull(taskRunnerWorkItem, "taskRunnerWorkItem");
Preconditions.checkNotNull(taskStatus, "taskStatus");
if (zkWorker != null) {
log.info("Worker[%s] completed task[%s] with status[%s]", zkWorker.getWorker().getHost(), taskStatus.getId(), taskStatus.getStatusCode());
// Worker is done with this task
zkWorker.setLastCompletedTaskTime(DateTimes.nowUtc());
} else {
log.info("Workerless task[%s] completed with status[%s]", taskStatus.getId(), taskStatus.getStatusCode());
}
// Move from running -> complete
// If the task was running and this is the first complete event,
// previousComplete should be null and removedRunning should not.
final RemoteTaskRunnerWorkItem previousComplete = completeTasks.put(taskStatus.getId(), taskRunnerWorkItem);
final RemoteTaskRunnerWorkItem removedRunning = runningTasks.remove(taskStatus.getId());
if (previousComplete != null && removedRunning != null) {
log.warn("This is not the first complete event for task[%s], but it was still known as running. " + "Ignoring the previously known running status.", taskStatus.getId());
}
if (previousComplete != null) {
// This is not the first complete event for the same task.
try {
// getResult().get() must return immediately.
TaskState lastKnownState = previousComplete.getResult().get(1, TimeUnit.MILLISECONDS).getStatusCode();
if (taskStatus.getStatusCode() != lastKnownState) {
log.warn("The state of the new task complete event is different from its last known state. " + "New state[%s], last known state[%s]", taskStatus.getStatusCode(), lastKnownState);
}
} catch (InterruptedException e) {
log.warn(e, "Interrupted while getting the last known task status.");
Thread.currentThread().interrupt();
} catch (ExecutionException | TimeoutException e) {
// This case should not really happen.
log.warn(e, "Failed to get the last known task status. Ignoring this failure.");
}
} else {
// Update success/failure counters
if (zkWorker != null) {
if (taskStatus.isSuccess()) {
zkWorker.resetContinuouslyFailedTasksCount();
if (blackListedWorkers.remove(zkWorker)) {
zkWorker.setBlacklistedUntil(null);
log.info("[%s] removed from blacklist because a task finished with SUCCESS", zkWorker.getWorker());
}
} else if (taskStatus.isFailure()) {
zkWorker.incrementContinuouslyFailedTasksCount();
}
// Blacklist node if there are too many failures.
synchronized (blackListedWorkers) {
if (zkWorker.getContinuouslyFailedTasksCount() > config.getMaxRetriesBeforeBlacklist() && blackListedWorkers.size() <= zkWorkers.size() * (config.getMaxPercentageBlacklistWorkers() / 100.0) - 1) {
zkWorker.setBlacklistedUntil(DateTimes.nowUtc().plus(config.getWorkerBlackListBackoffTime()));
if (blackListedWorkers.add(zkWorker)) {
log.info("Blacklisting [%s] until [%s] after [%,d] failed tasks in a row.", zkWorker.getWorker(), zkWorker.getBlacklistedUntil(), zkWorker.getContinuouslyFailedTasksCount());
}
}
}
}
// Notify interested parties
taskRunnerWorkItem.setResult(taskStatus);
TaskRunnerUtils.notifyStatusChanged(listeners, taskStatus.getId(), taskStatus);
}
}
use of org.apache.druid.indexer.TaskState in project druid by druid-io.
the class ParallelIndexSupervisorTask method runRangePartitionMultiPhaseParallel.
@VisibleForTesting
TaskStatus runRangePartitionMultiPhaseParallel(TaskToolbox toolbox) throws Exception {
ParallelIndexIngestionSpec ingestionSchemaToUse = ingestionSchema;
ParallelIndexTaskRunner<PartialDimensionDistributionTask, DimensionDistributionReport> distributionRunner = createRunner(toolbox, this::createPartialDimensionDistributionRunner);
TaskState distributionState = runNextPhase(distributionRunner);
if (distributionState.isFailure()) {
String errMsg = StringUtils.format(TASK_PHASE_FAILURE_MSG, distributionRunner.getName());
return TaskStatus.failure(getId(), errMsg);
}
Map<Interval, PartitionBoundaries> intervalToPartitions = determineAllRangePartitions(distributionRunner.getReports().values());
if (intervalToPartitions.isEmpty()) {
String msg = "No valid rows for single dimension partitioning." + " All rows may have invalid timestamps or multiple dimension values.";
LOG.warn(msg);
return TaskStatus.success(getId(), msg);
}
ingestionSchemaToUse = rewriteIngestionSpecWithIntervalsIfMissing(ingestionSchemaToUse, intervalToPartitions.keySet());
final ParallelIndexIngestionSpec segmentCreateIngestionSpec = ingestionSchemaToUse;
ParallelIndexTaskRunner<PartialRangeSegmentGenerateTask, GeneratedPartitionsReport> indexingRunner = createRunner(toolbox, tb -> createPartialRangeSegmentGenerateRunner(tb, intervalToPartitions, segmentCreateIngestionSpec));
TaskState indexingState = runNextPhase(indexingRunner);
if (indexingState.isFailure()) {
String errMsg = StringUtils.format(TASK_PHASE_FAILURE_MSG, indexingRunner.getName());
return TaskStatus.failure(getId(), errMsg);
}
// partition (interval, partitionId) -> partition locations
Map<Partition, List<PartitionLocation>> partitionToLocations = getPartitionToLocations(indexingRunner.getReports());
final List<PartialSegmentMergeIOConfig> ioConfigs = createGenericMergeIOConfigs(ingestionSchema.getTuningConfig().getTotalNumMergeTasks(), partitionToLocations);
final ParallelIndexIngestionSpec segmentMergeIngestionSpec = ingestionSchemaToUse;
ParallelIndexTaskRunner<PartialGenericSegmentMergeTask, PushedSegmentsReport> mergeRunner = createRunner(toolbox, tb -> createPartialGenericSegmentMergeRunner(tb, ioConfigs, segmentMergeIngestionSpec));
TaskState mergeState = runNextPhase(mergeRunner);
TaskStatus taskStatus;
if (mergeState.isSuccess()) {
publishSegments(toolbox, mergeRunner.getReports());
if (awaitSegmentAvailabilityTimeoutMillis > 0) {
waitForSegmentAvailability(mergeRunner.getReports());
}
taskStatus = TaskStatus.success(getId());
} else {
// there is only success or failure after running....
Preconditions.checkState(mergeState.isFailure(), "Unrecognized state after task is complete[%s]", mergeState);
String errMsg = StringUtils.format(TASK_PHASE_FAILURE_MSG, mergeRunner.getName());
taskStatus = TaskStatus.failure(getId(), errMsg);
}
toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports(taskStatus, segmentAvailabilityConfirmationCompleted));
return taskStatus;
}
Aggregations