use of org.apache.druid.indexing.common.TaskToolbox in project druid by druid-io.
the class ParallelIndexSupervisorTask method runHashPartitionMultiPhaseParallel.
@VisibleForTesting
TaskStatus runHashPartitionMultiPhaseParallel(TaskToolbox toolbox) throws Exception {
TaskState state;
ParallelIndexIngestionSpec ingestionSchemaToUse = ingestionSchema;
if (!(ingestionSchema.getTuningConfig().getPartitionsSpec() instanceof HashedPartitionsSpec)) {
// only range and hash partitioning is supported for multiphase parallel ingestion, see runMultiPhaseParallel()
throw new ISE("forceGuaranteedRollup is set but partitionsSpec [%s] is not a single_dim or hash partition spec.", ingestionSchema.getTuningConfig().getPartitionsSpec());
}
final Map<Interval, Integer> intervalToNumShards;
HashedPartitionsSpec partitionsSpec = (HashedPartitionsSpec) ingestionSchema.getTuningConfig().getPartitionsSpec();
final boolean needsInputSampling = partitionsSpec.getNumShards() == null || ingestionSchemaToUse.getDataSchema().getGranularitySpec().inputIntervals().isEmpty();
if (needsInputSampling) {
// 0. need to determine intervals and numShards by scanning the data
LOG.info("Needs to determine intervals or numShards, beginning %s phase.", PartialDimensionCardinalityTask.TYPE);
ParallelIndexTaskRunner<PartialDimensionCardinalityTask, DimensionCardinalityReport> cardinalityRunner = createRunner(toolbox, this::createPartialDimensionCardinalityRunner);
state = runNextPhase(cardinalityRunner);
if (state.isFailure()) {
String errMsg = StringUtils.format(TASK_PHASE_FAILURE_MSG, cardinalityRunner.getName());
return TaskStatus.failure(getId(), errMsg);
}
if (cardinalityRunner.getReports().isEmpty()) {
String msg = "No valid rows for hash partitioning." + " All rows may have invalid timestamps or have been filtered out.";
LOG.warn(msg);
return TaskStatus.success(getId(), msg);
}
if (partitionsSpec.getNumShards() == null) {
int effectiveMaxRowsPerSegment = partitionsSpec.getMaxRowsPerSegment() == null ? PartitionsSpec.DEFAULT_MAX_ROWS_PER_SEGMENT : partitionsSpec.getMaxRowsPerSegment();
LOG.info("effective maxRowsPerSegment is: " + effectiveMaxRowsPerSegment);
intervalToNumShards = determineNumShardsFromCardinalityReport(cardinalityRunner.getReports().values(), effectiveMaxRowsPerSegment);
} else {
intervalToNumShards = CollectionUtils.mapValues(mergeCardinalityReports(cardinalityRunner.getReports().values()), k -> partitionsSpec.getNumShards());
}
ingestionSchemaToUse = rewriteIngestionSpecWithIntervalsIfMissing(ingestionSchemaToUse, intervalToNumShards.keySet());
} else {
// numShards will be determined in PartialHashSegmentGenerateTask
intervalToNumShards = null;
}
// 1. Partial segment generation phase
final ParallelIndexIngestionSpec segmentCreateIngestionSpec = ingestionSchemaToUse;
ParallelIndexTaskRunner<PartialHashSegmentGenerateTask, GeneratedPartitionsReport> indexingRunner = createRunner(toolbox, f -> createPartialHashSegmentGenerateRunner(toolbox, segmentCreateIngestionSpec, intervalToNumShards));
state = runNextPhase(indexingRunner);
if (state.isFailure()) {
String errMsg = StringUtils.format(TASK_PHASE_FAILURE_MSG, indexingRunner.getName());
return TaskStatus.failure(getId(), errMsg);
}
// 2. Partial segment merge phase
// partition (interval, partitionId) -> partition locations
Map<Partition, List<PartitionLocation>> partitionToLocations = getPartitionToLocations(indexingRunner.getReports());
final List<PartialSegmentMergeIOConfig> ioConfigs = createGenericMergeIOConfigs(ingestionSchema.getTuningConfig().getTotalNumMergeTasks(), partitionToLocations);
final ParallelIndexIngestionSpec segmentMergeIngestionSpec = ingestionSchemaToUse;
final ParallelIndexTaskRunner<PartialGenericSegmentMergeTask, PushedSegmentsReport> mergeRunner = createRunner(toolbox, tb -> createPartialGenericSegmentMergeRunner(tb, ioConfigs, segmentMergeIngestionSpec));
state = runNextPhase(mergeRunner);
TaskStatus taskStatus;
if (state.isSuccess()) {
// noinspection ConstantConditions
publishSegments(toolbox, mergeRunner.getReports());
if (awaitSegmentAvailabilityTimeoutMillis > 0) {
waitForSegmentAvailability(mergeRunner.getReports());
}
taskStatus = TaskStatus.success(getId());
} else {
// there is only success or failure after running....
Preconditions.checkState(state.isFailure(), "Unrecognized state after task is complete[%s]", state);
String errMsg = StringUtils.format(TASK_PHASE_FAILURE_MSG, mergeRunner.getName());
taskStatus = TaskStatus.failure(getId(), errMsg);
}
toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports(taskStatus, segmentAvailabilityConfirmationCompleted));
return taskStatus;
}
use of org.apache.druid.indexing.common.TaskToolbox in project druid by druid-io.
the class PartialSegmentMergeTask method runTask.
@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception {
// Group partitionLocations by interval and partitionId
final Map<Interval, Int2ObjectMap<List<PartitionLocation>>> intervalToBuckets = new HashMap<>();
for (PartitionLocation location : ioConfig.getPartitionLocations()) {
intervalToBuckets.computeIfAbsent(location.getInterval(), k -> new Int2ObjectOpenHashMap<>()).computeIfAbsent(location.getBucketId(), k -> new ArrayList<>()).add(location);
}
final List<TaskLock> locks = toolbox.getTaskActionClient().submit(new SurrogateAction<>(supervisorTaskId, new LockListAction()));
final Map<Interval, String> intervalToVersion = Maps.newHashMapWithExpectedSize(locks.size());
locks.forEach(lock -> {
if (lock.isRevoked()) {
throw new ISE("Lock[%s] is revoked", lock);
}
final String mustBeNull = intervalToVersion.put(lock.getInterval(), lock.getVersion());
if (mustBeNull != null) {
throw new ISE("Unexpected state: Two versions([%s], [%s]) for the same interval[%s]", lock.getVersion(), mustBeNull, lock.getInterval());
}
});
final Stopwatch fetchStopwatch = Stopwatch.createStarted();
final Map<Interval, Int2ObjectMap<List<File>>> intervalToUnzippedFiles = fetchSegmentFiles(toolbox, intervalToBuckets);
final long fetchTime = fetchStopwatch.elapsed(TimeUnit.SECONDS);
fetchStopwatch.stop();
LOG.info("Fetch took [%s] seconds", fetchTime);
final ParallelIndexSupervisorTaskClient taskClient = toolbox.getSupervisorTaskClientFactory().build(new ClientBasedTaskInfoProvider(toolbox.getIndexingServiceClient()), getId(), // always use a single http thread
1, getTuningConfig().getChatHandlerTimeout(), getTuningConfig().getChatHandlerNumRetries());
final File persistDir = toolbox.getPersistDir();
org.apache.commons.io.FileUtils.deleteQuietly(persistDir);
FileUtils.mkdirp(persistDir);
final Set<DataSegment> pushedSegments = mergeAndPushSegments(toolbox, getDataSchema(), getTuningConfig(), persistDir, intervalToVersion, intervalToUnzippedFiles);
taskClient.report(supervisorTaskId, new PushedSegmentsReport(getId(), Collections.emptySet(), pushedSegments, ImmutableMap.of()));
return TaskStatus.success(getId());
}
use of org.apache.druid.indexing.common.TaskToolbox in project druid by druid-io.
the class SingleTaskBackgroundRunner method run.
@Override
public ListenableFuture<TaskStatus> run(final Task task) {
if (runningItem == null) {
final TaskToolbox toolbox = toolboxFactory.build(task);
final Object taskPriorityObj = task.getContextValue(TaskThreadPriority.CONTEXT_KEY);
int taskPriority = 0;
try {
taskPriority = taskPriorityObj == null ? 0 : Numbers.parseInt(taskPriorityObj);
} catch (NumberFormatException e) {
log.error(e, "Error parsing task priority [%s] for task [%s]", taskPriorityObj, task.getId());
}
// Ensure an executor for that priority exists
executorService = buildExecutorService(taskPriority);
final ListenableFuture<TaskStatus> statusFuture = executorService.submit(new SingleTaskBackgroundRunnerCallable(task, location, toolbox));
runningItem = new SingleTaskBackgroundRunnerWorkItem(task, location, statusFuture);
return statusFuture;
} else {
throw new ISE("Already running task[%s]", runningItem.getTask().getId());
}
}
use of org.apache.druid.indexing.common.TaskToolbox in project druid by druid-io.
the class AppenderatorDriverRealtimeIndexTaskTest method runTask.
private ListenableFuture<TaskStatus> runTask(final Task task) {
try {
taskStorage.insert(task, TaskStatus.running(task.getId()));
} catch (EntryExistsException e) {
// suppress
}
taskLockbox.syncFromStorage();
final TaskToolbox toolbox = taskToolboxFactory.build(task);
return taskExec.submit(() -> {
try {
if (task.isReady(toolbox.getTaskActionClient())) {
return task.run(toolbox);
} else {
throw new ISE("Task is not ready");
}
} catch (Exception e) {
log.warn(e, "Task failed");
throw e;
}
});
}
use of org.apache.druid.indexing.common.TaskToolbox in project druid by druid-io.
the class IndexTaskTest method testWaitForSegmentAvailabilityMultipleSegmentsSuccess.
@Test
public void testWaitForSegmentAvailabilityMultipleSegmentsSuccess() throws IOException {
final File tmpDir = temporaryFolder.newFolder();
TaskToolbox mockToolbox = EasyMock.createMock(TaskToolbox.class);
DataSegment mockDataSegment1 = EasyMock.createMock(DataSegment.class);
DataSegment mockDataSegment2 = EasyMock.createMock(DataSegment.class);
List<DataSegment> segmentsToWaitFor = new ArrayList<>();
segmentsToWaitFor.add(mockDataSegment1);
segmentsToWaitFor.add(mockDataSegment2);
IndexTask indexTask = new IndexTask(null, null, createDefaultIngestionSpec(jsonMapper, tmpDir, new UniformGranularitySpec(Granularities.HOUR, Granularities.MINUTE, null), null, createTuningConfigWithMaxRowsPerSegment(2, true), false, false), null);
EasyMock.expect(mockDataSegment1.getInterval()).andReturn(Intervals.of("1970-01-01/1971-01-01")).once();
EasyMock.expect(mockDataSegment1.getVersion()).andReturn("dummyString").once();
EasyMock.expect(mockDataSegment1.getShardSpec()).andReturn(EasyMock.createMock(ShardSpec.class)).once();
EasyMock.expect(mockDataSegment1.getId()).andReturn(SegmentId.dummy("MockDataSource")).once();
EasyMock.expect(mockDataSegment2.getInterval()).andReturn(Intervals.of("1971-01-01/1972-01-01")).once();
EasyMock.expect(mockDataSegment2.getVersion()).andReturn("dummyString").once();
EasyMock.expect(mockDataSegment2.getShardSpec()).andReturn(EasyMock.createMock(ShardSpec.class)).once();
EasyMock.expect(mockDataSegment2.getId()).andReturn(SegmentId.dummy("MockDataSource")).once();
EasyMock.expect(mockToolbox.getSegmentHandoffNotifierFactory()).andReturn(new NoopSegmentHandoffNotifierFactory()).once();
EasyMock.expect(mockToolbox.getEmitter()).andReturn(new NoopServiceEmitter()).anyTimes();
EasyMock.expect(mockDataSegment1.getDataSource()).andReturn("MockDataSource").once();
EasyMock.replay(mockToolbox);
EasyMock.replay(mockDataSegment1, mockDataSegment2);
Assert.assertTrue(indexTask.waitForSegmentAvailability(mockToolbox, segmentsToWaitFor, 30000));
EasyMock.verify(mockToolbox);
EasyMock.verify(mockDataSegment1, mockDataSegment2);
}
Aggregations