use of io.prestosql.execution.TaskStatus in project hetu-core by openlookeng.
the class TaskSystemTable method cursor.
@Override
public RecordCursor cursor(ConnectorTransactionHandle transactionHandle, ConnectorSession session, TupleDomain<Integer> constraint) {
Builder table = InMemoryRecordSet.builder(TASK_TABLE);
for (TaskInfo taskInfo : taskManager.getAllTaskInfo()) {
TaskStats stats = taskInfo.getStats();
TaskStatus taskStatus = taskInfo.getTaskStatus();
table.addRow(nodeId, taskStatus.getTaskId().toString(), taskStatus.getTaskId().getStageId().toString(), taskStatus.getTaskId().getQueryId().toString(), taskStatus.getState().toString(), (long) stats.getTotalDrivers(), (long) stats.getQueuedDrivers(), (long) stats.getRunningDrivers(), (long) stats.getCompletedDrivers(), toMillis(stats.getTotalScheduledTime()), toMillis(stats.getTotalCpuTime()), toMillis(stats.getTotalBlockedTime()), toBytes(stats.getRawInputDataSize()), stats.getRawInputPositions(), toBytes(stats.getProcessedInputDataSize()), stats.getProcessedInputPositions(), toBytes(stats.getOutputDataSize()), stats.getOutputPositions(), toBytes(stats.getPhysicalWrittenDataSize()), toTimeStamp(stats.getCreateTime()), toTimeStamp(stats.getFirstStartTime()), toTimeStamp(taskInfo.getLastHeartbeat()), toTimeStamp(stats.getEndTime()));
}
return table.build().cursor();
}
use of io.prestosql.execution.TaskStatus in project hetu-core by openlookeng.
the class HttpRemoteTask method sendUpdate.
private synchronized void sendUpdate() {
if (abandoned.get()) {
// Snapshot: Corresponding task has been canceled to resume. Stop any communication with it.
return;
}
TaskStatus taskStatus = getTaskStatus();
// don't update if the task hasn't been started yet or if it is already finished
if (!needsUpdate.get() || taskStatus.getState().isDone()) {
return;
}
// if there is a request already running, wait for it to complete
if (this.currentRequest != null && !this.currentRequest.isDone()) {
return;
}
// if throttled due to error, asynchronously wait for timeout and try again
ListenableFuture<?> errorRateLimit = updateErrorTracker.acquireRequestPermit();
if (!errorRateLimit.isDone()) {
errorRateLimit.addListener(this::sendUpdate, executor);
return;
}
List<TaskSource> sources = getSources();
Optional<PlanFragment> fragment = sendPlan.get() ? Optional.of(planFragment) : Optional.empty();
TaskUpdateRequest updateRequest = new TaskUpdateRequest(// so receiver can verify if the instance id matches
instanceId, session.toSessionRepresentation(), session.getIdentity().getExtraCredentials(), fragment, sources, outputBuffers.get(), totalPartitions, parent);
byte[] taskUpdateRequestJson = taskUpdateRequestCodec.toBytes(updateRequest);
if (fragment.isPresent()) {
stats.updateWithPlanBytes(taskUpdateRequestJson.length);
}
HttpUriBuilder uriBuilder = getHttpUriBuilder(taskStatus);
Request request = setContentTypeHeaders(isBinaryEncoding, preparePost()).setUri(uriBuilder.build()).setBodyGenerator(StaticBodyGenerator.createStaticBodyGenerator(taskUpdateRequestJson)).build();
ResponseHandler responseHandler;
if (isBinaryEncoding) {
responseHandler = createFullSmileResponseHandler((SmileCodec<TaskInfo>) taskInfoCodec);
} else {
responseHandler = createAdaptingJsonResponseHandler(unwrapJsonCodec(taskInfoCodec));
}
updateErrorTracker.startRequest();
ListenableFuture<BaseResponse<TaskInfo>> future = httpClient.executeAsync(request, responseHandler);
currentRequest = future;
currentRequestStartNanos = System.nanoTime();
// The needsUpdate flag needs to be set to false BEFORE adding the Future callback since callback might change the flag value
// and does so without grabbing the instance lock.
needsUpdate.set(false);
Futures.addCallback(future, new SimpleHttpResponseHandler<>(new UpdateResponseHandler(sources), request.getUri(), stats), executor);
}
use of io.prestosql.execution.TaskStatus in project hetu-core by openlookeng.
the class HttpRemoteTask method cancel.
@Override
public synchronized void cancel() {
try (SetThreadName ignored = new SetThreadName("HttpRemoteTask-%s", taskId)) {
TaskStatus taskStatus = getTaskStatus();
if (taskStatus.getState().isDone()) {
return;
}
sendCancelRequest(taskStatus, TaskState.CANCELED, "cancel");
}
}
use of io.prestosql.execution.TaskStatus in project hetu-core by openlookeng.
the class HttpRemoteTask method doScheduleAsyncCleanupRequest.
private void doScheduleAsyncCleanupRequest(Backoff cleanupBackoff, Request request, String action) {
ResponseHandler responseHandler;
if (isBinaryEncoding) {
responseHandler = createFullSmileResponseHandler((SmileCodec<TaskInfo>) taskInfoCodec);
} else {
responseHandler = createAdaptingJsonResponseHandler(unwrapJsonCodec(taskInfoCodec));
}
Futures.addCallback(httpClient.executeAsync(request, responseHandler), new FutureCallback<BaseResponse<TaskInfo>>() {
@Override
public void onSuccess(BaseResponse<TaskInfo> result) {
try {
updateTaskInfo(result.getValue());
} finally {
if (!getTaskInfo().getTaskStatus().getState().isDone()) {
cleanUpLocally();
}
}
}
@Override
public void onFailure(Throwable t) {
if (cancelledToResume.get()) {
// Remote worker is probably unreachable. Don't make additional attempts.
cleanUpLocally();
return;
}
if (t instanceof RejectedExecutionException && httpClient.isClosed()) {
logError(t, "Unable to %s task at %s. HTTP client is closed.", action, request.getUri());
cleanUpLocally();
return;
}
// record failure
if (cleanupBackoff.failure()) {
logError(t, "Unable to %s task at %s. Back off depleted.", action, request.getUri());
cleanUpLocally();
return;
}
// reschedule
long delayNanos = cleanupBackoff.getBackoffDelayNanos();
if (delayNanos == 0) {
doScheduleAsyncCleanupRequest(cleanupBackoff, request, action);
} else {
errorScheduledExecutor.schedule(() -> doScheduleAsyncCleanupRequest(cleanupBackoff, request, action), delayNanos, NANOSECONDS);
}
}
private void cleanUpLocally() {
// Update the taskInfo with the new taskStatus.
// Generally, we send a cleanup request to the worker, and update the TaskInfo on
// the coordinator based on what we fetched from the worker. If we somehow cannot
// get the cleanup request to the worker, the TaskInfo that we fetch for the worker
// likely will not say the task is done however many times we try. In this case,
// we have to set the local query info directly so that we stop trying to fetch
// updated TaskInfo from the worker. This way, the task on the worker eventually
// expires due to lack of activity.
// This is required because the query state machine depends on TaskInfo (instead of task status)
// to transition its own state.
// TODO: Update the query state machine and stage state machine to depend on TaskStatus instead
// Since this TaskInfo is updated in the client the "complete" flag will not be set,
// indicating that the stats may not reflect the final stats on the worker.
TaskStatus taskStatus = getTaskStatus();
if (cancelledToResume.get()) {
// When the task is cancelled to resume, then make sure it gets the new state, so query can start resuming.
// Check for task state is in QueryInfo#areAllStagesDone.
taskStatus = TaskStatus.failWith(taskStatus, CANCELED_TO_RESUME, ImmutableList.of());
}
updateTaskInfo(getTaskInfo().withTaskStatus(taskStatus));
}
}, executor);
}
use of io.prestosql.execution.TaskStatus in project hetu-core by openlookeng.
the class SqlQueryScheduler method createStages.
private List<SqlStageExecution> createStages(ExchangeLocationsConsumer parent, AtomicInteger nextStageId, LocationFactory locationFactory, StageExecutionPlan plan, NodeScheduler nodeScheduler, RemoteTaskFactory remoteTaskFactory, Session session, int splitBatchSize, BiFunction<PartitioningHandle, Integer, NodePartitionMap> partitioningCache, NodePartitioningManager nodePartitioningManager, ExecutorService queryExecutor, ScheduledExecutorService schedulerExecutor, FailureDetector failureDetector, NodeTaskMap nodeTaskMap, ImmutableMap.Builder<StageId, StageScheduler> stageSchedulers, ImmutableMap.Builder<StageId, StageLinkage> stageLinkages, boolean isSnapshotEnabled, QuerySnapshotManager snapshotManager, Map<StageId, Integer> stageTaskCounts) {
ImmutableList.Builder<SqlStageExecution> localStages = ImmutableList.builder();
StageId stageId = new StageId(queryStateMachine.getQueryId(), nextStageId.getAndIncrement());
SqlStageExecution stageExecution = createSqlStageExecution(stageId, locationFactory.createStageLocation(stageId), plan.getFragment(), plan.getTables(), remoteTaskFactory, session, summarizeTaskInfo, nodeTaskMap, queryExecutor, failureDetector, schedulerStats, dynamicFilterService, snapshotManager);
localStages.add(stageExecution);
Optional<int[]> bucketToPartition;
PartitioningHandle partitioningHandle = plan.getFragment().getPartitioning();
boolean keepConsumerOnFeederNodes = !plan.getFragment().getFeederCTEId().isPresent() && plan.getFragment().getFeederCTEParentId().isPresent();
if (partitioningHandle.equals(SOURCE_DISTRIBUTION)) {
// nodes are selected dynamically based on the constraints of the splits and the system load
Entry<PlanNodeId, SplitSource> entry = Iterables.getOnlyElement(plan.getSplitSources().entrySet());
PlanNodeId planNodeId = entry.getKey();
SplitSource splitSource = entry.getValue();
CatalogName catalogName = splitSource.getCatalogName();
if (isInternalSystemConnector(catalogName)) {
catalogName = null;
}
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes);
if (isSnapshotEnabled) {
// When snapshot is enabled, then no task can be added after the query started running,
// otherwise assumptions about how many "input channels" may be broken.
nodeSelector.lockDownNodes();
}
SplitPlacementPolicy placementPolicy = new DynamicSplitPlacementPolicy(nodeSelector, stageExecution::getAllTasks);
checkArgument(!plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution());
stageSchedulers.put(stageId, newSourcePartitionedSchedulerAsStageScheduler(stageExecution, planNodeId, splitSource, placementPolicy, splitBatchSize, session, heuristicIndexerManager));
bucketToPartition = Optional.of(new int[1]);
} else if (partitioningHandle.equals(SCALED_WRITER_DISTRIBUTION)) {
bucketToPartition = Optional.of(new int[1]);
} else {
Map<PlanNodeId, SplitSource> splitSources = plan.getSplitSources();
if (!splitSources.isEmpty()) {
// contains local source
List<PlanNodeId> schedulingOrder = plan.getFragment().getPartitionedSources();
CatalogName catalogName = partitioningHandle.getConnectorId().orElseThrow(IllegalStateException::new);
List<ConnectorPartitionHandle> connectorPartitionHandles;
boolean groupedExecutionForStage = plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution();
if (groupedExecutionForStage) {
connectorPartitionHandles = nodePartitioningManager.listPartitionHandles(session, partitioningHandle);
checkState(!ImmutableList.of(NOT_PARTITIONED).equals(connectorPartitionHandles));
} else {
connectorPartitionHandles = ImmutableList.of(NOT_PARTITIONED);
}
BucketNodeMap bucketNodeMap;
List<InternalNode> stageNodeList;
if (plan.getFragment().getRemoteSourceNodes().stream().allMatch(node -> node.getExchangeType() == REPLICATE)) {
// no remote source
boolean dynamicLifespanSchedule = plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule();
if (isSnapshotEnabled) {
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes);
int nodeCount;
if (stageTaskCounts != null) {
// Resuming: need to create same number of tasks as old stage.
nodeCount = stageTaskCounts.get(stageId);
} else {
// Scheduling: reserve some nodes for resuming
nodeCount = calculateTaskCount(nodeSelector.selectableNodeCount());
}
stageNodeList = new ArrayList<>(nodeSelector.selectRandomNodes(nodeCount));
checkCondition(stageNodeList.size() == nodeCount, NO_NODES_AVAILABLE, "Snapshot: not enough worker nodes to resume expected number of tasks: " + nodeCount);
// Make sure bucketNodeMap uses the same node list
bucketNodeMap = nodePartitioningManager.getBucketNodeMap(session, partitioningHandle, dynamicLifespanSchedule, stageNodeList);
} else {
bucketNodeMap = nodePartitioningManager.getBucketNodeMap(session, partitioningHandle, dynamicLifespanSchedule);
stageNodeList = new ArrayList<>(nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes).allNodes());
}
// verify execution is consistent with planner's decision on dynamic lifespan schedule
verify(bucketNodeMap.isDynamic() == dynamicLifespanSchedule);
Collections.shuffle(stageNodeList);
bucketToPartition = Optional.empty();
} else {
// cannot use dynamic lifespan schedule
verify(!plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule());
// remote source requires nodePartitionMap
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning(), stageTaskCounts == null ? null : stageTaskCounts.get(stageId));
if (groupedExecutionForStage) {
checkState(connectorPartitionHandles.size() == nodePartitionMap.getBucketToPartition().length);
}
stageNodeList = nodePartitionMap.getPartitionToNode();
bucketNodeMap = nodePartitionMap.asBucketNodeMap();
bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition());
}
stageSchedulers.put(stageId, new FixedSourcePartitionedScheduler(stageExecution, splitSources, plan.getFragment().getStageExecutionDescriptor(), schedulingOrder, stageNodeList, bucketNodeMap, splitBatchSize, getConcurrentLifespansPerNode(session), nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes), connectorPartitionHandles, session, heuristicIndexerManager));
} else {
// all sources are remote
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning(), stageTaskCounts == null ? null : stageTaskCounts.get(stageId));
List<InternalNode> partitionToNode = nodePartitionMap.getPartitionToNode();
// todo this should asynchronously wait a standard timeout period before failing
checkCondition(!partitionToNode.isEmpty(), NO_NODES_AVAILABLE, "No worker nodes available");
stageSchedulers.put(stageId, new FixedCountScheduler(stageExecution, partitionToNode));
bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition());
}
}
ImmutableSet.Builder<SqlStageExecution> childStagesBuilder = ImmutableSet.builder();
for (StageExecutionPlan subStagePlan : plan.getSubStages()) {
if (visitedPlanFrags.contains(subStagePlan.getFragment().getId())) {
continue;
}
visitedPlanFrags.add(subStagePlan.getFragment().getId());
List<SqlStageExecution> subTree = createStages(stageExecution::addExchangeLocations, nextStageId, locationFactory, subStagePlan.withBucketToPartition(bucketToPartition), nodeScheduler, remoteTaskFactory, session, splitBatchSize, partitioningCache, nodePartitioningManager, queryExecutor, schedulerExecutor, failureDetector, nodeTaskMap, stageSchedulers, stageLinkages, isSnapshotEnabled, snapshotManager, stageTaskCounts);
localStages.addAll(subTree);
SqlStageExecution childStage = subTree.get(0);
childStagesBuilder.add(childStage);
Optional<RemoteSourceNode> parentNode = plan.getFragment().getRemoteSourceNodes().stream().filter(x -> x.getSourceFragmentIds().contains(childStage.getFragment().getId())).findAny();
checkArgument(parentNode.isPresent(), "Couldn't find parent of a CTE node");
childStage.setParentId(parentNode.get().getId());
}
Set<SqlStageExecution> childStages = childStagesBuilder.build();
stageExecution.addStateChangeListener(newState -> {
if (newState.isDone() && newState != StageState.RESCHEDULING) {
// Snapshot: For "rescheduling", tasks are already cancelled (for resume)
childStages.forEach(SqlStageExecution::cancel);
}
});
stageLinkages.put(stageId, new StageLinkage(plan.getFragment().getId(), parent, childStages));
if (partitioningHandle.equals(SCALED_WRITER_DISTRIBUTION)) {
Supplier<Collection<TaskStatus>> sourceTasksProvider = () -> childStages.stream().map(SqlStageExecution::getAllTasks).flatMap(Collection::stream).map(RemoteTask::getTaskStatus).collect(toList());
Supplier<Collection<TaskStatus>> writerTasksProvider = () -> stageExecution.getAllTasks().stream().map(RemoteTask::getTaskStatus).collect(toList());
ScaledWriterScheduler scheduler = new ScaledWriterScheduler(stageExecution, sourceTasksProvider, writerTasksProvider, nodeScheduler.createNodeSelector(null, keepConsumerOnFeederNodes, feederScheduledNodes), schedulerExecutor, getWriterMinSize(session), isSnapshotEnabled, stageTaskCounts != null ? stageTaskCounts.get(stageId) : null);
whenAllStages(childStages, StageState::isDone).addListener(scheduler::finish, directExecutor());
stageSchedulers.put(stageId, scheduler);
}
return localStages.build();
}
Aggregations