use of org.apache.mesos.v1.Protos.AgentID in project Singularity by HubSpot.
the class SingularityMesosOfferScheduler method checkOffers.
Collection<SingularityOfferHolder> checkOffers(final Map<String, Offer> offers, long start) {
if (offers.isEmpty()) {
LOG.debug("No offers to check");
return Collections.emptyList();
}
final List<SingularityTaskRequestHolder> sortedTaskRequestHolders = getSortedDueTaskRequests();
final int numDueTasks = sortedTaskRequestHolders.size();
final Map<String, SingularityOfferHolder> offerHolders = offers.values().stream().collect(Collectors.groupingBy(o -> o.getAgentId().getValue())).entrySet().stream().filter(e -> e.getValue().size() > 0).map(e -> {
List<Offer> offersList = e.getValue();
String agentId = e.getKey();
return new SingularityOfferHolder(offersList, numDueTasks, agentAndRackHelper.getRackIdOrDefault(offersList.get(0)), agentId, offersList.get(0).getHostname(), agentAndRackHelper.getTextAttributes(offersList.get(0)), agentAndRackHelper.getReservedAgentAttributes(offersList.get(0)));
}).collect(Collectors.toMap(SingularityOfferHolder::getAgentId, Function.identity()));
if (sortedTaskRequestHolders.isEmpty()) {
return offerHolders.values();
}
final AtomicInteger tasksScheduled = new AtomicInteger(0);
Map<String, RequestUtilization> requestUtilizations = usageManager.getRequestUtilizations(false);
List<SingularityTaskId> activeTaskIds = taskManager.getActiveTaskIds();
Map<String, SingularityAgentUsageWithId> currentUsages = usageManager.getAllCurrentAgentUsage();
List<CompletableFuture<Void>> currentUsagesFutures = new ArrayList<>();
for (SingularityOfferHolder offerHolder : offerHolders.values()) {
currentUsagesFutures.add(runAsync(() -> {
String agentId = offerHolder.getAgentId();
Optional<SingularityAgentUsageWithId> maybeUsage = Optional.ofNullable(currentUsages.get(agentId));
if (configuration.isReCheckMetricsForLargeNewTaskCount() && maybeUsage.isPresent()) {
long newTaskCount = taskManager.getActiveTaskIds().stream().filter(t -> t.getStartedAt() > maybeUsage.get().getTimestamp() && t.getSanitizedHost().equals(offerHolder.getSanitizedHost())).count();
if (newTaskCount >= maybeUsage.get().getNumTasks() / 2) {
try {
MesosAgentMetricsSnapshotObject metricsSnapshot = usageHelper.getMetricsSnapshot(offerHolder.getHostname());
if (metricsSnapshot.getSystemLoad5Min() / metricsSnapshot.getSystemCpusTotal() > mesosConfiguration.getRecheckMetricsLoad1Threshold() || metricsSnapshot.getSystemLoad1Min() / metricsSnapshot.getSystemCpusTotal() > mesosConfiguration.getRecheckMetricsLoad5Threshold()) {
// Come back to this agent after we have collected more metrics
LOG.info("Skipping evaluation of {} until new metrics are collected. Current load is load1: {}, load5: {}", offerHolder.getHostname(), metricsSnapshot.getSystemLoad1Min(), metricsSnapshot.getSystemLoad5Min());
currentUsages.remove(agentId);
}
} catch (Throwable t) {
LOG.warn("Could not check metrics for host {}, skipping", offerHolder.getHostname());
currentUsages.remove(agentId);
}
}
}
}));
}
CompletableFutures.allOf(currentUsagesFutures).join();
List<CompletableFuture<Void>> usagesWithScoresFutures = new ArrayList<>();
Map<String, SingularityAgentUsageWithCalculatedScores> currentUsagesById = new ConcurrentHashMap<>();
for (SingularityAgentUsageWithId usage : currentUsages.values()) {
if (offerHolders.containsKey(usage.getAgentId())) {
usagesWithScoresFutures.add(runAsync(() -> currentUsagesById.put(usage.getAgentId(), new SingularityAgentUsageWithCalculatedScores(usage, mesosConfiguration.getScoreUsingSystemLoad(), getMaxProbableUsageForAgent(activeTaskIds, requestUtilizations, offerHolders.get(usage.getAgentId()).getSanitizedHost()), mesosConfiguration.getLoad5OverloadedThreshold(), mesosConfiguration.getLoad1OverloadedThreshold(), usage.getTimestamp()))));
}
}
CompletableFutures.allOf(usagesWithScoresFutures).join();
long startCheck = System.currentTimeMillis();
LOG.debug("Found agent usages and scores after {}ms", startCheck - start);
Map<SingularityDeployKey, Optional<SingularityDeployStatistics>> deployStatsCache = new ConcurrentHashMap<>();
Set<String> overloadedHosts = Sets.newConcurrentHashSet();
AtomicInteger noMatches = new AtomicInteger();
// We spend much of the offer check loop for request level locks. Wait for the locks in parallel, but ensure that actual offer checks
// are done in serial to not over commit a single offer
ReentrantLock offerCheckTempLock = new ReentrantLock(false);
CompletableFutures.allOf(sortedTaskRequestHolders.stream().collect(Collectors.groupingBy(t -> t.getTaskRequest().getRequest().getId())).entrySet().stream().map(entry -> runAsync(() -> {
lock.tryRunWithRequestLock(() -> {
offerCheckTempLock.lock();
try {
long startRequest = System.currentTimeMillis();
int evaluated = 0;
for (SingularityTaskRequestHolder taskRequestHolder : entry.getValue()) {
long now = System.currentTimeMillis();
boolean isOfferLoopTakingTooLong = now - startCheck > mesosConfiguration.getOfferLoopTimeoutMillis();
boolean isRequestInOfferLoopTakingTooLong = (now - startRequest > mesosConfiguration.getOfferLoopRequestTimeoutMillis() && evaluated > 1);
if (isOfferLoopTakingTooLong || isRequestInOfferLoopTakingTooLong) {
LOG.warn("{} is holding the offer lock for too long, skipping remaining {} tasks for scheduling", taskRequestHolder.getTaskRequest().getRequest().getId(), entry.getValue().size() - evaluated);
break;
}
evaluated++;
List<SingularityTaskId> activeTaskIdsForRequest = leaderCache.getActiveTaskIdsForRequest(taskRequestHolder.getTaskRequest().getRequest().getId());
if (isTooManyInstancesForRequest(taskRequestHolder.getTaskRequest(), activeTaskIdsForRequest)) {
LOG.debug("Skipping pending task {}, too many instances already running", taskRequestHolder.getTaskRequest().getPendingTask().getPendingTaskId());
continue;
}
Map<String, Double> scorePerOffer = new ConcurrentHashMap<>();
for (SingularityOfferHolder offerHolder : offerHolders.values()) {
if (!isOfferFull(offerHolder)) {
if (calculateScore(requestUtilizations, currentUsagesById, taskRequestHolder, scorePerOffer, activeTaskIdsForRequest, offerHolder, deployStatsCache, overloadedHosts) > mesosConfiguration.getGoodEnoughScoreThreshold()) {
break;
}
}
}
if (!scorePerOffer.isEmpty()) {
SingularityOfferHolder bestOffer = offerHolders.get(Collections.max(scorePerOffer.entrySet(), Map.Entry.comparingByValue()).getKey());
LOG.info("Best offer {}/1 is on {}", scorePerOffer.get(bestOffer.getAgentId()), bestOffer.getSanitizedHost());
acceptTask(bestOffer, taskRequestHolder);
tasksScheduled.getAndIncrement();
updateAgentUsageScores(taskRequestHolder, currentUsagesById, bestOffer.getAgentId(), requestUtilizations);
} else {
noMatches.getAndIncrement();
}
}
} finally {
offerCheckTempLock.unlock();
}
}, entry.getKey(), String.format("%s#%s", getClass().getSimpleName(), "checkOffers"), mesosConfiguration.getOfferLoopRequestTimeoutMillis(), TimeUnit.MILLISECONDS);
})).collect(Collectors.toList())).join();
LOG.info("{} tasks scheduled, {} tasks remaining after examining {} offers ({} overloaded hosts, {} had no offer matches)", tasksScheduled, numDueTasks - tasksScheduled.get(), offers.size(), overloadedHosts.size(), noMatches.get());
return offerHolders.values();
}
use of org.apache.mesos.v1.Protos.AgentID in project Singularity by HubSpot.
the class SingularityMesosSchedulerClient method kill.
public void kill(TaskID taskId, AgentID agentId, KillPolicy killPolicy) {
Builder kill = build().setKill(Kill.newBuilder().setTaskId(taskId).setAgentId(agentId).setKillPolicy(killPolicy));
sendCall(kill, Type.KILL);
}
use of org.apache.mesos.v1.Protos.AgentID in project Singularity by HubSpot.
the class SingularityMesosSchedulerClient method frameworkMessage.
/**
* Sent by the scheduler to send arbitrary binary data to the executor. Mesos neither interprets this data nor
* makes any guarantees about the delivery of this message to the executor. data is raw bytes encoded in Base64.
*
* @param executorId
* @param agentId
* @param data
*/
public void frameworkMessage(ExecutorID executorId, AgentID agentId, byte[] data) {
Builder message = build().setMessage(Message.newBuilder().setAgentId(agentId).setExecutorId(executorId).setData(ByteString.copyFrom(data)));
sendCall(message, Type.MESSAGE);
}
use of org.apache.mesos.v1.Protos.AgentID in project Singularity by HubSpot.
the class SingularityMesosSchedulerClient method acknowledge.
/**
* Sent by the scheduler to acknowledge a status update. Note that with the new API, schedulers are responsible
* for explicitly acknowledging the receipt of status updates that have status.uuid set. These status updates
* are retried until they are acknowledged by the scheduler. The scheduler must not acknowledge status updates
* that do not have status.uuid set, as they are not retried. The uuid field contains raw bytes encoded in Base64.
*
* @param agentId
* @param taskId
* @param uuid
*/
public void acknowledge(AgentID agentId, TaskID taskId, ByteString uuid) {
Builder acknowledge = build().setAcknowledge(Acknowledge.newBuilder().setAgentId(agentId).setTaskId(taskId).setUuid(uuid));
sendCall(acknowledge, Type.ACKNOWLEDGE);
}
use of org.apache.mesos.v1.Protos.AgentID in project Singularity by HubSpot.
the class SingularityMesosSchedulerClient method connect.
/**
* Sets up the connection and is blocking in wait for calls from mesos
* master.
*/
private void connect(URI mesosMasterURI, FrameworkInfo frameworkInfo, SingularityMesosScheduler scheduler) throws URISyntaxException {
MesosClientBuilder<Call, Event> clientBuilder = ProtobufMesosClientBuilder.schedulerUsingProtos().mesosUri(mesosMasterURI).applicationUserAgentEntry(UserAgentEntries.userAgentEntryForMavenArtifact("com.hubspot.singularity", "SingularityService")).onSendEventBackpressureBuffer().onSendErrorRetry().onBackpressureBuffer(scheduler.getEventBufferSize(), () -> {
String message = String.format("Overflow of event buffer (%s), singularity could not keep up!", scheduler.getEventBufferSize());
scheduler.onUncaughtException(new EventBufferOverflowException(message));
}, BackpressureOverflow.ON_OVERFLOW_ERROR);
Call subscribeCall = Call.newBuilder().setType(Call.Type.SUBSCRIBE).setFrameworkId(frameworkInfo.getId()).setSubscribe(Call.Subscribe.newBuilder().setFrameworkInfo(frameworkInfo).build()).build();
MesosClientBuilder<Call, Event> subscribe = clientBuilder.subscribe(subscribeCall);
this.scheduler = scheduler;
subscribe.processStream(unicastEvents -> {
final Observable<Event> events = unicastEvents.share();
events.filter(event -> event.getType() == Event.Type.ERROR).map(event -> event.getError().getMessage()).subscribe(scheduler::error, scheduler::onUncaughtException);
events.filter(event -> event.getType() == Event.Type.FAILURE).map(Event::getFailure).subscribe(scheduler::failure, scheduler::onUncaughtException);
events.filter(event -> event.getType() == Event.Type.HEARTBEAT).subscribe(scheduler::heartbeat, scheduler::onUncaughtException);
events.filter(event -> event.getType() == Event.Type.INVERSE_OFFERS).map(event -> event.getInverseOffers().getInverseOffersList()).subscribe(scheduler::inverseOffers, scheduler::onUncaughtException);
events.filter(event -> event.getType() == Event.Type.MESSAGE).map(Event::getMessage).subscribe(scheduler::message, scheduler::onUncaughtException);
events.filter(event -> event.getType() == Event.Type.OFFERS).map(event -> event.getOffers().getOffersList()).subscribe(scheduler::resourceOffers, scheduler::onUncaughtException);
events.filter(event -> event.getType() == Event.Type.RESCIND).map(event -> event.getRescind().getOfferId()).subscribe(scheduler::rescind, scheduler::onUncaughtException);
events.filter(event -> event.getType() == Event.Type.RESCIND_INVERSE_OFFER).map(event -> event.getRescindInverseOffer().getInverseOfferId()).subscribe(scheduler::rescindInverseOffer, scheduler::onUncaughtException);
events.filter(event -> event.getType() == Event.Type.SUBSCRIBED).map(Event::getSubscribed).subscribe(subscribed -> {
this.frameworkId = subscribed.getFrameworkId();
scheduler.subscribed(subscribed);
}, scheduler::onSubscribeException);
events.filter(event -> event.getType() == Event.Type.UPDATE).map(event -> event.getUpdate().getStatus()).filter(status -> {
if (!status.hasAgentId() || !status.getAgentId().hasValue()) {
LOG.warn("Filtering out status update without agentId {}", status);
return false;
} else {
return true;
}
}).subscribe(scheduler::statusUpdate, scheduler::onUncaughtException);
// This is the observable that is responsible for sending calls to mesos master.
PublishSubject<Optional<SinkOperation<Call>>> p = PublishSubject.create();
// toSerialised handles the fact that we can add calls on different threads.
publisher = p.toSerialized();
return publisher.onBackpressureBuffer();
});
MesosClient<Call, Event> client = clientBuilder.build();
openStream = client.openStream();
try {
openStream.await();
} catch (Throwable t) {
if (Throwables.getCausalChain(t).stream().anyMatch(throwable -> throwable instanceof InterruptedException)) {
LOG.warn("Observable interrupted, closed stream from mesos");
} else {
LOG.error("Observable was unexpectedly closed", t);
scheduler.onUncaughtException(t);
}
}
}
Aggregations