use of org.apache.accumulo.server.master.state.TServerInstance in project accumulo by apache.
the class TabletGroupWatcher method flushChanges.
private void flushChanges(SortedMap<TServerInstance, TabletServerStatus> currentTServers, List<Assignment> assignments, List<Assignment> assigned, List<TabletLocationState> assignedToDeadServers, Map<TServerInstance, List<Path>> logsForDeadServers, List<TabletLocationState> suspendedToGoneServers, Map<KeyExtent, TServerInstance> unassigned) throws DistributedStoreException, TException, WalMarkerException {
boolean tabletsSuspendable = canSuspendTablets();
if (!assignedToDeadServers.isEmpty()) {
int maxServersToShow = min(assignedToDeadServers.size(), 100);
Master.log.debug("{} assigned to dead servers: {}...", assignedToDeadServers.size(), assignedToDeadServers.subList(0, maxServersToShow));
Master.log.debug("logs for dead servers: {}", logsForDeadServers);
if (tabletsSuspendable) {
store.suspend(assignedToDeadServers, logsForDeadServers, master.getSteadyTime());
} else {
store.unassign(assignedToDeadServers, logsForDeadServers);
}
this.master.markDeadServerLogsAsClosed(logsForDeadServers);
this.master.nextEvent.event("Marked %d tablets as suspended because they don't have current servers", assignedToDeadServers.size());
}
if (!suspendedToGoneServers.isEmpty()) {
int maxServersToShow = min(assignedToDeadServers.size(), 100);
Master.log.debug(assignedToDeadServers.size() + " suspended to gone servers: " + assignedToDeadServers.subList(0, maxServersToShow) + "...");
store.unsuspend(suspendedToGoneServers);
}
if (!currentTServers.isEmpty()) {
Map<KeyExtent, TServerInstance> assignedOut = new HashMap<>();
final StringBuilder builder = new StringBuilder(64);
this.master.tabletBalancer.getAssignments(Collections.unmodifiableSortedMap(currentTServers), Collections.unmodifiableMap(unassigned), assignedOut);
for (Entry<KeyExtent, TServerInstance> assignment : assignedOut.entrySet()) {
if (unassigned.containsKey(assignment.getKey())) {
if (assignment.getValue() != null) {
if (!currentTServers.containsKey(assignment.getValue())) {
Master.log.warn("balancer assigned {} to a tablet server that is not current {} ignoring", assignment.getKey(), assignment.getValue());
continue;
}
if (builder.length() > 0) {
builder.append(ASSIGNMENT_BUFFER_SEPARATOR);
}
builder.append(assignment);
// Don't let the log message get too gigantic
if (builder.length() > ASSINGMENT_BUFFER_MAX_LENGTH) {
builder.append("]");
Master.log.debug("{} assigning tablets: [{}", store.name(), builder.toString());
builder.setLength(0);
}
assignments.add(new Assignment(assignment.getKey(), assignment.getValue()));
}
} else {
Master.log.warn("{} load balancer assigning tablet that was not nominated for assignment {}", store.name(), assignment.getKey());
}
}
if (builder.length() > 0) {
// Make sure to log any leftover assignments
builder.append("]");
Master.log.debug("{} assigning tablets: [{}", store.name(), builder.toString());
}
if (!unassigned.isEmpty() && assignedOut.isEmpty())
Master.log.warn("Load balancer failed to assign any tablets");
}
if (assignments.size() > 0) {
Master.log.info(String.format("Assigning %d tablets", assignments.size()));
store.setFutureLocations(assignments);
}
assignments.addAll(assigned);
for (Assignment a : assignments) {
TServerConnection conn = this.master.tserverSet.getConnection(a.server);
if (conn != null) {
conn.assignTablet(this.master.masterLock, a.tablet);
} else {
Master.log.warn("Could not connect to server {}", a.server);
}
master.assignedTablet(a.tablet);
}
}
use of org.apache.accumulo.server.master.state.TServerInstance in project accumulo by apache.
the class TabletGroupWatcher method repairMetadata.
private void repairMetadata(Text row) {
Master.log.debug("Attempting repair on {}", row);
// Attempt to find the dead server entry and remove it.
try {
Map<Key, Value> future = new HashMap<>();
Map<Key, Value> assigned = new HashMap<>();
KeyExtent extent = new KeyExtent(row, new Value(new byte[] { 0 }));
String table = MetadataTable.NAME;
if (extent.isMeta())
table = RootTable.NAME;
Scanner scanner = this.master.getConnector().createScanner(table, Authorizations.EMPTY);
scanner.fetchColumnFamily(CurrentLocationColumnFamily.NAME);
scanner.fetchColumnFamily(FutureLocationColumnFamily.NAME);
scanner.setRange(new Range(row));
for (Entry<Key, Value> entry : scanner) {
if (entry.getKey().getColumnFamily().equals(CurrentLocationColumnFamily.NAME)) {
assigned.put(entry.getKey(), entry.getValue());
} else if (entry.getKey().getColumnFamily().equals(FutureLocationColumnFamily.NAME)) {
future.put(entry.getKey(), entry.getValue());
}
}
if (future.size() > 0 && assigned.size() > 0) {
Master.log.warn("Found a tablet assigned and hosted, attempting to repair");
} else if (future.size() > 1 && assigned.size() == 0) {
Master.log.warn("Found a tablet assigned to multiple servers, attempting to repair");
} else if (future.size() == 0 && assigned.size() > 1) {
Master.log.warn("Found a tablet hosted on multiple servers, attempting to repair");
} else {
Master.log.info("Attempted a repair, but nothing seems to be obviously wrong. {} {}", assigned, future);
return;
}
Iterator<Entry<Key, Value>> iter = Iterators.concat(future.entrySet().iterator(), assigned.entrySet().iterator());
while (iter.hasNext()) {
Entry<Key, Value> entry = iter.next();
TServerInstance alive = master.tserverSet.find(entry.getValue().toString());
if (alive == null) {
Master.log.info("Removing entry {}", entry);
BatchWriter bw = this.master.getConnector().createBatchWriter(table, new BatchWriterConfig());
Mutation m = new Mutation(entry.getKey().getRow());
m.putDelete(entry.getKey().getColumnFamily(), entry.getKey().getColumnQualifier());
bw.addMutation(m);
bw.close();
return;
}
}
Master.log.error("Metadata table is inconsistent at {} and all assigned/future tservers are still online.", row);
} catch (Throwable e) {
Master.log.error("Error attempting repair of metadata " + row + ": " + e, e);
}
}
use of org.apache.accumulo.server.master.state.TServerInstance in project accumulo by apache.
the class MasterReplicationCoordinator method getServicerAddress.
@Override
public String getServicerAddress(String remoteTableId, TCredentials creds) throws ReplicationCoordinatorException, TException {
try {
security.authenticateUser(master.rpcCreds(), creds);
} catch (ThriftSecurityException e) {
log.error("{} failed to authenticate for replication to {}", creds.getPrincipal(), remoteTableId);
throw new ReplicationCoordinatorException(ReplicationCoordinatorErrorCode.CANNOT_AUTHENTICATE, "Could not authenticate " + creds.getPrincipal());
}
Set<TServerInstance> tservers = master.onlineTabletServers();
if (tservers.isEmpty()) {
throw new ReplicationCoordinatorException(ReplicationCoordinatorErrorCode.NO_AVAILABLE_SERVERS, "No tservers are available for replication");
}
TServerInstance tserver = getRandomTServer(tservers, rand.nextInt(tservers.size()));
String replServiceAddr;
try {
replServiceAddr = new String(reader.getData(ZooUtil.getRoot(inst) + ReplicationConstants.ZOO_TSERVERS + "/" + tserver.hostPort(), null), UTF_8);
} catch (KeeperException | InterruptedException e) {
log.error("Could not fetch repliation service port for tserver", e);
throw new ReplicationCoordinatorException(ReplicationCoordinatorErrorCode.SERVICE_CONFIGURATION_UNAVAILABLE, "Could not determine port for replication service running at " + tserver.hostPort());
}
return replServiceAddr;
}
use of org.apache.accumulo.server.master.state.TServerInstance in project accumulo by apache.
the class GarbageCollectWriteAheadLogs method collect.
public void collect(GCStatus status) {
Span span = Trace.start("getCandidates");
try {
status.currentLog.started = System.currentTimeMillis();
Map<TServerInstance, Set<UUID>> logsByServer = new HashMap<>();
Map<UUID, Pair<WalState, Path>> logsState = new HashMap<>();
// Scan for log file info first: the order is important
// Consider:
// * get live servers
// * new server gets a lock, creates a log
// * get logs
// * the log appears to belong to a dead server
long count = getCurrent(logsByServer, logsState);
long fileScanStop = System.currentTimeMillis();
log.info(String.format("Fetched %d files for %d servers in %.2f seconds", count, logsByServer.size(), (fileScanStop - status.currentLog.started) / 1000.));
status.currentLog.candidates = count;
span.stop();
// now it's safe to get the liveServers
Set<TServerInstance> currentServers = liveServers.getCurrentServers();
Map<UUID, TServerInstance> uuidToTServer;
span = Trace.start("removeEntriesInUse");
try {
uuidToTServer = removeEntriesInUse(logsByServer, currentServers, logsState);
count = uuidToTServer.size();
} catch (Exception ex) {
log.error("Unable to scan metadata table", ex);
return;
} finally {
span.stop();
}
long logEntryScanStop = System.currentTimeMillis();
log.info(String.format("%d log entries scanned in %.2f seconds", count, (logEntryScanStop - fileScanStop) / 1000.));
span = Trace.start("removeReplicationEntries");
try {
count = removeReplicationEntries(uuidToTServer);
} catch (Exception ex) {
log.error("Unable to scan replication table", ex);
return;
} finally {
span.stop();
}
long replicationEntryScanStop = System.currentTimeMillis();
log.info(String.format("%d replication entries scanned in %.2f seconds", count, (replicationEntryScanStop - logEntryScanStop) / 1000.));
span = Trace.start("removeFiles");
logsState.keySet().retainAll(uuidToTServer.keySet());
count = removeFiles(logsState.values(), status);
long removeStop = System.currentTimeMillis();
log.info(String.format("%d total logs removed from %d servers in %.2f seconds", count, logsByServer.size(), (removeStop - logEntryScanStop) / 1000.));
span.stop();
span = Trace.start("removeMarkers");
count = removeTabletServerMarkers(uuidToTServer, logsByServer, currentServers);
long removeMarkersStop = System.currentTimeMillis();
log.info(String.format("%d markers removed in %.2f seconds", count, (removeMarkersStop - removeStop) / 1000.));
span.stop();
status.currentLog.finished = removeStop;
status.lastLog = status.currentLog;
status.currentLog = new GcCycleStats();
} catch (Exception e) {
log.error("exception occured while garbage collecting write ahead logs", e);
} finally {
span.stop();
}
}
use of org.apache.accumulo.server.master.state.TServerInstance in project accumulo by apache.
the class GarbageCollectWriteAheadLogs method removeEntriesInUse.
private Map<UUID, TServerInstance> removeEntriesInUse(Map<TServerInstance, Set<UUID>> candidates, Set<TServerInstance> liveServers, Map<UUID, Pair<WalState, Path>> logsState) throws IOException, KeeperException, InterruptedException {
Map<UUID, TServerInstance> result = new HashMap<>();
for (Entry<TServerInstance, Set<UUID>> entry : candidates.entrySet()) {
for (UUID id : entry.getValue()) {
result.put(id, entry.getKey());
}
}
// remove any entries if there's a log reference (recovery hasn't finished)
Iterator<TabletLocationState> states = store.iterator();
while (states.hasNext()) {
TabletLocationState state = states.next();
// Easiest to just ignore all the WALs for the dead server.
if (state.getState(liveServers) == TabletState.ASSIGNED_TO_DEAD_SERVER) {
Set<UUID> idsToIgnore = candidates.remove(state.current);
if (idsToIgnore != null) {
for (UUID id : idsToIgnore) {
result.remove(id);
}
}
}
// that made the WALs.
for (Collection<String> wals : state.walogs) {
for (String wal : wals) {
UUID walUUID = path2uuid(new Path(wal));
TServerInstance dead = result.get(walUUID);
// There's a reference to a log file, so skip that server's logs
Set<UUID> idsToIgnore = candidates.remove(dead);
if (idsToIgnore != null) {
for (UUID id : idsToIgnore) {
result.remove(id);
}
}
}
}
}
// Remove OPEN and CLOSED logs for live servers: they are still in use
for (TServerInstance liveServer : liveServers) {
Set<UUID> idsForServer = candidates.get(liveServer);
// Server may not have any logs yet
if (idsForServer != null) {
for (UUID id : idsForServer) {
Pair<WalState, Path> stateFile = logsState.get(id);
if (stateFile.getFirst() != WalState.UNREFERENCED) {
result.remove(id);
}
}
}
}
return result;
}
Aggregations