Search in sources :

Example 6 with ExclusionSet

use of com.ms.silverking.cloud.meta.ExclusionSet in project SilverKing by Morgan-Stanley.

the class NodeRingMaster2 method getAllCurrentNonExcludedReplicaServers.

private Set<IPAndPort> getAllCurrentNonExcludedReplicaServers() {
    Set<IPAndPort> nonExcludedReplicas;
    Set<IPAndPort> allReplicas;
    ExclusionSet curExclusionSet;
    allReplicas = curMapState.getResolvedReplicaMap().allReplicas();
    // curExclusionSet = curMapState.getCurrentExclusionSet();
    curExclusionSet = latestExclusionSet;
    nonExcludedReplicas = ImmutableSet.copyOf(curExclusionSet.filterByIP(allReplicas));
    return nonExcludedReplicas;
}
Also used : IPAndPort(com.ms.silverking.net.IPAndPort) ExclusionSet(com.ms.silverking.cloud.meta.ExclusionSet)

Example 7 with ExclusionSet

use of com.ms.silverking.cloud.meta.ExclusionSet in project SilverKing by Morgan-Stanley.

the class HealthMonitor method updateInstanceExclusionSet.

private void updateInstanceExclusionSet(Set<IPAndPort> guiltySuspects, Set<IPAndPort> newActiveNodes) {
    try {
        if (instanceExclusionZK != null) {
            ExclusionSet oldExclusionSet;
            ExclusionSet newExclusionSet;
            if (!guiltySuspects.isEmpty()) {
                Log.warning(String.format("Marking as bad"));
                Log.warning(String.format("%s\n", CollectionUtil.toString(hostStringSet(guiltySuspects))));
            } else {
                Log.warning(String.format("No guilty suspects"));
            }
            if (!newActiveNodes.isEmpty()) {
                Log.warning(String.format("Marking as good"));
                Log.warning(String.format("%s\n", CollectionUtil.toString(hostStringSet(newActiveNodes))));
            } else {
                Log.warning(String.format("No newly good nodes"));
            }
            Log.warning(String.format("Latest exclusion set path %s", instanceExclusionZK.getLatestZKPath()));
            if (instanceExclusionZK.getLatestZKPath() != null) {
                oldExclusionSet = instanceExclusionZK.readLatestFromZK();
            } else {
                oldExclusionSet = ExclusionSet.emptyExclusionSet(0);
            }
            newExclusionSet = oldExclusionSet.add(hostStringSet(guiltySuspects)).remove(hostStringSet(newActiveNodes));
            // window of vulnerability here
            // for now we ensure this isn't violated externally
            Log.warning(String.format("Old exclusion set %d %s", oldExclusionSet.size(), oldExclusionSet));
            Log.warning(String.format("New exclusion set %d %s", newExclusionSet.size(), newExclusionSet));
            if (!newExclusionSet.equals(oldExclusionSet)) {
                Log.warning(String.format("Writing exclusion set"));
                instanceExclusionZK.writeToZK(newExclusionSet);
                Log.warning(String.format("Latest exclusion set path after write %s", instanceExclusionZK.getLatestZKPath()));
            } else {
                Log.warning(String.format("No change in exclusion set"));
            }
        } else {
            Log.warning(String.format("Unable to mark as bad as exclusionZK is null"));
        }
    } catch (Exception e) {
        Log.logErrorWarning(e, "Exception in updateInstanceExclusionSet");
    }
}
Also used : ExclusionSet(com.ms.silverking.cloud.meta.ExclusionSet) KeeperException(org.apache.zookeeper.KeeperException) IOException(java.io.IOException) CmdLineException(org.kohsuke.args4j.CmdLineException)

Example 8 with ExclusionSet

use of com.ms.silverking.cloud.meta.ExclusionSet in project SilverKing by Morgan-Stanley.

the class SKAdmin method execClusterCommandGroup.

private boolean execClusterCommandGroup(SKAdminCommand[] commands) throws IOException, KeeperException {
    /*
		 * Each DHT consists of active + passive nodes
		 * Filter active&passive by particular host groups
		 * 	any servers that aren't in the included host groups won't be used
		 * Fetch all host group tables
		 * Fetch all class variables for the host groups
		 * For ChecSKFS, fetch the skfs environment
		 * Wait for all fetches to complete
		 * Create map of servers->commands to run
		 * Pass the command map to TwoLevelParallelSSH and run
		 * Run/wait until complete
		 */
    // Map<String,HostGroupTable>	hostGroupTables;
    // hostGroupTables = getHostGroupTables(hostGroups, dhtMC.getZooKeeper().getZKConfig());
    Set<String> activeHostGroupNames;
    Map<String, ClassVars> hostGroupToClassVars;
    HostGroupTable hostGroupTable;
    Set<String> validActiveServers;
    Set<String> validPassiveServers;
    String hostGroupTableName;
    Map<String, String[]> serverCommands;
    Set<String> passiveNodeHostGroupNames;
    boolean result;
    Set<String> targetServers;
    Set<String> passiveTargetServers;
    targetServers = CollectionUtil.parseSet(options.targets, ",");
    activeHostGroupNames = dhtConfig.getHostGroups();
    Log.warning("hostGroupNames: ", CollectionUtil.toString(activeHostGroupNames));
    hostGroupToClassVars = getHostGroupToClassVarsMap(dhtConfig);
    Log.warning("hostGroupToClassVars: ", CollectionUtil.mapToString(hostGroupToClassVars));
    Log.warning("ringConfig: ", ringConfig);
    hostGroupTableName = ringConfig.getCloudConfiguration().getHostGroupTableName();
    Log.warning("hostGroupTableName: ", hostGroupTableName);
    hostGroupTable = getHostGroupTable(hostGroupTableName, dhtMC.getZooKeeper().getZKConfig());
    // FUTURE - Do more validation of configuration. E.g. prevent a server from being both
    // active and passive, the ring from containing servers without class vars, etc.
    validActiveServers = findValidActiveServers(activeHostGroupNames, hostGroupTable, ringTree);
    validActiveServers = retainOnlySpecifiedAndNonExcludedServers(validActiveServers, targetServers);
    verifyServerEligibility(validActiveServers, commands);
    Log.warning("validActiveServers: ", CollectionUtil.toString(validActiveServers));
    // Allow StopNodes with empty validActiveServers if the target is activeDaemons
    if (options.targetsEqualsActiveDaemonsTarget() && validActiveServers.isEmpty()) {
        boolean exitOK;
        exitOK = true;
        for (SKAdminCommand command : commands) {
            if (command != SKAdminCommand.StopNodes) {
                exitOK = false;
            }
        }
        if (exitOK) {
            return true;
        }
    }
    passiveTargetServers = new HashSet<>();
    passiveTargetServers.addAll(targetServers);
    passiveTargetServers.removeAll(validActiveServers);
    passiveNodeHostGroupNames = dhtConfig.getPassiveNodeHostGroupsAsSet();
    Log.warning("passiveNodeHostGroupNames: ", CollectionUtil.toString(passiveNodeHostGroupNames));
    if (passiveTargetServers.size() > 0) {
        validPassiveServers = ImmutableSet.copyOf(passiveTargetServers);
    } else {
        validPassiveServers = findValidPassiveServers(passiveNodeHostGroupNames, hostGroupTable);
    }
    validPassiveServers = retainOnlySpecifiedAndNonExcludedServers(validPassiveServers, passiveTargetServers);
    Log.warning("validPassiveServers: ", CollectionUtil.toString(validPassiveServers));
    if (Arrays.contains(commands, SKAdminCommand.ClearData) && !options.targetsEqualsExclusionsTarget()) {
        Log.countdownWarning("*** Clearing ALL data ***", unsafeWarningCountdown);
    }
    result = true;
    for (SKAdminCommand command : commands) {
        boolean _result;
        Log.warning("Executing cluster command: ", command);
        serverCommands = createServerCommands(command, validActiveServers, validPassiveServers, hostGroupTable, hostGroupToClassVars, activeHostGroupNames, passiveNodeHostGroupNames);
        displayCommandMap(serverCommands);
        if (!options.displayOnly) {
            _result = execCommandMap(serverCommands, validActiveServers.size() > 0 ? validActiveServers : validPassiveServers, hostGroupTable);
            result = result && _result;
            if (!result) {
                break;
            }
        }
        if (command.equals(SKAdminCommand.StartNodes)) {
            int[] timeouts;
            boolean running;
            int attemptIndex;
            Log.warning("Waiting for nodes to enter running state...");
            timeouts = NumUtil.parseIntArray(options.timeoutSeconds, ",");
            running = false;
            attemptIndex = 0;
            do {
                Pair<Set<IPAndPort>, Boolean> waitResult;
                Set<IPAndPort> failedServers;
                Log.warningf("attemptIndex: %d\ttimeout: %d", attemptIndex, timeouts[attemptIndex]);
                if (replicaSetExcludedByExclusions(exclusionSet)) {
                    return false;
                }
                waitResult = waitUntilRunning(IPAndPort.set(validActiveServers, dhtConfig.getPort()), timeouts[attemptIndex]);
                failedServers = waitResult.getV1();
                if (waitResult.getV2()) {
                    running = true;
                } else {
                    ++attemptIndex;
                    if (attemptIndex < timeouts.length) {
                        Log.warningf("Adding to instance exclusion set: %s", failedServers);
                        if (options.excludeInstanceExclusions) {
                            exclusionSet = exclusionSet.addByIPAndPort(failedServers);
                        }
                        addToInstanceExclusions(failedServers);
                        validActiveServers = removeServers(validActiveServers, failedServers);
                    }
                }
            } while (!running && attemptIndex < timeouts.length);
            if (!running) {
                return false;
            }
        }
    }
    return result;
}
Also used : IPAndPort(com.ms.silverking.net.IPAndPort) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) HashSet(java.util.HashSet) ExclusionSet(com.ms.silverking.cloud.meta.ExclusionSet) HostGroupTable(com.ms.silverking.cloud.config.HostGroupTable) ClassVars(com.ms.silverking.cloud.dht.meta.ClassVars)

Example 9 with ExclusionSet

use of com.ms.silverking.cloud.meta.ExclusionSet in project SilverKing by Morgan-Stanley.

the class SKAdmin method replicaSetExcludedByExclusions.

private boolean replicaSetExcludedByExclusions(ExclusionSet es) throws KeeperException, IOException {
    InstantiatedRingTree curTree;
    ResolvedReplicaMap replicaMap;
    List<Set<IPAndPort>> excludedReplicaSets;
    curTree = readCurrentTree();
    replicaMap = curTree.getResolvedMap(ringConfig.getRingParentName(), new ReplicaNaiveIPPrioritizer());
    excludedReplicaSets = replicaMap.getExcludedReplicaSets(es.asIPAndPortSet(0));
    if (excludedReplicaSets.size() != 0) {
        Log.warning("Exclusion set excludes at least one replica set:");
        for (Set<IPAndPort> s : excludedReplicaSets) {
            Log.warningf("%s", s);
        }
        return true;
    }
    return false;
}
Also used : IPAndPort(com.ms.silverking.net.IPAndPort) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) HashSet(java.util.HashSet) ExclusionSet(com.ms.silverking.cloud.meta.ExclusionSet) ReplicaNaiveIPPrioritizer(com.ms.silverking.cloud.dht.daemon.ReplicaNaiveIPPrioritizer) InstantiatedRingTree(com.ms.silverking.cloud.toporing.InstantiatedRingTree) ResolvedReplicaMap(com.ms.silverking.cloud.toporing.ResolvedReplicaMap)

Example 10 with ExclusionSet

use of com.ms.silverking.cloud.meta.ExclusionSet in project SilverKing by Morgan-Stanley.

the class SKAdmin method createServerCommands.

private Map<String, String[]> createServerCommands(SKAdminCommand command, Set<String> validActiveServers, Set<String> validPassiveServers, HostGroupTable hostGroupTable, Map<String, ClassVars> hostGroupToClassVars, Set<String> activeHostGroupNames, Set<String> passiveNodeHostGroupNames) {
    Map<String, String[]> serverCommands;
    Set<String> allServers;
    allServers = new HashSet<>();
    if (command == SKAdminCommand.ClearInstanceExclusionsData) {
        try {
            ExclusionSet e;
            e = getInstanceExclusions();
            if (replicaSetExcludedByExclusions(e)) {
                Log.warning("Can't clear instance exclusions data. At least one replica set is entirely excluded.");
                throw new RuntimeException("Entire replica set excluded");
            } else {
                Log.warning("Servers to clear data from:\n", e);
                Log.countdownWarning("*** Clearing instance exclusions data ***", unsafeWarningCountdown);
                allServers.addAll(e.getServers());
            }
        } catch (KeeperException | IOException e) {
            throw new RuntimeException("Exception calling getInstanceExclusions()", e);
        }
    } else {
        allServers.addAll(validActiveServers);
        allServers.addAll(validPassiveServers);
    }
    serverCommands = new HashMap<>();
    for (String server : allServers) {
        String rawServerCommand;
        String[] serverCommand;
        ClassVars serverClassVars;
        serverClassVars = getServerClassVars(server, hostGroupTable, activeHostGroupNames, passiveNodeHostGroupNames, hostGroupToClassVars);
        if (serverClassVars != null) {
            switch(command) {
                case StartNodes:
                    rawServerCommand = createStartCommand(dhtConfig, serverClassVars, options);
                    break;
                case StopNodes:
                    rawServerCommand = createStopCommand(dhtConfig, serverClassVars);
                    break;
                case ClearInstanceExclusionsData:
                case ClearData:
                    rawServerCommand = createClearDataCommand(dhtConfig, serverClassVars);
                    break;
                case StartSKFS:
                    if (options.destructive) {
                        throw new RuntimeException("Destructive StartSKFS not supported");
                    }
                case CheckSKFS:
                    rawServerCommand = createCheckSKFSCommand(dhtConfig, serverClassVars);
                    break;
                case StopSKFS:
                    rawServerCommand = createStopSKFSCommand(dhtConfig, serverClassVars);
                    break;
                default:
                    throw new RuntimeException("Unsupported command: " + command);
            }
            serverCommand = rawServerCommand.split("\\s+");
            serverCommands.put(server, serverCommand);
        }
    }
    return serverCommands;
}
Also used : ExclusionSet(com.ms.silverking.cloud.meta.ExclusionSet) ClassVars(com.ms.silverking.cloud.dht.meta.ClassVars) IOException(java.io.IOException) KeeperException(org.apache.zookeeper.KeeperException)

Aggregations

ExclusionSet (com.ms.silverking.cloud.meta.ExclusionSet)14 IOException (java.io.IOException)8 KeeperException (org.apache.zookeeper.KeeperException)8 ServerSetExtensionZK (com.ms.silverking.cloud.meta.ServerSetExtensionZK)5 ExclusionZK (com.ms.silverking.cloud.meta.ExclusionZK)4 IPAndPort (com.ms.silverking.net.IPAndPort)4 CmdLineException (org.kohsuke.args4j.CmdLineException)4 HostGroupTable (com.ms.silverking.cloud.config.HostGroupTable)3 InstantiatedRingTree (com.ms.silverking.cloud.toporing.InstantiatedRingTree)3 ResolvedReplicaMap (com.ms.silverking.cloud.toporing.ResolvedReplicaMap)3 ImmutableSet (com.google.common.collect.ImmutableSet)2 ReplicaNaiveIPPrioritizer (com.ms.silverking.cloud.dht.daemon.ReplicaNaiveIPPrioritizer)2 InvalidTransitionException (com.ms.silverking.cloud.dht.daemon.storage.convergence.InvalidTransitionException)2 ClassVars (com.ms.silverking.cloud.dht.meta.ClassVars)2 HostGroupTableZK (com.ms.silverking.cloud.meta.HostGroupTableZK)2 StoragePolicyGroup (com.ms.silverking.cloud.storagepolicy.StoragePolicyGroup)2 Topology (com.ms.silverking.cloud.topology.Topology)2 TopologyZK (com.ms.silverking.cloud.topology.TopologyZK)2 InvalidRingException (com.ms.silverking.cloud.toporing.InvalidRingException)2 RingTree (com.ms.silverking.cloud.toporing.RingTree)2