Search in sources :

Example 1 with ReplicationQueueInfo

use of org.apache.hadoop.hbase.replication.ReplicationQueueInfo in project hbase by apache.

the class ReplicationZKNodeCleaner method getUnDeletedQueues.

/**
   * @return undeletedQueues replicator with its queueIds for removed peers
   * @throws IOException
   */
public Map<String, List<String>> getUnDeletedQueues() throws IOException {
    Map<String, List<String>> undeletedQueues = new HashMap<>();
    Set<String> peerIds = new HashSet<>(this.replicationPeers.getAllPeerIds());
    try {
        List<String> replicators = this.queuesClient.getListOfReplicators();
        for (String replicator : replicators) {
            List<String> queueIds = this.queuesClient.getAllQueues(replicator);
            for (String queueId : queueIds) {
                ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueId);
                if (!peerIds.contains(queueInfo.getPeerId())) {
                    undeletedQueues.computeIfAbsent(replicator, (key) -> new ArrayList<>()).add(queueId);
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Undeleted replication queue for removed peer found: " + String.format("[removedPeerId=%s, replicator=%s, queueId=%s]", queueInfo.getPeerId(), replicator, queueId));
                    }
                }
            }
        }
    } catch (KeeperException ke) {
        throw new IOException("Failed to get the replication queues of all replicators", ke);
    }
    return undeletedQueues;
}
Also used : KeeperException(org.apache.zookeeper.KeeperException) ZKUtil(org.apache.hadoop.hbase.zookeeper.ZKUtil) Abortable(org.apache.hadoop.hbase.Abortable) ReplicationQueuesClientArguments(org.apache.hadoop.hbase.replication.ReplicationQueuesClientArguments) Set(java.util.Set) ReplicationFactory(org.apache.hadoop.hbase.replication.ReplicationFactory) IOException(java.io.IOException) HashMap(java.util.HashMap) ReplicationPeers(org.apache.hadoop.hbase.replication.ReplicationPeers) ReplicationStateZKBase(org.apache.hadoop.hbase.replication.ReplicationStateZKBase) ZooKeeperWatcher(org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) List(java.util.List) ReplicationQueueInfo(org.apache.hadoop.hbase.replication.ReplicationQueueInfo) ReplicationQueuesClient(org.apache.hadoop.hbase.replication.ReplicationQueuesClient) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Entry(java.util.Map.Entry) Log(org.apache.commons.logging.Log) LogFactory(org.apache.commons.logging.LogFactory) InterfaceAudience(org.apache.hadoop.hbase.classification.InterfaceAudience) HashMap(java.util.HashMap) ReplicationQueueInfo(org.apache.hadoop.hbase.replication.ReplicationQueueInfo) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) IOException(java.io.IOException) KeeperException(org.apache.zookeeper.KeeperException) HashSet(java.util.HashSet)

Example 2 with ReplicationQueueInfo

use of org.apache.hadoop.hbase.replication.ReplicationQueueInfo in project hbase by apache.

the class DumpReplicationQueues method dumpQueues.

public String dumpQueues(ZKWatcher zkw, Set<String> peerIds, boolean hdfs) throws Exception {
    ReplicationQueueStorage queueStorage;
    StringBuilder sb = new StringBuilder();
    queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zkw, getConf());
    Set<ServerName> liveRegionServers = ZKUtil.listChildrenNoWatch(zkw, zkw.getZNodePaths().rsZNode).stream().map(ServerName::parseServerName).collect(Collectors.toSet());
    // Loops each peer on each RS and dumps the queues
    List<ServerName> regionservers = queueStorage.getListOfReplicators();
    if (regionservers == null || regionservers.isEmpty()) {
        return sb.toString();
    }
    for (ServerName regionserver : regionservers) {
        List<String> queueIds = queueStorage.getAllQueues(regionserver);
        if (!liveRegionServers.contains(regionserver)) {
            deadRegionServers.add(regionserver.getServerName());
        }
        for (String queueId : queueIds) {
            ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueId);
            List<String> wals = queueStorage.getWALsInQueue(regionserver, queueId);
            Collections.sort(wals);
            if (!peerIds.contains(queueInfo.getPeerId())) {
                deletedQueues.add(regionserver + "/" + queueId);
                sb.append(formatQueue(regionserver, queueStorage, queueInfo, queueId, wals, true, hdfs));
            } else {
                sb.append(formatQueue(regionserver, queueStorage, queueInfo, queueId, wals, false, hdfs));
            }
        }
    }
    return sb.toString();
}
Also used : ReplicationQueueInfo(org.apache.hadoop.hbase.replication.ReplicationQueueInfo) ServerName(org.apache.hadoop.hbase.ServerName) ReplicationQueueStorage(org.apache.hadoop.hbase.replication.ReplicationQueueStorage)

Example 3 with ReplicationQueueInfo

use of org.apache.hadoop.hbase.replication.ReplicationQueueInfo in project hbase by apache.

the class ReplicationSourceManager method claimQueue.

void claimQueue(ServerName deadRS, String queue) {
    // This sleep may not be enough in some cases.
    try {
        Thread.sleep(sleepBeforeFailover + (long) (ThreadLocalRandom.current().nextFloat() * sleepBeforeFailover));
    } catch (InterruptedException e) {
        LOG.warn("Interrupted while waiting before transferring a queue.");
        Thread.currentThread().interrupt();
    }
    // We try to lock that rs' queue directory
    if (server.isStopped()) {
        LOG.info("Not transferring queue since we are shutting down");
        return;
    }
    // After claim the queues from dead region server, wewill skip to start the
    // RecoveredReplicationSource if the peer has been removed. but there's possible that remove a
    // peer with peerId = 2 and add a peer with peerId = 2 again during failover. So we need to get
    // a copy of the replication peer first to decide whether we should start the
    // RecoveredReplicationSource. If the latest peer is not the old peer, we should also skip to
    // start the RecoveredReplicationSource, Otherwise the rs will abort (See HBASE-20475).
    String peerId = new ReplicationQueueInfo(queue).getPeerId();
    ReplicationPeerImpl oldPeer = replicationPeers.getPeer(peerId);
    if (oldPeer == null) {
        LOG.info("Not transferring queue since the replication peer {} for queue {} does not exist", peerId, queue);
        return;
    }
    Pair<String, SortedSet<String>> claimedQueue;
    try {
        claimedQueue = queueStorage.claimQueue(deadRS, queue, server.getServerName());
    } catch (ReplicationException e) {
        LOG.error("ReplicationException: cannot claim dead region ({})'s " + "replication queue. Znode : ({})" + " Possible solution: check if znode size exceeds jute.maxBuffer value. " + " If so, increase it for both client and server side.", deadRS, queueStorage.getRsNode(deadRS), e);
        server.abort("Failed to claim queue from dead regionserver.", e);
        return;
    }
    if (claimedQueue.getSecond().isEmpty()) {
        return;
    }
    String queueId = claimedQueue.getFirst();
    Set<String> walsSet = claimedQueue.getSecond();
    ReplicationPeerImpl peer = replicationPeers.getPeer(peerId);
    if (peer == null || peer != oldPeer) {
        LOG.warn("Skipping failover for peer {} of node {}, peer is null", peerId, deadRS);
        abortWhenFail(() -> queueStorage.removeQueue(server.getServerName(), queueId));
        return;
    }
    if (server instanceof ReplicationSyncUp.DummyServer && peer.getPeerState().equals(PeerState.DISABLED)) {
        LOG.warn("Peer {} is disabled. ReplicationSyncUp tool will skip " + "replicating data to this peer.", peerId);
        return;
    }
    ReplicationSourceInterface src;
    try {
        src = createSource(queueId, peer);
    } catch (IOException e) {
        LOG.error("Can not create replication source for peer {} and queue {}", peerId, queueId, e);
        server.abort("Failed to create replication source after claiming queue.", e);
        return;
    }
    // synchronized on oldsources to avoid adding recovered source for the to-be-removed peer
    synchronized (oldsources) {
        peer = replicationPeers.getPeer(src.getPeerId());
        if (peer == null || peer != oldPeer) {
            src.terminate("Recovered queue doesn't belong to any current peer");
            deleteQueue(queueId);
            return;
        }
        // replicated back.
        if (peer.getPeerConfig().isSyncReplication()) {
            Pair<SyncReplicationState, SyncReplicationState> stateAndNewState = peer.getSyncReplicationStateAndNewState();
            if ((stateAndNewState.getFirst().equals(SyncReplicationState.STANDBY) && stateAndNewState.getSecond().equals(SyncReplicationState.NONE)) || stateAndNewState.getSecond().equals(SyncReplicationState.STANDBY)) {
                src.terminate("Sync replication peer is in STANDBY state");
                deleteQueue(queueId);
                return;
            }
        }
        // track sources in walsByIdRecoveredQueues
        Map<String, NavigableSet<String>> walsByGroup = new HashMap<>();
        walsByIdRecoveredQueues.put(queueId, walsByGroup);
        for (String wal : walsSet) {
            String walPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(wal);
            NavigableSet<String> wals = walsByGroup.get(walPrefix);
            if (wals == null) {
                wals = new TreeSet<>();
                walsByGroup.put(walPrefix, wals);
            }
            wals.add(wal);
        }
        oldsources.add(src);
        LOG.info("Added source for recovered queue {}", src.getQueueId());
        for (String wal : walsSet) {
            LOG.trace("Enqueueing log from recovered queue for source: " + src.getQueueId());
            src.enqueueLog(new Path(oldLogDir, wal));
        }
        src.startup();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) NavigableSet(java.util.NavigableSet) ReplicationQueueInfo(org.apache.hadoop.hbase.replication.ReplicationQueueInfo) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ReplicationPeerImpl(org.apache.hadoop.hbase.replication.ReplicationPeerImpl) SyncReplicationState(org.apache.hadoop.hbase.replication.SyncReplicationState) IOException(java.io.IOException) SortedSet(java.util.SortedSet) ReplicationException(org.apache.hadoop.hbase.replication.ReplicationException)

Example 4 with ReplicationQueueInfo

use of org.apache.hadoop.hbase.replication.ReplicationQueueInfo in project hbase by apache.

the class ReplicationChecker method getUnDeletedQueues.

private Map<ServerName, List<String>> getUnDeletedQueues() throws ReplicationException {
    Map<ServerName, List<String>> undeletedQueues = new HashMap<>();
    Set<String> peerIds = new HashSet<>(peerStorage.listPeerIds());
    for (ServerName replicator : queueStorage.getListOfReplicators()) {
        for (String queueId : queueStorage.getAllQueues(replicator)) {
            ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueId);
            if (!peerIds.contains(queueInfo.getPeerId())) {
                undeletedQueues.computeIfAbsent(replicator, key -> new ArrayList<>()).add(queueId);
                LOG.debug("Undeleted replication queue for removed peer found: " + "[removedPeerId={}, replicator={}, queueId={}]", queueInfo.getPeerId(), replicator, queueId);
            }
        }
    }
    return undeletedQueues;
}
Also used : HbckErrorReporter(org.apache.hadoop.hbase.util.HbckErrorReporter) Logger(org.slf4j.Logger) ReplicationStorageFactory(org.apache.hadoop.hbase.replication.ReplicationStorageFactory) LoggerFactory(org.slf4j.LoggerFactory) Set(java.util.Set) HashMap(java.util.HashMap) ZKWatcher(org.apache.hadoop.hbase.zookeeper.ZKWatcher) ReplicationQueueStorage(org.apache.hadoop.hbase.replication.ReplicationQueueStorage) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) List(java.util.List) ReplicationQueueInfo(org.apache.hadoop.hbase.replication.ReplicationQueueInfo) InterfaceAudience(org.apache.yetus.audience.InterfaceAudience) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) ReplicationPeerStorage(org.apache.hadoop.hbase.replication.ReplicationPeerStorage) ServerName(org.apache.hadoop.hbase.ServerName) ReplicationException(org.apache.hadoop.hbase.replication.ReplicationException) HashMap(java.util.HashMap) ReplicationQueueInfo(org.apache.hadoop.hbase.replication.ReplicationQueueInfo) ServerName(org.apache.hadoop.hbase.ServerName) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet)

Example 5 with ReplicationQueueInfo

use of org.apache.hadoop.hbase.replication.ReplicationQueueInfo in project hbase by apache.

the class ReplicationChecker method checkUnDeletedQueues.

public void checkUnDeletedQueues() throws ReplicationException {
    undeletedQueueIds = getUnDeletedQueues();
    undeletedQueueIds.forEach((replicator, queueIds) -> {
        queueIds.forEach(queueId -> {
            ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueId);
            String msg = "Undeleted replication queue for removed peer found: " + String.format("[removedPeerId=%s, replicator=%s, queueId=%s]", queueInfo.getPeerId(), replicator, queueId);
            errorReporter.reportError(HbckErrorReporter.ERROR_CODE.UNDELETED_REPLICATION_QUEUE, msg);
        });
    });
    undeletedHFileRefsPeerIds = getUndeletedHFileRefsPeers();
    undeletedHFileRefsPeerIds.stream().map(peerId -> "Undeleted replication hfile-refs queue for removed peer " + peerId + " found").forEach(msg -> errorReporter.reportError(HbckErrorReporter.ERROR_CODE.UNDELETED_REPLICATION_QUEUE, msg));
}
Also used : HbckErrorReporter(org.apache.hadoop.hbase.util.HbckErrorReporter) Logger(org.slf4j.Logger) ReplicationStorageFactory(org.apache.hadoop.hbase.replication.ReplicationStorageFactory) LoggerFactory(org.slf4j.LoggerFactory) Set(java.util.Set) HashMap(java.util.HashMap) ZKWatcher(org.apache.hadoop.hbase.zookeeper.ZKWatcher) ReplicationQueueStorage(org.apache.hadoop.hbase.replication.ReplicationQueueStorage) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) List(java.util.List) ReplicationQueueInfo(org.apache.hadoop.hbase.replication.ReplicationQueueInfo) InterfaceAudience(org.apache.yetus.audience.InterfaceAudience) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) ReplicationPeerStorage(org.apache.hadoop.hbase.replication.ReplicationPeerStorage) ServerName(org.apache.hadoop.hbase.ServerName) ReplicationException(org.apache.hadoop.hbase.replication.ReplicationException) ReplicationQueueInfo(org.apache.hadoop.hbase.replication.ReplicationQueueInfo)

Aggregations

ReplicationQueueInfo (org.apache.hadoop.hbase.replication.ReplicationQueueInfo)11 ServerName (org.apache.hadoop.hbase.ServerName)5 ReplicationQueueStorage (org.apache.hadoop.hbase.replication.ReplicationQueueStorage)5 HashMap (java.util.HashMap)4 ReplicationException (org.apache.hadoop.hbase.replication.ReplicationException)4 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 HashSet (java.util.HashSet)3 List (java.util.List)3 Map (java.util.Map)3 Set (java.util.Set)3 Configuration (org.apache.hadoop.conf.Configuration)3 ReplicationPeerStorage (org.apache.hadoop.hbase.replication.ReplicationPeerStorage)2 ReplicationPeers (org.apache.hadoop.hbase.replication.ReplicationPeers)2 ReplicationQueuesClient (org.apache.hadoop.hbase.replication.ReplicationQueuesClient)2 ReplicationQueuesClientArguments (org.apache.hadoop.hbase.replication.ReplicationQueuesClientArguments)2 ReplicationStorageFactory (org.apache.hadoop.hbase.replication.ReplicationStorageFactory)2 HbckErrorReporter (org.apache.hadoop.hbase.util.HbckErrorReporter)2 ZKWatcher (org.apache.hadoop.hbase.zookeeper.ZKWatcher)2 InterfaceAudience (org.apache.yetus.audience.InterfaceAudience)2