use of org.apache.bookkeeper.replication.ReplicationException.BKAuditException in project bookkeeper by apache.
the class BookieLedgerIndexer method getBookieToLedgerIndex.
/**
* Generating bookie vs its ledgers map by reading all the ledgers in each
* bookie and parsing its metadata.
*
* @return bookie2ledgersMap map of bookie vs ledgers
* @throws BKAuditException
* exception while getting bookie-ledgers
*/
public Map<String, Set<Long>> getBookieToLedgerIndex() throws BKAuditException {
// bookie vs ledgers map
final ConcurrentHashMap<String, Set<Long>> bookie2ledgersMap = new ConcurrentHashMap<String, Set<Long>>();
final CountDownLatch ledgerCollectorLatch = new CountDownLatch(1);
Processor<Long> ledgerProcessor = new Processor<Long>() {
@Override
public void process(final Long ledgerId, final AsyncCallback.VoidCallback iterCallback) {
GenericCallback<LedgerMetadata> genericCallback = new GenericCallback<LedgerMetadata>() {
@Override
public void operationComplete(int rc, LedgerMetadata ledgerMetadata) {
if (rc == BKException.Code.OK) {
for (Map.Entry<Long, ArrayList<BookieSocketAddress>> ensemble : ledgerMetadata.getEnsembles().entrySet()) {
for (BookieSocketAddress bookie : ensemble.getValue()) {
putLedger(bookie2ledgersMap, bookie.toString(), ledgerId);
}
}
} else if (rc == BKException.Code.NoSuchLedgerExistsException) {
LOG.info("Ignoring replication of already deleted ledger {}", ledgerId);
rc = BKException.Code.OK;
} else {
LOG.warn("Unable to read the ledger:" + ledgerId + " information");
}
iterCallback.processResult(rc, null, null);
}
};
ledgerManager.readLedgerMetadata(ledgerId, genericCallback);
}
};
// Reading the result after processing all the ledgers
final List<Integer> resultCode = new ArrayList<Integer>(1);
ledgerManager.asyncProcessLedgers(ledgerProcessor, new AsyncCallback.VoidCallback() {
@Override
public void processResult(int rc, String s, Object obj) {
resultCode.add(rc);
ledgerCollectorLatch.countDown();
}
}, null, BKException.Code.OK, BKException.Code.ReadException);
try {
ledgerCollectorLatch.await();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new BKAuditException("Exception while getting the bookie-ledgers", e);
}
if (!resultCode.contains(BKException.Code.OK)) {
throw new BKAuditException("Exception while getting the bookie-ledgers", BKException.create(resultCode.get(0)));
}
return bookie2ledgersMap;
}
use of org.apache.bookkeeper.replication.ReplicationException.BKAuditException in project bookkeeper by apache.
the class Auditor method checkAllLedgers.
/**
* List all the ledgers and check them individually. This should not
* be run very often.
*/
void checkAllLedgers() throws BKAuditException, BKException, IOException, InterruptedException, KeeperException {
ZooKeeper newzk = ZooKeeperClient.newBuilder().connectString(conf.getZkServers()).sessionTimeoutMs(conf.getZkTimeout()).build();
final BookKeeper client = new BookKeeper(new ClientConfiguration(conf), newzk);
final BookKeeperAdmin admin = new BookKeeperAdmin(client, statsLogger);
try {
final LedgerChecker checker = new LedgerChecker(client);
final AtomicInteger returnCode = new AtomicInteger(BKException.Code.OK);
final CountDownLatch processDone = new CountDownLatch(1);
Processor<Long> checkLedgersProcessor = new Processor<Long>() {
@Override
public void process(final Long ledgerId, final AsyncCallback.VoidCallback callback) {
try {
if (!ledgerUnderreplicationManager.isLedgerReplicationEnabled()) {
LOG.info("Ledger rereplication has been disabled, aborting periodic check");
processDone.countDown();
return;
}
} catch (ReplicationException.UnavailableException ue) {
LOG.error("Underreplication manager unavailable running periodic check", ue);
processDone.countDown();
return;
}
LedgerHandle lh = null;
try {
lh = admin.openLedgerNoRecovery(ledgerId);
checker.checkLedger(lh, new ProcessLostFragmentsCb(lh, callback), conf.getAuditorLedgerVerificationPercentage());
// we collect the following stats to get a measure of the
// distribution of a single ledger within the bk cluster
// the higher the number of fragments/bookies, the more distributed it is
numFragmentsPerLedger.registerSuccessfulValue(lh.getNumFragments());
numBookiesPerLedger.registerSuccessfulValue(lh.getNumBookies());
numLedgersChecked.inc();
} catch (BKException.BKNoSuchLedgerExistsException bknsle) {
if (LOG.isDebugEnabled()) {
LOG.debug("Ledger was deleted before we could check it", bknsle);
}
callback.processResult(BKException.Code.OK, null, null);
return;
} catch (BKException bke) {
LOG.error("Couldn't open ledger " + ledgerId, bke);
callback.processResult(BKException.Code.BookieHandleNotAvailableException, null, null);
return;
} catch (InterruptedException ie) {
LOG.error("Interrupted opening ledger", ie);
Thread.currentThread().interrupt();
callback.processResult(BKException.Code.InterruptedException, null, null);
return;
} finally {
if (lh != null) {
try {
lh.close();
} catch (BKException bke) {
LOG.warn("Couldn't close ledger " + ledgerId, bke);
} catch (InterruptedException ie) {
LOG.warn("Interrupted closing ledger " + ledgerId, ie);
Thread.currentThread().interrupt();
}
}
}
}
};
ledgerManager.asyncProcessLedgers(checkLedgersProcessor, new AsyncCallback.VoidCallback() {
@Override
public void processResult(int rc, String s, Object obj) {
returnCode.set(rc);
processDone.countDown();
}
}, null, BKException.Code.OK, BKException.Code.ReadException);
try {
processDone.await();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new BKAuditException("Exception while checking ledgers", e);
}
if (returnCode.get() != BKException.Code.OK) {
throw BKException.create(returnCode.get());
}
} finally {
admin.close();
client.close();
newzk.close();
}
}
use of org.apache.bookkeeper.replication.ReplicationException.BKAuditException in project bookkeeper by apache.
the class Auditor method publishSuspectedLedgers.
private void publishSuspectedLedgers(String bookieIP, Set<Long> ledgers) throws BKAuditException {
if (null == ledgers || ledgers.size() == 0) {
// there is no ledgers available for this bookie and just
// ignoring the bookie failures
LOG.info("There is no ledgers for the failed bookie: {}", bookieIP);
return;
}
LOG.info("Following ledgers: {} of bookie: {} are identified as underreplicated", ledgers, bookieIP);
numUnderReplicatedLedger.registerSuccessfulValue(ledgers.size());
for (Long ledgerId : ledgers) {
try {
ledgerUnderreplicationManager.markLedgerUnderreplicated(ledgerId, bookieIP);
} catch (UnavailableException ue) {
throw new BKAuditException("Failed to publish underreplicated ledger: " + ledgerId + " of bookie: " + bookieIP, ue);
}
}
}
use of org.apache.bookkeeper.replication.ReplicationException.BKAuditException in project bookkeeper by apache.
the class Auditor method submitLostBookieRecoveryDelayChangedEvent.
synchronized Future<?> submitLostBookieRecoveryDelayChangedEvent() {
if (executor.isShutdown()) {
SettableFuture<Void> f = SettableFuture.<Void>create();
f.setException(new BKAuditException("Auditor shutting down"));
return f;
}
return executor.submit(new Runnable() {
int lostBookieRecoveryDelay = -1;
public void run() {
try {
waitIfLedgerReplicationDisabled();
lostBookieRecoveryDelay = Auditor.this.ledgerUnderreplicationManager.getLostBookieRecoveryDelay();
// after new lostBookieRecoveryDelay period
if (auditTask != null) {
LOG.info("lostBookieRecoveryDelay period has been changed so canceling the pending AuditTask");
auditTask.cancel(false);
numDelayedBookieAuditsCancelled.inc();
}
// signal to trigger the Audit immediately.
if ((lostBookieRecoveryDelay == 0) || (lostBookieRecoveryDelay == lostBookieRecoveryDelayBeforeChange)) {
LOG.info("lostBookieRecoveryDelay has been set to 0 or reset to its previous value, " + "so starting AuditTask. Current lostBookieRecoveryDelay: {}, " + "previous lostBookieRecoveryDelay: {}", lostBookieRecoveryDelay, lostBookieRecoveryDelayBeforeChange);
startAudit(false);
auditTask = null;
bookiesToBeAudited.clear();
} else if (auditTask != null) {
LOG.info("lostBookieRecoveryDelay has been set to {}, so rescheduling AuditTask accordingly", lostBookieRecoveryDelay);
auditTask = executor.schedule(new Runnable() {
public void run() {
startAudit(false);
auditTask = null;
bookiesToBeAudited.clear();
}
}, lostBookieRecoveryDelay, TimeUnit.SECONDS);
numBookieAuditsDelayed.inc();
}
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
LOG.error("Interrupted while for LedgersReplication to be enabled ", ie);
} catch (UnavailableException ue) {
LOG.error("Exception while reading from ZK", ue);
} finally {
if (lostBookieRecoveryDelay != -1) {
lostBookieRecoveryDelayBeforeChange = lostBookieRecoveryDelay;
}
}
}
});
}
use of org.apache.bookkeeper.replication.ReplicationException.BKAuditException in project bookkeeper by apache.
the class BookKeeperAdmin method decommissionBookie.
/**
* Triggers AuditTask by resetting lostBookieRecoveryDelay and then make
* sure the ledgers stored in the given decommissioning bookie are properly
* replicated and they are not underreplicated because of the given bookie.
* This method waits untill there are no underreplicatedledgers because of this
* bookie. If the given Bookie is not shutdown yet, then it will throw
* BKIllegalOpException.
*
* @param bookieAddress
* address of the decommissioning bookie
* @throws CompatibilityException
* @throws UnavailableException
* @throws KeeperException
* @throws InterruptedException
* @throws IOException
* @throws BKAuditException
* @throws TimeoutException
* @throws BKException
*/
public void decommissionBookie(BookieSocketAddress bookieAddress) throws CompatibilityException, UnavailableException, KeeperException, InterruptedException, IOException, BKAuditException, TimeoutException, BKException {
if (getAvailableBookies().contains(bookieAddress) || getReadOnlyBookies().contains(bookieAddress)) {
LOG.error("Bookie: {} is not shutdown yet", bookieAddress);
throw BKException.create(BKException.Code.IllegalOpException);
}
triggerAudit();
/*
* Sleep for 30 secs, so that Auditor gets chance to trigger its
* force audittask and let the underreplicationmanager process
* to do its replication process
*/
Thread.sleep(30 * 1000);
/*
* get the collection of the ledgers which are stored in this
* bookie, by making a call to
* bookieLedgerIndexer.getBookieToLedgerIndex.
*/
BookieLedgerIndexer bookieLedgerIndexer = new BookieLedgerIndexer(bkc.ledgerManager);
Map<String, Set<Long>> bookieToLedgersMap = bookieLedgerIndexer.getBookieToLedgerIndex();
Set<Long> ledgersStoredInThisBookie = bookieToLedgersMap.get(bookieAddress.toString());
if ((ledgersStoredInThisBookie != null) && (!ledgersStoredInThisBookie.isEmpty())) {
/*
* wait untill all the ledgers are replicated to other
* bookies by making sure that these ledgers metadata don't
* contain this bookie as part of their ensemble.
*/
waitForLedgersToBeReplicated(ledgersStoredInThisBookie, bookieAddress, bkc.ledgerManager);
}
// for double-checking, check if any ledgers are listed as underreplicated because of this bookie
Predicate<List<String>> predicate = replicasList -> replicasList.contains(bookieAddress.toString());
Iterator<Long> urLedgerIterator = underreplicationManager.listLedgersToRereplicate(predicate);
if (urLedgerIterator.hasNext()) {
// if there are any then wait and make sure those ledgers are replicated properly
LOG.info("Still in some underreplicated ledgers metadata, this bookie is part of its ensemble. " + "Have to make sure that those ledger fragments are rereplicated");
List<Long> urLedgers = new ArrayList<>();
urLedgerIterator.forEachRemaining(urLedgers::add);
waitForLedgersToBeReplicated(urLedgers, bookieAddress, bkc.ledgerManager);
}
}
Aggregations