use of org.apache.zookeeper_voltpatches.KeeperException.NodeExistsException in project voltdb by VoltDB.
the class InvocationDispatcher method takeShutdownSaveSnapshot.
private final ClientResponseImpl takeShutdownSaveSnapshot(final StoredProcedureInvocation task, final InvocationClientHandler handler, final Connection ccxn, final AuthUser user, OverrideCheck bypass) {
// shutdown save snapshot is available for Pro edition only
if (!MiscUtils.isPro()) {
task.setParams();
return dispatch(task, handler, ccxn, user, bypass, false);
}
Object p0 = task.getParams().getParam(0);
final long zkTxnId;
if (p0 instanceof Long) {
zkTxnId = ((Long) p0).longValue();
} else if (p0 instanceof String) {
try {
zkTxnId = Long.parseLong((String) p0);
} catch (NumberFormatException e) {
return gracefulFailureResponse("Incorrect argument type", task.clientHandle);
}
} else {
return gracefulFailureResponse("Incorrect argument type", task.clientHandle);
}
VoltDBInterface voltdb = VoltDB.instance();
if (!voltdb.isPreparingShuttingdown()) {
log.warn("Ignoring shutdown save snapshot request as VoltDB is not shutting down");
return unexpectedFailureResponse("Ignoring shutdown save snapshot request as VoltDB is not shutting down", task.clientHandle);
}
final ZooKeeper zk = voltdb.getHostMessenger().getZK();
// network threads are blocked from making zookeeper calls
Future<Long> fut = voltdb.getSES(true).submit(new Callable<Long>() {
@Override
public Long call() {
try {
Stat stat = zk.exists(VoltZK.operationMode, false);
if (stat == null) {
VoltDB.crashLocalVoltDB("cluster operation mode zookeeper node does not exist");
return Long.MIN_VALUE;
}
return stat.getMzxid();
} catch (KeeperException | InterruptedException e) {
VoltDB.crashLocalVoltDB("Failed to stat the cluster operation zookeeper node", true, e);
return Long.MIN_VALUE;
}
}
});
try {
if (fut.get().longValue() != zkTxnId) {
return unexpectedFailureResponse("Internal error: cannot write a startup snapshot because the " + "current system state is not consistent with an orderly shutdown. " + "Please try \"voltadmin shutdown --save\" again.", task.clientHandle);
}
} catch (InterruptedException | ExecutionException e1) {
VoltDB.crashLocalVoltDB("Failed to stat the cluster operation zookeeper node", true, e1);
return null;
}
NodeSettings paths = m_catalogContext.get().getNodeSettings();
String data;
try {
data = new JSONStringer().object().keySymbolValuePair(SnapshotUtil.JSON_TERMINUS, zkTxnId).endObject().toString();
} catch (JSONException e) {
VoltDB.crashLocalVoltDB("Failed to create startup snapshot save command", true, e);
return null;
}
log.info("Saving startup snapshot");
consoleLog.info("Taking snapshot to save database contents");
final SimpleClientResponseAdapter alternateAdapter = new SimpleClientResponseAdapter(ClientInterface.SHUTDONW_SAVE_CID, "Blocking Startup Snapshot Save");
final InvocationClientHandler alternateHandler = new InvocationClientHandler() {
@Override
public boolean isAdmin() {
return handler.isAdmin();
}
@Override
public long connectionId() {
return ClientInterface.SHUTDONW_SAVE_CID;
}
};
final long sourceHandle = task.clientHandle;
task.setClientHandle(alternateAdapter.registerCallback(SimpleClientResponseAdapter.NULL_CALLBACK));
SnapshotUtil.SnapshotResponseHandler savCallback = new SnapshotUtil.SnapshotResponseHandler() {
@Override
public void handleResponse(ClientResponse r) {
if (r == null) {
String msg = "Snapshot save failed. The database is paused and the shutdown has been cancelled";
transmitResponseMessage(gracefulFailureResponse(msg, sourceHandle), ccxn, sourceHandle);
}
if (r.getStatus() != ClientResponse.SUCCESS) {
String msg = "Snapshot save failed: " + r.getStatusString() + ". The database is paused and the shutdown has been cancelled";
ClientResponseImpl resp = new ClientResponseImpl(ClientResponse.GRACEFUL_FAILURE, r.getResults(), msg, sourceHandle);
transmitResponseMessage(resp, ccxn, sourceHandle);
}
consoleLog.info("Snapshot taken successfully");
task.setParams();
dispatch(task, alternateHandler, alternateAdapter, user, bypass, false);
}
};
// network threads are blocked from making zookeeper calls
final byte[] guardContent = data.getBytes(StandardCharsets.UTF_8);
Future<Boolean> guardFuture = voltdb.getSES(true).submit(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
try {
ZKUtil.asyncMkdirs(zk, VoltZK.shutdown_save_guard, guardContent).get();
} catch (NodeExistsException itIsOk) {
return false;
} catch (InterruptedException | KeeperException e) {
VoltDB.crashLocalVoltDB("Failed to create shutdown save guard zookeeper node", true, e);
return false;
}
return true;
}
});
boolean created;
try {
created = guardFuture.get().booleanValue();
} catch (InterruptedException | ExecutionException e) {
VoltDB.crashLocalVoltDB("Failed to create shutdown save guard zookeeper node", true, e);
return null;
}
if (!created) {
return unexpectedFailureResponse("Internal error: detected concurrent invocations of \"voltadmin shutdown --save\"", task.clientHandle);
}
voltdb.getClientInterface().bindAdapter(alternateAdapter, null);
SnapshotUtil.requestSnapshot(sourceHandle, paths.resolve(paths.getSnapshoth()).toPath().toUri().toString(), SnapshotUtil.getShutdownSaveNonce(zkTxnId), true, SnapshotFormat.NATIVE, SnapshotPathType.SNAP_AUTO, data, savCallback, true);
return null;
}
use of org.apache.zookeeper_voltpatches.KeeperException.NodeExistsException in project voltdb by VoltDB.
the class SnapshotDaemon method scheduleSnapshotForLater.
/*
* Schedule a user snapshot request for later since the database was busy.
* Continue doing this as long as the error response returned by the DB is snapshot in progress.
* Since the snapshot is being scheduled for later we will send an immediate response to the client
* via ZK relay.
*/
private void scheduleSnapshotForLater(final String requestObj, final String requestId, final boolean isFirstAttempt) throws Exception {
/*
* Only need to send the queue response the first time we attempt to schedule the snapshot
* for later. It may be necessary to reschedule via this function multiple times.
*/
if (isFirstAttempt) {
SNAP_LOG.info("A user snapshot request could not be immediately fulfilled and will be reattempted later");
/*
* Construct a result to send to the client right now via ZK
* saying we queued it to run later
*/
VoltTable result = SnapshotUtil.constructNodeResultsTable();
result.addRow(-1, CoreUtils.getHostnameOrAddress(), "", "SUCCESS", "SNAPSHOT REQUEST QUEUED");
final ClientResponseImpl queuedResponse = new ClientResponseImpl(ClientResponseImpl.SUCCESS, new VoltTable[] { result }, "Snapshot request could not be fulfilled because a snapshot " + "is in progress. It was queued for execution", 0);
ByteBuffer buf = ByteBuffer.allocate(queuedResponse.getSerializedSize());
m_zk.create(VoltZK.user_snapshot_response + requestId, queuedResponse.flattenToBuffer(buf).array(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
}
/*
* Now queue the request for later
*/
final Runnable r = new Runnable() {
@Override
public void run() {
try {
/*
* Construct a callback to handle the response to the
* @SnapshotSave invocation that will reattempt the user snapshot
*/
final long handle = m_nextCallbackHandle++;
m_procedureCallbacks.put(handle, new ProcedureCallback() {
@Override
public void clientCallback(ClientResponse clientResponse) {
m_lastInitiationTs = null;
try {
/*
* If there is an error then we are done
* attempting this user snapshot. The params must be bad
* or things are broken.
*/
if (clientResponse.getStatus() != ClientResponse.SUCCESS) {
SNAP_LOG.error(clientResponse.getStatusString());
//Reset the watch, in case this is recoverable
userSnapshotRequestExistenceCheck(true);
return;
}
VoltTable[] results = clientResponse.getResults();
//Do this check to avoid an NPE
if (results == null || results.length == 0 || results[0].getRowCount() < 1) {
SNAP_LOG.error("Queued user snapshot request reattempt received an unexpected response" + " and will not be reattempted. The client response is (status: " + clientResponse.getStatus() + " " + clientResponse.getStatusString() + " result: " + (results != null && results.length > 0 ? results[0] : "null") + ")");
/*
* Don't think this should happen, reset the watch to allow later requests
*/
userSnapshotRequestExistenceCheck(true);
return;
}
VoltTable result = results[0];
boolean snapshotInProgress = false;
boolean haveFailure = false;
while (result.advanceRow()) {
if (result.getString("RESULT").equals("FAILURE")) {
if (result.getString("ERR_MSG").equals("SNAPSHOT IN PROGRESS")) {
snapshotInProgress = true;
} else {
haveFailure = true;
}
}
}
/*
* If a snapshot was in progress, reattempt later, otherwise,
* if there was a failure, abort the attempt and log.
*/
if (snapshotInProgress) {
SNAP_LOG.info("Queued user snapshot was reattempted, but a snapshot was " + " still in progress. It will be reattempted.");
//Turtles all the way down
scheduleSnapshotForLater(requestObj, requestId, false);
} else if (haveFailure) {
SNAP_LOG.info("Queued user snapshot was attempted, but there was a failure.");
try {
ClientResponseImpl rimpl = (ClientResponseImpl) clientResponse;
saveResponseToZKAndReset(requestId, rimpl);
} catch (NodeExistsException e) {
// used to pass null as request ID to avoid this check if the request ID
// already existed, this gives us the same behavior with a pre-existing
// request ID
}
//Log the details of the failure, after resetting the watch in case of some odd NPE
result.resetRowPosition();
SNAP_LOG.info(result);
} else {
try {
SNAP_LOG.debug("Queued user snapshot was successfully requested, saving to path " + VoltZK.user_snapshot_response + requestId);
/*
* Snapshot was started no problem, reset the watch for new requests
*/
ClientResponseImpl rimpl = (ClientResponseImpl) clientResponse;
saveResponseToZKAndReset(requestId, rimpl);
} catch (NodeExistsException e) {
// used to pass null as request ID to avoid this check if the request ID
// already existed, this gives us the same behavior with a pre-existing
// request ID
}
return;
}
} catch (Exception e) {
SNAP_LOG.error("Error processing procedure callback for user snapshot", e);
try {
userSnapshotRequestExistenceCheck(true);
} catch (Exception e1) {
VoltDB.crashLocalVoltDB("Error resetting watch for user snapshot requests", true, e1);
}
}
}
});
initiateSnapshotSave(handle, new Object[] { requestObj }, false);
} catch (Exception e) {
try {
userSnapshotRequestExistenceCheck(true);
} catch (Exception e1) {
VoltDB.crashLocalVoltDB("Error checking for existence of user snapshots", true, e1);
}
}
}
};
m_es.schedule(r, m_userSnapshotRetryInterval, TimeUnit.SECONDS);
}
use of org.apache.zookeeper_voltpatches.KeeperException.NodeExistsException in project voltdb by VoltDB.
the class SnapshotDaemon method leaderElection.
/**
* Leader election for snapshots.
* Leader will watch for truncation and user snapshot requests
*/
private void leaderElection() {
loggingLog.info("Starting leader election for snapshot truncation daemon");
try {
while (true) {
Stat stat = m_zk.exists(VoltZK.snapshot_truncation_master, new Watcher() {
@Override
public void process(WatchedEvent event) {
switch(event.getType()) {
case NodeDeleted:
loggingLog.info("Detected the snapshot truncation leader's ephemeral node deletion");
m_es.execute(new Runnable() {
@Override
public void run() {
leaderElection();
}
});
break;
default:
break;
}
}
});
if (stat == null) {
try {
m_zk.create(VoltZK.snapshot_truncation_master, null, Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
m_isAutoSnapshotLeader = true;
if (m_lastKnownSchedule != null) {
makeActivePrivate(m_lastKnownSchedule);
}
electedTruncationLeader();
return;
} catch (NodeExistsException e) {
}
} else {
loggingLog.info("Leader election concluded, a leader already exists");
break;
}
}
} catch (Exception e) {
VoltDB.crashLocalVoltDB("Exception in snapshot daemon electing master via ZK", true, e);
}
}
Aggregations