use of org.apache.zookeeper_voltpatches.KeeperException.NoNodeException in project voltdb by VoltDB.
the class SnapshotCompletionMonitor method processSnapshotChildrenChanged.
private void processSnapshotChildrenChanged(final WatchedEvent event) {
try {
TreeSet<String> children = new TreeSet<String>(m_zk.getChildren(VoltZK.completed_snapshots, m_newSnapshotWatcher));
TreeSet<String> newChildren = new TreeSet<String>(children);
newChildren.removeAll(m_lastKnownSnapshots);
m_lastKnownSnapshots = children;
for (String newSnapshot : newChildren) {
String path = VoltZK.completed_snapshots + "/" + newSnapshot;
try {
byte[] data = m_zk.getData(path, new Watcher() {
@Override
public void process(final WatchedEvent event) {
switch(event.getType()) {
case NodeDataChanged:
m_es.execute(new Runnable() {
@Override
public void run() {
processSnapshotDataChangedEvent(event);
}
});
break;
default:
break;
}
}
}, null);
processSnapshotData(data);
} catch (NoNodeException e) {
}
}
} catch (Exception e) {
VoltDB.crashLocalVoltDB("Exception in snapshot completion monitor", true, e);
}
}
use of org.apache.zookeeper_voltpatches.KeeperException.NoNodeException in project voltdb by VoltDB.
the class SnapshotCompletionMonitor method processSnapshotDataChangedEvent.
private void processSnapshotDataChangedEvent(final WatchedEvent event) {
try {
byte[] data = m_zk.getData(event.getPath(), new Watcher() {
@Override
public void process(final WatchedEvent event) {
switch(event.getType()) {
case NodeDataChanged:
m_es.execute(new Runnable() {
@Override
public void run() {
processSnapshotDataChangedEvent(event);
}
});
break;
default:
break;
}
}
}, null);
processSnapshotData(data);
} catch (NoNodeException e) {
} catch (Exception e) {
VoltDB.crashLocalVoltDB("Exception in snapshot completion monitor", true, e);
}
}
use of org.apache.zookeeper_voltpatches.KeeperException.NoNodeException in project voltdb by VoltDB.
the class SnapshotSiteProcessor method logSnapshotCompleteToZK.
private static void logSnapshotCompleteToZK(long txnId, boolean snapshotSuccess, ExtensibleSnapshotDigestData extraSnapshotData) {
ZooKeeper zk = VoltDB.instance().getHostMessenger().getZK();
// Timeout after 10 minutes
final long endTime = System.currentTimeMillis() + TimeUnit.MINUTES.toMillis(10);
final String snapshotPath = VoltZK.completed_snapshots + "/" + txnId;
boolean success = false;
while (!success) {
if (System.currentTimeMillis() > endTime) {
VoltDB.crashLocalVoltDB("Timed out logging snapshot completion to ZK");
}
Stat stat = new Stat();
byte[] data = null;
try {
data = zk.getData(snapshotPath, false, stat);
} catch (NoNodeException e) {
// if the node doesn't exist yet, retry
continue;
} catch (Exception e) {
VoltDB.crashLocalVoltDB("This ZK get should never fail", true, e);
}
if (data == null) {
VoltDB.crashLocalVoltDB("Data should not be null if the node exists", false, null);
}
try {
JSONObject jsonObj = new JSONObject(new String(data, "UTF-8"));
if (jsonObj.getLong("txnId") != txnId) {
VoltDB.crashLocalVoltDB("TxnId should match", false, null);
}
int remainingHosts = jsonObj.getInt("hostCount") - 1;
jsonObj.put("hostCount", remainingHosts);
jsonObj.put("didSucceed", snapshotSuccess);
if (!snapshotSuccess) {
jsonObj.put("isTruncation", false);
}
extraSnapshotData.mergeToZooKeeper(jsonObj, SNAP_LOG);
byte[] zkData = jsonObj.toString().getBytes("UTF-8");
if (zkData.length > 5000000) {
SNAP_LOG.warn("ZooKeeper node for snapshot digest unexpectedly large: " + zkData.length);
}
zk.setData(snapshotPath, zkData, stat.getVersion());
} catch (KeeperException.BadVersionException e) {
continue;
} catch (Exception e) {
VoltDB.crashLocalVoltDB("This ZK call should never fail", true, e);
}
success = true;
}
/*
* If we are running without command logging there will be no consumer for
* the completed snapshot messages. Consume them here to bound space usage in ZK.
*/
try {
TreeSet<String> snapshots = new TreeSet<String>(zk.getChildren(VoltZK.completed_snapshots, false));
while (snapshots.size() > 30) {
try {
zk.delete(VoltZK.completed_snapshots + "/" + snapshots.first(), -1);
} catch (NoNodeException e) {
} catch (Exception e) {
VoltDB.crashLocalVoltDB("Deleting a snapshot completion record from ZK should only fail with NoNodeException", true, e);
}
snapshots.remove(snapshots.first());
}
} catch (Exception e) {
VoltDB.crashLocalVoltDB("Retrieving list of completed snapshots from ZK should never fail", true, e);
}
}
use of org.apache.zookeeper_voltpatches.KeeperException.NoNodeException in project voltdb by VoltDB.
the class SnapshotSiteProcessor method doSnapshotWork.
/*
* No schedule means don't try and schedule snapshot work because this is a blocking
* task from completeSnapshotWork. This avoids creating thousands of task objects.
*/
public Future<?> doSnapshotWork(SystemProcedureExecutionContext context, boolean noSchedule) {
ListenableFuture<?> retval = null;
/*
* This thread will null out the reference to m_snapshotTableTasks when
* a snapshot is finished. If the snapshot buffer is loaned out that means
* it is pending I/O somewhere so there is no work to do until it comes back.
*/
if (m_snapshotTableTasks == null) {
return retval;
}
if (m_snapshotTargets == null) {
return null;
}
/*
* Try to serialize a block from a table, if the table is finished,
* remove the tasks from the task map and move on to the next table. If a block is
* successfully serialized, break out of the loop and release the site thread for more
* transaction work.
*/
Iterator<Map.Entry<Integer, Collection<SnapshotTableTask>>> taskIter = m_snapshotTableTasks.asMap().entrySet().iterator();
while (taskIter.hasNext()) {
Map.Entry<Integer, Collection<SnapshotTableTask>> taskEntry = taskIter.next();
final int tableId = taskEntry.getKey();
final Collection<SnapshotTableTask> tableTasks = taskEntry.getValue();
final List<BBContainer> outputBuffers = getOutputBuffers(tableTasks, noSchedule);
if (outputBuffers == null) {
// Not enough buffers available
if (!noSchedule) {
rescheduleSnapshotWork();
}
break;
}
// Stream more and add a listener to handle any failures
Pair<ListenableFuture, Boolean> streamResult = m_streamers.get(tableId).streamMore(context, outputBuffers, null);
if (streamResult.getFirst() != null) {
final ListenableFuture writeFutures = streamResult.getFirst();
writeFutures.addListener(new Runnable() {
@Override
public void run() {
try {
writeFutures.get();
} catch (Throwable t) {
if (m_perSiteLastSnapshotSucceded) {
if (t instanceof StreamSnapshotTimeoutException || t.getCause() instanceof StreamSnapshotTimeoutException) {
//This error is already logged by the watchdog when it generates the exception
} else {
SNAP_LOG.error("Error while attempting to write snapshot data", t);
}
m_perSiteLastSnapshotSucceded = false;
}
}
}
}, CoreUtils.SAMETHREADEXECUTOR);
}
/**
* The table streamer will return false when there is no more data left to pull from that table. The
* enclosing loop ensures that the next table is then addressed.
*/
if (!streamResult.getSecond()) {
asyncTerminateReplicatedTableTasks(tableTasks);
// XXX: Guava's multimap will clear the tableTasks collection when the entry is
// removed from the containing map, so don't use the collection after removal!
taskIter.remove();
SNAP_LOG.debug("Finished snapshot tasks for table " + tableId + ": " + tableTasks);
} else {
break;
}
}
/**
* If there are no more tasks then this particular EE is finished doing snapshot work
* Check the AtomicInteger to find out if this is the last one.
*/
if (m_snapshotTableTasks.isEmpty()) {
SNAP_LOG.debug("Finished with tasks");
// In case this is a non-blocking snapshot, do the post-snapshot tasks here.
runPostSnapshotTasks(context);
final ArrayList<SnapshotDataTarget> snapshotTargets = m_snapshotTargets;
m_snapshotTargets = null;
m_snapshotTableTasks = null;
boolean IamLast = false;
synchronized (ExecutionSitesCurrentlySnapshotting) {
if (!ExecutionSitesCurrentlySnapshotting.contains(this)) {
VoltDB.crashLocalVoltDB("Currently snapshotting site didn't find itself in set of snapshotting sites", true, null);
}
IamLast = ExecutionSitesCurrentlySnapshotting.size() == 1;
if (!IamLast) {
ExecutionSitesCurrentlySnapshotting.remove(this);
}
}
/**
* If this is the last one then this EE must close all the SnapshotDataTargets.
* Done in a separate thread so the EE can go and do other work. It will
* sync every file descriptor and that may block for a while.
*/
if (IamLast) {
SNAP_LOG.debug("I AM LAST!");
final long txnId = m_lastSnapshotTxnId;
final ExtensibleSnapshotDigestData snapshotDataForZookeeper = m_extraSnapshotData;
m_extraSnapshotData = null;
final Thread terminatorThread = new Thread("Snapshot terminator") {
@Override
public void run() {
boolean snapshotSucceeded = true;
try {
/*
* Be absolutely sure the snapshot is finished
* and synced to disk before another is started
*/
for (Thread t : m_snapshotTargetTerminators) {
if (t == this) {
continue;
}
try {
t.join();
} catch (InterruptedException e) {
return;
}
}
for (final SnapshotDataTarget t : snapshotTargets) {
try {
t.close();
} catch (IOException e) {
snapshotSucceeded = false;
throw new RuntimeException(e);
} catch (InterruptedException e) {
snapshotSucceeded = false;
throw new RuntimeException(e);
}
}
Runnable r = null;
while ((r = m_tasksOnSnapshotCompletion.poll()) != null) {
try {
r.run();
} catch (Exception e) {
SNAP_LOG.error("Error running snapshot completion task", e);
}
}
} finally {
try {
VoltDB.instance().getHostMessenger().getZK().delete(VoltZK.nodes_currently_snapshotting + "/" + VoltDB.instance().getHostMessenger().getHostId(), -1);
} catch (NoNodeException e) {
SNAP_LOG.warn("Expect the snapshot node to already exist during deletion", e);
} catch (Exception e) {
VoltDB.crashLocalVoltDB(e.getMessage(), true, e);
} finally {
/**
* Remove this last site from the set here after the terminator has run
* so that new snapshots won't start until
* everything is on disk for the previous snapshot. This prevents a really long
* snapshot initiation procedure from occurring because it has to contend for
* filesystem resources
*
* Do this before logSnapshotCompleteToZK() because the ZK operations are slow,
* and they can trigger snapshot completion interests to fire before this site
* removes itself from the set. The next snapshot request may come in and see
* this snapshot is still in progress.
*/
ExecutionSitesCurrentlySnapshotting.remove(SnapshotSiteProcessor.this);
}
logSnapshotCompleteToZK(txnId, snapshotSucceeded, snapshotDataForZookeeper);
}
}
};
m_snapshotTargetTerminators.add(terminatorThread);
terminatorThread.start();
}
}
return retval;
}
use of org.apache.zookeeper_voltpatches.KeeperException.NoNodeException in project voltdb by VoltDB.
the class TestZK method testFailureKillsEphemeral.
@Test
public void testFailureKillsEphemeral() throws Exception {
ZooKeeper zk = getClient(0);
zk.create("/foo", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
assertEquals(zk.getData("/foo", false, null).length, 0);
failSite(0);
zk = getClient(1);
try {
zk.getData("/foo", false, null);
} catch (NoNodeException e) {
return;
}
fail();
}
Aggregations