use of org.voltdb.rejoin.StreamSnapshotDataTarget.StreamSnapshotTimeoutException in project voltdb by VoltDB.
the class SnapshotSiteProcessor method doSnapshotWork.
/*
* No schedule means don't try and schedule snapshot work because this is a blocking
* task from completeSnapshotWork. This avoids creating thousands of task objects.
*/
public Future<?> doSnapshotWork(SystemProcedureExecutionContext context, boolean noSchedule) {
ListenableFuture<?> retval = null;
/*
* This thread will null out the reference to m_snapshotTableTasks when
* a snapshot is finished. If the snapshot buffer is loaned out that means
* it is pending I/O somewhere so there is no work to do until it comes back.
*/
if (m_snapshotTableTasks == null) {
return retval;
}
if (m_snapshotTargets == null) {
return null;
}
/*
* Try to serialize a block from a table, if the table is finished,
* remove the tasks from the task map and move on to the next table. If a block is
* successfully serialized, break out of the loop and release the site thread for more
* transaction work.
*/
Iterator<Map.Entry<Integer, Collection<SnapshotTableTask>>> taskIter = m_snapshotTableTasks.asMap().entrySet().iterator();
while (taskIter.hasNext()) {
Map.Entry<Integer, Collection<SnapshotTableTask>> taskEntry = taskIter.next();
final int tableId = taskEntry.getKey();
final Collection<SnapshotTableTask> tableTasks = taskEntry.getValue();
final List<BBContainer> outputBuffers = getOutputBuffers(tableTasks, noSchedule);
if (outputBuffers == null) {
// Not enough buffers available
if (!noSchedule) {
rescheduleSnapshotWork();
}
break;
}
// Stream more and add a listener to handle any failures
Pair<ListenableFuture, Boolean> streamResult = m_streamers.get(tableId).streamMore(context, outputBuffers, null);
if (streamResult.getFirst() != null) {
final ListenableFuture writeFutures = streamResult.getFirst();
writeFutures.addListener(new Runnable() {
@Override
public void run() {
try {
writeFutures.get();
} catch (Throwable t) {
if (m_perSiteLastSnapshotSucceded) {
if (t instanceof StreamSnapshotTimeoutException || t.getCause() instanceof StreamSnapshotTimeoutException) {
//This error is already logged by the watchdog when it generates the exception
} else {
SNAP_LOG.error("Error while attempting to write snapshot data", t);
}
m_perSiteLastSnapshotSucceded = false;
}
}
}
}, CoreUtils.SAMETHREADEXECUTOR);
}
/**
* The table streamer will return false when there is no more data left to pull from that table. The
* enclosing loop ensures that the next table is then addressed.
*/
if (!streamResult.getSecond()) {
asyncTerminateReplicatedTableTasks(tableTasks);
// XXX: Guava's multimap will clear the tableTasks collection when the entry is
// removed from the containing map, so don't use the collection after removal!
taskIter.remove();
SNAP_LOG.debug("Finished snapshot tasks for table " + tableId + ": " + tableTasks);
} else {
break;
}
}
/**
* If there are no more tasks then this particular EE is finished doing snapshot work
* Check the AtomicInteger to find out if this is the last one.
*/
if (m_snapshotTableTasks.isEmpty()) {
SNAP_LOG.debug("Finished with tasks");
// In case this is a non-blocking snapshot, do the post-snapshot tasks here.
runPostSnapshotTasks(context);
final ArrayList<SnapshotDataTarget> snapshotTargets = m_snapshotTargets;
m_snapshotTargets = null;
m_snapshotTableTasks = null;
boolean IamLast = false;
synchronized (ExecutionSitesCurrentlySnapshotting) {
if (!ExecutionSitesCurrentlySnapshotting.contains(this)) {
VoltDB.crashLocalVoltDB("Currently snapshotting site didn't find itself in set of snapshotting sites", true, null);
}
IamLast = ExecutionSitesCurrentlySnapshotting.size() == 1;
if (!IamLast) {
ExecutionSitesCurrentlySnapshotting.remove(this);
}
}
/**
* If this is the last one then this EE must close all the SnapshotDataTargets.
* Done in a separate thread so the EE can go and do other work. It will
* sync every file descriptor and that may block for a while.
*/
if (IamLast) {
SNAP_LOG.debug("I AM LAST!");
final long txnId = m_lastSnapshotTxnId;
final ExtensibleSnapshotDigestData snapshotDataForZookeeper = m_extraSnapshotData;
m_extraSnapshotData = null;
final Thread terminatorThread = new Thread("Snapshot terminator") {
@Override
public void run() {
boolean snapshotSucceeded = true;
try {
/*
* Be absolutely sure the snapshot is finished
* and synced to disk before another is started
*/
for (Thread t : m_snapshotTargetTerminators) {
if (t == this) {
continue;
}
try {
t.join();
} catch (InterruptedException e) {
return;
}
}
for (final SnapshotDataTarget t : snapshotTargets) {
try {
t.close();
} catch (IOException e) {
snapshotSucceeded = false;
throw new RuntimeException(e);
} catch (InterruptedException e) {
snapshotSucceeded = false;
throw new RuntimeException(e);
}
}
Runnable r = null;
while ((r = m_tasksOnSnapshotCompletion.poll()) != null) {
try {
r.run();
} catch (Exception e) {
SNAP_LOG.error("Error running snapshot completion task", e);
}
}
} finally {
try {
VoltDB.instance().getHostMessenger().getZK().delete(VoltZK.nodes_currently_snapshotting + "/" + VoltDB.instance().getHostMessenger().getHostId(), -1);
} catch (NoNodeException e) {
SNAP_LOG.warn("Expect the snapshot node to already exist during deletion", e);
} catch (Exception e) {
VoltDB.crashLocalVoltDB(e.getMessage(), true, e);
} finally {
/**
* Remove this last site from the set here after the terminator has run
* so that new snapshots won't start until
* everything is on disk for the previous snapshot. This prevents a really long
* snapshot initiation procedure from occurring because it has to contend for
* filesystem resources
*
* Do this before logSnapshotCompleteToZK() because the ZK operations are slow,
* and they can trigger snapshot completion interests to fire before this site
* removes itself from the set. The next snapshot request may come in and see
* this snapshot is still in progress.
*/
ExecutionSitesCurrentlySnapshotting.remove(SnapshotSiteProcessor.this);
}
logSnapshotCompleteToZK(txnId, snapshotSucceeded, snapshotDataForZookeeper);
}
}
};
m_snapshotTargetTerminators.add(terminatorThread);
terminatorThread.start();
}
}
return retval;
}
Aggregations