Search in sources :

Example 1 with StreamSnapshotTimeoutException

use of org.voltdb.rejoin.StreamSnapshotDataTarget.StreamSnapshotTimeoutException in project voltdb by VoltDB.

the class SnapshotSiteProcessor method doSnapshotWork.

/*
     * No schedule means don't try and schedule snapshot work because this is a blocking
     * task from completeSnapshotWork. This avoids creating thousands of task objects.
     */
public Future<?> doSnapshotWork(SystemProcedureExecutionContext context, boolean noSchedule) {
    ListenableFuture<?> retval = null;
    /*
         * This thread will null out the reference to m_snapshotTableTasks when
         * a snapshot is finished. If the snapshot buffer is loaned out that means
         * it is pending I/O somewhere so there is no work to do until it comes back.
         */
    if (m_snapshotTableTasks == null) {
        return retval;
    }
    if (m_snapshotTargets == null) {
        return null;
    }
    /*
         * Try to serialize a block from a table, if the table is finished,
         * remove the tasks from the task map and move on to the next table. If a block is
         * successfully serialized, break out of the loop and release the site thread for more
         * transaction work.
         */
    Iterator<Map.Entry<Integer, Collection<SnapshotTableTask>>> taskIter = m_snapshotTableTasks.asMap().entrySet().iterator();
    while (taskIter.hasNext()) {
        Map.Entry<Integer, Collection<SnapshotTableTask>> taskEntry = taskIter.next();
        final int tableId = taskEntry.getKey();
        final Collection<SnapshotTableTask> tableTasks = taskEntry.getValue();
        final List<BBContainer> outputBuffers = getOutputBuffers(tableTasks, noSchedule);
        if (outputBuffers == null) {
            // Not enough buffers available
            if (!noSchedule) {
                rescheduleSnapshotWork();
            }
            break;
        }
        // Stream more and add a listener to handle any failures
        Pair<ListenableFuture, Boolean> streamResult = m_streamers.get(tableId).streamMore(context, outputBuffers, null);
        if (streamResult.getFirst() != null) {
            final ListenableFuture writeFutures = streamResult.getFirst();
            writeFutures.addListener(new Runnable() {

                @Override
                public void run() {
                    try {
                        writeFutures.get();
                    } catch (Throwable t) {
                        if (m_perSiteLastSnapshotSucceded) {
                            if (t instanceof StreamSnapshotTimeoutException || t.getCause() instanceof StreamSnapshotTimeoutException) {
                            //This error is already logged by the watchdog when it generates the exception
                            } else {
                                SNAP_LOG.error("Error while attempting to write snapshot data", t);
                            }
                            m_perSiteLastSnapshotSucceded = false;
                        }
                    }
                }
            }, CoreUtils.SAMETHREADEXECUTOR);
        }
        /**
             * The table streamer will return false when there is no more data left to pull from that table. The
             * enclosing loop ensures that the next table is then addressed.
             */
        if (!streamResult.getSecond()) {
            asyncTerminateReplicatedTableTasks(tableTasks);
            // XXX: Guava's multimap will clear the tableTasks collection when the entry is
            // removed from the containing map, so don't use the collection after removal!
            taskIter.remove();
            SNAP_LOG.debug("Finished snapshot tasks for table " + tableId + ": " + tableTasks);
        } else {
            break;
        }
    }
    /**
         * If there are no more tasks then this particular EE is finished doing snapshot work
         * Check the AtomicInteger to find out if this is the last one.
         */
    if (m_snapshotTableTasks.isEmpty()) {
        SNAP_LOG.debug("Finished with tasks");
        // In case this is a non-blocking snapshot, do the post-snapshot tasks here.
        runPostSnapshotTasks(context);
        final ArrayList<SnapshotDataTarget> snapshotTargets = m_snapshotTargets;
        m_snapshotTargets = null;
        m_snapshotTableTasks = null;
        boolean IamLast = false;
        synchronized (ExecutionSitesCurrentlySnapshotting) {
            if (!ExecutionSitesCurrentlySnapshotting.contains(this)) {
                VoltDB.crashLocalVoltDB("Currently snapshotting site didn't find itself in set of snapshotting sites", true, null);
            }
            IamLast = ExecutionSitesCurrentlySnapshotting.size() == 1;
            if (!IamLast) {
                ExecutionSitesCurrentlySnapshotting.remove(this);
            }
        }
        /**
             * If this is the last one then this EE must close all the SnapshotDataTargets.
             * Done in a separate thread so the EE can go and do other work. It will
             * sync every file descriptor and that may block for a while.
             */
        if (IamLast) {
            SNAP_LOG.debug("I AM LAST!");
            final long txnId = m_lastSnapshotTxnId;
            final ExtensibleSnapshotDigestData snapshotDataForZookeeper = m_extraSnapshotData;
            m_extraSnapshotData = null;
            final Thread terminatorThread = new Thread("Snapshot terminator") {

                @Override
                public void run() {
                    boolean snapshotSucceeded = true;
                    try {
                        /*
                             * Be absolutely sure the snapshot is finished
                             * and synced to disk before another is started
                             */
                        for (Thread t : m_snapshotTargetTerminators) {
                            if (t == this) {
                                continue;
                            }
                            try {
                                t.join();
                            } catch (InterruptedException e) {
                                return;
                            }
                        }
                        for (final SnapshotDataTarget t : snapshotTargets) {
                            try {
                                t.close();
                            } catch (IOException e) {
                                snapshotSucceeded = false;
                                throw new RuntimeException(e);
                            } catch (InterruptedException e) {
                                snapshotSucceeded = false;
                                throw new RuntimeException(e);
                            }
                        }
                        Runnable r = null;
                        while ((r = m_tasksOnSnapshotCompletion.poll()) != null) {
                            try {
                                r.run();
                            } catch (Exception e) {
                                SNAP_LOG.error("Error running snapshot completion task", e);
                            }
                        }
                    } finally {
                        try {
                            VoltDB.instance().getHostMessenger().getZK().delete(VoltZK.nodes_currently_snapshotting + "/" + VoltDB.instance().getHostMessenger().getHostId(), -1);
                        } catch (NoNodeException e) {
                            SNAP_LOG.warn("Expect the snapshot node to already exist during deletion", e);
                        } catch (Exception e) {
                            VoltDB.crashLocalVoltDB(e.getMessage(), true, e);
                        } finally {
                            /**
                                 * Remove this last site from the set here after the terminator has run
                                 * so that new snapshots won't start until
                                 * everything is on disk for the previous snapshot. This prevents a really long
                                 * snapshot initiation procedure from occurring because it has to contend for
                                 * filesystem resources
                                 *
                                 * Do this before logSnapshotCompleteToZK() because the ZK operations are slow,
                                 * and they can trigger snapshot completion interests to fire before this site
                                 * removes itself from the set. The next snapshot request may come in and see
                                 * this snapshot is still in progress.
                                 */
                            ExecutionSitesCurrentlySnapshotting.remove(SnapshotSiteProcessor.this);
                        }
                        logSnapshotCompleteToZK(txnId, snapshotSucceeded, snapshotDataForZookeeper);
                    }
                }
            };
            m_snapshotTargetTerminators.add(terminatorThread);
            terminatorThread.start();
        }
    }
    return retval;
}
Also used : NoNodeException(org.apache.zookeeper_voltpatches.KeeperException.NoNodeException) StreamSnapshotTimeoutException(org.voltdb.rejoin.StreamSnapshotDataTarget.StreamSnapshotTimeoutException) IOException(java.io.IOException) StreamSnapshotTimeoutException(org.voltdb.rejoin.StreamSnapshotDataTarget.StreamSnapshotTimeoutException) NoNodeException(org.apache.zookeeper_voltpatches.KeeperException.NoNodeException) KeeperException(org.apache.zookeeper_voltpatches.KeeperException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Collection(java.util.Collection) BBContainer(org.voltcore.utils.DBBPool.BBContainer) ListenableFuture(com.google_voltpatches.common.util.concurrent.ListenableFuture) HashMap(java.util.HashMap) Map(java.util.Map) ImmutableMap(com.google_voltpatches.common.collect.ImmutableMap)

Aggregations

ImmutableMap (com.google_voltpatches.common.collect.ImmutableMap)1 ListenableFuture (com.google_voltpatches.common.util.concurrent.ListenableFuture)1 IOException (java.io.IOException)1 Collection (java.util.Collection)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 ExecutionException (java.util.concurrent.ExecutionException)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 KeeperException (org.apache.zookeeper_voltpatches.KeeperException)1 NoNodeException (org.apache.zookeeper_voltpatches.KeeperException.NoNodeException)1 BBContainer (org.voltcore.utils.DBBPool.BBContainer)1 StreamSnapshotTimeoutException (org.voltdb.rejoin.StreamSnapshotDataTarget.StreamSnapshotTimeoutException)1