Search in sources :

Example 1 with Checkpoint

use of org.exist.storage.txn.Checkpoint in project exist by eXist-db.

the class RecoveryManager method recover.

/**
 * Checks if the database is in a consistent state. If not, start a recovery run.
 *
 * The method scans the last log file and tries to find the last checkpoint
 * record. If the checkpoint record is the last record in the file,
 * the database was closed cleanly and is in a consistent state. If not, a
 * recovery run is started beginning at the last checkpoint found.
 *
 * @throws LogException Reading of journal failed.
 * @return if recover was successful
 */
public boolean recover() throws LogException {
    boolean recoveryRun = false;
    final List<Path> files;
    try (final Stream<Path> fileStream = journalRecovery.getFiles.get()) {
        files = fileStream.collect(Collectors.toList());
    } catch (final IOException ioe) {
        throw new LogException("Unable to find journal files in data dir", ioe);
    }
    // find the last log file in the data directory
    final short lastNum = Journal.findLastFile(files.stream());
    if (-1 < lastNum) {
        // load the last log file
        final Path last = journalRecovery.getFile.apply(lastNum);
        // scan the last log file and record the last checkpoint found
        try (JournalReader reader = new JournalReader(broker, last, lastNum)) {
            // try to read the last log record to see if it is a checkpoint
            boolean checkpointFound = false;
            try {
                final Loggable lastLog = reader.lastEntry();
                if (lastLog != null && lastLog.getLogType() == LogEntryTypes.CHECKPOINT) {
                    final Checkpoint checkpoint = (Checkpoint) lastLog;
                    // record, we compare the LSN stored in it with the current LSN.
                    if (checkpoint.getStoredLsn().equals(checkpoint.getLsn())) {
                        checkpointFound = true;
                        LOG.debug("Database is in clean state. Last checkpoint: {}", checkpoint.getDateString());
                    }
                }
            } catch (final LogException e) {
                LOG.info("Reading last journal log entry failed: {}. Will scan the log...", e.getMessage());
                // if an exception occurs at this point, the journal file is probably incomplete,
                // which indicates a db crash
                checkpointFound = false;
            }
            if (!checkpointFound) {
                LOG.info("Unclean shutdown detected. Scanning journal...");
                broker.getBrokerPool().reportStatus("Unclean shutdown detected. Scanning log...");
                reader.positionFirst();
                final Long2ObjectMap<Loggable> txnsStarted = new Long2ObjectOpenHashMap<>();
                Checkpoint lastCheckpoint = null;
                Lsn lastLsn = Lsn.LSN_INVALID;
                Loggable next;
                try {
                    final ProgressBar progress = new ProgressBar("Scanning journal ", FileUtils.sizeQuietly(last));
                    while ((next = reader.nextEntry()) != null) {
                        // LOG.debug(next.dump());
                        progress.set(next.getLsn().getOffset());
                        if (next.getLogType() == LogEntryTypes.TXN_START) {
                            // new transaction starts: add it to the transactions table
                            txnsStarted.put(next.getTransactionId(), next);
                        } else if (next.getLogType() == LogEntryTypes.TXN_ABORT) {
                            // transaction aborted: remove it from the transactions table
                            txnsStarted.remove(next.getTransactionId());
                        } else if (next.getLogType() == LogEntryTypes.CHECKPOINT) {
                            txnsStarted.clear();
                            lastCheckpoint = (Checkpoint) next;
                        }
                        lastLsn = next.getLsn();
                    }
                } catch (final LogException e) {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Caught exception while reading log", e);
                    }
                    LOG.warn("Last readable journal log entry lsn: {}", lastLsn);
                }
                // we need a recovery.
                if ((lastCheckpoint == null || !lastCheckpoint.getLsn().equals(lastLsn)) && txnsStarted.size() > 0) {
                    LOG.info("Dirty transactions: {}", txnsStarted.size());
                    // starting recovery: reposition the log reader to the last checkpoint
                    if (lastCheckpoint == null) {
                        reader.positionFirst();
                    } else {
                        reader.position(lastCheckpoint.getLsn());
                        next = reader.nextEntry();
                    }
                    recoveryRun = true;
                    try {
                        LOG.info("Running recovery...");
                        broker.getBrokerPool().reportStatus("Running recovery...");
                        try (final BlobStore blobStore = broker.getBrokerPool().getBlobStore()) {
                            try {
                                blobStore.openForRecovery();
                            } catch (final FileNotFoundException e) {
                                LOG.warn(e.getMessage(), e);
                            } catch (final IOException e) {
                                throw new LogException("Unable to Open the Blob Store for Recovery: " + e.getMessage(), e);
                            }
                            doRecovery(txnsStarted.size(), last, reader, lastLsn);
                        } catch (final IOException e) {
                            LOG.error("Error whilst closing the Blob Store after recovery: {}", e.getMessage(), e);
                        }
                    } catch (final LogException e) {
                        // if restartOnError == true, we try to bring up the database even if there
                        // are errors. Otherwise, an exception is thrown, which will stop the db initialization
                        broker.getBrokerPool().reportStatus(BrokerPool.SIGNAL_ABORTED);
                        if (restartOnError) {
                            LOG.error("Aborting recovery. eXist-db detected an error during recovery. This may not be fatal. Database will start up, but corruptions are likely.");
                        } else {
                            LOG.error("Aborting recovery. eXist-db detected an error during recovery. This may not be fatal. Please consider running a consistency check via the export tool and create a backup if problems are reported. The db should come up again if you restart it.");
                            throw e;
                        }
                    }
                } else {
                    LOG.info("Database is in clean state. Nothing to recover from the journal.");
                }
            }
        } finally {
            // remove .log files from directory even if recovery failed.
            // Re-applying them on a second start up attempt would definitely damage the db, so we better
            // delete them before user tries to launch again.
            cleanDirectory(files.stream());
            if (recoveryRun) {
                broker.repairPrimary();
                broker.sync(Sync.MAJOR);
            }
        }
    }
    journalRecovery.setCurrentFileNum.accept(lastNum);
    journalRecovery.switchFiles.get();
    return recoveryRun;
}
Also used : Path(java.nio.file.Path) FileNotFoundException(java.io.FileNotFoundException) IOException(java.io.IOException) Long2ObjectOpenHashMap(it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap) Checkpoint(org.exist.storage.txn.Checkpoint) ProgressBar(org.exist.util.ProgressBar) BlobStore(org.exist.storage.blob.BlobStore)

Example 2 with Checkpoint

use of org.exist.storage.txn.Checkpoint in project exist by eXist-db.

the class RecoveryManager method doRecovery.

/**
 * Called by {@link #recover()} to do the actual recovery.
 *
 * @param txnCount
 * @param last
 * @param reader
 * @param lastLsn
 *
 * @throws LogException
 */
private void doRecovery(final int txnCount, final Path last, final JournalReader reader, final Lsn lastLsn) throws LogException {
    if (LOG.isInfoEnabled()) {
        LOG.info("Running recovery ...");
    }
    journalRecovery.setInRecovery.accept(true);
    try {
        // map to track running transactions
        final Long2ObjectMap<Loggable> runningTxns = new Long2ObjectOpenHashMap<>();
        // ------- REDO ---------
        if (LOG.isInfoEnabled()) {
            LOG.info("First pass: redoing {} transactions...", txnCount);
        }
        final ProgressBar progress = new ProgressBar("Redo ", FileUtils.sizeQuietly(last));
        Loggable next = null;
        int redoCnt = 0;
        try {
            while ((next = reader.nextEntry()) != null) {
                SanityCheck.ASSERT(next.getLogType() != LogEntryTypes.CHECKPOINT, "Found a checkpoint during recovery run! This should not ever happen.");
                if (next.getLogType() == LogEntryTypes.TXN_START) {
                    // new transaction starts: add it to the transactions table
                    runningTxns.put(next.getTransactionId(), next);
                } else if (next.getLogType() == LogEntryTypes.TXN_COMMIT) {
                    // transaction committed: remove it from the transactions table
                    runningTxns.remove(next.getTransactionId());
                    redoCnt++;
                } else if (next.getLogType() == LogEntryTypes.TXN_ABORT) {
                    // transaction aborted: remove it from the transactions table
                    runningTxns.remove(next.getTransactionId());
                }
                // LOG.debug("Redo: " + next.dump());
                // redo the log entry
                next.redo();
                progress.set(next.getLsn().getOffset());
                if (next.getLsn().equals(lastLsn)) // last readable entry reached. Stop here.
                {
                    break;
                }
            }
        } catch (final Exception e) {
            LOG.error("Exception caught while redoing transactions. Aborting recovery to avoid possible damage. " + "Before starting again, make sure to run a check via the emergency export tool.", e);
            if (next != null) {
                LOG.info("Log entry that caused the exception: {}", next.dump());
            }
            throw new LogException("Recovery aborted. ");
        } finally {
            LOG.info("Redo processed {} out of {} transactions.", redoCnt, txnCount);
        }
        // ------- UNDO ---------
        if (LOG.isInfoEnabled()) {
            LOG.info("Second pass: undoing dirty transactions. Uncommitted transactions: {}", runningTxns.size());
        }
        // see if there are uncommitted transactions pending
        if (runningTxns.size() > 0) {
            // do a reverse scan of the log, undoing all uncommitted transactions
            try {
                while ((next = reader.previousEntry()) != null) {
                    if (next.getLogType() == LogEntryTypes.TXN_START) {
                        if (runningTxns.get(next.getTransactionId()) != null) {
                            runningTxns.remove(next.getTransactionId());
                            if (runningTxns.size() == 0) // all dirty transactions undone
                            {
                                break;
                            }
                        }
                    } else if (next.getLogType() == LogEntryTypes.TXN_COMMIT) {
                    // ignore already committed transaction
                    } else if (next.getLogType() == LogEntryTypes.CHECKPOINT) {
                        // found last checkpoint: undo is completed
                        break;
                    }
                    // undo the log entry if it belongs to an uncommitted transaction
                    if (runningTxns.get(next.getTransactionId()) != null) {
                        // LOG.debug("Undo: " + next.dump());
                        next.undo();
                    }
                }
            } catch (final Exception e) {
                LOG.warn("Exception caught while undoing dirty transactions. Remaining transactions to be undone: {}. Aborting recovery to avoid possible damage. Before starting again, make sure to run a check via the emergency export tool.", runningTxns.size(), e);
                if (next != null) {
                    LOG.warn("Log entry that caused the exception: {}", next.dump());
                }
                throw new LogException("Recovery aborted", e);
            }
        }
    } finally {
        broker.sync(Sync.MAJOR);
        journalRecovery.setInRecovery.accept(false);
    }
}
Also used : Long2ObjectOpenHashMap(it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap) ProgressBar(org.exist.util.ProgressBar) Checkpoint(org.exist.storage.txn.Checkpoint) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException)

Example 3 with Checkpoint

use of org.exist.storage.txn.Checkpoint in project exist by eXist-db.

the class Journal method writeToLog.

/**
 * Write a log entry to the journal.
 *
 * @param entry the journal entry to write
 * @throws JournalException if the entry could not be written
 */
public synchronized void writeToLog(final Loggable entry) throws JournalException {
    if (currentBuffer == null) {
        throw new JournalException("Database is shut down.");
    }
    SanityCheck.ASSERT(!inRecovery, "Write to log during recovery. Should not happen!");
    final int size = entry.getLogSize();
    if (size > Short.MAX_VALUE) {
        throw new JournalException("Journal can only write log entries of less that 32KB");
    }
    final int required = size + LOG_ENTRY_BASE_LEN;
    if (required > currentBuffer.remaining()) {
        flushToLog(false);
    }
    try {
        // TODO(AR) this is needed as the journal is initialised by starting a transaction for loading the SymbolTable... before recovery! which is likely wrong!!! as Recovery Cannot run if the Journal file has been switched!
        final long pos = channel != null ? channel.position() : 0;
        currentLsn = new Lsn(currentJournalFileNumber, pos + currentBuffer.position() + 1);
    } catch (final IOException e) {
        throw new JournalException("Unable to create LSN for: " + entry.dump());
    }
    entry.setLsn(currentLsn);
    try {
        final int currentBufferEntryOffset = currentBuffer.position();
        // write entryHeader
        currentBuffer.put(entry.getLogType());
        currentBuffer.putLong(entry.getTransactionId());
        currentBuffer.putShort((short) size);
        // write entry data
        entry.write(currentBuffer);
        // write backlink
        currentBuffer.putShort((short) (size + LOG_ENTRY_HEADER_LEN));
        // write checksum
        final long checksum = xxHash64.hash(currentBuffer, currentBufferEntryOffset, currentBuffer.position() - currentBufferEntryOffset, XXHASH64_SEED);
        currentBuffer.putLong(checksum);
    } catch (final BufferOverflowException e) {
        throw new JournalException("Buffer overflow while writing log record: " + entry.dump(), e);
    }
    // NOTE: we don't track operations on txnStart or checkpoints!
    if (!(entry instanceof TxnStart || entry instanceof Checkpoint)) {
        pool.getTransactionManager().trackOperation(entry.getTransactionId());
    }
}
Also used : Checkpoint(org.exist.storage.txn.Checkpoint) TxnStart(org.exist.storage.txn.TxnStart) BufferOverflowException(java.nio.BufferOverflowException) Checkpoint(org.exist.storage.txn.Checkpoint)

Example 4 with Checkpoint

use of org.exist.storage.txn.Checkpoint in project exist by eXist-db.

the class Journal method checkpoint.

/**
 * Write a checkpoint record to the journal and flush it. If switchLogFiles is true,
 * a new journal will be started, but only if the file is larger than
 * {@link #journalSizeMin}. The old log is removed.
 *
 * @param txnId          The transaction id
 * @param switchLogFiles Indicates whether a new journal file should be started
 * @throws JournalException if the checkpoint could not be written to the journal.
 */
public synchronized void checkpoint(final long txnId, final boolean switchLogFiles) throws JournalException {
    LOG.debug("Checkpoint reached");
    writeToLog(new Checkpoint(txnId));
    if (switchLogFiles) {
        // if we switch files, we don't need to sync.
        // the file will be removed anyway.
        flushBuffer();
    } else {
        flushToLog(true, true);
    }
    try {
        if (switchLogFiles && channel != null && channel.position() > journalSizeMin) {
            final Path oldFile = getFile(currentJournalFileNumber);
            // takes ownership of channel and oldFile when `start` is called
            final RemoveRunnable removeRunnable = new RemoveRunnable(channel, oldFile);
            try {
                switchFiles();
            } catch (final LogException e) {
                LOG.warn("Failed to create new journal: {}", e.getMessage(), e);
            }
            final Thread removeThread = newInstanceThread(pool, "remove-journal", removeRunnable);
            removeThread.start();
        }
    } catch (final IOException e) {
        LOG.warn("IOException while writing checkpoint", e);
    }
}
Also used : Path(java.nio.file.Path) Checkpoint(org.exist.storage.txn.Checkpoint) ThreadUtils.newInstanceThread(org.exist.util.ThreadUtils.newInstanceThread)

Example 5 with Checkpoint

use of org.exist.storage.txn.Checkpoint in project exist by eXist-db.

the class Journal method shutdown.

/**
 * Shut down the journal. This will write a checkpoint record
 * to the log, so recovery manager knows the file has been
 * closed in a clean way.
 *
 * @param txnId      the transaction id.
 * @param checkpoint true if a checkpoint should be written before shitdown
 */
public synchronized void shutdown(final long txnId, final boolean checkpoint) {
    if (!initialised) {
        // no journal is initialized
        return;
    }
    if (currentBuffer == null) {
        // the db has probably shut down already
        return;
    }
    if (!BrokerPool.FORCE_CORRUPTION) {
        if (checkpoint) {
            LOG.info("Shutting down Journal with checkpoint...");
            try {
                writeToLog(new Checkpoint(txnId));
            } catch (final JournalException e) {
                LOG.error("An error occurred whilst writing a checkpoint to the Journal: {}", e.getMessage(), e);
            }
        }
        flushBuffer();
    }
    try {
        channel.close();
    } catch (final IOException e) {
        LOG.error("Unable to close Journal file: {}", e.getMessage(), e);
    }
    channel = null;
    fileLock.release();
    currentBuffer = null;
}
Also used : Checkpoint(org.exist.storage.txn.Checkpoint)

Aggregations

Checkpoint (org.exist.storage.txn.Checkpoint)5 Long2ObjectOpenHashMap (it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap)2 FileNotFoundException (java.io.FileNotFoundException)2 IOException (java.io.IOException)2 Path (java.nio.file.Path)2 ProgressBar (org.exist.util.ProgressBar)2 BufferOverflowException (java.nio.BufferOverflowException)1 BlobStore (org.exist.storage.blob.BlobStore)1 TxnStart (org.exist.storage.txn.TxnStart)1 ThreadUtils.newInstanceThread (org.exist.util.ThreadUtils.newInstanceThread)1