use of org.exist.storage.txn.Checkpoint in project exist by eXist-db.
the class RecoveryManager method recover.
/**
* Checks if the database is in a consistent state. If not, start a recovery run.
*
* The method scans the last log file and tries to find the last checkpoint
* record. If the checkpoint record is the last record in the file,
* the database was closed cleanly and is in a consistent state. If not, a
* recovery run is started beginning at the last checkpoint found.
*
* @throws LogException Reading of journal failed.
* @return if recover was successful
*/
public boolean recover() throws LogException {
boolean recoveryRun = false;
final List<Path> files;
try (final Stream<Path> fileStream = journalRecovery.getFiles.get()) {
files = fileStream.collect(Collectors.toList());
} catch (final IOException ioe) {
throw new LogException("Unable to find journal files in data dir", ioe);
}
// find the last log file in the data directory
final short lastNum = Journal.findLastFile(files.stream());
if (-1 < lastNum) {
// load the last log file
final Path last = journalRecovery.getFile.apply(lastNum);
// scan the last log file and record the last checkpoint found
try (JournalReader reader = new JournalReader(broker, last, lastNum)) {
// try to read the last log record to see if it is a checkpoint
boolean checkpointFound = false;
try {
final Loggable lastLog = reader.lastEntry();
if (lastLog != null && lastLog.getLogType() == LogEntryTypes.CHECKPOINT) {
final Checkpoint checkpoint = (Checkpoint) lastLog;
// record, we compare the LSN stored in it with the current LSN.
if (checkpoint.getStoredLsn().equals(checkpoint.getLsn())) {
checkpointFound = true;
LOG.debug("Database is in clean state. Last checkpoint: {}", checkpoint.getDateString());
}
}
} catch (final LogException e) {
LOG.info("Reading last journal log entry failed: {}. Will scan the log...", e.getMessage());
// if an exception occurs at this point, the journal file is probably incomplete,
// which indicates a db crash
checkpointFound = false;
}
if (!checkpointFound) {
LOG.info("Unclean shutdown detected. Scanning journal...");
broker.getBrokerPool().reportStatus("Unclean shutdown detected. Scanning log...");
reader.positionFirst();
final Long2ObjectMap<Loggable> txnsStarted = new Long2ObjectOpenHashMap<>();
Checkpoint lastCheckpoint = null;
Lsn lastLsn = Lsn.LSN_INVALID;
Loggable next;
try {
final ProgressBar progress = new ProgressBar("Scanning journal ", FileUtils.sizeQuietly(last));
while ((next = reader.nextEntry()) != null) {
// LOG.debug(next.dump());
progress.set(next.getLsn().getOffset());
if (next.getLogType() == LogEntryTypes.TXN_START) {
// new transaction starts: add it to the transactions table
txnsStarted.put(next.getTransactionId(), next);
} else if (next.getLogType() == LogEntryTypes.TXN_ABORT) {
// transaction aborted: remove it from the transactions table
txnsStarted.remove(next.getTransactionId());
} else if (next.getLogType() == LogEntryTypes.CHECKPOINT) {
txnsStarted.clear();
lastCheckpoint = (Checkpoint) next;
}
lastLsn = next.getLsn();
}
} catch (final LogException e) {
if (LOG.isDebugEnabled()) {
LOG.debug("Caught exception while reading log", e);
}
LOG.warn("Last readable journal log entry lsn: {}", lastLsn);
}
// we need a recovery.
if ((lastCheckpoint == null || !lastCheckpoint.getLsn().equals(lastLsn)) && txnsStarted.size() > 0) {
LOG.info("Dirty transactions: {}", txnsStarted.size());
// starting recovery: reposition the log reader to the last checkpoint
if (lastCheckpoint == null) {
reader.positionFirst();
} else {
reader.position(lastCheckpoint.getLsn());
next = reader.nextEntry();
}
recoveryRun = true;
try {
LOG.info("Running recovery...");
broker.getBrokerPool().reportStatus("Running recovery...");
try (final BlobStore blobStore = broker.getBrokerPool().getBlobStore()) {
try {
blobStore.openForRecovery();
} catch (final FileNotFoundException e) {
LOG.warn(e.getMessage(), e);
} catch (final IOException e) {
throw new LogException("Unable to Open the Blob Store for Recovery: " + e.getMessage(), e);
}
doRecovery(txnsStarted.size(), last, reader, lastLsn);
} catch (final IOException e) {
LOG.error("Error whilst closing the Blob Store after recovery: {}", e.getMessage(), e);
}
} catch (final LogException e) {
// if restartOnError == true, we try to bring up the database even if there
// are errors. Otherwise, an exception is thrown, which will stop the db initialization
broker.getBrokerPool().reportStatus(BrokerPool.SIGNAL_ABORTED);
if (restartOnError) {
LOG.error("Aborting recovery. eXist-db detected an error during recovery. This may not be fatal. Database will start up, but corruptions are likely.");
} else {
LOG.error("Aborting recovery. eXist-db detected an error during recovery. This may not be fatal. Please consider running a consistency check via the export tool and create a backup if problems are reported. The db should come up again if you restart it.");
throw e;
}
}
} else {
LOG.info("Database is in clean state. Nothing to recover from the journal.");
}
}
} finally {
// remove .log files from directory even if recovery failed.
// Re-applying them on a second start up attempt would definitely damage the db, so we better
// delete them before user tries to launch again.
cleanDirectory(files.stream());
if (recoveryRun) {
broker.repairPrimary();
broker.sync(Sync.MAJOR);
}
}
}
journalRecovery.setCurrentFileNum.accept(lastNum);
journalRecovery.switchFiles.get();
return recoveryRun;
}
use of org.exist.storage.txn.Checkpoint in project exist by eXist-db.
the class RecoveryManager method doRecovery.
/**
* Called by {@link #recover()} to do the actual recovery.
*
* @param txnCount
* @param last
* @param reader
* @param lastLsn
*
* @throws LogException
*/
private void doRecovery(final int txnCount, final Path last, final JournalReader reader, final Lsn lastLsn) throws LogException {
if (LOG.isInfoEnabled()) {
LOG.info("Running recovery ...");
}
journalRecovery.setInRecovery.accept(true);
try {
// map to track running transactions
final Long2ObjectMap<Loggable> runningTxns = new Long2ObjectOpenHashMap<>();
// ------- REDO ---------
if (LOG.isInfoEnabled()) {
LOG.info("First pass: redoing {} transactions...", txnCount);
}
final ProgressBar progress = new ProgressBar("Redo ", FileUtils.sizeQuietly(last));
Loggable next = null;
int redoCnt = 0;
try {
while ((next = reader.nextEntry()) != null) {
SanityCheck.ASSERT(next.getLogType() != LogEntryTypes.CHECKPOINT, "Found a checkpoint during recovery run! This should not ever happen.");
if (next.getLogType() == LogEntryTypes.TXN_START) {
// new transaction starts: add it to the transactions table
runningTxns.put(next.getTransactionId(), next);
} else if (next.getLogType() == LogEntryTypes.TXN_COMMIT) {
// transaction committed: remove it from the transactions table
runningTxns.remove(next.getTransactionId());
redoCnt++;
} else if (next.getLogType() == LogEntryTypes.TXN_ABORT) {
// transaction aborted: remove it from the transactions table
runningTxns.remove(next.getTransactionId());
}
// LOG.debug("Redo: " + next.dump());
// redo the log entry
next.redo();
progress.set(next.getLsn().getOffset());
if (next.getLsn().equals(lastLsn)) // last readable entry reached. Stop here.
{
break;
}
}
} catch (final Exception e) {
LOG.error("Exception caught while redoing transactions. Aborting recovery to avoid possible damage. " + "Before starting again, make sure to run a check via the emergency export tool.", e);
if (next != null) {
LOG.info("Log entry that caused the exception: {}", next.dump());
}
throw new LogException("Recovery aborted. ");
} finally {
LOG.info("Redo processed {} out of {} transactions.", redoCnt, txnCount);
}
// ------- UNDO ---------
if (LOG.isInfoEnabled()) {
LOG.info("Second pass: undoing dirty transactions. Uncommitted transactions: {}", runningTxns.size());
}
// see if there are uncommitted transactions pending
if (runningTxns.size() > 0) {
// do a reverse scan of the log, undoing all uncommitted transactions
try {
while ((next = reader.previousEntry()) != null) {
if (next.getLogType() == LogEntryTypes.TXN_START) {
if (runningTxns.get(next.getTransactionId()) != null) {
runningTxns.remove(next.getTransactionId());
if (runningTxns.size() == 0) // all dirty transactions undone
{
break;
}
}
} else if (next.getLogType() == LogEntryTypes.TXN_COMMIT) {
// ignore already committed transaction
} else if (next.getLogType() == LogEntryTypes.CHECKPOINT) {
// found last checkpoint: undo is completed
break;
}
// undo the log entry if it belongs to an uncommitted transaction
if (runningTxns.get(next.getTransactionId()) != null) {
// LOG.debug("Undo: " + next.dump());
next.undo();
}
}
} catch (final Exception e) {
LOG.warn("Exception caught while undoing dirty transactions. Remaining transactions to be undone: {}. Aborting recovery to avoid possible damage. Before starting again, make sure to run a check via the emergency export tool.", runningTxns.size(), e);
if (next != null) {
LOG.warn("Log entry that caused the exception: {}", next.dump());
}
throw new LogException("Recovery aborted", e);
}
}
} finally {
broker.sync(Sync.MAJOR);
journalRecovery.setInRecovery.accept(false);
}
}
use of org.exist.storage.txn.Checkpoint in project exist by eXist-db.
the class Journal method writeToLog.
/**
* Write a log entry to the journal.
*
* @param entry the journal entry to write
* @throws JournalException if the entry could not be written
*/
public synchronized void writeToLog(final Loggable entry) throws JournalException {
if (currentBuffer == null) {
throw new JournalException("Database is shut down.");
}
SanityCheck.ASSERT(!inRecovery, "Write to log during recovery. Should not happen!");
final int size = entry.getLogSize();
if (size > Short.MAX_VALUE) {
throw new JournalException("Journal can only write log entries of less that 32KB");
}
final int required = size + LOG_ENTRY_BASE_LEN;
if (required > currentBuffer.remaining()) {
flushToLog(false);
}
try {
// TODO(AR) this is needed as the journal is initialised by starting a transaction for loading the SymbolTable... before recovery! which is likely wrong!!! as Recovery Cannot run if the Journal file has been switched!
final long pos = channel != null ? channel.position() : 0;
currentLsn = new Lsn(currentJournalFileNumber, pos + currentBuffer.position() + 1);
} catch (final IOException e) {
throw new JournalException("Unable to create LSN for: " + entry.dump());
}
entry.setLsn(currentLsn);
try {
final int currentBufferEntryOffset = currentBuffer.position();
// write entryHeader
currentBuffer.put(entry.getLogType());
currentBuffer.putLong(entry.getTransactionId());
currentBuffer.putShort((short) size);
// write entry data
entry.write(currentBuffer);
// write backlink
currentBuffer.putShort((short) (size + LOG_ENTRY_HEADER_LEN));
// write checksum
final long checksum = xxHash64.hash(currentBuffer, currentBufferEntryOffset, currentBuffer.position() - currentBufferEntryOffset, XXHASH64_SEED);
currentBuffer.putLong(checksum);
} catch (final BufferOverflowException e) {
throw new JournalException("Buffer overflow while writing log record: " + entry.dump(), e);
}
// NOTE: we don't track operations on txnStart or checkpoints!
if (!(entry instanceof TxnStart || entry instanceof Checkpoint)) {
pool.getTransactionManager().trackOperation(entry.getTransactionId());
}
}
use of org.exist.storage.txn.Checkpoint in project exist by eXist-db.
the class Journal method checkpoint.
/**
* Write a checkpoint record to the journal and flush it. If switchLogFiles is true,
* a new journal will be started, but only if the file is larger than
* {@link #journalSizeMin}. The old log is removed.
*
* @param txnId The transaction id
* @param switchLogFiles Indicates whether a new journal file should be started
* @throws JournalException if the checkpoint could not be written to the journal.
*/
public synchronized void checkpoint(final long txnId, final boolean switchLogFiles) throws JournalException {
LOG.debug("Checkpoint reached");
writeToLog(new Checkpoint(txnId));
if (switchLogFiles) {
// if we switch files, we don't need to sync.
// the file will be removed anyway.
flushBuffer();
} else {
flushToLog(true, true);
}
try {
if (switchLogFiles && channel != null && channel.position() > journalSizeMin) {
final Path oldFile = getFile(currentJournalFileNumber);
// takes ownership of channel and oldFile when `start` is called
final RemoveRunnable removeRunnable = new RemoveRunnable(channel, oldFile);
try {
switchFiles();
} catch (final LogException e) {
LOG.warn("Failed to create new journal: {}", e.getMessage(), e);
}
final Thread removeThread = newInstanceThread(pool, "remove-journal", removeRunnable);
removeThread.start();
}
} catch (final IOException e) {
LOG.warn("IOException while writing checkpoint", e);
}
}
use of org.exist.storage.txn.Checkpoint in project exist by eXist-db.
the class Journal method shutdown.
/**
* Shut down the journal. This will write a checkpoint record
* to the log, so recovery manager knows the file has been
* closed in a clean way.
*
* @param txnId the transaction id.
* @param checkpoint true if a checkpoint should be written before shitdown
*/
public synchronized void shutdown(final long txnId, final boolean checkpoint) {
if (!initialised) {
// no journal is initialized
return;
}
if (currentBuffer == null) {
// the db has probably shut down already
return;
}
if (!BrokerPool.FORCE_CORRUPTION) {
if (checkpoint) {
LOG.info("Shutting down Journal with checkpoint...");
try {
writeToLog(new Checkpoint(txnId));
} catch (final JournalException e) {
LOG.error("An error occurred whilst writing a checkpoint to the Journal: {}", e.getMessage(), e);
}
}
flushBuffer();
}
try {
channel.close();
} catch (final IOException e) {
LOG.error("Unable to close Journal file: {}", e.getMessage(), e);
}
channel = null;
fileLock.release();
currentBuffer = null;
}
Aggregations