Search in sources :

Example 1 with DataPageMetaData

use of herddb.core.PageSet.DataPageMetaData in project herddb by diennea.

the class TableStatus method deserialize.

public static TableStatus deserialize(ExtendedDataInputStream in) throws IOException {
    // version
    long version = in.readVLong();
    // flags for future implementations
    long flags = in.readVLong();
    if (version != 1 || flags != 0) {
        throw new DataStorageManagerException("corrupted table status");
    String tableName = in.readUTF();
    long ledgerId = in.readLong();
    long offset = in.readLong();
    long nextPageId = in.readLong();
    byte[] nextPrimaryKeyValue = in.readArray();
    int numActivePages = in.readVInt();
    Map<Long, DataPageMetaData> activePages = new HashMap<>(numActivePages);
    for (int i = 0; i < numActivePages; i++) {
        activePages.put(in.readVLong(), DataPageMetaData.deserialize(in));
    return new TableStatus(tableName, new LogSequenceNumber(ledgerId, offset), nextPrimaryKeyValue, nextPageId, activePages);
Also used : DataPageMetaData(herddb.core.PageSet.DataPageMetaData) HashMap(java.util.HashMap) LogSequenceNumber(herddb.log.LogSequenceNumber)

Example 2 with DataPageMetaData

use of herddb.core.PageSet.DataPageMetaData in project herddb by diennea.

the class TableManager method start.

public void start() throws DataStorageManagerException {
    Map<Long, DataPageMetaData> activePagesAtBoot = new HashMap<>();
    bootSequenceNumber = LogSequenceNumber.START_OF_TIME;
    boolean requireLoadAtStartup = keyToPage.requireLoadAtStartup();
    if (requireLoadAtStartup) {
        // non persistent primary key index, we need a full table scan
        LOGGER.log(Level.SEVERE, "loading in memory all the keys for table {0}", new Object[] { });
        dataStorageManager.fullTableScan(tableSpaceUUID, table.uuid, new FullTableScanConsumer() {

            Long currentPage;

            public void acceptTableStatus(TableStatus tableStatus) {
                LOGGER.log(Level.SEVERE, "recovery table at " + tableStatus.sequenceNumber);
                nextPrimaryKeyValue.set(Bytes.toLong(tableStatus.nextPrimaryKeyValue, 0));
                nextPageId = tableStatus.nextPageId;
                bootSequenceNumber = tableStatus.sequenceNumber;

            public void startPage(long pageId) {
                currentPage = pageId;

            public void acceptRecord(Record record) {
                if (currentPage < 0) {
                    throw new IllegalStateException();
                keyToPage.put(record.key, currentPage);

            public void endPage() {
                currentPage = null;

            public void endTable() {
    } else {
        LOGGER.log(Level.SEVERE, "loading table {0}, uuid {1}", new Object[] {, table.uuid });
        TableStatus tableStatus = dataStorageManager.getLatestTableStatus(tableSpaceUUID, table.uuid);
        LOGGER.log(Level.SEVERE, "recovery table at " + tableStatus.sequenceNumber);
        nextPrimaryKeyValue.set(Bytes.toLong(tableStatus.nextPrimaryKeyValue, 0));
        nextPageId = tableStatus.nextPageId;
        bootSequenceNumber = tableStatus.sequenceNumber;
    dataStorageManager.cleanupAfterBoot(tableSpaceUUID, table.uuid, activePagesAtBoot.keySet());
    LOGGER.log(Level.SEVERE, "loaded {0} keys for table {1}, newPageId {2}, nextPrimaryKeyValue {3}, activePages {4}", new Object[] { keyToPage.size(),, nextPageId, nextPrimaryKeyValue.get(), pageSet.getActivePages() + "" });
    started = true;
Also used : DataPageMetaData(herddb.core.PageSet.DataPageMetaData) FullTableScanConsumer( ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) AtomicLong(java.util.concurrent.atomic.AtomicLong) TableStatus( Record(herddb.model.Record)

Example 3 with DataPageMetaData

use of herddb.core.PageSet.DataPageMetaData in project herddb by diennea.

the class TableManager method checkpoint.

 * @param sequenceNumber
 * @param dirtyThreshold
 * @param fillThreshold
 * @param checkpointTargetTime checkpoint target max milliseconds
 * @param compactionTargetTime compaction target max milliseconds
 * @return
 * @throws DataStorageManagerException
private TableCheckpoint checkpoint(double dirtyThreshold, double fillThreshold, long checkpointTargetTime, long compactionTargetTime, boolean pin) throws DataStorageManagerException {
    if (createdInTransaction > 0) {
        LOGGER.log(Level.SEVERE, "checkpoint for table " + + " skipped," + "this table is created on transaction " + createdInTransaction + " which is not committed");
        return null;
    final long fillPageThreshold = (long) (fillThreshold * maxLogicalPageSize);
    final long dirtyPageThreshold = (long) (dirtyThreshold * maxLogicalPageSize);
    long start = System.currentTimeMillis();
    long end;
    long getlock;
    long pageAnalysis;
    long dirtyPagesFlush;
    long smallPagesFlush;
    long newPagesFlush;
    long keytopagecheckpoint;
    long indexcheckpoint;
    long tablecheckpoint;
    final List<PostCheckpointAction> actions = new ArrayList<>();
    TableCheckpoint result;
    boolean lockAcquired;
    try {
        lockAcquired = checkpointLock.asWriteLock().tryLock(CHECKPOINT_LOCK_WRITE_TIMEOUT, TimeUnit.SECONDS);
    } catch (InterruptedException err) {
        throw new DataStorageManagerException("interrupted while waiting for checkpoint lock", err);
    if (!lockAcquired) {
        throw new DataStorageManagerException("timed out while waiting for checkpoint lock, write lock " + checkpointLock.writeLock());
    try {
        LogSequenceNumber sequenceNumber = log.getLastSequenceNumber();
        getlock = System.currentTimeMillis();
        checkPointRunning = true;
        final long checkpointLimitInstant = sumOverflowWise(getlock, checkpointTargetTime);
        final Map<Long, DataPageMetaData> activePages = pageSet.getActivePages();
        Map<Bytes, Record> buffer = new HashMap<>();
        long bufferPageSize = 0;
        long flushedRecords = 0;
        final List<WeightedPage> flushingDirtyPages = new ArrayList<>();
        final List<WeightedPage> flushingSmallPages = new ArrayList<>();
        final List<Long> flushedPages = new ArrayList<>();
        int flushedDirtyPages = 0;
        int flushedSmallPages = 0;
        for (Entry<Long, DataPageMetaData> ref : activePages.entrySet()) {
            final Long pageId = ref.getKey();
            final DataPageMetaData metadata = ref.getValue();
            final long dirt = metadata.dirt.sum();
                 * Check dirtiness (flush here even small pages if dirty. Small pages flush IGNORES dirty data
                 * handling).
            if (dirt > 0 && (dirt >= dirtyPageThreshold || metadata.size <= fillPageThreshold)) {
                flushingDirtyPages.add(new WeightedPage(pageId, dirt));
            /* Check emptiness (with a really dirty check to avoid to rewrite an unfillable page) */
            if (metadata.size <= fillPageThreshold && maxLogicalPageSize - metadata.avgRecordSize >= fillPageThreshold) {
                flushingSmallPages.add(new WeightedPage(pageId, metadata.size));
        /* Clean dirtier first */
        /* Clean smaller first */
        pageAnalysis = System.currentTimeMillis();
        /* Rebuild dirty pages with only records to be kept */
        for (WeightedPage weighted : flushingDirtyPages) {
            /* Page flushed */
            final DataPage dataPage = pages.get(weighted.pageId);
            final Collection<Record> records;
            if (dataPage == null) {
                records = dataStorageManager.readPage(tableSpaceUUID, table.uuid, weighted.pageId);
                LOGGER.log(Level.FINEST, "loaded dirty page {0} on tmp buffer: {1} records", new Object[] { weighted.pageId, records.size() });
            } else {
                records =;
            for (Record record : records) {
                /* Avoid the record if has been modified or deleted */
                final Long currentPageId = keyToPage.get(record.key);
                if (currentPageId == null || !weighted.pageId.equals(currentPageId)) {
                /* Flush the page if it would exceed max page size */
                if (bufferPageSize + DataPage.estimateEntrySize(record) > maxLogicalPageSize) {
                    createImmutablePage(buffer, bufferPageSize);
                    flushedRecords += buffer.size();
                    bufferPageSize = 0;
                    /* Do not clean old buffer! It will used in generated pages to avoid too many copies! */
                    buffer = new HashMap<>(buffer.size());
                buffer.put(record.key, record);
                bufferPageSize += DataPage.estimateEntrySize(record);
            /* Do not continue if we have used up all configured checkpoint time */
            if (checkpointLimitInstant <= System.currentTimeMillis()) {
        dirtyPagesFlush = System.currentTimeMillis();
             * If there is only one without additional data to add
             * rebuilding the page make no sense: is too probable to rebuild an identical page!
        if (flushingSmallPages.size() == 1 && buffer.isEmpty()) {
            boolean hasNewPagesData = newPages.values().stream().filter(p -> !p.isEmpty()).findAny().isPresent();
            if (!hasNewPagesData) {
        final long compactionLimitInstant = sumOverflowWise(dirtyPagesFlush, compactionTargetTime);
        /* Rebuild too small pages */
        for (WeightedPage weighted : flushingSmallPages) {
            /* Page flushed */
            final DataPage dataPage = pages.get(weighted.pageId);
            final Collection<Record> records;
            if (dataPage == null) {
                records = dataStorageManager.readPage(tableSpaceUUID, table.uuid, weighted.pageId);
                LOGGER.log(Level.FINEST, "loaded small page {0} on tmp buffer: {1} records", new Object[] { weighted.pageId, records.size() });
            } else {
                records =;
            for (Record record : records) {
                /* Flush the page if it would exceed max page size */
                if (bufferPageSize + DataPage.estimateEntrySize(record) > maxLogicalPageSize) {
                    createImmutablePage(buffer, bufferPageSize);
                    flushedRecords += buffer.size();
                    bufferPageSize = 0;
                    /* Do not clean old buffer! It will used in generated pages to avoid too many copies! */
                    buffer = new HashMap<>(buffer.size());
                buffer.put(record.key, record);
                bufferPageSize += DataPage.estimateEntrySize(record);
            final long now = System.currentTimeMillis();
                 * Do not continue if we have used up all configured compaction or checkpoint time (but still compact at
                 * least the smaller page (normally the leftover from last checkpoint)
            if (compactionLimitInstant <= now || checkpointLimitInstant <= now) {
        smallPagesFlush = System.currentTimeMillis();
             * Flush dirty records (and remaining records from previous step).
             * Any newpage remaining here is unflushed and is not set as dirty (if "dirty" were unloaded!).
             * Just write the pages as they are.
             * New empty pages won't be written
        long flushedNewPages = 0;
        for (DataPage dataPage : newPages.values()) {
            if (!dataPage.isEmpty()) {
                bufferPageSize -= flushNewPageForCheckpoint(dataPage, buffer);
                // dataPage.makeImmutable();
                flushedRecords += dataPage.size();
        /* Flush remaining records */
        if (!buffer.isEmpty()) {
            createImmutablePage(buffer, bufferPageSize);
            flushedRecords += buffer.size();
            bufferPageSize = 0;
        /* Do not clean old buffer! It will used in generated pages to avoid too many copies! */
        newPagesFlush = System.currentTimeMillis();
        LOGGER.log(Level.INFO, "checkpoint {0}, logpos {1}, flushed: {2} dirty pages, {3} small pages, {4} new pages, {5} records", new Object[] {, sequenceNumber, flushedDirtyPages, flushedSmallPages, flushedNewPages, flushedRecords });
        if (LOGGER.isLoggable(Level.FINE)) {
            LOGGER.log(Level.FINE, "checkpoint {0}, logpos {1}, flushed pages: {2}", new Object[] {, sequenceNumber, flushedPages.toString() });
        /* Checkpoint the key to page too */
        actions.addAll(keyToPage.checkpoint(sequenceNumber, pin));
        keytopagecheckpoint = System.currentTimeMillis();
        /* Checkpoint secondary indexes too */
        final Map<String, AbstractIndexManager> indexes = tableSpaceManager.getIndexesOnTable(;
        if (indexes != null) {
            for (AbstractIndexManager indexManager : indexes.values()) {
                // Checkpoint at the same position of current TableManager
                actions.addAll(indexManager.checkpoint(sequenceNumber, pin));
        indexcheckpoint = System.currentTimeMillis();
        TableStatus tableStatus = new TableStatus(, sequenceNumber, Bytes.from_long(nextPrimaryKeyValue.get()).data, nextPageId, pageSet.getActivePages());
        actions.addAll(dataStorageManager.tableCheckpoint(tableSpaceUUID, table.uuid, tableStatus, pin));
        tablecheckpoint = System.currentTimeMillis();
        /* Remove flushed pages handled */
        for (Long pageId : flushedPages) {
            final DataPage page = pages.remove(pageId);
            /* Current dirty record page isn't known to page replacement policy */
            if (page != null && currentDirtyRecordsPage.get() != page.pageId) {
             * Can happen when at checkpoint start all pages are set as dirty or immutable (immutable or
             * unloaded) due do a deletion: all pages will be removed and no page will remain alive.
        if (newPages.isEmpty()) {
            /* Allocate live handles the correct policy load/unload of last dirty page */
        checkPointRunning = false;
        result = new TableCheckpoint(, sequenceNumber, actions);
        end = System.currentTimeMillis();
        LOGGER.log(Level.INFO, "checkpoint {0} finished, logpos {1}, {2} active pages, {3} dirty pages, " + "flushed {4} records, total time {5} ms", new Object[] {, sequenceNumber, pageSet.getActivePagesCount(), pageSet.getDirtyPagesCount(), flushedRecords, Long.toString(end - start) });
        if (LOGGER.isLoggable(Level.FINE)) {
            LOGGER.log(Level.FINE, "checkpoint {0} finished, logpos {1}, pageSet: {2}", new Object[] {, sequenceNumber, pageSet.toString() });
    } finally {
    long delta = end - start;
    if (delta > 1000) {
        long delta_lock = getlock - start;
        long delta_pageAnalysis = pageAnalysis - getlock;
        long delta_dirtyPagesFlush = dirtyPagesFlush - pageAnalysis;
        long delta_smallPagesFlush = smallPagesFlush - dirtyPagesFlush;
        long delta_newPagesFlush = newPagesFlush - smallPagesFlush;
        long delta_keytopagecheckpoint = keytopagecheckpoint - newPagesFlush;
        long delta_indexcheckpoint = indexcheckpoint - keytopagecheckpoint;
        long delta_tablecheckpoint = tablecheckpoint - indexcheckpoint;
        long delta_unload = end - tablecheckpoint;
        LOGGER.log(Level.INFO, "long checkpoint for {0}, time {1}", new Object[] {, delta + " ms (" + delta_lock + "+" + delta_pageAnalysis + "+" + delta_dirtyPagesFlush + "+" + delta_smallPagesFlush + "+" + delta_newPagesFlush + "+" + delta_keytopagecheckpoint + "+" + delta_indexcheckpoint + "+" + delta_tablecheckpoint + "+" + delta_unload + ")" });
    return result;
Also used : DataStorageManagerException( ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) DataPageMetaData(herddb.core.PageSet.DataPageMetaData) Bytes(herddb.utils.Bytes) TableStatus( Record(herddb.model.Record) LogSequenceNumber(herddb.log.LogSequenceNumber) AtomicLong(java.util.concurrent.atomic.AtomicLong)


DataPageMetaData (herddb.core.PageSet.DataPageMetaData)3 HashMap (java.util.HashMap)3 LogSequenceNumber (herddb.log.LogSequenceNumber)2 Record (herddb.model.Record)2 TableStatus ( ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2 AtomicLong (java.util.concurrent.atomic.AtomicLong)2 DataStorageManagerException ( FullTableScanConsumer ( Bytes (herddb.utils.Bytes)1 ArrayList (java.util.ArrayList)1