Search in sources :

Example 1 with ReplayPosition

use of org.apache.cassandra.db.commitlog.ReplayPosition in project eiger by wlloyd.

the class ColumnFamilyStore method createCompactionWriter.

public SSTableWriter createCompactionWriter(long estimatedRows, File location, Collection<SSTableReader> sstables) throws IOException {
    ReplayPosition rp = ReplayPosition.getReplayPosition(sstables);
    SSTableMetadata.Collector sstableMetadataCollector = SSTableMetadata.createCollector().replayPosition(rp);
    // get the max timestamp of the precompacted sstables
    for (SSTableReader sstable : sstables) sstableMetadataCollector.updateMaxTimestamp(sstable.getMaxTimestamp());
    return new SSTableWriter(getTempSSTablePath(location), estimatedRows, metadata, partitioner, sstableMetadataCollector);
}
Also used : ReplayPosition(org.apache.cassandra.db.commitlog.ReplayPosition)

Example 2 with ReplayPosition

use of org.apache.cassandra.db.commitlog.ReplayPosition in project eiger by wlloyd.

the class ColumnFamilyStore method maybeSwitchMemtable.

/**
 * flush the given memtable and swap in a new one for its CFS, if it hasn't been frozen already.  threadsafe.
 */
public Future<?> maybeSwitchMemtable(Memtable oldMemtable, final boolean writeCommitLog) {
    if (oldMemtable.isFrozen()) {
        logger.debug("memtable is already frozen; another thread must be flushing it");
        return null;
    }
    /*
         * If we can get the writelock, that means no new updates can come in and
         * all ongoing updates to memtables have completed. We can get the tail
         * of the log and use it as the starting position for log replay on recovery.
         *
         * This is why we Table.flusherLock needs to be global instead of per-Table:
         * we need to schedule discardCompletedSegments calls in the same order as their
         * contexts (commitlog position) were read, even though the flush executor
         * is multithreaded.
         */
    Table.switchLock.writeLock().lock();
    try {
        if (oldMemtable.isFrozen()) {
            logger.debug("memtable is already frozen; another thread must be flushing it");
            return null;
        }
        assert getMemtableThreadSafe() == oldMemtable;
        final ReplayPosition ctx = writeCommitLog ? CommitLog.instance.getContext() : ReplayPosition.NONE;
        logger.debug("flush position is {}", ctx);
        // submit the memtable for any indexed sub-cfses, and our own.
        final List<ColumnFamilyStore> icc = new ArrayList<ColumnFamilyStore>();
        // don't assume that this.memtable is dirty; forceFlush can bring us here during index build even if it is not
        for (ColumnFamilyStore cfs : concatWithIndexes()) {
            Memtable mt = cfs.getMemtableThreadSafe();
            if (!mt.isClean() && !mt.isFrozen()) {
                // We need to freeze indexes too because they can be concurrently flushed too (#3547)
                mt.freeze();
                icc.add(cfs);
            }
        }
        final CountDownLatch latch = new CountDownLatch(icc.size());
        for (ColumnFamilyStore cfs : icc) {
            Memtable memtable = cfs.data.switchMemtable();
            logger.info("Enqueuing flush of {}", memtable);
            memtable.flushAndSignal(latch, flushWriter, ctx);
        }
        if (memtableSwitchCount == Integer.MAX_VALUE)
            memtableSwitchCount = 0;
        memtableSwitchCount++;
        // while keeping the wait-for-flush (future.get) out of anything latency-sensitive.
        return postFlushExecutor.submit(new WrappedRunnable() {

            public void runMayThrow() throws InterruptedException, IOException {
                latch.await();
                if (!icc.isEmpty()) {
                    for (SecondaryIndex index : indexManager.getIndexesNotBackedByCfs()) {
                        // flush any non-cfs backed indexes
                        logger.info("Flushing SecondaryIndex {}", index);
                        index.forceBlockingFlush();
                    }
                }
                if (writeCommitLog) {
                    // if we're not writing to the commit log, we are replaying the log, so marking
                    // the log header with "you can discard anything written before the context" is not valid
                    CommitLog.instance.discardCompletedSegments(metadata.cfId, ctx);
                }
            }
        });
    } finally {
        Table.switchLock.writeLock().unlock();
    }
}
Also used : SecondaryIndex(org.apache.cassandra.db.index.SecondaryIndex) ReplayPosition(org.apache.cassandra.db.commitlog.ReplayPosition)

Example 3 with ReplayPosition

use of org.apache.cassandra.db.commitlog.ReplayPosition in project eiger by wlloyd.

the class ColumnFamilyStore method truncate.

/**
 * Truncate practically deletes the entire column family's data
 * @return a Future to the delete operation. Call the future's get() to make
 * sure the column family has been deleted
 */
public Future<?> truncate() throws IOException, ExecutionException, InterruptedException {
    // We have two goals here:
    // - truncate should delete everything written before truncate was invoked
    // - but not delete anything that isn't part of the snapshot we create.
    // We accomplish this by first flushing manually, then snapshotting, and
    // recording the timestamp IN BETWEEN those actions. Any sstables created
    // with this timestamp or greater time, will not be marked for delete.
    // 
    // Bonus complication: since we store replay position in sstable metadata,
    // truncating those sstables means we will replay any CL segments from the
    // beginning if we restart before they are discarded for normal reasons
    // post-truncate.  So we need to (a) force a new segment so the currently
    // active one can be discarded, and (b) flush *all* CFs so that unflushed
    // data in others don't keep any pre-truncate CL segments alive.
    // 
    // Bonus bonus: simply forceFlush of all the CF is not enough, because if
    // for a given column family the memtable is clean, forceFlush will return
    // immediately, even though there could be a memtable being flushed at the same
    // time.  So to guarantee that all segments can be cleaned out, we need to
    // "waitForActiveFlushes" after the new segment has been created.
    logger.debug("truncating {}", columnFamily);
    // flush the CF being truncated before forcing the new segment
    forceBlockingFlush();
    CommitLog.instance.forceNewSegment();
    ReplayPosition position = CommitLog.instance.getContext();
    // now flush everyone else.  re-flushing ourselves is not necessary, but harmless
    for (ColumnFamilyStore cfs : ColumnFamilyStore.all()) cfs.forceFlush();
    waitForActiveFlushes();
    // if everything was clean, flush won't have called discard
    CommitLog.instance.discardCompletedSegments(metadata.cfId, position);
    // that was part of the flushed we forced; otherwise on a tie, it won't get deleted.
    try {
        Thread.sleep(100);
    } catch (InterruptedException e) {
        throw new AssertionError(e);
    }
    long truncatedAt = System.currentTimeMillis();
    snapshot(Table.getTimestampedSnapshotName(columnFamily));
    return CompactionManager.instance.submitTruncate(this, truncatedAt);
}
Also used : ReplayPosition(org.apache.cassandra.db.commitlog.ReplayPosition)

Example 4 with ReplayPosition

use of org.apache.cassandra.db.commitlog.ReplayPosition in project eiger by wlloyd.

the class SSTableMetadataSerializerTest method testSerialization.

@Test
public void testSerialization() throws IOException {
    EstimatedHistogram rowSizes = new EstimatedHistogram(new long[] { 1L, 2L }, new long[] { 3L, 4L, 5L });
    EstimatedHistogram columnCounts = new EstimatedHistogram(new long[] { 6L, 7L }, new long[] { 8L, 9L, 10L });
    ReplayPosition rp = new ReplayPosition(11L, 12);
    long maxTimestamp = 4162517136L;
    SSTableMetadata.Collector collector = SSTableMetadata.createCollector().estimatedRowSize(rowSizes).estimatedColumnCount(columnCounts).replayPosition(rp);
    collector.updateMaxTimestamp(maxTimestamp);
    SSTableMetadata originalMetadata = collector.finalizeMetadata(RandomPartitioner.class.getCanonicalName());
    ByteArrayOutputStream byteOutput = new ByteArrayOutputStream();
    DataOutputStream dos = new DataOutputStream(byteOutput);
    SSTableMetadata.serializer.serialize(originalMetadata, dos);
    ByteArrayInputStream byteInput = new ByteArrayInputStream(byteOutput.toByteArray());
    DataInputStream dis = new DataInputStream(byteInput);
    Descriptor desc = new Descriptor(Descriptor.CURRENT_VERSION, new File("."), "", "", 0, false);
    SSTableMetadata stats = SSTableMetadata.serializer.deserialize(dis, desc);
    assert stats.estimatedRowSize.equals(originalMetadata.estimatedRowSize);
    assert stats.estimatedRowSize.equals(rowSizes);
    assert stats.estimatedColumnCount.equals(originalMetadata.estimatedColumnCount);
    assert stats.estimatedColumnCount.equals(columnCounts);
    assert stats.replayPosition.equals(originalMetadata.replayPosition);
    assert stats.replayPosition.equals(rp);
    assert stats.maxTimestamp == maxTimestamp;
    assert stats.maxTimestamp == originalMetadata.maxTimestamp;
    assert RandomPartitioner.class.getCanonicalName().equals(stats.partitioner);
}
Also used : RandomPartitioner(org.apache.cassandra.dht.RandomPartitioner) EstimatedHistogram(org.apache.cassandra.utils.EstimatedHistogram) ByteArrayInputStream(java.io.ByteArrayInputStream) DataOutputStream(java.io.DataOutputStream) ReplayPosition(org.apache.cassandra.db.commitlog.ReplayPosition) ByteArrayOutputStream(java.io.ByteArrayOutputStream) DataInputStream(java.io.DataInputStream) File(java.io.File) Test(org.junit.Test)

Aggregations

ReplayPosition (org.apache.cassandra.db.commitlog.ReplayPosition)4 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 DataInputStream (java.io.DataInputStream)1 DataOutputStream (java.io.DataOutputStream)1 File (java.io.File)1 SecondaryIndex (org.apache.cassandra.db.index.SecondaryIndex)1 RandomPartitioner (org.apache.cassandra.dht.RandomPartitioner)1 EstimatedHistogram (org.apache.cassandra.utils.EstimatedHistogram)1 Test (org.junit.Test)1