Search in sources :

Example 6 with RangesAtEndpoint

use of org.apache.cassandra.locator.RangesAtEndpoint in project cassandra by apache.

the class DiskBoundaryManager method getDiskBoundaryValue.

private static DiskBoundaries getDiskBoundaryValue(ColumnFamilyStore cfs) {
    RangesAtEndpoint localRanges;
    long ringVersion;
    TokenMetadata tmd;
    do {
        tmd = StorageService.instance.getTokenMetadata();
        ringVersion = tmd.getRingVersion();
        if (StorageService.instance.isBootstrapMode() && // When replacing same address, the node marks itself as UN locally
        !StorageService.isReplacingSameAddress()) {
            PendingRangeCalculatorService.instance.blockUntilFinished();
            localRanges = tmd.getPendingRanges(cfs.keyspace.getName(), FBUtilities.getBroadcastAddressAndPort());
        } else {
            // Reason we use use the future settled TMD is that if we decommission a node, we want to stream
            // from that node to the correct location on disk, if we didn't, we would put new files in the wrong places.
            // We do this to minimize the amount of data we need to move in rebalancedisks once everything settled
            localRanges = cfs.keyspace.getReplicationStrategy().getAddressReplicas(tmd.cloneAfterAllSettled(), FBUtilities.getBroadcastAddressAndPort());
        }
        logger.debug("Got local ranges {} (ringVersion = {})", localRanges, ringVersion);
    } while (// if ringVersion is different here it means that
    ringVersion != tmd.getRingVersion());
    // it might have changed before we calculated localRanges - recalculate
    int directoriesVersion;
    Directories.DataDirectory[] dirs;
    do {
        directoriesVersion = DisallowedDirectories.getDirectoriesVersion();
        dirs = cfs.getDirectories().getWriteableLocations();
    } while (// if directoriesVersion has changed we need to recalculate
    directoriesVersion != DisallowedDirectories.getDirectoriesVersion());
    if (localRanges == null || localRanges.isEmpty())
        return new DiskBoundaries(cfs, dirs, null, ringVersion, directoriesVersion);
    List<PartitionPosition> positions = getDiskBoundaries(localRanges, cfs.getPartitioner(), dirs);
    return new DiskBoundaries(cfs, dirs, positions, ringVersion, directoriesVersion);
}
Also used : RangesAtEndpoint(org.apache.cassandra.locator.RangesAtEndpoint) TokenMetadata(org.apache.cassandra.locator.TokenMetadata) RangesAtEndpoint(org.apache.cassandra.locator.RangesAtEndpoint)

Example 7 with RangesAtEndpoint

use of org.apache.cassandra.locator.RangesAtEndpoint in project cassandra by apache.

the class ViewBuilder method build.

private synchronized void build() {
    if (isStopped) {
        logger.debug("Stopped build for view({}.{}) after covering {} keys", ksName, view.name, keysBuilt);
        return;
    }
    // Get the local ranges for which the view hasn't already been built nor it's building
    RangesAtEndpoint replicatedRanges = StorageService.instance.getLocalReplicas(ksName);
    Replicas.temporaryAssertFull(replicatedRanges);
    Set<Range<Token>> newRanges = replicatedRanges.ranges().stream().map(r -> r.subtractAll(builtRanges)).flatMap(Set::stream).map(r -> r.subtractAll(pendingRanges.keySet())).flatMap(Set::stream).collect(Collectors.toSet());
    // If there are no new nor pending ranges we should finish the build
    if (newRanges.isEmpty() && pendingRanges.isEmpty()) {
        finish();
        return;
    }
    // Split the new local ranges and add them to the pending set
    DatabaseDescriptor.getPartitioner().splitter().map(s -> s.split(newRanges, NUM_TASKS)).orElse(newRanges).forEach(r -> pendingRanges.put(r, Pair.<Token, Long>create(null, 0L)));
    // Submit a new view build task for each building range.
    // We keep record of all the submitted tasks to be able of stopping them.
    List<Future<Long>> futures = pendingRanges.entrySet().stream().map(e -> new ViewBuilderTask(baseCfs, view, e.getKey(), e.getValue().left, e.getValue().right)).peek(tasks::add).map(CompactionManager.instance::submitViewBuilder).collect(toList());
    // Add a callback to process any eventual new local range and mark the view as built, doing a delayed retry if
    // the tasks don't succeed
    Future<List<Long>> future = FutureCombiner.allOf(futures);
    future.addCallback(new FutureCallback<List<Long>>() {

        public void onSuccess(List<Long> result) {
            keysBuilt += result.stream().mapToLong(x -> x).sum();
            builtRanges.addAll(pendingRanges.keySet());
            pendingRanges.clear();
            build();
        }

        public void onFailure(Throwable t) {
            if (t instanceof CompactionInterruptedException) {
                internalStop(true);
                keysBuilt = tasks.stream().mapToLong(ViewBuilderTask::keysBuilt).sum();
                logger.info("Interrupted build for view({}.{}) after covering {} keys", ksName, view.name, keysBuilt);
            } else {
                ScheduledExecutors.nonPeriodicTasks.schedule(() -> loadStatusAndBuild(), 5, TimeUnit.MINUTES);
                logger.warn("Materialized View failed to complete, sleeping 5 minutes before restarting", t);
            }
        }
    });
    this.future = future;
}
Also used : ScheduledExecutors(org.apache.cassandra.concurrent.ScheduledExecutors) CompactionManager(org.apache.cassandra.db.compaction.CompactionManager) LoggerFactory(org.slf4j.LoggerFactory) Range(org.apache.cassandra.dht.Range) CompactionInterruptedException(org.apache.cassandra.db.compaction.CompactionInterruptedException) SystemKeyspace(org.apache.cassandra.db.SystemKeyspace) SystemDistributedKeyspace(org.apache.cassandra.schema.SystemDistributedKeyspace) Token(org.apache.cassandra.dht.Token) Replicas(org.apache.cassandra.locator.Replicas) Pair(org.apache.cassandra.utils.Pair) Map(java.util.Map) DatabaseDescriptor(org.apache.cassandra.config.DatabaseDescriptor) FutureCombiner(org.apache.cassandra.utils.concurrent.FutureCombiner) Logger(org.slf4j.Logger) FBUtilities(org.apache.cassandra.utils.FBUtilities) Set(java.util.Set) StorageService(org.apache.cassandra.service.StorageService) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) Maps(com.google.common.collect.Maps) Sets(com.google.common.collect.Sets) FutureCallback(com.google.common.util.concurrent.FutureCallback) RangesAtEndpoint(org.apache.cassandra.locator.RangesAtEndpoint) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) ColumnFamilyStore(org.apache.cassandra.db.ColumnFamilyStore) Future(org.apache.cassandra.utils.concurrent.Future) ImmediateFuture(org.apache.cassandra.utils.concurrent.ImmediateFuture) RangesAtEndpoint(org.apache.cassandra.locator.RangesAtEndpoint) CompactionInterruptedException(org.apache.cassandra.db.compaction.CompactionInterruptedException) Token(org.apache.cassandra.dht.Token) Range(org.apache.cassandra.dht.Range) CompactionManager(org.apache.cassandra.db.compaction.CompactionManager) Future(org.apache.cassandra.utils.concurrent.Future) ImmediateFuture(org.apache.cassandra.utils.concurrent.ImmediateFuture) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList)

Example 8 with RangesAtEndpoint

use of org.apache.cassandra.locator.RangesAtEndpoint in project cassandra by apache.

the class LocalSessions method handlePrepareMessage.

/**
 * The PrepareConsistentRequest promotes the parent repair session to a consistent incremental
 * session, and isolates the data to be repaired from the rest of the table's data
 *
 * No response is sent to the repair coordinator until the data preparation / isolation has completed
 * successfully. If the data preparation fails, a failure message is sent to the coordinator,
 * cancelling the session.
 */
public void handlePrepareMessage(InetAddressAndPort from, PrepareConsistentRequest request) {
    logger.trace("received {} from {}", request, from);
    UUID sessionID = request.parentSession;
    InetAddressAndPort coordinator = request.coordinator;
    Set<InetAddressAndPort> peers = request.participants;
    ActiveRepairService.ParentRepairSession parentSession;
    try {
        parentSession = getParentRepairSession(sessionID);
    } catch (Throwable e) {
        logger.error("Error retrieving ParentRepairSession for session {}, responding with failure", sessionID);
        sendMessage(coordinator, Message.out(PREPARE_CONSISTENT_RSP, new PrepareConsistentResponse(sessionID, getBroadcastAddressAndPort(), false)));
        return;
    }
    LocalSession session = createSessionUnsafe(sessionID, parentSession, peers);
    putSessionUnsafe(session);
    logger.info("Beginning local incremental repair session {}", session);
    ExecutorService executor = executorFactory().pooled("Repair-" + sessionID, parentSession.getColumnFamilyStores().size());
    KeyspaceRepairManager repairManager = parentSession.getKeyspace().getRepairManager();
    RangesAtEndpoint tokenRanges = filterLocalRanges(parentSession.getKeyspace().getName(), parentSession.getRanges());
    Future<List<Void>> repairPreparation = prepareSession(repairManager, sessionID, parentSession.getColumnFamilyStores(), tokenRanges, executor, () -> session.getState() != PREPARING);
    repairPreparation.addCallback(new FutureCallback<List<Void>>() {

        public void onSuccess(@Nullable List<Void> result) {
            try {
                logger.info("Prepare phase for incremental repair session {} completed", sessionID);
                if (session.getState() != FAILED)
                    setStateAndSave(session, PREPARED);
                else
                    logger.info("Session {} failed before anticompaction completed", sessionID);
                Message<PrepareConsistentResponse> message = Message.out(PREPARE_CONSISTENT_RSP, new PrepareConsistentResponse(sessionID, getBroadcastAddressAndPort(), session.getState() != FAILED));
                sendMessage(coordinator, message);
            } finally {
                executor.shutdown();
            }
        }

        public void onFailure(Throwable t) {
            try {
                if (Throwables.anyCauseMatches(t, (throwable) -> throwable instanceof CompactionInterruptedException))
                    logger.info("Anticompaction interrupted for session {}: {}", sessionID, t.getMessage());
                else if (Throwables.anyCauseMatches(t, (throwable) -> throwable instanceof NoSuchRepairSessionException))
                    logger.warn("No such repair session: {}", sessionID);
                else
                    logger.error("Prepare phase for incremental repair session {} failed", sessionID, t);
                sendMessage(coordinator, Message.out(PREPARE_CONSISTENT_RSP, new PrepareConsistentResponse(sessionID, getBroadcastAddressAndPort(), false)));
                failSession(sessionID, false);
            } finally {
                executor.shutdown();
            }
        }
    });
}
Also used : NoSuchRepairSessionException(org.apache.cassandra.repair.NoSuchRepairSessionException) KeyspaceRepairManager(org.apache.cassandra.repair.KeyspaceRepairManager) Date(java.util.Date) LoggerFactory(org.slf4j.LoggerFactory) STATUS_REQ(org.apache.cassandra.net.Verb.STATUS_REQ) CompactionInterruptedException(org.apache.cassandra.db.compaction.CompactionInterruptedException) ByteBuffer(java.nio.ByteBuffer) BooleanSupplier(java.util.function.BooleanSupplier) Map(java.util.Map) DatabaseDescriptor(org.apache.cassandra.config.DatabaseDescriptor) PendingStats(org.apache.cassandra.repair.consistent.admin.PendingStats) Verify(com.google.common.base.Verify) ImmutableSet(com.google.common.collect.ImmutableSet) FBUtilities(org.apache.cassandra.utils.FBUtilities) ImmutableMap(com.google.common.collect.ImmutableMap) Predicate(java.util.function.Predicate) Collection(java.util.Collection) FinalizeCommit(org.apache.cassandra.repair.messages.FinalizeCommit) Set(java.util.Set) UUID(java.util.UUID) Instant(java.time.Instant) CopyOnWriteArraySet(java.util.concurrent.CopyOnWriteArraySet) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) RangesAtEndpoint(org.apache.cassandra.locator.RangesAtEndpoint) FinalizePropose(org.apache.cassandra.repair.messages.FinalizePropose) State(org.apache.cassandra.repair.consistent.ConsistentSession.State) List(java.util.List) FAILED_SESSION_MSG(org.apache.cassandra.net.Verb.FAILED_SESSION_MSG) ColumnFamilyStore(org.apache.cassandra.db.ColumnFamilyStore) Future(org.apache.cassandra.utils.concurrent.Future) Throwables(org.apache.cassandra.utils.Throwables) FINALIZE_PROMISE_MSG(org.apache.cassandra.net.Verb.FINALIZE_PROMISE_MSG) FailureDetector(org.apache.cassandra.gms.FailureDetector) SchemaConstants(org.apache.cassandra.schema.SchemaConstants) DataInputBuffer(org.apache.cassandra.io.util.DataInputBuffer) InetAddressAndPort(org.apache.cassandra.locator.InetAddressAndPort) Iterables(com.google.common.collect.Iterables) PendingStat(org.apache.cassandra.repair.consistent.admin.PendingStat) PrepareConsistentResponse(org.apache.cassandra.repair.messages.PrepareConsistentResponse) DataOutputBuffer(org.apache.cassandra.io.util.DataOutputBuffer) TableId(org.apache.cassandra.schema.TableId) PREPARE_CONSISTENT_RSP(org.apache.cassandra.net.Verb.PREPARE_CONSISTENT_RSP) Range(org.apache.cassandra.dht.Range) HashMap(java.util.HashMap) Message(org.apache.cassandra.net.Message) QueryProcessor(org.apache.cassandra.cql3.QueryProcessor) FinalizePromise(org.apache.cassandra.repair.messages.FinalizePromise) SystemKeyspace(org.apache.cassandra.db.SystemKeyspace) ArrayList(java.util.ArrayList) Schema(org.apache.cassandra.schema.Schema) HashSet(java.util.HashSet) UTF8Type(org.apache.cassandra.db.marshal.UTF8Type) Token(org.apache.cassandra.dht.Token) ActiveRepairService(org.apache.cassandra.service.ActiveRepairService) Lists(com.google.common.collect.Lists) CleanupSummary(org.apache.cassandra.repair.consistent.admin.CleanupSummary) STATUS_RSP(org.apache.cassandra.net.Verb.STATUS_RSP) ExecutorService(java.util.concurrent.ExecutorService) Nullable(javax.annotation.Nullable) MessagingService(org.apache.cassandra.net.MessagingService) Logger(org.slf4j.Logger) BytesType(org.apache.cassandra.db.marshal.BytesType) StatusResponse(org.apache.cassandra.repair.messages.StatusResponse) StorageService(org.apache.cassandra.service.StorageService) IOException(java.io.IOException) PrepareConsistentRequest(org.apache.cassandra.repair.messages.PrepareConsistentRequest) UnknownHostException(java.net.UnknownHostException) Ints(com.google.common.primitives.Ints) FutureCallback(com.google.common.util.concurrent.FutureCallback) Replica(org.apache.cassandra.locator.Replica) FailSession(org.apache.cassandra.repair.messages.FailSession) TimeUnit(java.util.concurrent.TimeUnit) RepairMessage(org.apache.cassandra.repair.messages.RepairMessage) UUIDType(org.apache.cassandra.db.marshal.UUIDType) IPartitioner(org.apache.cassandra.dht.IPartitioner) Global.executorFactory(org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory) UntypedResultSet(org.apache.cassandra.cql3.UntypedResultSet) Preconditions(com.google.common.base.Preconditions) VisibleForTesting(com.google.common.annotations.VisibleForTesting) StatusRequest(org.apache.cassandra.repair.messages.StatusRequest) InetAddressAndPort(org.apache.cassandra.locator.InetAddressAndPort) RangesAtEndpoint(org.apache.cassandra.locator.RangesAtEndpoint) ActiveRepairService(org.apache.cassandra.service.ActiveRepairService) PrepareConsistentResponse(org.apache.cassandra.repair.messages.PrepareConsistentResponse) Message(org.apache.cassandra.net.Message) RepairMessage(org.apache.cassandra.repair.messages.RepairMessage) CompactionInterruptedException(org.apache.cassandra.db.compaction.CompactionInterruptedException) KeyspaceRepairManager(org.apache.cassandra.repair.KeyspaceRepairManager) ExecutorService(java.util.concurrent.ExecutorService) NoSuchRepairSessionException(org.apache.cassandra.repair.NoSuchRepairSessionException) List(java.util.List) ArrayList(java.util.ArrayList) UUID(java.util.UUID)

Example 9 with RangesAtEndpoint

use of org.apache.cassandra.locator.RangesAtEndpoint in project cassandra by apache.

the class CompactionManager method doAntiCompaction.

/**
 * Splits up an sstable into two new sstables. The first of the new tables will store repaired ranges, the second
 * will store the non-repaired ranges. Once anticompation is completed, the original sstable is marked as compacted
 * and subsequently deleted.
 * @param cfs
 * @param txn a transaction over the repaired sstables to anticompact
 * @param ranges full and transient ranges to be placed into one of the new sstables. The repaired table will be tracked via
 *   the {@link org.apache.cassandra.io.sstable.metadata.StatsMetadata#pendingRepair} field.
 * @param pendingRepair the repair session we're anti-compacting for
 * @param isCancelled function that indicates if active anti-compaction should be canceled
 */
private void doAntiCompaction(ColumnFamilyStore cfs, RangesAtEndpoint ranges, LifecycleTransaction txn, UUID pendingRepair, BooleanSupplier isCancelled) {
    int originalCount = txn.originals().size();
    logger.info("Performing anticompaction on {} sstables for {}", originalCount, pendingRepair);
    // Group SSTables
    Set<SSTableReader> sstables = txn.originals();
    // Repairs can take place on both unrepaired (incremental + full) and repaired (full) data.
    // Although anti-compaction could work on repaired sstables as well and would result in having more accurate
    // repairedAt values for these, we still avoid anti-compacting already repaired sstables, as we currently don't
    // make use of any actual repairedAt value and splitting up sstables just for that is not worth it at this point.
    Set<SSTableReader> unrepairedSSTables = sstables.stream().filter((s) -> !s.isRepaired()).collect(Collectors.toSet());
    cfs.metric.bytesAnticompacted.inc(SSTableReader.getTotalBytes(unrepairedSSTables));
    Collection<Collection<SSTableReader>> groupedSSTables = cfs.getCompactionStrategyManager().groupSSTablesForAntiCompaction(unrepairedSSTables);
    // iterate over sstables to check if the full / transient / unrepaired ranges intersect them.
    int antiCompactedSSTableCount = 0;
    for (Collection<SSTableReader> sstableGroup : groupedSSTables) {
        try (LifecycleTransaction groupTxn = txn.split(sstableGroup)) {
            int antiCompacted = antiCompactGroup(cfs, ranges, groupTxn, pendingRepair, isCancelled);
            antiCompactedSSTableCount += antiCompacted;
        }
    }
    String format = "Anticompaction completed successfully, anticompacted from {} to {} sstable(s) for {}.";
    logger.info(format, originalCount, antiCompactedSSTableCount, pendingRepair);
}
Also used : NoSuchRepairSessionException(org.apache.cassandra.repair.NoSuchRepairSessionException) WrappedExecutorPlus(org.apache.cassandra.concurrent.WrappedExecutorPlus) SSTableSet(org.apache.cassandra.db.lifecycle.SSTableSet) File(org.apache.cassandra.io.util.File) LoggerFactory(org.slf4j.LoggerFactory) org.apache.cassandra.db(org.apache.cassandra.db) CompactionExecutor.compactionThreadGroup(org.apache.cassandra.db.compaction.CompactionManager.CompactionExecutor.compactionThreadGroup) TabularData(javax.management.openmbean.TabularData) org.apache.cassandra.utils(org.apache.cassandra.utils) Global.nanoTime(org.apache.cassandra.utils.Clock.Global.nanoTime) SSTableReader(org.apache.cassandra.io.sstable.format.SSTableReader) BooleanSupplier(java.util.function.BooleanSupplier) ExecutorFactory(org.apache.cassandra.concurrent.ExecutorFactory) Collections.singleton(java.util.Collections.singleton) NO_PENDING_REPAIR(org.apache.cassandra.service.ActiveRepairService.NO_PENDING_REPAIR) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ViewBuilderTask(org.apache.cassandra.db.view.ViewBuilderTask) com.google.common.collect(com.google.common.collect) DatabaseDescriptor(org.apache.cassandra.config.DatabaseDescriptor) Predicate(java.util.function.Predicate) UNREPAIRED_SSTABLE(org.apache.cassandra.service.ActiveRepairService.UNREPAIRED_SSTABLE) TombstoneOption(org.apache.cassandra.schema.CompactionParams.TombstoneOption) Collectors(java.util.stream.Collectors) RangesAtEndpoint(org.apache.cassandra.locator.RangesAtEndpoint) LifecycleTransaction(org.apache.cassandra.db.lifecycle.LifecycleTransaction) FileUtils(org.apache.cassandra.io.util.FileUtils) Future(org.apache.cassandra.utils.concurrent.Future) TableMetadata(org.apache.cassandra.schema.TableMetadata) StatsMetadata(org.apache.cassandra.io.sstable.metadata.StatsMetadata) ILifecycleTransaction(org.apache.cassandra.db.lifecycle.ILifecycleTransaction) FutureTask.callable(org.apache.cassandra.concurrent.FutureTask.callable) java.util(java.util) OpenDataException(javax.management.openmbean.OpenDataException) Range(org.apache.cassandra.dht.Range) Callable(java.util.concurrent.Callable) WrappedLifecycleTransaction(org.apache.cassandra.db.lifecycle.WrappedLifecycleTransaction) RateLimiter(com.google.common.util.concurrent.RateLimiter) Schema(org.apache.cassandra.schema.Schema) RejectedExecutionException(java.util.concurrent.RejectedExecutionException) Token(org.apache.cassandra.dht.Token) ActiveRepairService(org.apache.cassandra.service.ActiveRepairService) TableMetrics(org.apache.cassandra.metrics.TableMetrics) ConfigurationException(org.apache.cassandra.exceptions.ConfigurationException) UnfilteredRowIterator(org.apache.cassandra.db.rows.UnfilteredRowIterator) CompactionMetrics(org.apache.cassandra.metrics.CompactionMetrics) AbstractBounds(org.apache.cassandra.dht.AbstractBounds) Component(org.apache.cassandra.io.sstable.Component) Descriptor(org.apache.cassandra.io.sstable.Descriptor) MetadataCollector(org.apache.cassandra.io.sstable.metadata.MetadataCollector) Refs(org.apache.cassandra.utils.concurrent.Refs) ExecutorService(java.util.concurrent.ExecutorService) Uninterruptibles(com.google.common.util.concurrent.Uninterruptibles) Logger(org.slf4j.Logger) SSTableRewriter(org.apache.cassandra.io.sstable.SSTableRewriter) SecondaryIndexBuilder(org.apache.cassandra.index.SecondaryIndexBuilder) AutoSavingCache(org.apache.cassandra.cache.AutoSavingCache) ISSTableScanner(org.apache.cassandra.io.sstable.ISSTableScanner) StorageService(org.apache.cassandra.service.StorageService) IOException(java.io.IOException) Bounds(org.apache.cassandra.dht.Bounds) SSTableIntervalTree(org.apache.cassandra.db.lifecycle.SSTableIntervalTree) ExecutionException(java.util.concurrent.ExecutionException) TimeUnit(java.util.concurrent.TimeUnit) PreviewKind(org.apache.cassandra.streaming.PreviewKind) Holder(org.apache.cassandra.db.compaction.CompactionInfo.Holder) ImmediateFuture(org.apache.cassandra.utils.concurrent.ImmediateFuture) Global.executorFactory(org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory) DatabaseDescriptor.getConcurrentCompactors(org.apache.cassandra.config.DatabaseDescriptor.getConcurrentCompactors) Preconditions(com.google.common.base.Preconditions) VisibleForTesting(com.google.common.annotations.VisibleForTesting) IndexSummaryRedistribution(org.apache.cassandra.io.sstable.IndexSummaryRedistribution) SSTableWriter(org.apache.cassandra.io.sstable.format.SSTableWriter) View(org.apache.cassandra.db.lifecycle.View) SSTableReader(org.apache.cassandra.io.sstable.format.SSTableReader) LifecycleTransaction(org.apache.cassandra.db.lifecycle.LifecycleTransaction) ILifecycleTransaction(org.apache.cassandra.db.lifecycle.ILifecycleTransaction) WrappedLifecycleTransaction(org.apache.cassandra.db.lifecycle.WrappedLifecycleTransaction) RangesAtEndpoint(org.apache.cassandra.locator.RangesAtEndpoint)

Example 10 with RangesAtEndpoint

use of org.apache.cassandra.locator.RangesAtEndpoint in project cassandra by apache.

the class CompactionManager method antiCompactGroup.

@VisibleForTesting
int antiCompactGroup(ColumnFamilyStore cfs, RangesAtEndpoint ranges, LifecycleTransaction txn, UUID pendingRepair, BooleanSupplier isCancelled) {
    Preconditions.checkArgument(!ranges.isEmpty(), "need at least one full or transient range");
    long groupMaxDataAge = -1;
    for (Iterator<SSTableReader> i = txn.originals().iterator(); i.hasNext(); ) {
        SSTableReader sstable = i.next();
        if (groupMaxDataAge < sstable.maxDataAge)
            groupMaxDataAge = sstable.maxDataAge;
    }
    if (txn.originals().size() == 0) {
        logger.info("No valid anticompactions for this group, All sstables were compacted and are no longer available");
        return 0;
    }
    logger.info("Anticompacting {} in {}.{} for {}", txn.originals(), cfs.keyspace.getName(), cfs.getTableName(), pendingRepair);
    Set<SSTableReader> sstableAsSet = txn.originals();
    File destination = cfs.getDirectories().getWriteableLocationAsFile(cfs.getExpectedCompactedFileSize(sstableAsSet, OperationType.ANTICOMPACTION));
    int nowInSec = FBUtilities.nowInSeconds();
    RateLimiter limiter = getRateLimiter();
    /**
     * HACK WARNING
     *
     * We have multiple writers operating over the same Transaction, producing different sets of sstables that all
     * logically replace the transaction's originals.  The SSTableRewriter assumes it has exclusive control over
     * the transaction state, and this will lead to temporarily inconsistent sstable/tracker state if we do not
     * take special measures to avoid it.
     *
     * Specifically, if a number of rewriter have prepareToCommit() invoked in sequence, then two problematic things happen:
     *   1. The obsoleteOriginals() call of the first rewriter immediately remove the originals from the tracker, despite
     *      their having been only partially replaced.  To avoid this, we must either avoid obsoleteOriginals() or checkpoint()
     *   2. The LifecycleTransaction may only have prepareToCommit() invoked once, and this will checkpoint() also.
     *
     * Similarly commit() would finalise partially complete on-disk state.
     *
     * To avoid these problems, we introduce a SharedTxn that proxies all calls onto the underlying transaction
     * except prepareToCommit(), checkpoint(), obsoleteOriginals(), and commit().
     * We then invoke these methods directly once each of the rewriter has updated the transaction
     * with their share of replacements.
     *
     * Note that for the same essential reason we also explicitly disable early open.
     * By noop-ing checkpoint we avoid any of the problems with early open, but by continuing to explicitly
     * disable it we also prevent any of the extra associated work from being performed.
     */
    class SharedTxn extends WrappedLifecycleTransaction {

        public SharedTxn(ILifecycleTransaction delegate) {
            super(delegate);
        }

        public Throwable commit(Throwable accumulate) {
            return accumulate;
        }

        public void prepareToCommit() {
        }

        public void checkpoint() {
        }

        public void obsoleteOriginals() {
        }

        public void close() {
        }
    }
    CompactionStrategyManager strategy = cfs.getCompactionStrategyManager();
    try (SharedTxn sharedTxn = new SharedTxn(txn);
        SSTableRewriter fullWriter = SSTableRewriter.constructWithoutEarlyOpening(sharedTxn, false, groupMaxDataAge);
        SSTableRewriter transWriter = SSTableRewriter.constructWithoutEarlyOpening(sharedTxn, false, groupMaxDataAge);
        SSTableRewriter unrepairedWriter = SSTableRewriter.constructWithoutEarlyOpening(sharedTxn, false, groupMaxDataAge);
        AbstractCompactionStrategy.ScannerList scanners = strategy.getScanners(txn.originals());
        CompactionController controller = new CompactionController(cfs, sstableAsSet, getDefaultGcBefore(cfs, nowInSec));
        CompactionIterator ci = getAntiCompactionIterator(scanners.scanners, controller, nowInSec, UUIDGen.getTimeUUID(), active, isCancelled)) {
        int expectedBloomFilterSize = Math.max(cfs.metadata().params.minIndexInterval, (int) (SSTableReader.getApproximateKeyCount(sstableAsSet)));
        fullWriter.switchWriter(CompactionManager.createWriterForAntiCompaction(cfs, destination, expectedBloomFilterSize, UNREPAIRED_SSTABLE, pendingRepair, false, sstableAsSet, txn));
        transWriter.switchWriter(CompactionManager.createWriterForAntiCompaction(cfs, destination, expectedBloomFilterSize, UNREPAIRED_SSTABLE, pendingRepair, true, sstableAsSet, txn));
        unrepairedWriter.switchWriter(CompactionManager.createWriterForAntiCompaction(cfs, destination, expectedBloomFilterSize, UNREPAIRED_SSTABLE, NO_PENDING_REPAIR, false, sstableAsSet, txn));
        Predicate<Token> fullChecker = !ranges.onlyFull().isEmpty() ? new Range.OrderedRangeContainmentChecker(ranges.onlyFull().ranges()) : t -> false;
        Predicate<Token> transChecker = !ranges.onlyTransient().isEmpty() ? new Range.OrderedRangeContainmentChecker(ranges.onlyTransient().ranges()) : t -> false;
        double compressionRatio = scanners.getCompressionRatio();
        if (compressionRatio == MetadataCollector.NO_COMPRESSION_RATIO)
            compressionRatio = 1.0;
        long lastBytesScanned = 0;
        while (ci.hasNext()) {
            try (UnfilteredRowIterator partition = ci.next()) {
                Token token = partition.partitionKey().getToken();
                // if this row is contained in the full or transient ranges, append it to the appropriate sstable
                if (fullChecker.test(token)) {
                    fullWriter.append(partition);
                } else if (transChecker.test(token)) {
                    transWriter.append(partition);
                } else {
                    // otherwise, append it to the unrepaired sstable
                    unrepairedWriter.append(partition);
                }
                long bytesScanned = scanners.getTotalBytesScanned();
                compactionRateLimiterAcquire(limiter, bytesScanned, lastBytesScanned, compressionRatio);
                lastBytesScanned = bytesScanned;
            }
        }
        fullWriter.prepareToCommit();
        transWriter.prepareToCommit();
        unrepairedWriter.prepareToCommit();
        txn.checkpoint();
        txn.obsoleteOriginals();
        txn.prepareToCommit();
        List<SSTableReader> fullSSTables = new ArrayList<>(fullWriter.finished());
        List<SSTableReader> transSSTables = new ArrayList<>(transWriter.finished());
        List<SSTableReader> unrepairedSSTables = new ArrayList<>(unrepairedWriter.finished());
        fullWriter.commit();
        transWriter.commit();
        unrepairedWriter.commit();
        txn.commit();
        logger.info("Anticompacted {} in {}.{} to full = {}, transient = {}, unrepaired = {} for {}", sstableAsSet, cfs.keyspace.getName(), cfs.getTableName(), fullSSTables, transSSTables, unrepairedSSTables, pendingRepair);
        return fullSSTables.size() + transSSTables.size() + unrepairedSSTables.size();
    } catch (Throwable e) {
        if (e instanceof CompactionInterruptedException && isCancelled.getAsBoolean()) {
            logger.info("Anticompaction has been canceled for session {}", pendingRepair);
            logger.trace(e.getMessage(), e);
        } else {
            JVMStabilityInspector.inspectThrowable(e);
            logger.error("Error anticompacting " + txn + " for " + pendingRepair, e);
        }
        throw e;
    }
}
Also used : UnfilteredRowIterator(org.apache.cassandra.db.rows.UnfilteredRowIterator) ILifecycleTransaction(org.apache.cassandra.db.lifecycle.ILifecycleTransaction) Token(org.apache.cassandra.dht.Token) SSTableRewriter(org.apache.cassandra.io.sstable.SSTableRewriter) SSTableReader(org.apache.cassandra.io.sstable.format.SSTableReader) Range(org.apache.cassandra.dht.Range) RangesAtEndpoint(org.apache.cassandra.locator.RangesAtEndpoint) RateLimiter(com.google.common.util.concurrent.RateLimiter) WrappedLifecycleTransaction(org.apache.cassandra.db.lifecycle.WrappedLifecycleTransaction) File(org.apache.cassandra.io.util.File) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

RangesAtEndpoint (org.apache.cassandra.locator.RangesAtEndpoint)32 Token (org.apache.cassandra.dht.Token)22 Range (org.apache.cassandra.dht.Range)20 Replica (org.apache.cassandra.locator.Replica)17 SSTableReader (org.apache.cassandra.io.sstable.format.SSTableReader)10 EndpointsByReplica (org.apache.cassandra.locator.EndpointsByReplica)10 ArrayList (java.util.ArrayList)9 List (java.util.List)9 Test (org.junit.Test)9 Logger (org.slf4j.Logger)9 LoggerFactory (org.slf4j.LoggerFactory)9 Collection (java.util.Collection)8 DatabaseDescriptor (org.apache.cassandra.config.DatabaseDescriptor)8 InetAddressAndPort (org.apache.cassandra.locator.InetAddressAndPort)8 VisibleForTesting (com.google.common.annotations.VisibleForTesting)7 Set (java.util.Set)7 UUID (java.util.UUID)7 Replica.fullReplica (org.apache.cassandra.locator.Replica.fullReplica)7 IOException (java.io.IOException)6 Replica.transientReplica (org.apache.cassandra.locator.Replica.transientReplica)6