use of org.apache.cassandra.locator.RangesAtEndpoint in project cassandra by apache.
the class DiskBoundaryManager method getDiskBoundaryValue.
private static DiskBoundaries getDiskBoundaryValue(ColumnFamilyStore cfs) {
RangesAtEndpoint localRanges;
long ringVersion;
TokenMetadata tmd;
do {
tmd = StorageService.instance.getTokenMetadata();
ringVersion = tmd.getRingVersion();
if (StorageService.instance.isBootstrapMode() && // When replacing same address, the node marks itself as UN locally
!StorageService.isReplacingSameAddress()) {
PendingRangeCalculatorService.instance.blockUntilFinished();
localRanges = tmd.getPendingRanges(cfs.keyspace.getName(), FBUtilities.getBroadcastAddressAndPort());
} else {
// Reason we use use the future settled TMD is that if we decommission a node, we want to stream
// from that node to the correct location on disk, if we didn't, we would put new files in the wrong places.
// We do this to minimize the amount of data we need to move in rebalancedisks once everything settled
localRanges = cfs.keyspace.getReplicationStrategy().getAddressReplicas(tmd.cloneAfterAllSettled(), FBUtilities.getBroadcastAddressAndPort());
}
logger.debug("Got local ranges {} (ringVersion = {})", localRanges, ringVersion);
} while (// if ringVersion is different here it means that
ringVersion != tmd.getRingVersion());
// it might have changed before we calculated localRanges - recalculate
int directoriesVersion;
Directories.DataDirectory[] dirs;
do {
directoriesVersion = DisallowedDirectories.getDirectoriesVersion();
dirs = cfs.getDirectories().getWriteableLocations();
} while (// if directoriesVersion has changed we need to recalculate
directoriesVersion != DisallowedDirectories.getDirectoriesVersion());
if (localRanges == null || localRanges.isEmpty())
return new DiskBoundaries(cfs, dirs, null, ringVersion, directoriesVersion);
List<PartitionPosition> positions = getDiskBoundaries(localRanges, cfs.getPartitioner(), dirs);
return new DiskBoundaries(cfs, dirs, positions, ringVersion, directoriesVersion);
}
use of org.apache.cassandra.locator.RangesAtEndpoint in project cassandra by apache.
the class ViewBuilder method build.
private synchronized void build() {
if (isStopped) {
logger.debug("Stopped build for view({}.{}) after covering {} keys", ksName, view.name, keysBuilt);
return;
}
// Get the local ranges for which the view hasn't already been built nor it's building
RangesAtEndpoint replicatedRanges = StorageService.instance.getLocalReplicas(ksName);
Replicas.temporaryAssertFull(replicatedRanges);
Set<Range<Token>> newRanges = replicatedRanges.ranges().stream().map(r -> r.subtractAll(builtRanges)).flatMap(Set::stream).map(r -> r.subtractAll(pendingRanges.keySet())).flatMap(Set::stream).collect(Collectors.toSet());
// If there are no new nor pending ranges we should finish the build
if (newRanges.isEmpty() && pendingRanges.isEmpty()) {
finish();
return;
}
// Split the new local ranges and add them to the pending set
DatabaseDescriptor.getPartitioner().splitter().map(s -> s.split(newRanges, NUM_TASKS)).orElse(newRanges).forEach(r -> pendingRanges.put(r, Pair.<Token, Long>create(null, 0L)));
// Submit a new view build task for each building range.
// We keep record of all the submitted tasks to be able of stopping them.
List<Future<Long>> futures = pendingRanges.entrySet().stream().map(e -> new ViewBuilderTask(baseCfs, view, e.getKey(), e.getValue().left, e.getValue().right)).peek(tasks::add).map(CompactionManager.instance::submitViewBuilder).collect(toList());
// Add a callback to process any eventual new local range and mark the view as built, doing a delayed retry if
// the tasks don't succeed
Future<List<Long>> future = FutureCombiner.allOf(futures);
future.addCallback(new FutureCallback<List<Long>>() {
public void onSuccess(List<Long> result) {
keysBuilt += result.stream().mapToLong(x -> x).sum();
builtRanges.addAll(pendingRanges.keySet());
pendingRanges.clear();
build();
}
public void onFailure(Throwable t) {
if (t instanceof CompactionInterruptedException) {
internalStop(true);
keysBuilt = tasks.stream().mapToLong(ViewBuilderTask::keysBuilt).sum();
logger.info("Interrupted build for view({}.{}) after covering {} keys", ksName, view.name, keysBuilt);
} else {
ScheduledExecutors.nonPeriodicTasks.schedule(() -> loadStatusAndBuild(), 5, TimeUnit.MINUTES);
logger.warn("Materialized View failed to complete, sleeping 5 minutes before restarting", t);
}
}
});
this.future = future;
}
use of org.apache.cassandra.locator.RangesAtEndpoint in project cassandra by apache.
the class LocalSessions method handlePrepareMessage.
/**
* The PrepareConsistentRequest promotes the parent repair session to a consistent incremental
* session, and isolates the data to be repaired from the rest of the table's data
*
* No response is sent to the repair coordinator until the data preparation / isolation has completed
* successfully. If the data preparation fails, a failure message is sent to the coordinator,
* cancelling the session.
*/
public void handlePrepareMessage(InetAddressAndPort from, PrepareConsistentRequest request) {
logger.trace("received {} from {}", request, from);
UUID sessionID = request.parentSession;
InetAddressAndPort coordinator = request.coordinator;
Set<InetAddressAndPort> peers = request.participants;
ActiveRepairService.ParentRepairSession parentSession;
try {
parentSession = getParentRepairSession(sessionID);
} catch (Throwable e) {
logger.error("Error retrieving ParentRepairSession for session {}, responding with failure", sessionID);
sendMessage(coordinator, Message.out(PREPARE_CONSISTENT_RSP, new PrepareConsistentResponse(sessionID, getBroadcastAddressAndPort(), false)));
return;
}
LocalSession session = createSessionUnsafe(sessionID, parentSession, peers);
putSessionUnsafe(session);
logger.info("Beginning local incremental repair session {}", session);
ExecutorService executor = executorFactory().pooled("Repair-" + sessionID, parentSession.getColumnFamilyStores().size());
KeyspaceRepairManager repairManager = parentSession.getKeyspace().getRepairManager();
RangesAtEndpoint tokenRanges = filterLocalRanges(parentSession.getKeyspace().getName(), parentSession.getRanges());
Future<List<Void>> repairPreparation = prepareSession(repairManager, sessionID, parentSession.getColumnFamilyStores(), tokenRanges, executor, () -> session.getState() != PREPARING);
repairPreparation.addCallback(new FutureCallback<List<Void>>() {
public void onSuccess(@Nullable List<Void> result) {
try {
logger.info("Prepare phase for incremental repair session {} completed", sessionID);
if (session.getState() != FAILED)
setStateAndSave(session, PREPARED);
else
logger.info("Session {} failed before anticompaction completed", sessionID);
Message<PrepareConsistentResponse> message = Message.out(PREPARE_CONSISTENT_RSP, new PrepareConsistentResponse(sessionID, getBroadcastAddressAndPort(), session.getState() != FAILED));
sendMessage(coordinator, message);
} finally {
executor.shutdown();
}
}
public void onFailure(Throwable t) {
try {
if (Throwables.anyCauseMatches(t, (throwable) -> throwable instanceof CompactionInterruptedException))
logger.info("Anticompaction interrupted for session {}: {}", sessionID, t.getMessage());
else if (Throwables.anyCauseMatches(t, (throwable) -> throwable instanceof NoSuchRepairSessionException))
logger.warn("No such repair session: {}", sessionID);
else
logger.error("Prepare phase for incremental repair session {} failed", sessionID, t);
sendMessage(coordinator, Message.out(PREPARE_CONSISTENT_RSP, new PrepareConsistentResponse(sessionID, getBroadcastAddressAndPort(), false)));
failSession(sessionID, false);
} finally {
executor.shutdown();
}
}
});
}
use of org.apache.cassandra.locator.RangesAtEndpoint in project cassandra by apache.
the class CompactionManager method doAntiCompaction.
/**
* Splits up an sstable into two new sstables. The first of the new tables will store repaired ranges, the second
* will store the non-repaired ranges. Once anticompation is completed, the original sstable is marked as compacted
* and subsequently deleted.
* @param cfs
* @param txn a transaction over the repaired sstables to anticompact
* @param ranges full and transient ranges to be placed into one of the new sstables. The repaired table will be tracked via
* the {@link org.apache.cassandra.io.sstable.metadata.StatsMetadata#pendingRepair} field.
* @param pendingRepair the repair session we're anti-compacting for
* @param isCancelled function that indicates if active anti-compaction should be canceled
*/
private void doAntiCompaction(ColumnFamilyStore cfs, RangesAtEndpoint ranges, LifecycleTransaction txn, UUID pendingRepair, BooleanSupplier isCancelled) {
int originalCount = txn.originals().size();
logger.info("Performing anticompaction on {} sstables for {}", originalCount, pendingRepair);
// Group SSTables
Set<SSTableReader> sstables = txn.originals();
// Repairs can take place on both unrepaired (incremental + full) and repaired (full) data.
// Although anti-compaction could work on repaired sstables as well and would result in having more accurate
// repairedAt values for these, we still avoid anti-compacting already repaired sstables, as we currently don't
// make use of any actual repairedAt value and splitting up sstables just for that is not worth it at this point.
Set<SSTableReader> unrepairedSSTables = sstables.stream().filter((s) -> !s.isRepaired()).collect(Collectors.toSet());
cfs.metric.bytesAnticompacted.inc(SSTableReader.getTotalBytes(unrepairedSSTables));
Collection<Collection<SSTableReader>> groupedSSTables = cfs.getCompactionStrategyManager().groupSSTablesForAntiCompaction(unrepairedSSTables);
// iterate over sstables to check if the full / transient / unrepaired ranges intersect them.
int antiCompactedSSTableCount = 0;
for (Collection<SSTableReader> sstableGroup : groupedSSTables) {
try (LifecycleTransaction groupTxn = txn.split(sstableGroup)) {
int antiCompacted = antiCompactGroup(cfs, ranges, groupTxn, pendingRepair, isCancelled);
antiCompactedSSTableCount += antiCompacted;
}
}
String format = "Anticompaction completed successfully, anticompacted from {} to {} sstable(s) for {}.";
logger.info(format, originalCount, antiCompactedSSTableCount, pendingRepair);
}
use of org.apache.cassandra.locator.RangesAtEndpoint in project cassandra by apache.
the class CompactionManager method antiCompactGroup.
@VisibleForTesting
int antiCompactGroup(ColumnFamilyStore cfs, RangesAtEndpoint ranges, LifecycleTransaction txn, UUID pendingRepair, BooleanSupplier isCancelled) {
Preconditions.checkArgument(!ranges.isEmpty(), "need at least one full or transient range");
long groupMaxDataAge = -1;
for (Iterator<SSTableReader> i = txn.originals().iterator(); i.hasNext(); ) {
SSTableReader sstable = i.next();
if (groupMaxDataAge < sstable.maxDataAge)
groupMaxDataAge = sstable.maxDataAge;
}
if (txn.originals().size() == 0) {
logger.info("No valid anticompactions for this group, All sstables were compacted and are no longer available");
return 0;
}
logger.info("Anticompacting {} in {}.{} for {}", txn.originals(), cfs.keyspace.getName(), cfs.getTableName(), pendingRepair);
Set<SSTableReader> sstableAsSet = txn.originals();
File destination = cfs.getDirectories().getWriteableLocationAsFile(cfs.getExpectedCompactedFileSize(sstableAsSet, OperationType.ANTICOMPACTION));
int nowInSec = FBUtilities.nowInSeconds();
RateLimiter limiter = getRateLimiter();
/**
* HACK WARNING
*
* We have multiple writers operating over the same Transaction, producing different sets of sstables that all
* logically replace the transaction's originals. The SSTableRewriter assumes it has exclusive control over
* the transaction state, and this will lead to temporarily inconsistent sstable/tracker state if we do not
* take special measures to avoid it.
*
* Specifically, if a number of rewriter have prepareToCommit() invoked in sequence, then two problematic things happen:
* 1. The obsoleteOriginals() call of the first rewriter immediately remove the originals from the tracker, despite
* their having been only partially replaced. To avoid this, we must either avoid obsoleteOriginals() or checkpoint()
* 2. The LifecycleTransaction may only have prepareToCommit() invoked once, and this will checkpoint() also.
*
* Similarly commit() would finalise partially complete on-disk state.
*
* To avoid these problems, we introduce a SharedTxn that proxies all calls onto the underlying transaction
* except prepareToCommit(), checkpoint(), obsoleteOriginals(), and commit().
* We then invoke these methods directly once each of the rewriter has updated the transaction
* with their share of replacements.
*
* Note that for the same essential reason we also explicitly disable early open.
* By noop-ing checkpoint we avoid any of the problems with early open, but by continuing to explicitly
* disable it we also prevent any of the extra associated work from being performed.
*/
class SharedTxn extends WrappedLifecycleTransaction {
public SharedTxn(ILifecycleTransaction delegate) {
super(delegate);
}
public Throwable commit(Throwable accumulate) {
return accumulate;
}
public void prepareToCommit() {
}
public void checkpoint() {
}
public void obsoleteOriginals() {
}
public void close() {
}
}
CompactionStrategyManager strategy = cfs.getCompactionStrategyManager();
try (SharedTxn sharedTxn = new SharedTxn(txn);
SSTableRewriter fullWriter = SSTableRewriter.constructWithoutEarlyOpening(sharedTxn, false, groupMaxDataAge);
SSTableRewriter transWriter = SSTableRewriter.constructWithoutEarlyOpening(sharedTxn, false, groupMaxDataAge);
SSTableRewriter unrepairedWriter = SSTableRewriter.constructWithoutEarlyOpening(sharedTxn, false, groupMaxDataAge);
AbstractCompactionStrategy.ScannerList scanners = strategy.getScanners(txn.originals());
CompactionController controller = new CompactionController(cfs, sstableAsSet, getDefaultGcBefore(cfs, nowInSec));
CompactionIterator ci = getAntiCompactionIterator(scanners.scanners, controller, nowInSec, UUIDGen.getTimeUUID(), active, isCancelled)) {
int expectedBloomFilterSize = Math.max(cfs.metadata().params.minIndexInterval, (int) (SSTableReader.getApproximateKeyCount(sstableAsSet)));
fullWriter.switchWriter(CompactionManager.createWriterForAntiCompaction(cfs, destination, expectedBloomFilterSize, UNREPAIRED_SSTABLE, pendingRepair, false, sstableAsSet, txn));
transWriter.switchWriter(CompactionManager.createWriterForAntiCompaction(cfs, destination, expectedBloomFilterSize, UNREPAIRED_SSTABLE, pendingRepair, true, sstableAsSet, txn));
unrepairedWriter.switchWriter(CompactionManager.createWriterForAntiCompaction(cfs, destination, expectedBloomFilterSize, UNREPAIRED_SSTABLE, NO_PENDING_REPAIR, false, sstableAsSet, txn));
Predicate<Token> fullChecker = !ranges.onlyFull().isEmpty() ? new Range.OrderedRangeContainmentChecker(ranges.onlyFull().ranges()) : t -> false;
Predicate<Token> transChecker = !ranges.onlyTransient().isEmpty() ? new Range.OrderedRangeContainmentChecker(ranges.onlyTransient().ranges()) : t -> false;
double compressionRatio = scanners.getCompressionRatio();
if (compressionRatio == MetadataCollector.NO_COMPRESSION_RATIO)
compressionRatio = 1.0;
long lastBytesScanned = 0;
while (ci.hasNext()) {
try (UnfilteredRowIterator partition = ci.next()) {
Token token = partition.partitionKey().getToken();
// if this row is contained in the full or transient ranges, append it to the appropriate sstable
if (fullChecker.test(token)) {
fullWriter.append(partition);
} else if (transChecker.test(token)) {
transWriter.append(partition);
} else {
// otherwise, append it to the unrepaired sstable
unrepairedWriter.append(partition);
}
long bytesScanned = scanners.getTotalBytesScanned();
compactionRateLimiterAcquire(limiter, bytesScanned, lastBytesScanned, compressionRatio);
lastBytesScanned = bytesScanned;
}
}
fullWriter.prepareToCommit();
transWriter.prepareToCommit();
unrepairedWriter.prepareToCommit();
txn.checkpoint();
txn.obsoleteOriginals();
txn.prepareToCommit();
List<SSTableReader> fullSSTables = new ArrayList<>(fullWriter.finished());
List<SSTableReader> transSSTables = new ArrayList<>(transWriter.finished());
List<SSTableReader> unrepairedSSTables = new ArrayList<>(unrepairedWriter.finished());
fullWriter.commit();
transWriter.commit();
unrepairedWriter.commit();
txn.commit();
logger.info("Anticompacted {} in {}.{} to full = {}, transient = {}, unrepaired = {} for {}", sstableAsSet, cfs.keyspace.getName(), cfs.getTableName(), fullSSTables, transSSTables, unrepairedSSTables, pendingRepair);
return fullSSTables.size() + transSSTables.size() + unrepairedSSTables.size();
} catch (Throwable e) {
if (e instanceof CompactionInterruptedException && isCancelled.getAsBoolean()) {
logger.info("Anticompaction has been canceled for session {}", pendingRepair);
logger.trace(e.getMessage(), e);
} else {
JVMStabilityInspector.inspectThrowable(e);
logger.error("Error anticompacting " + txn + " for " + pendingRepair, e);
}
throw e;
}
}
Aggregations