use of org.apache.cassandra.locator.InetAddressAndPort in project cassandra by apache.
the class RepairSession method start.
/**
* Start RepairJob on given ColumnFamilies.
*
* This first validates if all replica are available, and if they are,
* creates RepairJobs and submit to run on given executor.
*
* @param executor Executor to run validation
*/
public void start(ExecutorPlus executor) {
String message;
if (terminated)
return;
logger.info("{} parentSessionId = {}: new session: will sync {} on range {} for {}.{}", previewKind.logPrefix(getId()), parentRepairSession, repairedNodes(), commonRange, keyspace, Arrays.toString(cfnames));
Tracing.traceRepair("Syncing range {}", commonRange);
if (!previewKind.isPreview()) {
SystemDistributedKeyspace.startRepairs(getId(), parentRepairSession, keyspace, cfnames, commonRange);
}
if (commonRange.endpoints.isEmpty()) {
logger.info("{} {}", previewKind.logPrefix(getId()), message = String.format("No neighbors to repair with on range %s: session completed", commonRange));
Tracing.traceRepair(message);
trySuccess(new RepairSessionResult(id, keyspace, commonRange.ranges, Lists.<RepairResult>newArrayList(), commonRange.hasSkippedReplicas));
if (!previewKind.isPreview()) {
SystemDistributedKeyspace.failRepairs(getId(), keyspace, cfnames, new RuntimeException(message));
}
return;
}
// Checking all nodes are live
for (InetAddressAndPort endpoint : commonRange.endpoints) {
if (!FailureDetector.instance.isAlive(endpoint) && !commonRange.hasSkippedReplicas) {
message = String.format("Cannot proceed on repair because a neighbor (%s) is dead: session failed", endpoint);
logger.error("{} {}", previewKind.logPrefix(getId()), message);
Exception e = new IOException(message);
tryFailure(e);
if (!previewKind.isPreview()) {
SystemDistributedKeyspace.failRepairs(getId(), keyspace, cfnames, e);
}
return;
}
}
// Create and submit RepairJob for each ColumnFamily
List<Future<RepairResult>> jobs = new ArrayList<>(cfnames.length);
for (String cfname : cfnames) {
RepairJob job = new RepairJob(this, cfname);
executor.execute(job);
jobs.add(job);
}
// When all RepairJobs are done without error, cleanup and set the final result
FBUtilities.allOf(jobs).addCallback(new FutureCallback<List<RepairResult>>() {
public void onSuccess(List<RepairResult> results) {
// this repair session is completed
logger.info("{} {}", previewKind.logPrefix(getId()), "Session completed successfully");
Tracing.traceRepair("Completed sync of range {}", commonRange);
trySuccess(new RepairSessionResult(id, keyspace, commonRange.ranges, results, commonRange.hasSkippedReplicas));
taskExecutor.shutdown();
// mark this session as terminated
terminate();
}
public void onFailure(Throwable t) {
String msg = "{} Session completed with the following error";
if (Throwables.anyCauseMatches(t, RepairException::shouldWarn))
logger.warn(msg + ": {}", previewKind.logPrefix(getId()), t.getMessage());
else
logger.error(msg, previewKind.logPrefix(getId()), t);
Tracing.traceRepair("Session completed with the following error: {}", t);
forceShutdown(t);
}
});
}
use of org.apache.cassandra.locator.InetAddressAndPort in project cassandra by apache.
the class RepairSession method repairedNodes.
private String repairedNodes() {
StringBuilder sb = new StringBuilder();
sb.append(FBUtilities.getBroadcastAddressAndPort());
for (InetAddressAndPort ep : commonRange.endpoints) sb.append(", ").append(ep);
return sb.toString();
}
use of org.apache.cassandra.locator.InetAddressAndPort in project cassandra by apache.
the class RepairJob method run.
/**
* Runs repair job.
*
* This sets up necessary task and runs them on given {@code taskExecutor}.
* After submitting all tasks, waits until validation with replica completes.
*/
@SuppressWarnings("UnstableApiUsage")
public void run() {
Keyspace ks = Keyspace.open(desc.keyspace);
ColumnFamilyStore cfs = ks.getColumnFamilyStore(desc.columnFamily);
cfs.metric.repairsStarted.inc();
List<InetAddressAndPort> allEndpoints = new ArrayList<>(session.commonRange.endpoints);
allEndpoints.add(FBUtilities.getBroadcastAddressAndPort());
Future<List<TreeResponse>> treeResponses;
// Create a snapshot at all nodes unless we're using pure parallel repairs
if (parallelismDegree != RepairParallelism.PARALLEL) {
Future<List<InetAddressAndPort>> allSnapshotTasks;
if (session.isIncremental) {
// consistent repair does it's own "snapshotting"
allSnapshotTasks = ImmediateFuture.success(allEndpoints);
} else {
// Request snapshot to all replica
List<Future<InetAddressAndPort>> snapshotTasks = new ArrayList<>(allEndpoints.size());
for (InetAddressAndPort endpoint : allEndpoints) {
SnapshotTask snapshotTask = new SnapshotTask(desc, endpoint);
snapshotTasks.add(snapshotTask);
taskExecutor.execute(snapshotTask);
}
allSnapshotTasks = FutureCombiner.allOf(snapshotTasks);
}
// When all snapshot complete, send validation requests
treeResponses = allSnapshotTasks.flatMap(endpoints -> {
if (parallelismDegree == RepairParallelism.SEQUENTIAL)
return sendSequentialValidationRequest(endpoints);
else
return sendDCAwareValidationRequest(endpoints);
}, taskExecutor);
} else {
// If not sequential, just send validation request to all replica
treeResponses = sendValidationRequest(allEndpoints);
}
// When all validations complete, submit sync tasks
Future<List<SyncStat>> syncResults = treeResponses.flatMap(session.optimiseStreams && !session.pullRepair ? this::optimisedSyncing : this::standardSyncing, taskExecutor);
// When all sync complete, set the final result
syncResults.addCallback(new FutureCallback<List<SyncStat>>() {
public void onSuccess(List<SyncStat> stats) {
if (!session.previewKind.isPreview()) {
logger.info("{} {}.{} is fully synced", session.previewKind.logPrefix(session.getId()), desc.keyspace, desc.columnFamily);
SystemDistributedKeyspace.successfulRepairJob(session.getId(), desc.keyspace, desc.columnFamily);
}
cfs.metric.repairsCompleted.inc();
trySuccess(new RepairResult(desc, stats));
}
/**
* Snapshot, validation and sync failures are all handled here
*/
public void onFailure(Throwable t) {
// Make sure all validation tasks have cleaned up the off-heap Merkle trees they might contain.
validationTasks.forEach(ValidationTask::abort);
if (!session.previewKind.isPreview()) {
logger.warn("{} {}.{} sync failed", session.previewKind.logPrefix(session.getId()), desc.keyspace, desc.columnFamily);
SystemDistributedKeyspace.failedRepairJob(session.getId(), desc.keyspace, desc.columnFamily, t);
}
cfs.metric.repairsCompleted.inc();
tryFailure(t instanceof NoSuchRepairSessionExceptionWrapper ? ((NoSuchRepairSessionExceptionWrapper) t).wrapped : t);
}
}, taskExecutor);
}
use of org.apache.cassandra.locator.InetAddressAndPort in project cassandra by apache.
the class RepairJob method sendValidationRequest.
/**
* Creates {@link ValidationTask} and submit them to task executor in parallel.
*
* @param endpoints Endpoint addresses to send validation request
* @return Future that can get all {@link TreeResponse} from replica, if all validation succeed.
*/
private Future<List<TreeResponse>> sendValidationRequest(Collection<InetAddressAndPort> endpoints) {
String message = String.format("Requesting merkle trees for %s (to %s)", desc.columnFamily, endpoints);
logger.info("{} {}", session.previewKind.logPrefix(desc.sessionId), message);
Tracing.traceRepair(message);
int nowInSec = getNowInSeconds();
List<Future<TreeResponse>> tasks = new ArrayList<>(endpoints.size());
for (InetAddressAndPort endpoint : endpoints) {
ValidationTask task = newValidationTask(endpoint, nowInSec);
tasks.add(task);
session.trackValidationCompletion(Pair.create(desc, endpoint), task);
taskExecutor.execute(task);
}
return FutureCombiner.allOf(tasks);
}
use of org.apache.cassandra.locator.InetAddressAndPort in project cassandra by apache.
the class RepairRunnable method getNeighborsAndRanges.
private NeighborsAndRanges getNeighborsAndRanges() throws RepairException {
Set<InetAddressAndPort> allNeighbors = new HashSet<>();
List<CommonRange> commonRanges = new ArrayList<>();
// pre-calculate output of getLocalReplicas and pass it to getNeighbors to increase performance and prevent
// calculation multiple times
Iterable<Range<Token>> keyspaceLocalRanges = storageService.getLocalReplicas(keyspace).ranges();
for (Range<Token> range : options.getRanges()) {
EndpointsForRange neighbors = ActiveRepairService.getNeighbors(keyspace, keyspaceLocalRanges, range, options.getDataCenters(), options.getHosts());
if (neighbors.isEmpty()) {
if (options.ignoreUnreplicatedKeyspaces()) {
logger.info("{} Found no neighbors for range {} for {} - ignoring since repairing with --ignore-unreplicated-keyspaces", parentSession, range, keyspace);
continue;
} else {
throw RepairException.warn(String.format("Nothing to repair for %s in %s - aborting", range, keyspace));
}
}
addRangeToNeighbors(commonRanges, range, neighbors);
allNeighbors.addAll(neighbors.endpoints());
}
if (options.ignoreUnreplicatedKeyspaces() && allNeighbors.isEmpty()) {
throw new SkipRepairException(String.format("Nothing to repair for %s in %s - unreplicated keyspace is ignored since repair was called with --ignore-unreplicated-keyspaces", options.getRanges(), keyspace));
}
progressCounter.incrementAndGet();
boolean shouldExcludeDeadParticipants = options.isForcedRepair();
if (shouldExcludeDeadParticipants) {
Set<InetAddressAndPort> actualNeighbors = Sets.newHashSet(Iterables.filter(allNeighbors, FailureDetector.instance::isAlive));
shouldExcludeDeadParticipants = !allNeighbors.equals(actualNeighbors);
allNeighbors = actualNeighbors;
}
return new NeighborsAndRanges(shouldExcludeDeadParticipants, allNeighbors, commonRanges);
}
Aggregations