use of alluxio.exception.status.FailedPreconditionException in project alluxio by Alluxio.
the class AbstractClient method retryRPCInternal.
private synchronized <V> V retryRPCInternal(RetryPolicy retryPolicy, RpcCallable<V> rpc, Supplier<Void> onRetry) throws AlluxioStatusException {
Exception ex = null;
while (retryPolicy.attempt()) {
if (mClosed) {
throw new FailedPreconditionException("Client is closed");
}
connect();
try {
return rpc.call();
} catch (StatusRuntimeException e) {
AlluxioStatusException se = AlluxioStatusException.fromStatusRuntimeException(e);
if (se.getStatusCode() == Status.Code.UNAVAILABLE || se.getStatusCode() == Status.Code.CANCELLED || se.getStatusCode() == Status.Code.UNAUTHENTICATED || e.getCause() instanceof UnresolvedAddressException) {
ex = se;
} else {
throw se;
}
}
LOG.debug("Rpc failed ({}): ", retryPolicy.getAttemptCount(), ex);
onRetry.get();
disconnect();
}
throw new UnavailableException("Failed after " + retryPolicy.getAttemptCount() + " attempts: " + ex.toString(), ex);
}
use of alluxio.exception.status.FailedPreconditionException in project alluxio by Alluxio.
the class LoadDefinition method selectExecutors.
@Override
public Set<Pair<WorkerInfo, ArrayList<LoadTask>>> selectExecutors(LoadConfig config, List<WorkerInfo> jobWorkerInfoList, SelectExecutorsContext context) throws Exception {
Map<String, WorkerInfo> jobWorkersByAddress = jobWorkerInfoList.stream().collect(Collectors.toMap(info -> info.getAddress().getHost(), info -> info));
// Filter out workers which have no local job worker available.
List<String> missingJobWorkerHosts = new ArrayList<>();
List<BlockWorkerInfo> workers = new ArrayList<>();
for (BlockWorkerInfo worker : context.getFsContext().getCachedWorkers()) {
if (jobWorkersByAddress.containsKey(worker.getNetAddress().getHost())) {
String workerHost = worker.getNetAddress().getHost().toUpperCase();
if (!isEmptySet(config.getExcludedWorkerSet()) && config.getExcludedWorkerSet().contains(workerHost)) {
continue;
}
// If specified the locality id, the candidate worker must match one at least
boolean match = false;
if (worker.getNetAddress().getTieredIdentity().getTiers() != null) {
if (!(isEmptySet(config.getLocalityIds()) && isEmptySet(config.getExcludedLocalityIds()))) {
boolean exclude = false;
for (LocalityTier tier : worker.getNetAddress().getTieredIdentity().getTiers()) {
if (!isEmptySet(config.getExcludedLocalityIds()) && config.getExcludedLocalityIds().contains(tier.getValue().toUpperCase())) {
exclude = true;
break;
}
if (!isEmptySet(config.getLocalityIds()) && config.getLocalityIds().contains(tier.getValue().toUpperCase())) {
match = true;
break;
}
}
if (exclude) {
continue;
}
}
}
// Or user specified neither worker-set nor locality id
if ((isEmptySet(config.getWorkerSet()) && isEmptySet(config.getLocalityIds())) || match || (!isEmptySet(config.getWorkerSet()) && config.getWorkerSet().contains(workerHost))) {
workers.add(worker);
}
} else {
LOG.warn("Worker on host {} has no local job worker", worker.getNetAddress().getHost());
missingJobWorkerHosts.add(worker.getNetAddress().getHost());
}
}
// Mapping from worker to block ids which that worker is supposed to load.
Multimap<WorkerInfo, LoadTask> assignments = LinkedListMultimap.create();
AlluxioURI uri = new AlluxioURI(config.getFilePath());
for (FileBlockInfo blockInfo : context.getFileSystem().getStatus(uri).getFileBlockInfos()) {
List<BlockWorkerInfo> workersWithoutBlock = getWorkersWithoutBlock(workers, blockInfo);
int neededReplicas = config.getReplication() - blockInfo.getBlockInfo().getLocations().size();
if (workersWithoutBlock.size() < neededReplicas) {
String missingJobWorkersMessage = "";
if (!missingJobWorkerHosts.isEmpty()) {
missingJobWorkersMessage = ". The following workers could not be used because they have " + "no local job workers: " + missingJobWorkerHosts;
}
throw new FailedPreconditionException(String.format("Failed to find enough block workers to replicate to. Needed %s but only found %s. " + "Available workers without the block: %s" + missingJobWorkersMessage, neededReplicas, workersWithoutBlock.size(), workersWithoutBlock));
}
Collections.shuffle(workersWithoutBlock);
for (int i = 0; i < neededReplicas; i++) {
String address = workersWithoutBlock.get(i).getNetAddress().getHost();
WorkerInfo jobWorker = jobWorkersByAddress.get(address);
assignments.put(jobWorker, new LoadTask(blockInfo.getBlockInfo().getBlockId(), workersWithoutBlock.get(i).getNetAddress()));
}
}
Set<Pair<WorkerInfo, ArrayList<LoadTask>>> result = Sets.newHashSet();
for (Map.Entry<WorkerInfo, Collection<LoadTask>> assignment : assignments.asMap().entrySet()) {
Collection<LoadTask> loadTasks = assignment.getValue();
List<List<LoadTask>> partitionedTasks = CommonUtils.partition(Lists.newArrayList(loadTasks), JOBS_PER_WORKER);
for (List<LoadTask> tasks : partitionedTasks) {
if (!tasks.isEmpty()) {
result.add(new Pair<>(assignment.getKey(), Lists.newArrayList(tasks)));
}
}
}
return result;
}
use of alluxio.exception.status.FailedPreconditionException in project alluxio by Alluxio.
the class DefaultFileSystemMaster method deleteInternal.
/**
* Implements file deletion.
* <p>
* This method does not delete blocks. Instead, it returns deleted inodes so that their blocks can
* be deleted after the inode deletion journal entry has been written. We cannot delete blocks
* earlier because the inode deletion may fail, leaving us with inode containing deleted blocks.
*
* @param rpcContext the rpc context
* @param inodePath the file {@link LockedInodePath}
* @param deleteContext the method optitions
*/
@VisibleForTesting
public void deleteInternal(RpcContext rpcContext, LockedInodePath inodePath, DeleteContext deleteContext) throws FileDoesNotExistException, IOException, DirectoryNotEmptyException, InvalidPathException {
Preconditions.checkState(inodePath.getLockPattern() == LockPattern.WRITE_EDGE);
// journaled will result in an inconsistency between Alluxio and UFS.
if (!inodePath.fullPathExists()) {
return;
}
long opTimeMs = System.currentTimeMillis();
Inode inode = inodePath.getInode();
if (inode == null) {
return;
}
boolean recursive = deleteContext.getOptions().getRecursive();
if (inode.isDirectory() && !recursive && mInodeStore.hasChildren(inode.asDirectory())) {
// true
throw new DirectoryNotEmptyException(ExceptionMessage.DELETE_NONEMPTY_DIRECTORY_NONRECURSIVE, inode.getName());
}
if (mInodeTree.isRootId(inode.getId())) {
// The root cannot be deleted.
throw new InvalidPathException(ExceptionMessage.DELETE_ROOT_DIRECTORY.getMessage());
}
// Inodes for which deletion will be attempted
List<Pair<AlluxioURI, LockedInodePath>> inodesToDelete = new ArrayList<>();
// Add root of sub-tree to delete
inodesToDelete.add(new Pair<>(inodePath.getUri(), inodePath));
try (LockedInodePathList descendants = mInodeTree.getDescendants(inodePath)) {
for (LockedInodePath childPath : descendants) {
inodesToDelete.add(new Pair<>(mInodeTree.getPath(childPath.getInode()), childPath));
}
// Prepare to delete persisted inodes
UfsDeleter ufsDeleter = NoopUfsDeleter.INSTANCE;
if (!deleteContext.getOptions().getAlluxioOnly()) {
ufsDeleter = new SafeUfsDeleter(mMountTable, mInodeStore, inodesToDelete, deleteContext.getOptions().build());
}
// Inodes to delete from tree after attempting to delete from UFS
List<Pair<AlluxioURI, LockedInodePath>> revisedInodesToDelete = new ArrayList<>();
// Inodes that are not safe for recursive deletes
Set<Long> unsafeInodes = new HashSet<>();
// Alluxio URIs (and the reason for failure) which could not be deleted
List<Pair<String, String>> failedUris = new ArrayList<>();
// file, we deal with the checkpoints and blocks as well.
for (int i = inodesToDelete.size() - 1; i >= 0; i--) {
rpcContext.throwIfCancelled();
Pair<AlluxioURI, LockedInodePath> inodePairToDelete = inodesToDelete.get(i);
AlluxioURI alluxioUriToDelete = inodePairToDelete.getFirst();
Inode inodeToDelete = inodePairToDelete.getSecond().getInode();
String failureReason = null;
if (unsafeInodes.contains(inodeToDelete.getId())) {
failureReason = ExceptionMessage.DELETE_FAILED_DIR_NONEMPTY.getMessage();
} else if (inodeToDelete.isPersisted()) {
// TODO(calvin): Add tests (ALLUXIO-1831)
if (mMountTable.isMountPoint(alluxioUriToDelete)) {
mMountTable.delete(rpcContext, alluxioUriToDelete, true);
} else {
if (!deleteContext.getOptions().getAlluxioOnly()) {
try {
checkUfsMode(alluxioUriToDelete, OperationType.WRITE);
// Attempt to delete node if all children were deleted successfully
ufsDeleter.delete(alluxioUriToDelete, inodeToDelete);
} catch (AccessControlException | IOException e) {
// In case ufs is not writable, we will still attempt to delete other entries
// if any as they may be from a different mount point
LOG.warn("Failed to delete {}: {}", alluxioUriToDelete, e.toString());
failureReason = e.getMessage();
}
}
}
}
if (failureReason == null) {
if (inodeToDelete.isFile()) {
long fileId = inodeToDelete.getId();
// Remove the file from the set of files to persist.
mPersistRequests.remove(fileId);
// Cancel any ongoing jobs.
PersistJob job = mPersistJobs.get(fileId);
if (job != null) {
job.setCancelState(PersistJob.CancelState.TO_BE_CANCELED);
}
}
revisedInodesToDelete.add(new Pair<>(alluxioUriToDelete, inodePairToDelete.getSecond()));
} else {
unsafeInodes.add(inodeToDelete.getId());
// Propagate 'unsafe-ness' to parent as one of its descendants can't be deleted
unsafeInodes.add(inodeToDelete.getParentId());
failedUris.add(new Pair<>(alluxioUriToDelete.toString(), failureReason));
}
}
if (mSyncManager.isSyncPoint(inodePath.getUri())) {
mSyncManager.stopSyncAndJournal(RpcContext.NOOP, inodePath.getUri());
}
// Delete Inodes
for (Pair<AlluxioURI, LockedInodePath> delInodePair : revisedInodesToDelete) {
LockedInodePath tempInodePath = delInodePair.getSecond();
MountTable.Resolution resolution = mMountTable.resolve(tempInodePath.getUri());
mInodeTree.deleteInode(rpcContext, tempInodePath, opTimeMs);
if (deleteContext.getOptions().getAlluxioOnly()) {
Metrics.getUfsOpsSavedCounter(resolution.getUfsMountPointUri(), Metrics.UFSOps.DELETE_FILE).inc();
}
}
if (!failedUris.isEmpty()) {
Collection<String> messages = failedUris.stream().map(pair -> String.format("%s (%s)", pair.getFirst(), pair.getSecond())).collect(Collectors.toList());
throw new FailedPreconditionException(ExceptionMessage.DELETE_FAILED_UFS.getMessage(StringUtils.join(messages, ", ")));
}
}
Metrics.PATHS_DELETED.inc(inodesToDelete.size());
}
use of alluxio.exception.status.FailedPreconditionException in project alluxio by Alluxio.
the class AbstractClient method connect.
/**
* Connects with the remote.
*/
@Override
public synchronized void connect() throws AlluxioStatusException {
if (mConnected) {
return;
}
disconnect();
Preconditions.checkState(!mClosed, "Client is closed, will not try to connect.");
IOException lastConnectFailure = null;
RetryPolicy retryPolicy = mRetryPolicySupplier.get();
while (retryPolicy.attempt()) {
if (mClosed) {
throw new FailedPreconditionException("Failed to connect: client has been closed");
}
// failover).
try {
mAddress = getAddress();
} catch (UnavailableException e) {
LOG.debug("Failed to determine {} rpc address ({}): {}", getServiceName(), retryPolicy.getAttemptCount(), e.toString());
continue;
}
try {
beforeConnect();
LOG.debug("Alluxio client (version {}) is trying to connect with {} @ {}", RuntimeConstants.VERSION, getServiceName(), mAddress);
mChannel = GrpcChannelBuilder.newBuilder(GrpcServerAddress.create(mAddress), mContext.getClusterConf()).setSubject(mContext.getSubject()).setClientType(getServiceName()).build();
// Create stub for version service on host
mVersionService = ServiceVersionClientServiceGrpc.newBlockingStub(mChannel);
mConnected = true;
afterConnect();
checkVersion(getServiceVersion());
LOG.debug("Alluxio client (version {}) is connected with {} @ {}", RuntimeConstants.VERSION, getServiceName(), mAddress);
return;
} catch (IOException e) {
LOG.debug("Failed to connect ({}) with {} @ {}", retryPolicy.getAttemptCount(), getServiceName(), mAddress, e);
lastConnectFailure = e;
if (e instanceof UnauthenticatedException) {
// If there has been a failure in opening GrpcChannel, it's possible because
// the authentication credential has expired. Relogin.
mContext.getUserState().relogin();
}
if (e instanceof NotFoundException) {
// service is not found in the server, skip retry
break;
}
}
}
if (mChannel != null) {
mChannel.shutdown();
}
if (mAddress == null) {
throw new UnavailableException(String.format("Failed to determine address for %s after %s attempts", getServiceName(), retryPolicy.getAttemptCount()));
}
/*
* Throw as-is if {@link UnauthenticatedException} occurred.
*/
if (lastConnectFailure instanceof UnauthenticatedException) {
throw (AlluxioStatusException) lastConnectFailure;
}
if (lastConnectFailure instanceof NotFoundException) {
throw new NotFoundException(lastConnectFailure.getMessage(), new ServiceNotFoundException(lastConnectFailure.getMessage(), lastConnectFailure));
}
throw new UnavailableException(String.format("Failed to connect to master (%s) after %s attempts." + "Please check if Alluxio master is currently running on \"%s\". Service=\"%s\"", mAddress, retryPolicy.getAttemptCount(), mAddress, getServiceName()), lastConnectFailure);
}
use of alluxio.exception.status.FailedPreconditionException in project alluxio by Alluxio.
the class JournalBackupIntegrationTest method backupDelegationProtocol.
// Tests various protocols and configurations for backup delegation.
@Test
public void backupDelegationProtocol() throws Exception {
mCluster = MultiProcessCluster.newBuilder(PortCoordination.BACKUP_DELEGATION_PROTOCOL).setClusterName("backupDelegationProtocol").setNumMasters(3).addProperty(PropertyKey.MASTER_JOURNAL_TYPE, JournalType.UFS.toString()).addProperty(PropertyKey.ZOOKEEPER_SESSION_TIMEOUT, "1sec").addProperty(PropertyKey.MASTER_BACKUP_CONNECT_INTERVAL_MIN, "100ms").addProperty(PropertyKey.MASTER_BACKUP_CONNECT_INTERVAL_MAX, "100ms").addProperty(PropertyKey.MASTER_BACKUP_DELEGATION_ENABLED, "true").build();
File backups = AlluxioTestDirectory.createTemporaryDirectory("backups");
mCluster.start();
// Validate backup works with delegation.
waitForBackup(BackupPRequest.newBuilder().setTargetDirectory(backups.getAbsolutePath()).setOptions(BackupPOptions.newBuilder().setLocalFileSystem(false)).build());
// Kill the primary.
int primaryIdx = mCluster.getPrimaryMasterIndex(GET_PRIMARY_INDEX_TIMEOUT_MS);
mCluster.waitForAndKillPrimaryMaster(PRIMARY_KILL_TIMEOUT_MS);
// Validate backup works again after leader fail-over.
waitForBackup(BackupPRequest.newBuilder().setTargetDirectory(backups.getAbsolutePath()).setOptions(BackupPOptions.newBuilder().setLocalFileSystem(false)).build());
// Continue testing with 2 masters...
// Find standby master index.
int newPrimaryIdx = mCluster.getPrimaryMasterIndex(GET_PRIMARY_INDEX_TIMEOUT_MS);
int followerIdx = (newPrimaryIdx + 1) % 2;
if (followerIdx == primaryIdx) {
followerIdx = (followerIdx + 1) % 2;
}
// Kill the follower. (only leader remains).
mCluster.stopMaster(followerIdx);
// Wait for a second for process to terminate properly.
// This is so that backup request don't get delegated to follower before termination.
Thread.sleep(1000);
// Validate backup delegation fails.
try {
mCluster.getMetaMasterClient().backup(BackupPRequest.newBuilder().setTargetDirectory(backups.getAbsolutePath()).setOptions(BackupPOptions.newBuilder().setLocalFileSystem(false)).build());
Assert.fail("Cannot delegate backup with no followers.");
} catch (FailedPreconditionException e) {
// Expected to fail since there is only single master.
}
// Should work with "AllowLeader" backup.
mCluster.getMetaMasterClient().backup(BackupPRequest.newBuilder().setTargetDirectory(backups.getAbsolutePath()).setOptions(BackupPOptions.newBuilder().setLocalFileSystem(false).setAllowLeader(true)).build());
// Restart the follower. (1 leader 1 follower remains).
mCluster.startMaster(followerIdx);
// Validate backup works again.
waitForBackup(BackupPRequest.newBuilder().setTargetDirectory(backups.getAbsolutePath()).setOptions(BackupPOptions.newBuilder().setLocalFileSystem(false)).build());
// Schedule async backup.
UUID backupId = mCluster.getMetaMasterClient().backup(BackupPRequest.newBuilder().setTargetDirectory(backups.getAbsolutePath()).setOptions(BackupPOptions.newBuilder().setLocalFileSystem(false).setRunAsync(true)).build()).getBackupId();
// Wait until backup is complete.
CommonUtils.waitFor("Backup completed.", () -> {
try {
return mCluster.getMetaMasterClient().getBackupStatus(backupId).getState().equals(BackupState.Completed);
} catch (Exception e) {
throw new RuntimeException(String.format("Unexpected error while getting backup status: %s", e.toString()));
}
});
// Schedule a local backup to overwrite latest backup Id in the current leader.
mCluster.getMetaMasterClient().backup(BackupPRequest.newBuilder().setTargetDirectory(backups.getAbsolutePath()).setOptions(BackupPOptions.newBuilder().setLocalFileSystem(false).setAllowLeader(true)).build());
// Validate old backup can still be queried.
mCluster.getMetaMasterClient().getBackupStatus(backupId);
mCluster.notifySuccess();
}
Aggregations