Search in sources :

Example 1 with NonTransientException

use of com.microsoft.frameworklauncher.common.exceptions.NonTransientException in project pai by Microsoft.

the class RequestManager method pullRequest.

private void pullRequest() throws Exception {
    // Pull LauncherRequest
    LOGGER.logDebug("Pulling LauncherRequest");
    LauncherRequest newLauncherRequest = zkStore.getLauncherRequest();
    LOGGER.logDebug("Pulled LauncherRequest");
    // newLauncherRequest is always not null
    updateLauncherRequest(newLauncherRequest);
    // Pull AggregatedFrameworkRequest
    AggregatedFrameworkRequest aggFrameworkRequest;
    try {
        LOGGER.logDebug("Pulling AggregatedFrameworkRequest");
        aggFrameworkRequest = zkStore.getAggregatedFrameworkRequest(conf.getFrameworkName());
        LOGGER.logDebug("Pulled AggregatedFrameworkRequest");
    } catch (NoNodeException e) {
        existsLocalVersionFrameworkRequest = 0;
        throw new NonTransientException("Failed to getAggregatedFrameworkRequest, FrameworkRequest is already deleted on ZK", e);
    }
    // newFrameworkDescriptor is always not null
    FrameworkDescriptor newFrameworkDescriptor = aggFrameworkRequest.getFrameworkRequest().getFrameworkDescriptor();
    checkFrameworkVersion(newFrameworkDescriptor);
    flattenFrameworkDescriptor(newFrameworkDescriptor);
    updateFrameworkDescriptor(newFrameworkDescriptor);
    updateOverrideApplicationProgressRequest(aggFrameworkRequest.getOverrideApplicationProgressRequest());
    updateMigrateTaskRequests(aggFrameworkRequest.getMigrateTaskRequests());
}
Also used : NonTransientException(com.microsoft.frameworklauncher.common.exceptions.NonTransientException) NoNodeException(org.apache.zookeeper.KeeperException.NoNodeException)

Example 2 with NonTransientException

use of com.microsoft.frameworklauncher.common.exceptions.NonTransientException in project pai by Microsoft.

the class StatusManager method recover.

@Override
protected void recover() throws Exception {
    super.recover();
    AggregatedFrameworkStatus aggFrameworkStatus;
    try {
        aggFrameworkStatus = zkStore.getAggregatedFrameworkStatus(conf.getFrameworkName());
        for (Map.Entry<String, AggregatedTaskRoleStatus> aggTaskRoleStatus : aggFrameworkStatus.getAggregatedTaskRoleStatuses().entrySet()) {
            String taskRoleName = aggTaskRoleStatus.getKey();
            TaskRoleStatus taskRoleStatus = aggTaskRoleStatus.getValue().getTaskRoleStatus();
            TaskStatuses taskStatuses = aggTaskRoleStatus.getValue().getTaskStatuses();
            // exit either due to AM RM heartbeat or pushStatus.existsLocalVersionFrameworkRequest.
            if (!taskRoleStatus.getFrameworkVersion().equals(conf.getFrameworkVersion())) {
                throw new NonTransientException(String.format("[%s]: FrameworkVersion mismatch: Local Version %s, Previous TaskRoleStatus Version %s", taskRoleName, conf.getFrameworkVersion(), taskRoleStatus.getFrameworkVersion()));
            }
            if (!taskStatuses.getFrameworkVersion().equals(conf.getFrameworkVersion())) {
                throw new NonTransientException(String.format("[%s]: FrameworkVersion mismatch: Local Version %s, Previous TaskStatuses Version %s", taskRoleName, conf.getFrameworkVersion(), taskStatuses.getFrameworkVersion()));
            }
        }
    } catch (KeeperException.NoNodeException e) {
        throw new NonTransientException("Failed to getAggregatedFrameworkStatus, FrameworkStatus is already deleted on ZK", e);
    } catch (KeeperException e) {
        throw e;
    } catch (Exception e) {
        LOGGER.logError(e, "Failed to recover %s. Reinitializing all TaskRoleStatuses and TaskStatuseses in the Framework on ZK.", serviceName);
        zkStore.deleteFrameworkStatus(conf.getFrameworkName(), true);
        aggFrameworkStatus = null;
    }
    if (aggFrameworkStatus != null) {
        for (Map.Entry<String, AggregatedTaskRoleStatus> aggTaskRoleStatus : aggFrameworkStatus.getAggregatedTaskRoleStatuses().entrySet()) {
            String taskRoleName = aggTaskRoleStatus.getKey();
            TaskRoleStatus taskRoleStatus = aggTaskRoleStatus.getValue().getTaskRoleStatus();
            TaskStatuses taskStatuses = aggTaskRoleStatus.getValue().getTaskStatuses();
            taskRoleStatuses.put(taskRoleName, taskRoleStatus);
            taskStatuseses.put(taskRoleName, taskStatuses);
            taskRoleStatusesChanged.put(taskRoleName, false);
            taskStatusesesChanged.put(taskRoleName, false);
            List<TaskStatus> taskStatusArray = taskStatuses.getTaskStatusArray();
            for (int taskIndex = 0; taskIndex < taskStatusArray.size(); taskIndex++) {
                addExtensionTaskStatus(new TaskStatusLocator(taskRoleName, taskIndex));
            }
        }
        LOGGER.logInfo("Succeeded to recover %s.", serviceName);
    }
// Here ZK and Mem Status is the same.
// Since Request may be ahead of Status even when Running,
// so here the Recovery of AM StatusManager is completed.
}
Also used : NonTransientException(com.microsoft.frameworklauncher.common.exceptions.NonTransientException) KeeperException(org.apache.zookeeper.KeeperException) IOException(java.io.IOException) NonTransientException(com.microsoft.frameworklauncher.common.exceptions.NonTransientException) NotAvailableException(com.microsoft.frameworklauncher.common.exceptions.NotAvailableException) KeeperException(org.apache.zookeeper.KeeperException)

Example 3 with NonTransientException

use of com.microsoft.frameworklauncher.common.exceptions.NonTransientException in project pai by Microsoft.

the class HadoopUtils method convertToLocalResource.

private static LocalResource convertToLocalResource(String hdfsPath, LocalResourceVisibility visibility) throws Exception {
    // Directory resource path must not end with /, otherwise localization will hang.
    hdfsPath = StringUtils.stripEnd(hdfsPath, HDFS_PATH_SEPARATOR);
    String extension = FilenameUtils.getExtension(hdfsPath).toLowerCase();
    LocalResourceType type;
    if (extension.equals(".zip") || extension.equals(".tgz") || extension.equals(".tar") || extension.equals(".tar.gz")) {
        type = LocalResourceType.ARCHIVE;
    } else {
        type = LocalResourceType.FILE;
    }
    // Applications' Containers on the same node write the same data in the resource directory.
    try {
        FileStatus fileStatus = getFileStatusInHdfs(hdfsPath);
        FileContext fileContext = FileContext.getFileContext(conf);
        return LocalResource.newInstance(ConverterUtils.getYarnUrlFromPath(fileContext.getDefaultFileSystem().resolvePath(fileStatus.getPath())), type, visibility, fileStatus.getLen(), fileStatus.getModificationTime());
    } catch (IllegalArgumentException e) {
        // hdfsPath may be from user, so it may be illegal.
        throw new NonTransientException("Path is illegal.", e);
    }
}
Also used : NonTransientException(com.microsoft.frameworklauncher.common.exceptions.NonTransientException)

Example 4 with NonTransientException

use of com.microsoft.frameworklauncher.common.exceptions.NonTransientException in project pai by Microsoft.

the class HadoopUtils method makeDirInHdfs.

// Should success when the hdfsPath and its parent paths are directories
// Note if parent directories do not exist, they will be created
public static void makeDirInHdfs(String hdfsPath) throws Exception {
    try {
        FileSystem fs = FileSystem.get(conf);
        LOGGER.logInfo("[hadoop fs -mkdir -p %s]", hdfsPath);
        fs.mkdirs(new Path(hdfsPath));
    } catch (Exception e) {
        if (e.getMessage().toLowerCase().contains("not a directory")) {
            throw new NonTransientException("Path is not a directory", e);
        } else {
            throw e;
        }
    }
}
Also used : NonTransientException(com.microsoft.frameworklauncher.common.exceptions.NonTransientException) UndeclaredThrowableException(java.lang.reflect.UndeclaredThrowableException) FileNotFoundException(java.io.FileNotFoundException) NonTransientException(com.microsoft.frameworklauncher.common.exceptions.NonTransientException) ApplicationNotFoundException(org.apache.hadoop.yarn.exceptions.ApplicationNotFoundException)

Example 5 with NonTransientException

use of com.microsoft.frameworklauncher.common.exceptions.NonTransientException in project pai by Microsoft.

the class Service method handleException.

@Override
protected Boolean handleException(Exception e) {
    super.handleException(e);
    if (e instanceof NonTransientException) {
        LOGGER.logError(e, "NonTransientException occurred in %1$s. %1$s will be stopped.", serviceName);
        stop(new StopStatus(ExitStatusKey.LAUNCHER_INTERNAL_NON_TRANSIENT_ERROR.toInt(), true, null, e));
        return false;
    } else {
        LOGGER.logError(e, "Exception occurred in %1$s. It should be transient. Will restart %1$s inplace.", serviceName);
        // TODO: Only Restart Service instead of exit whole process and Restart by external system.
        stop(new StopStatus(ExitStatusKey.LAUNCHER_INTERNAL_UNKNOWN_ERROR.toInt(), false, null, e));
        return true;
    }
}
Also used : NonTransientException(com.microsoft.frameworklauncher.common.exceptions.NonTransientException) StopStatus(com.microsoft.frameworklauncher.common.service.StopStatus)

Aggregations

NonTransientException (com.microsoft.frameworklauncher.common.exceptions.NonTransientException)9 FileNotFoundException (java.io.FileNotFoundException)3 StopStatus (com.microsoft.frameworklauncher.common.service.StopStatus)2 UndeclaredThrowableException (java.lang.reflect.UndeclaredThrowableException)2 ApplicationNotFoundException (org.apache.hadoop.yarn.exceptions.ApplicationNotFoundException)2 NoNodeException (org.apache.zookeeper.KeeperException.NoNodeException)2 NotAvailableException (com.microsoft.frameworklauncher.common.exceptions.NotAvailableException)1 IOException (java.io.IOException)1 KeeperException (org.apache.zookeeper.KeeperException)1