Search in sources :

Example 6 with PSAttemptId

use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.

the class MasterService method psError.

/**
 * notify a parameter server run failed
 * @param controller rpc controller of protobuf
 * @param request contains parameter server id and error message
 * @throws ServiceException
 */
@SuppressWarnings("unchecked")
@Override
public PSErrorResponse psError(RpcController controller, PSErrorRequest request) throws ServiceException {
    PSAttemptId psAttemptId = ProtobufUtil.convertToId(request.getPsAttemptId());
    LOG.info("error happened in psAttempt " + psAttemptId + " error msg=" + request.getMsg());
    // remove this parameter server attempt from monitor set
    psLastHeartbeatTS.remove(psAttemptId);
    context.getEventHandler().handle(new PSAttemptDiagnosticsUpdateEvent(request.getMsg(), psAttemptId));
    context.getEventHandler().handle(new PSAttemptEvent(PSAttemptEventType.PA_FAILMSG, psAttemptId));
    return PSErrorResponse.newBuilder().build();
}
Also used : PSAttemptId(com.tencent.angel.ps.PSAttemptId)

Example 7 with PSAttemptId

use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.

the class MasterService method psRegister.

/**
 * response for parameter server register.
 * @param controller rpc controller of protobuf
 * @param request register request
 * @throws ServiceException
 */
@SuppressWarnings("unchecked")
@Override
public PSRegisterResponse psRegister(RpcController controller, PSRegisterRequest request) throws ServiceException {
    if (LOG.isDebugEnabled()) {
        LOG.debug("receive ps register request. request=" + request);
    }
    PSAttemptId psAttemptId = ProtobufUtil.convertToId(request.getPsAttemptId());
    PSRegisterResponse.Builder resBuilder = PSRegisterResponse.newBuilder();
    // if psAttemptId is not in monitor set, just return a PSCOMMAND_SHUTDOWN command.
    if (!psLastHeartbeatTS.containsKey(psAttemptId)) {
        LOG.info(psAttemptId + " doesn't exists!");
        for (PSAttemptId id : psLastHeartbeatTS.keySet()) {
            LOG.info("contains psKey: " + id);
        }
        resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_SHUTDOWN);
    } else {
        context.getEventHandler().handle(new PSAttemptRegisterEvent(psAttemptId, new Location(request.getLocation().getIp(), request.getLocation().getPort())));
        LOG.info(psAttemptId + " is registered now!");
        resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_OK);
    }
    LOG.info(psAttemptId + " register finished!");
    return resBuilder.build();
}
Also used : PSAttemptId(com.tencent.angel.ps.PSAttemptId) PSLocation(com.tencent.angel.ml.matrix.transport.PSLocation) Location(com.tencent.angel.common.location.Location)

Example 8 with PSAttemptId

use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.

the class MasterService method psReport.

/**
 * response for parameter server heartbeat
 * @param controller rpc controller of protobuf
 * @param request heartbeat request
 * @throws ServiceException
 */
@SuppressWarnings("unchecked")
@Override
public PSReportResponse psReport(RpcController controller, PSReportRequest request) throws ServiceException {
    if (LOG.isDebugEnabled()) {
        LOG.debug("receive ps heartbeat request. request=" + request);
    }
    // parse parameter server counters
    List<Pair> params = request.getMetricsList();
    int size = params.size();
    Map<String, String> paramsMap = new HashMap<String, String>();
    for (int i = 0; i < size; i++) {
        paramsMap.put(params.get(i).getKey(), params.get(i).getValue());
    }
    PSAttemptId psAttemptId = ProtobufUtil.convertToId(request.getPsAttemptId());
    PSReportResponse.Builder resBuilder = PSReportResponse.newBuilder();
    if (!psLastHeartbeatTS.containsKey(psAttemptId)) {
        // if psAttemptId is not in monitor set, just return a PSCOMMAND_SHUTDOWN command.
        LOG.error("ps attempt " + psAttemptId + " is not in running ps attempt set");
        resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_SHUTDOWN);
    } else {
        // refresh last heartbeat timestamp
        psLastHeartbeatTS.put(psAttemptId, System.currentTimeMillis());
        // send a state update event to the specific PSAttempt
        context.getEventHandler().handle(new PSAttemptStateUpdateEvent(psAttemptId, paramsMap));
        // check if parameter server can commit now.
        if (context.getParameterServerManager().psCanCommit()) {
            List<Integer> ids = context.getParameterServerManager().getNeedCommitMatrixIds();
            LOG.info("notify ps" + psAttemptId + " to commit now! commit matrices:" + StringUtils.joinInts(",", ids));
            resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_COMMIT);
            NeedSaveMatrixProto.Builder saveBuilder = NeedSaveMatrixProto.newBuilder();
            for (int matrixId : ids) {
                resBuilder.addNeedSaveMatrices(saveBuilder.setMatrixId(matrixId).addAllPartIds(context.getMatrixMetaManager().getMasterPartsInPS(matrixId, psAttemptId.getPsId())).build());
                saveBuilder.clear();
            }
        } else {
            resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_OK);
        }
    }
    // Update PS failed counters
    context.getParameterServerManager().psFailedReports(ProtobufUtil.convert(request.getPsFailedReports()));
    // check matrix metadata inconsistencies between master and parameter server.
    // if a matrix exists on the Master and does not exist on ps, then it is necessary to notify ps to establish the matrix
    // if a matrix exists on the ps and does not exist on master, then it is necessary to notify ps to remove the matrix
    List<MatrixReportProto> matrixReportsProto = request.getMatrixReportsList();
    List<Integer> needReleaseMatrices = new ArrayList<>();
    List<MatrixMeta> needCreateMatrices = new ArrayList<>();
    List<RecoverPartKey> needRecoverParts = new ArrayList<>();
    List<MatrixReport> matrixReports = ProtobufUtil.convertToMatrixReports(matrixReportsProto);
    context.getMatrixMetaManager().syncMatrixInfos(matrixReports, needCreateMatrices, needReleaseMatrices, needRecoverParts, psAttemptId.getPsId());
    size = needCreateMatrices.size();
    for (int i = 0; i < size; i++) {
        resBuilder.addNeedCreateMatrices(ProtobufUtil.convertToMatrixMetaProto(needCreateMatrices.get(i)));
    }
    size = needReleaseMatrices.size();
    for (int i = 0; i < size; i++) {
        resBuilder.addNeedReleaseMatrixIds(needReleaseMatrices.get(i));
    }
    size = needRecoverParts.size();
    for (int i = 0; i < size; i++) {
        resBuilder.addNeedRecoverParts(ProtobufUtil.convert(needRecoverParts.get(i)));
    }
    return resBuilder.build();
}
Also used : ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Int2IntOpenHashMap(it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap) RecoverPartKey(com.tencent.angel.ps.recovery.ha.RecoverPartKey) MatrixMeta(com.tencent.angel.ml.matrix.MatrixMeta) PSAttemptId(com.tencent.angel.ps.PSAttemptId) MatrixReport(com.tencent.angel.ml.matrix.MatrixReport)

Example 9 with PSAttemptId

use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.

the class LocalContainerAllocator method requestContainer.

@SuppressWarnings("unchecked")
private void requestContainer(ContainerAllocatorEvent event) {
    LocalContainer allocated = new LocalContainer();
    Id id = event.getTaskId();
    if (id instanceof PSAttemptId) {
        context.getEventHandler().handle(new PSAttemptContainerAssignedEvent((PSAttemptId) id, allocated));
    } else if (id instanceof PSAgentAttemptId) {
        context.getEventHandler().handle(new PSAgentAttemptContainerAssignedEvent((PSAgentAttemptId) id, allocated));
    } else if (id instanceof WorkerAttemptId) {
        context.getEventHandler().handle(new WorkerAttemptContainerAssignedEvent((WorkerAttemptId) id, allocated));
    }
}
Also used : PSAttemptContainerAssignedEvent(com.tencent.angel.master.ps.attempt.PSAttemptContainerAssignedEvent) PSAttemptId(com.tencent.angel.ps.PSAttemptId) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) PSAgentAttemptContainerAssignedEvent(com.tencent.angel.master.psagent.PSAgentAttemptContainerAssignedEvent) PSAgentAttemptId(com.tencent.angel.psagent.PSAgentAttemptId) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) Id(com.tencent.angel.common.Id) PSAgentAttemptId(com.tencent.angel.psagent.PSAgentAttemptId) PSAttemptId(com.tencent.angel.ps.PSAttemptId) WorkerAttemptContainerAssignedEvent(com.tencent.angel.master.worker.attempt.WorkerAttemptContainerAssignedEvent)

Example 10 with PSAttemptId

use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.

the class LocalContainerLauncher method launch.

@SuppressWarnings("unchecked")
private void launch(ContainerLauncherEvent event) {
    Id id = event.getId();
    if (id instanceof PSAttemptId) {
        LocalPS ps = new LocalPS((PSAttemptId) id, context.getMasterService().getLocation(), context.getConf());
        context.getEventHandler().handle(new PSAttemptEvent(PSAttemptEventType.PA_CONTAINER_LAUNCHED, (PSAttemptId) id));
        try {
            ps.start();
            LocalClusterContext.get().addPS((PSAttemptId) id, ps);
        } catch (Exception e) {
            LOG.error("launch ps failed.", e);
            context.getEventHandler().handle(new PSAttemptEvent(PSAttemptEventType.PA_CONTAINER_LAUNCH_FAILED, (PSAttemptId) id));
        }
    } else if (id instanceof PSAgentAttemptId) {
        context.getEventHandler().handle(new PSAgentAttemptEvent(PSAgentAttemptEventType.PSAGENT_ATTEMPT_CONTAINER_LAUNCHED, (PSAgentAttemptId) id));
    } else if (id instanceof WorkerAttemptId) {
        LocalWorker worker = new LocalWorker(context.getConf(), context.getApplicationId(), context.getUser(), (WorkerAttemptId) id, context.getMasterService().getLocation(), 0, false);
        context.getEventHandler().handle(new WorkerAttemptEvent(WorkerAttemptEventType.CONTAINER_LAUNCHED, (WorkerAttemptId) id));
        try {
            worker.start();
            LocalClusterContext.get().addWorker((WorkerAttemptId) id, worker);
        } catch (Exception e) {
            LOG.error("launch worker failed.", e);
            context.getEventHandler().handle(new WorkerAttemptEvent(WorkerAttemptEventType.CONTAINER_LAUNCH_FAILED, (WorkerAttemptId) id));
        }
    }
}
Also used : PSAttemptId(com.tencent.angel.ps.PSAttemptId) PSAgentAttemptEvent(com.tencent.angel.master.psagent.PSAgentAttemptEvent) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) LocalWorker(com.tencent.angel.localcluster.LocalWorker) LocalPS(com.tencent.angel.localcluster.LocalPS) PSAgentAttemptId(com.tencent.angel.psagent.PSAgentAttemptId) WorkerAttemptEvent(com.tencent.angel.master.worker.attempt.WorkerAttemptEvent) PSAttemptId(com.tencent.angel.ps.PSAttemptId) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) Id(com.tencent.angel.common.Id) PSAgentAttemptId(com.tencent.angel.psagent.PSAgentAttemptId) PSAttemptEvent(com.tencent.angel.master.ps.attempt.PSAttemptEvent)

Aggregations

PSAttemptId (com.tencent.angel.ps.PSAttemptId)27 ParameterServerId (com.tencent.angel.ps.ParameterServerId)15 WorkerAttemptId (com.tencent.angel.worker.WorkerAttemptId)15 WorkerGroupId (com.tencent.angel.worker.WorkerGroupId)11 WorkerId (com.tencent.angel.worker.WorkerId)11 Configuration (org.apache.hadoop.conf.Configuration)11 MatrixContext (com.tencent.angel.ml.matrix.MatrixContext)10 CombineTextInputFormat (org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat)10 TaskId (com.tencent.angel.worker.task.TaskId)8 Before (org.junit.Before)6 Test (org.junit.Test)6 AngelException (com.tencent.angel.exception.AngelException)5 AMParameterServer (com.tencent.angel.master.ps.ps.AMParameterServer)5 DummyTask (com.tencent.angel.master.DummyTask)4 PSAttempt (com.tencent.angel.master.ps.attempt.PSAttempt)4 ParameterServer (com.tencent.angel.ps.impl.ParameterServer)4 PSAgentAttemptId (com.tencent.angel.psagent.PSAgentAttemptId)4 Id (com.tencent.angel.common.Id)3 Location (com.tencent.angel.common.location.Location)3 ParameterServerManager (com.tencent.angel.master.ps.ParameterServerManager)3