Search in sources :

Example 6 with PartitionLocation

use of com.tencent.angel.ml.matrix.PartitionLocation in project angel by Tencent.

the class MasterClient method getPartLocation.

/**
 * Get the pss and their locations that stored the partition
 *
 * @param matrixId matrix id
 * @param partId   partition id
 * @return the pss and their locations that stored the partition
 * @throws ServiceException
 */
public PartitionLocation getPartLocation(int matrixId, int partId) throws ServiceException {
    GetPartLocationResponse response = master.getPartLocation(null, GetPartLocationRequest.newBuilder().setMatrixId(matrixId).setPartId(partId).build());
    List<PSLocationProto> psLocsProto = response.getLocationsList();
    int size = psLocsProto.size();
    List<PSLocation> psLocs = new ArrayList<>(size);
    for (int i = 0; i < size; i++) {
        psLocs.add(new PSLocation(ProtobufUtil.convertToId(psLocsProto.get(i).getPsId()), ProtobufUtil.convertToLocation(psLocsProto.get(i))));
    }
    return new PartitionLocation(psLocs);
}
Also used : PSLocation(com.tencent.angel.ps.server.data.PSLocation) PartitionLocation(com.tencent.angel.ml.matrix.PartitionLocation)

Example 7 with PartitionLocation

use of com.tencent.angel.ml.matrix.PartitionLocation in project angel by Tencent.

the class PSAgentMatrixMetaManager method getPartLocation.

/**
 * Get partition location: includes stored pss and the location of the pss
 * TODO: cache
 *
 * @param matrixId partition information
 * @param partId partition id
 * @return partition location
 * @throws ServiceException
 */
public PartitionLocation getPartLocation(int matrixId, int partId) {
    List<ParameterServerId> psIds = getPss(matrixId, partId);
    if (psIds == null) {
        return new PartitionLocation(new ArrayList<>());
    }
    int size = psIds.size();
    List<PSLocation> psLocs = new ArrayList<>(size);
    for (int i = 0; i < size; i++) {
        psLocs.add(new PSLocation(psIds.get(i), PSAgentContext.get().getLocationManager().getPsLocation(psIds.get(i))));
    }
    return new PartitionLocation(psLocs);
}
Also used : PSLocation(com.tencent.angel.ps.server.data.PSLocation) ParameterServerId(com.tencent.angel.ps.ParameterServerId) PartitionLocation(com.tencent.angel.ml.matrix.PartitionLocation)

Example 8 with PartitionLocation

use of com.tencent.angel.ml.matrix.PartitionLocation in project angel by Tencent.

the class PeriodPusher method start.

/**
 * Start
 */
public void start() {
    super.start();
    dispatcher = new Thread(() -> {
        ParameterServerId psId = context.getPSAttemptId().getPsId();
        while (!stopped.get() && !Thread.interrupted()) {
            try {
                Thread.sleep(pushIntervalMs);
                Map<PartitionKey, Integer> parts = getAndClearAllNeedRecoverParts();
                Map<RecoverPartKey, FutureResult> futures = new HashMap<>(parts.size());
                for (PartitionKey part : parts.keySet()) {
                    PartitionLocation partLoc = context.getMaster().getPartLocation(part.getMatrixId(), part.getPartitionId());
                    if ((partLoc.psLocs.size() > 1) && psId.equals(partLoc.psLocs.get(0).psId)) {
                        int size = partLoc.psLocs.size();
                        for (int i = 1; i < size; i++) {
                            RecoverPartKey partKey = new RecoverPartKey(part, partLoc.psLocs.get(i));
                            LOG.info("Start to backup partition " + partKey.partKey + " to " + partKey.psLoc);
                            futures.put(partKey, recover(partKey));
                        }
                    }
                }
                waitResults(futures);
            } catch (Exception e) {
                if (!stopped.get()) {
                    LOG.error("recover parts failed ", e);
                }
            }
        }
    });
    dispatcher.setName("psha-push-dispatcher");
    dispatcher.start();
}
Also used : RecoverPartKey(com.tencent.angel.ps.recovery.ha.RecoverPartKey) PartitionKey(com.tencent.angel.PartitionKey) ParameterServerId(com.tencent.angel.ps.ParameterServerId) HashMap(java.util.HashMap) Map(java.util.Map) PartitionLocation(com.tencent.angel.ml.matrix.PartitionLocation)

Example 9 with PartitionLocation

use of com.tencent.angel.ml.matrix.PartitionLocation in project angel by Tencent.

the class WorkerPool method putPartUpdate.

/**
 * Update a matrix partition
 * @param seqId rpc request id
 * @param request rpc request
 * @param in serialized request
 * @return serialized rpc response
 */
private ByteBuf putPartUpdate(int seqId, PutPartitionUpdateRequest request, ByteBuf in) {
    if (LOG.isDebugEnabled()) {
        LOG.debug("put update request=" + request + " with seqId=" + seqId);
    }
    long startTs = System.currentTimeMillis();
    ByteBuf buf = ByteBufUtils.newByteBuf(8 + 4);
    buf.writeInt(seqId);
    PutPartitionUpdateResponse response = null;
    // Get partition and check the partition state
    PartitionKey partKey = request.getPartKey();
    ServerPartition part = context.getMatrixStorageManager().getPart(partKey.getMatrixId(), partKey.getPartitionId());
    if (part == null) {
        String log = "update " + request + " failed. The partition " + partKey + " does not exist";
        LOG.fatal(log);
        response = new PutPartitionUpdateResponse(ResponseType.SERVER_HANDLE_FATAL, log);
        response.serialize(buf);
        return buf;
    }
    PartitionState state = part.getState();
    if (!request.isComeFromPs() && state != PartitionState.READ_AND_WRITE) {
        String log = "update " + request + " failed. The partition " + partKey + " state is " + state;
        LOG.error(log);
        response = new PutPartitionUpdateResponse(ResponseType.SERVER_HANDLE_FAILED, log);
        response.serialize(buf);
        return buf;
    }
    // Get the stored pss for this partition
    PartitionLocation partLoc = null;
    try {
        partLoc = context.getMatrixMetaManager().getPartLocation(request.getPartKey(), disableRouterCache);
    } catch (Throwable x) {
        String log = "update " + request + " failed, get partition location from master failed " + x.getMessage();
        LOG.error(log, x);
        response = new PutPartitionUpdateResponse(ResponseType.SERVER_HANDLE_FAILED, log);
        response.serialize(buf);
        return buf;
    }
    // Check this ps is the master ps for this partition, if not, just return failed
    if (!request.isComeFromPs() && !isPartMasterPs(partLoc)) {
        String log = "local ps is " + context.getPSAttemptId().getPsId() + " update " + request + " failed, update to slave ps for partition " + request.getPartKey();
        LOG.error(log);
        response = new PutPartitionUpdateResponse(ResponseType.SERVER_HANDLE_FAILED, log);
    } else {
        int clock = request.getClock();
        partKey = request.getPartKey();
        int taskIndex = request.getTaskIndex();
        boolean updateClock = request.isUpdateClock();
        if (LOG.isDebugEnabled()) {
            LOG.debug("seqId = " + seqId + " update split request matrixId = " + partKey.getMatrixId() + ", partId = " + partKey.getPartitionId() + " clock = " + clock + ", taskIndex=" + taskIndex + ", updateClock = " + updateClock);
        }
        try {
            state = part.getState();
            if (state != PartitionState.READ_AND_WRITE) {
                String log = "update " + request + " failed. The partition " + partKey + " state is " + state;
                LOG.error(log);
                response = new PutPartitionUpdateResponse(ResponseType.SERVER_HANDLE_FAILED, log);
                response.serialize(buf);
                return buf;
            }
            part.update(in, rowUpdater);
            if (updateClock) {
                context.getClockVectorManager().updateClock(partKey.getMatrixId(), partKey.getPartitionId(), taskIndex, clock);
            }
            response = new PutPartitionUpdateResponse(ResponseType.SUCCESS);
            // Start to put the update to the slave pss
            if (partLoc.psLocs.size() > 1) {
                context.getPS2PSPusher().put(request, in, partLoc);
                if (updateClock) {
                    context.getPS2PSPusher().updateClock(request.getPartKey(), taskIndex, clock, partLoc);
                }
            }
        } catch (Throwable x) {
            String log = "update " + request + " failed " + x.getMessage();
            LOG.fatal(log, x);
            response = new PutPartitionUpdateResponse(ResponseType.SERVER_HANDLE_FATAL, log);
        }
    }
    response.serialize(buf);
    if (LOG.isDebugEnabled()) {
        LOG.debug("update partition for request " + request + " use time=" + (System.currentTimeMillis() - startTs) + ", response buf=" + buf);
    }
    return buf;
}
Also used : PartitionKey(com.tencent.angel.PartitionKey) ByteBuf(io.netty.buffer.ByteBuf) PartitionLocation(com.tencent.angel.ml.matrix.PartitionLocation)

Example 10 with PartitionLocation

use of com.tencent.angel.ml.matrix.PartitionLocation in project angel by Tencent.

the class WorkerPool method update.

/**
 * Update a partition use PSF
 * @param seqId rpc request id
 * @param request rpc request
 * @param in serialized rpc request
 * @return serialized rpc response
 */
private ByteBuf update(int seqId, UpdaterRequest request, ByteBuf in) {
    UpdaterResponse response = null;
    ByteBuf buf = ByteBufUtils.newByteBuf(4 + 8, useDirectorBuffer);
    // Get partition and check the partition state
    PartitionKey partKey = request.getPartKey();
    ServerPartition part = context.getMatrixStorageManager().getPart(partKey.getMatrixId(), partKey.getPartitionId());
    if (part == null) {
        String log = "update " + request + " failed. The partition " + partKey + " does not exist";
        LOG.fatal(log);
        response = new UpdaterResponse(ResponseType.SERVER_HANDLE_FATAL, log);
        response.serialize(buf);
        return buf;
    }
    PartitionState state = part.getState();
    if (state != PartitionState.READ_AND_WRITE) {
        String log = "update " + request + " failed. The partition " + partKey + " state is " + state;
        LOG.error(log);
        response = new UpdaterResponse(ResponseType.PARTITION_READ_ONLY, log);
        response.serialize(buf);
        return buf;
    }
    // Get the stored pss for this partition
    PartitionLocation partLoc = null;
    try {
        partLoc = context.getMatrixMetaManager().getPartLocation(request.getPartKey(), disableRouterCache);
    } catch (Throwable x) {
        String log = "update " + request + " failed, get partition location from master failed " + x.getMessage();
        LOG.error(log, x);
        response = new UpdaterResponse(ResponseType.SERVER_HANDLE_FAILED, log);
        response.serialize(buf);
        return buf;
    }
    // Check this ps is the master ps for this location, only master ps can accept the update
    if (!request.isComeFromPs() && !isPartMasterPs(partLoc)) {
        String log = "update " + request + " failed, update to slave ps for partition " + request.getPartKey();
        LOG.error(log);
        response = new UpdaterResponse(ResponseType.SERVER_HANDLE_FAILED, log);
    } else {
        try {
            Class<? extends UpdateFunc> funcClass = (Class<? extends UpdateFunc>) Class.forName(request.getUpdaterFuncClass());
            Constructor<? extends UpdateFunc> constructor = funcClass.getConstructor();
            constructor.setAccessible(true);
            UpdateFunc func = constructor.newInstance();
            func.setPsContext(context);
            // Check the partition state again
            state = part.getState();
            if (state != PartitionState.READ_AND_WRITE) {
                String log = "update " + request + " failed. The partition " + partKey + " state is " + state;
                LOG.error(log);
                response = new UpdaterResponse(ResponseType.SERVER_HANDLE_FAILED, log);
                response.serialize(buf);
                return buf;
            }
            part.update(func, request.getPartParam());
            response = new UpdaterResponse();
            response.setResponseType(ResponseType.SUCCESS);
            if (partLoc.psLocs.size() > 1) {
                // Start to put the update to the slave pss
                context.getPS2PSPusher().put(request, in, partLoc);
            }
        } catch (Throwable e) {
            String log = "update " + request + " failed " + e.getMessage();
            LOG.fatal(log, e);
            response = new UpdaterResponse(ResponseType.SERVER_HANDLE_FATAL, log);
        }
    }
    buf.writeInt(seqId);
    response.serialize(buf);
    return buf;
}
Also used : UpdateFunc(com.tencent.angel.ml.matrix.psf.update.enhance.UpdateFunc) PartitionKey(com.tencent.angel.PartitionKey) ByteBuf(io.netty.buffer.ByteBuf) PartitionLocation(com.tencent.angel.ml.matrix.PartitionLocation)

Aggregations

PartitionLocation (com.tencent.angel.ml.matrix.PartitionLocation)10 ParameterServerId (com.tencent.angel.ps.ParameterServerId)5 PSLocation (com.tencent.angel.ps.server.data.PSLocation)5 PartitionKey (com.tencent.angel.PartitionKey)4 ArrayList (java.util.ArrayList)3 PSLocation (com.tencent.angel.ml.matrix.transport.PSLocation)2 RecoverPartKey (com.tencent.angel.ps.recovery.ha.RecoverPartKey)2 ByteBuf (io.netty.buffer.ByteBuf)2 HashMap (java.util.HashMap)2 Map (java.util.Map)2 UpdateFunc (com.tencent.angel.ml.matrix.psf.update.enhance.UpdateFunc)1 MLProtos (com.tencent.angel.protobuf.generated.MLProtos)1