Search in sources :

Example 1 with PartitionKey

use of com.tencent.angel.PartitionKey in project angel by Tencent.

the class PS2PSPusherImpl method start.

/**
 * Start
 */
public void start() {
    psClient.start();
    workerPool = Executors.newFixedThreadPool(16);
    recoverChecker = new Thread(() -> {
        while (!stopped.get() && !Thread.interrupted()) {
            try {
                Thread.sleep(30000);
                Map<RecoverPartKey, FutureResult> futures = new HashMap<>();
                try {
                    lock.readLock().lock();
                    for (Map.Entry<PartitionKey, Map<PSLocation, Integer>> partEntry : failedUpdateCounters.entrySet()) {
                        PartitionKey partKey = partEntry.getKey();
                        Map<PSLocation, Integer> failedCounters = partEntry.getValue();
                        if (failedCounters.isEmpty()) {
                            continue;
                        }
                        PartitionLocation partLoc = context.getMaster().getPartLocation(partKey.getMatrixId(), partKey.getPartitionId());
                        if (partLoc.psLocs.size() > 1 && partLoc.psLocs.get(0).psId.equals(context.getPSAttemptId().getPsId())) {
                            for (int i = 1; i < partLoc.psLocs.size(); i++) {
                                PSLocation psLoc = partLoc.psLocs.get(i);
                                if (failedCounters.containsKey(psLoc) && failedCounters.get(psLoc) > 0) {
                                    RecoverPartKey recoverPartKey = new RecoverPartKey(partKey, psLoc);
                                    futures.put(recoverPartKey, recover(recoverPartKey));
                                }
                            }
                        }
                    }
                } finally {
                    lock.readLock().unlock();
                }
                waitResults(futures);
            } catch (Throwable e) {
                if (!stopped.get()) {
                    LOG.error("Start to ");
                }
            }
        }
    });
    recoverChecker.setName("Recover-checker");
    recoverChecker.start();
}
Also used : PSLocation(com.tencent.angel.ml.matrix.transport.PSLocation) RecoverPartKey(com.tencent.angel.ps.recovery.ha.RecoverPartKey) PartitionKey(com.tencent.angel.PartitionKey) HashMap(java.util.HashMap) Map(java.util.Map) PartitionLocation(com.tencent.angel.ml.matrix.PartitionLocation)

Example 2 with PartitionKey

use of com.tencent.angel.PartitionKey in project angel by Tencent.

the class SyncEventPusher method put.

@Override
public void put(PartitionRequest request, ByteBuf msg, PartitionLocation partLoc) {
    request.setComeFromPs(true);
    msg.resetReaderIndex();
    msg.setBoolean(8, true);
    PartitionKey partKey = request.getPartKey();
    if (partLoc.psLocs.size() == 1) {
        return;
    } else {
        if (partLoc.psLocs.get(0).psId.equals(context.getPSAttemptId().getPsId())) {
            int size = partLoc.psLocs.size();
            List<FutureResult> results = new ArrayList<>(size - 1);
            for (int i = 1; i < size; i++) {
                results.add(psClient.put(partLoc.psLocs.get(i).psId, partLoc.psLocs.get(i).loc, request, msg.copy()));
            }
            msg.release();
            for (int i = 0; i < size - 1; i++) {
                try {
                    Response result = (Response) results.get(i).get();
                    if (result.getResponseType() != ResponseType.SUCCESS) {
                        increaseFailedCounter(partKey, partLoc.psLocs.get(i + 1));
                    }
                } catch (Exception e) {
                    LOG.error("wait for result for sync failed ", e);
                    increaseFailedCounter(partKey, partLoc.psLocs.get(i + 1));
                }
            }
        }
    }
}
Also used : Response(com.tencent.angel.ml.matrix.transport.Response) FutureResult(com.tencent.angel.psagent.matrix.transport.FutureResult) ArrayList(java.util.ArrayList) PartitionKey(com.tencent.angel.PartitionKey)

Example 3 with PartitionKey

use of com.tencent.angel.PartitionKey in project angel by Tencent.

the class PSClient method recoverPart.

/**
 * Recover a matrix partition for a ps
 * @param serverId dest ps id
 * @param location dest ps location
 * @param part need recover partition
 * @return recover result
 */
public FutureResult<Response> recoverPart(ParameterServerId serverId, Location location, ServerPartition part) {
    // Generate seq id
    int seqId = seqIdGen.incrementAndGet();
    FutureResult<Response> result = new FutureResult<>();
    seqIdToResultMap.put(seqId, result);
    // Create a RecoverPartRequest
    PartitionKey partKey = part.getPartitionKey();
    RecoverPartRequest request = new RecoverPartRequest(context.getClockVectorManager().getClockVec(partKey.getMatrixId(), partKey.getPartitionId()), new PartitionKey(partKey.getMatrixId(), partKey.getPartitionId()), part);
    request.getContext().setServerId(serverId);
    seqIdToRequestMap.put(seqId, request);
    // Serialize the request
    ByteBuf msg = ByteBufUtils.newByteBuf(8 + request.bufferLen(), useDirectBuf);
    msg.writeInt(seqId);
    msg.writeInt(request.getType().getMethodId());
    request.serialize(msg);
    send(serverId, location, seqId, request, msg, result);
    return result;
}
Also used : FutureResult(com.tencent.angel.psagent.matrix.transport.FutureResult) PartitionKey(com.tencent.angel.PartitionKey) ByteBuf(io.netty.buffer.ByteBuf)

Example 4 with PartitionKey

use of com.tencent.angel.PartitionKey in project angel by Tencent.

the class ServerMatrix method savePartitions.

private void savePartitions(Path matrixPath, FileSystem fs, List<Integer> partitionIds, int startPos, int endPos, PSModelFilesMeta serverMatrixMeta) throws IOException {
    Path destFile = new Path(matrixPath, ModelFilesUtils.fileName(context.getPs().getServerId(), partitionIds.get(startPos)));
    Path tmpDestFile = HdfsUtil.toTmpPath(destFile);
    FSDataOutputStream out = fs.create(tmpDestFile);
    long streamPos = 0;
    ServerPartition partition = null;
    for (int i = startPos; i < endPos; i++) {
        LOG.info("Write partition " + partitionIds.get(i) + " of matrix " + matrixName + " to " + tmpDestFile);
        streamPos = out.getPos();
        partition = partitionMaps.get(partitionIds.get(i));
        PartitionKey partKey = partition.getPartitionKey();
        ModelPartitionMeta partMeta = new ModelPartitionMeta(partKey.getPartitionId(), partKey.getStartRow(), partKey.getEndRow(), partKey.getStartCol(), partKey.getEndCol(), partition.elementNum(), destFile.getName(), streamPos, 0);
        partition.save(out, partMeta);
        partMeta.setLength(out.getPos() - streamPos);
        serverMatrixMeta.addPartitionMeta(partitionIds.get(i), partMeta);
    }
    out.flush();
    out.close();
    HdfsUtil.rename(tmpDestFile, destFile, fs);
}
Also used : Path(org.apache.hadoop.fs.Path) PartitionKey(com.tencent.angel.PartitionKey) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream)

Example 5 with PartitionKey

use of com.tencent.angel.PartitionKey in project angel by Tencent.

the class WorkerPool method recoverPart.

/**
 * Recover a partition
 * @param seqId rpc request it
 * @param request request
 * @return serialized rpc response
 */
private ByteBuf recoverPart(int seqId, RecoverPartRequest request) {
    if (LOG.isDebugEnabled()) {
        LOG.debug("recover part request=" + request + " with seqId=" + seqId);
    }
    long startTs = System.currentTimeMillis();
    ByteBuf buf = ByteBufUtils.newByteBuf(8 + 4);
    buf.writeInt(seqId);
    Response response = null;
    PartitionKey partKey = request.getPartKey();
    Int2IntOpenHashMap clockVec = request.getTaskIndexToClockMap();
    if (clockVec != null) {
        context.getClockVectorManager().setClockVec(partKey.getMatrixId(), partKey.getPartitionId(), clockVec);
    }
    ServerPartition part = context.getMatrixStorageManager().getPart(partKey.getMatrixId(), partKey.getPartitionId());
    if (part == null) {
        String log = "can not find the partition " + partKey;
        response = new Response(ResponseType.SERVER_HANDLE_FATAL, log);
        response.serialize(buf);
        return buf;
    }
    part.recover(request.getPart());
    response = new Response(ResponseType.SUCCESS);
    response.serialize(buf);
    if (LOG.isDebugEnabled()) {
        LOG.debug("recover partition  request " + request + " use time=" + (System.currentTimeMillis() - startTs));
    }
    return buf;
}
Also used : PartitionKey(com.tencent.angel.PartitionKey) ByteBuf(io.netty.buffer.ByteBuf) Int2IntOpenHashMap(it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap)

Aggregations

PartitionKey (com.tencent.angel.PartitionKey)80 ArrayList (java.util.ArrayList)17 ByteBuf (io.netty.buffer.ByteBuf)12 Test (org.junit.Test)9 PartitionGetResult (com.tencent.angel.ml.matrix.psf.get.base.PartitionGetResult)8 MatrixMeta (com.tencent.angel.ml.matrix.MatrixMeta)7 PartitionGetParam (com.tencent.angel.ml.matrix.psf.get.base.PartitionGetParam)7 PartitionLocation (com.tencent.angel.ml.matrix.PartitionLocation)4 ServerRow (com.tencent.angel.ps.impl.matrix.ServerRow)4 ParameterServerId (com.tencent.angel.ps.ParameterServerId)3 RecoverPartKey (com.tencent.angel.ps.recovery.ha.RecoverPartKey)3 FutureResult (com.tencent.angel.psagent.matrix.transport.FutureResult)3 Map (java.util.Map)3 Location (com.tencent.angel.common.location.Location)2 TVector (com.tencent.angel.ml.math.TVector)2 RowType (com.tencent.angel.ml.matrix.RowType)2 PSLocation (com.tencent.angel.ml.matrix.transport.PSLocation)2 MatrixStorageManager (com.tencent.angel.ps.impl.MatrixStorageManager)2 ClockCache (com.tencent.angel.psagent.clock.ClockCache)2 Worker (com.tencent.angel.worker.Worker)2