Search in sources :

Example 61 with PartitionKey

use of com.tencent.angel.PartitionKey in project angel by Tencent.

the class MasterRecoverTest method testMasterRecover.

@SuppressWarnings("unchecked")
@Test
public void testMasterRecover() throws Exception {
    try {
        ApplicationAttemptId appAttempt1Id = ApplicationAttemptId.newInstance(LocalClusterContext.get().getAppId(), 1);
        ApplicationAttemptId appAttempt2Id = ApplicationAttemptId.newInstance(LocalClusterContext.get().getAppId(), 2);
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        LOG.info("angelAppMaster.getAppContext().getApplicationAttemptId()=" + angelAppMaster.getAppContext().getApplicationAttemptId());
        assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt1Id);
        ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
        ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        int w1Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w1").getId();
        int w2Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w2").getId();
        Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
        TConnection connection = TConnectionManager.getConnection(ps.getConf());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        int task0Iteration = 2;
        int task1Iteration = 1;
        int task0w1Clock = 10;
        int task0w2Clock = 20;
        int task1w1Clock = 9;
        int task1w2Clock = 19;
        int w1Clock = (task0w1Clock < task1w1Clock) ? task0w1Clock : task1w1Clock;
        int w2Clock = (task0w2Clock < task1w2Clock) ? task0w2Clock : task1w2Clock;
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task0w1Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task0w2Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task1w1Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task1w2Clock).build()).build());
        int writeIntervalMS = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_AM_WRITE_STATE_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_AM_WRITE_STATE_INTERVAL_MS);
        Thread.sleep(writeIntervalMS * 2);
        angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed", true));
        Thread.sleep(15000);
        angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.RUNNING);
        LOG.info("angelAppMaster.getAppContext().getApplicationAttemptId()=" + angelAppMaster.getAppContext().getApplicationAttemptId());
        assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt2Id);
        PartitionKey w1Part0Key = new PartitionKey(0, w1Id, 0, 0, 1, 50000);
        PartitionKey w1Part1Key = new PartitionKey(1, w1Id, 0, 50000, 1, 100000);
        PartitionKey w2Part0Key = new PartitionKey(0, w2Id, 0, 0, 1, 50000);
        PartitionKey w2Part1Key = new PartitionKey(1, w2Id, 0, 50000, 1, 100000);
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        LOG.info("worker=" + worker);
        LOG.info("worker.getTaskManager()=" + worker.getTaskManager());
        LOG.info("worker.getTaskManager().getRunningTask()=" + worker.getTaskManager().getRunningTask().size());
        TaskContext task0Context = worker.getTaskManager().getRunningTask().get(task0Id).getTaskContext();
        TaskContext task1Context = worker.getTaskManager().getRunningTask().get(task1Id).getTaskContext();
        assertEquals(task0Context.getEpoch(), task0Iteration);
        assertEquals(task1Context.getEpoch(), task1Iteration);
        assertEquals(task0Context.getMatrixClock(w1Id), task0w1Clock);
        assertEquals(task0Context.getMatrixClock(w2Id), task0w2Clock);
        assertEquals(task1Context.getMatrixClock(w1Id), task1w1Clock);
        assertEquals(task1Context.getMatrixClock(w2Id), task1w2Clock);
        LOG.info("===============worker.getPSAgent().getMatrixMetaManager().getMatrixMetas().size()=" + worker.getPSAgent().getMatrixMetaManager().getMatrixMetas().size());
        assertTrue(worker.getPSAgent().getMatrixMetaManager().exist(w1Id));
        assertTrue(worker.getPSAgent().getMatrixMetaManager().exist(w2Id));
        assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w1Part0Key).get(0), psId);
        assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w1Part1Key).get(0), psId);
        assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w2Part0Key).get(0), psId);
        assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w2Part1Key).get(0), psId);
        ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock, w2Clock);
        angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed", true));
        Thread.sleep(15000);
        angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt2Id);
        assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.FAILED);
    } catch (Exception x) {
        LOG.error("run testMasterRecover failed ", x);
        throw x;
    }
}
Also used : TaskContext(com.tencent.angel.worker.task.TaskContext) InternalErrorEvent(com.tencent.angel.master.app.InternalErrorEvent) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) ParameterServer(com.tencent.angel.ps.impl.ParameterServer) TConnection(com.tencent.angel.ipc.TConnection) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) PartitionKey(com.tencent.angel.PartitionKey) Worker(com.tencent.angel.worker.Worker) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Example 62 with PartitionKey

use of com.tencent.angel.PartitionKey in project angel by Tencent.

the class CompSparseDoubleLongKeyVectorTest method initVector.

private CompSparseLongKeyDoubleVector initVector() {
    PartitionKey[] partKeys = new PartitionKey[4];
    TLongDoubleVector[] vectors = new TLongDoubleVector[4];
    long blockCol = Long.MAX_VALUE / 2;
    partKeys[0] = new PartitionKey(0, 0, 0, Long.MIN_VALUE, 0, Long.MIN_VALUE + blockCol);
    partKeys[1] = new PartitionKey(0, 0, 0, Long.MIN_VALUE + blockCol, 0, Long.MIN_VALUE + blockCol * 2);
    partKeys[2] = new PartitionKey(0, 0, 0, Long.MIN_VALUE + blockCol * 2, 0, Long.MIN_VALUE + blockCol * 3);
    partKeys[3] = new PartitionKey(0, 0, 0, Long.MIN_VALUE + blockCol * 3, 0, Long.MAX_VALUE);
    vectors[0] = new SparseLongKeyDoubleVector(-1);
    vectors[1] = new SparseLongKeyDoubleVector(-1);
    vectors[2] = new SparseLongKeyDoubleVector(-1);
    vectors[3] = new SparseLongKeyDoubleVector(-1);
    return new CompSparseLongKeyDoubleVector(0, 0, -1, partKeys, vectors);
}
Also used : PartitionKey(com.tencent.angel.PartitionKey)

Example 63 with PartitionKey

use of com.tencent.angel.PartitionKey in project angel by Tencent.

the class CompSparseDoubleVectorTest method initVector.

private CompSparseDoubleVector initVector() {
    PartitionKey[] partKeys = new PartitionKey[4];
    TIntDoubleVector[] vectors = new TIntDoubleVector[4];
    long blockCol = dim / 4;
    partKeys[0] = new PartitionKey(0, 0, 0, 0, 0, blockCol);
    partKeys[1] = new PartitionKey(0, 0, 0, blockCol, 0, blockCol * 2);
    partKeys[2] = new PartitionKey(0, 0, 0, blockCol * 2, 0, blockCol * 3);
    partKeys[3] = new PartitionKey(0, 0, 0, blockCol * 3, 0, dim);
    vectors[0] = new SparseDoubleVector(dim);
    vectors[1] = new SparseDoubleVector(dim);
    vectors[2] = new SparseDoubleVector(dim);
    vectors[3] = new SparseDoubleVector(dim);
    return new CompSparseDoubleVector(0, 0, dim, partKeys, vectors);
}
Also used : PartitionKey(com.tencent.angel.PartitionKey)

Example 64 with PartitionKey

use of com.tencent.angel.PartitionKey in project angel by Tencent.

the class CompSparseFloatVectorTest method initVector.

private CompSparseFloatVector initVector() {
    PartitionKey[] partKeys = new PartitionKey[4];
    TFloatVector[] vectors = new TFloatVector[4];
    long blockCol = dim / 4;
    partKeys[0] = new PartitionKey(0, 0, 0, 0, 0, blockCol);
    partKeys[1] = new PartitionKey(0, 0, 0, blockCol, 0, blockCol * 2);
    partKeys[2] = new PartitionKey(0, 0, 0, blockCol * 2, 0, blockCol * 3);
    partKeys[3] = new PartitionKey(0, 0, 0, blockCol * 3, 0, dim);
    vectors[0] = new SparseFloatVector(dim);
    vectors[1] = new SparseFloatVector(dim);
    vectors[2] = new SparseFloatVector(dim);
    vectors[3] = new SparseFloatVector(dim);
    return new CompSparseFloatVector(0, 0, dim, partKeys, vectors);
}
Also used : PartitionKey(com.tencent.angel.PartitionKey)

Example 65 with PartitionKey

use of com.tencent.angel.PartitionKey in project angel by Tencent.

the class IndexPartGetResult method deserialize.

@Override
public void deserialize(ByteBuf buf) {
    partKey = new PartitionKey();
    partKey.deserialize(buf);
}
Also used : PartitionKey(com.tencent.angel.PartitionKey)

Aggregations

PartitionKey (com.tencent.angel.PartitionKey)80 ArrayList (java.util.ArrayList)17 ByteBuf (io.netty.buffer.ByteBuf)12 Test (org.junit.Test)9 PartitionGetResult (com.tencent.angel.ml.matrix.psf.get.base.PartitionGetResult)8 MatrixMeta (com.tencent.angel.ml.matrix.MatrixMeta)7 PartitionGetParam (com.tencent.angel.ml.matrix.psf.get.base.PartitionGetParam)7 PartitionLocation (com.tencent.angel.ml.matrix.PartitionLocation)4 ServerRow (com.tencent.angel.ps.impl.matrix.ServerRow)4 ParameterServerId (com.tencent.angel.ps.ParameterServerId)3 RecoverPartKey (com.tencent.angel.ps.recovery.ha.RecoverPartKey)3 FutureResult (com.tencent.angel.psagent.matrix.transport.FutureResult)3 Map (java.util.Map)3 Location (com.tencent.angel.common.location.Location)2 TVector (com.tencent.angel.ml.math.TVector)2 RowType (com.tencent.angel.ml.matrix.RowType)2 PSLocation (com.tencent.angel.ml.matrix.transport.PSLocation)2 MatrixStorageManager (com.tencent.angel.ps.impl.MatrixStorageManager)2 ClockCache (com.tencent.angel.psagent.clock.ClockCache)2 Worker (com.tencent.angel.worker.Worker)2