Search in sources :

Example 11 with TConnection

use of com.tencent.angel.ipc.TConnection in project angel by Tencent.

the class MasterRecoverTest method testMasterRecover.

@SuppressWarnings("unchecked")
@Test
public void testMasterRecover() throws Exception {
    try {
        ApplicationAttemptId appAttempt1Id = ApplicationAttemptId.newInstance(LocalClusterContext.get().getAppId(), 1);
        ApplicationAttemptId appAttempt2Id = ApplicationAttemptId.newInstance(LocalClusterContext.get().getAppId(), 2);
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        LOG.info("angelAppMaster.getAppContext().getApplicationAttemptId()=" + angelAppMaster.getAppContext().getApplicationAttemptId());
        assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt1Id);
        ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
        ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        int w1Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w1").getId();
        int w2Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w2").getId();
        Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
        TConnection connection = TConnectionManager.getConnection(ps.getConf());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        int task0Iteration = 2;
        int task1Iteration = 1;
        int task0w1Clock = 10;
        int task0w2Clock = 20;
        int task1w1Clock = 9;
        int task1w2Clock = 19;
        int w1Clock = (task0w1Clock < task1w1Clock) ? task0w1Clock : task1w1Clock;
        int w2Clock = (task0w2Clock < task1w2Clock) ? task0w2Clock : task1w2Clock;
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task0w1Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task0w2Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task1w1Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task1w2Clock).build()).build());
        int writeIntervalMS = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_AM_WRITE_STATE_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_AM_WRITE_STATE_INTERVAL_MS);
        Thread.sleep(writeIntervalMS * 2);
        angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed", true));
        Thread.sleep(10000);
        angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.RUNNING);
        LOG.info("angelAppMaster.getAppContext().getApplicationAttemptId()=" + angelAppMaster.getAppContext().getApplicationAttemptId());
        assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt2Id);
        PartitionKey w1Part0Key = new PartitionKey(0, w1Id, 0, 0, 1, 50000);
        PartitionKey w1Part1Key = new PartitionKey(1, w1Id, 0, 50000, 1, 100000);
        PartitionKey w2Part0Key = new PartitionKey(0, w2Id, 0, 0, 1, 50000);
        PartitionKey w2Part1Key = new PartitionKey(1, w2Id, 0, 50000, 1, 100000);
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        LOG.info("worker=" + worker);
        LOG.info("worker.getTaskManager()=" + worker.getTaskManager());
        LOG.info("worker.getTaskManager().getRunningTask()=" + worker.getTaskManager().getRunningTask().size());
        TaskContext task0Context = worker.getTaskManager().getRunningTask().get(task0Id).getTaskContext();
        TaskContext task1Context = worker.getTaskManager().getRunningTask().get(task1Id).getTaskContext();
        assertEquals(task0Context.getEpoch(), task0Iteration);
        assertEquals(task1Context.getEpoch(), task1Iteration);
        LOG.info("===============worker.getPSAgent().getMatrixMetaManager().getMatrixMetas().size()=" + worker.getPSAgent().getMatrixMetaManager().getMatrixMetas().size());
        assertTrue(worker.getPSAgent().getMatrixMetaManager().exist(w1Id));
        assertTrue(worker.getPSAgent().getMatrixMetaManager().exist(w2Id));
        assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w1Part0Key).get(0), psId);
        assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w1Part1Key).get(0), psId);
        assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w2Part0Key).get(0), psId);
        assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w2Part1Key).get(0), psId);
        ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock, w2Clock);
        angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed", true));
        Thread.sleep(10000);
        angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt2Id);
        assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.FAILED);
    } catch (Exception x) {
        LOG.error("run testMasterRecover failed ", x);
        throw x;
    }
}
Also used : TaskContext(com.tencent.angel.worker.task.TaskContext) InternalErrorEvent(com.tencent.angel.master.app.InternalErrorEvent) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) ParameterServer(com.tencent.angel.ps.ParameterServer) TConnection(com.tencent.angel.ipc.TConnection) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) PartitionKey(com.tencent.angel.PartitionKey) Worker(com.tencent.angel.worker.Worker) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Example 12 with TConnection

use of com.tencent.angel.ipc.TConnection in project angel by Tencent.

the class InitNeighborTest method testCSR.

@Test
public void testCSR() throws Exception {
    Worker worker = LocalClusterContext.get().getWorker(workerAttempt0Id).getWorker();
    MatrixClient client = worker.getPSAgent().getMatrixClient(SPARSE_INT_MAT, 0);
    int matrixId = client.getMatrixId();
    ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
    Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
    TConnection connection = TConnectionManager.getConnection(ps.getConf());
    MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
    // Init node neighbors
    Int2ObjectOpenHashMap<int[]> nodeIdToNeighbors = new Int2ObjectOpenHashMap<>();
    nodeIdToNeighbors.put(1, new int[] { 2, 3 });
    nodeIdToNeighbors.put(2, new int[] { 4 });
    InitNeighbor func = new InitNeighbor(new InitNeighborParam(matrixId, nodeIdToNeighbors));
    client.asyncUpdate(func).get();
    nodeIdToNeighbors.clear();
    nodeIdToNeighbors.put(1, new int[] { 4, 5, 6 });
    nodeIdToNeighbors.put(2, new int[] { 5 });
    nodeIdToNeighbors.put(4, new int[] { 5, 6 });
    func = new InitNeighbor(new InitNeighborParam(matrixId, nodeIdToNeighbors));
    client.asyncUpdate(func).get();
    nodeIdToNeighbors.clear();
    nodeIdToNeighbors.put(3, new int[] { 4, 5, 6 });
    nodeIdToNeighbors.put(5, new int[] { 6 });
    nodeIdToNeighbors.put(8, new int[] { 3, 4 });
    func = new InitNeighbor(new InitNeighborParam(matrixId, nodeIdToNeighbors));
    client.asyncUpdate(func).get();
    nodeIdToNeighbors.clear();
    client.asyncUpdate(new InitNeighborOver(new InitNeighborOverParam(matrixId))).get();
    // Sample the neighbors
    int[] nodeIds = new int[] { 1, 2, 3, 4, 5, 6, 7, 8 };
    SampleNeighborParam param = new SampleNeighborParam(matrixId, nodeIds, -1);
    Int2ObjectOpenHashMap<int[]> result = ((SampleNeighborResult) (client.get(new SampleNeighbor(param)))).getNodeIdToNeighbors();
    ObjectIterator<Entry<int[]>> iter = result.int2ObjectEntrySet().fastIterator();
    LOG.info("==============================sample neighbors result============================");
    Entry<int[]> entry;
    while (iter.hasNext()) {
        entry = iter.next();
        LOG.info("node id = " + entry.getIntKey() + ", neighbors = " + Arrays.toString(entry.getValue()));
    }
    client.checkpoint(0);
    ps.stop(-1);
    PSErrorRequest request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).setMsg("out of memory").build();
    master.psError(null, request);
    Thread.sleep(10000);
    result = ((SampleNeighborResult) (client.get(new SampleNeighbor(param)))).getNodeIdToNeighbors();
    iter = result.int2ObjectEntrySet().fastIterator();
    LOG.info("==============================sample neighbors result============================");
    while (iter.hasNext()) {
        entry = iter.next();
        LOG.info("node id = " + entry.getIntKey() + ", neighbors = " + Arrays.toString(entry.getValue()));
    }
}
Also used : Int2ObjectOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap) InitNeighborParam(com.tencent.angel.graph.client.initneighbor.InitNeighborParam) SampleNeighborResult(com.tencent.angel.graph.client.sampleneighbor.SampleNeighborResult) SampleNeighbor(com.tencent.angel.graph.client.sampleneighbor.SampleNeighbor) InitNeighbor(com.tencent.angel.graph.client.initneighbor.InitNeighbor) ParameterServer(com.tencent.angel.ps.ParameterServer) Entry(it.unimi.dsi.fastutil.ints.Int2ObjectMap.Entry) TConnection(com.tencent.angel.ipc.TConnection) InitNeighborOverParam(com.tencent.angel.graph.client.initneighbor.InitNeighborOverParam) SampleNeighborParam(com.tencent.angel.graph.client.sampleneighbor.SampleNeighborParam) Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient) InitNeighborOver(com.tencent.angel.graph.client.initneighbor.InitNeighborOver) MasterProtocol(com.tencent.angel.master.MasterProtocol) PSErrorRequest(com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSErrorRequest) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Example 13 with TConnection

use of com.tencent.angel.ipc.TConnection in project angel by Tencent.

the class InitNeighborTest2 method testCSR.

@Test
public void testCSR() throws Exception {
    Worker worker = LocalClusterContext.get().getWorker(workerAttempt0Id).getWorker();
    MatrixClient client = worker.getPSAgent().getMatrixClient(SPARSE_INT_MAT, 0);
    int matrixId = client.getMatrixId();
    ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
    Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
    TConnection connection = TConnectionManager.getConnection(ps.getConf());
    MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
    // Init node neighbors
    Long2ObjectOpenHashMap<long[]> nodeIdToNeighbors = new Long2ObjectOpenHashMap<>();
    nodeIdToNeighbors.put(1, new long[] { 2, 3, 4, 5, 6 });
    nodeIdToNeighbors.put(2, new long[] { 4, 5 });
    nodeIdToNeighbors.put(3, new long[] { 4, 5, 6 });
    nodeIdToNeighbors.put(4, new long[] { 5, 6 });
    nodeIdToNeighbors.put(5, new long[] { 6 });
    nodeIdToNeighbors.put(8, new long[] { 3, 4 });
    InitNeighbor func = new InitNeighbor(new InitNeighborParam(matrixId, nodeIdToNeighbors));
    client.asyncUpdate(func).get();
    nodeIdToNeighbors.clear();
    /*nodeIdToNeighbors.put(1, new long[]{4, 5, 6});
    nodeIdToNeighbors.put(2, new long[]{5});
    nodeIdToNeighbors.put(4, new long[]{5, 6});
    func = new InitNeighbor(new InitNeighborParam(matrixId, nodeIdToNeighbors));
    client.asyncUpdate(func).get();
    nodeIdToNeighbors.clear();

    nodeIdToNeighbors.put(3, new long[]{4, 5, 6});
    nodeIdToNeighbors.put(5, new long[]{6});
    nodeIdToNeighbors.put(8, new long[]{3, 4});
    func = new InitNeighbor(new InitNeighborParam(matrixId, nodeIdToNeighbors));
    client.asyncUpdate(func).get();
    nodeIdToNeighbors.clear();
    */
    // client.asyncUpdate(new InitNeighborOver(new InitNeighborOverParam(matrixId))).get();
    // Sample the neighbors
    long[] nodeIds = new long[] { 1, 2, 3, 4, 5, 6, 7, 8 };
    SampleNeighborParam param = new SampleNeighborParam(matrixId, nodeIds, 2);
    Long2ObjectOpenHashMap<long[]> result = ((SampleNeighborResult) (client.get(new SampleNeighbor(param)))).getNodeIdToNeighbors();
    ObjectIterator<Long2ObjectMap.Entry<long[]>> iter = result.long2ObjectEntrySet().fastIterator();
    LOG.info("==============================sample neighbors result============================");
    Long2ObjectMap.Entry<long[]> entry;
    while (iter.hasNext()) {
        entry = iter.next();
        LOG.info("node id = " + entry.getLongKey() + ", neighbors = " + Arrays.toString(entry.getValue()));
    }
    client.checkpoint(0).get();
    ps.stop(-1);
    PSErrorRequest request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).setMsg("out of memory").build();
    master.psError(null, request);
    Thread.sleep(10000);
    param = new SampleNeighborParam(matrixId, nodeIds, -1);
    result = ((SampleNeighborResult) (client.get(new SampleNeighbor(param)))).getNodeIdToNeighbors();
    iter = result.long2ObjectEntrySet().fastIterator();
    LOG.info("==============================sample neighbors result============================");
    while (iter.hasNext()) {
        entry = iter.next();
        LOG.info("node id = " + entry.getLongKey() + ", neighbors = " + Arrays.toString(entry.getValue()));
    }
}
Also used : InitNeighborParam(com.tencent.angel.graph.client.initneighbor2.InitNeighborParam) SampleNeighborResult(com.tencent.angel.graph.client.sampleneighbor2.SampleNeighborResult) Long2ObjectMap(it.unimi.dsi.fastutil.longs.Long2ObjectMap) SampleNeighbor(com.tencent.angel.graph.client.sampleneighbor2.SampleNeighbor) InitNeighbor(com.tencent.angel.graph.client.initneighbor2.InitNeighbor) Long2ObjectOpenHashMap(it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap) ParameterServer(com.tencent.angel.ps.ParameterServer) TConnection(com.tencent.angel.ipc.TConnection) SampleNeighborParam(com.tencent.angel.graph.client.sampleneighbor2.SampleNeighborParam) Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient) MasterProtocol(com.tencent.angel.master.MasterProtocol) PSErrorRequest(com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSErrorRequest) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Aggregations

TConnection (com.tencent.angel.ipc.TConnection)13 Location (com.tencent.angel.common.location.Location)12 Test (org.junit.Test)9 Worker (com.tencent.angel.worker.Worker)6 ParameterServer (com.tencent.angel.ps.ParameterServer)5 AngelException (com.tencent.angel.exception.AngelException)4 ParameterServerManager (com.tencent.angel.master.ps.ParameterServerManager)4 PSAttempt (com.tencent.angel.master.ps.attempt.PSAttempt)3 AMParameterServer (com.tencent.angel.master.ps.ps.AMParameterServer)3 MatrixClient (com.tencent.angel.psagent.matrix.MatrixClient)3 IOException (java.io.IOException)3 ServiceException (com.google.protobuf.ServiceException)2 MasterProtocol (com.tencent.angel.master.MasterProtocol)2 InternalErrorEvent (com.tencent.angel.master.app.InternalErrorEvent)2 AMTaskManager (com.tencent.angel.master.task.AMTaskManager)2 Pair (com.tencent.angel.protobuf.generated.MLProtos.Pair)2 PSErrorRequest (com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSErrorRequest)2 PSAttemptId (com.tencent.angel.ps.PSAttemptId)2 PartitionKey (com.tencent.angel.PartitionKey)1 InitNeighbor (com.tencent.angel.graph.client.initneighbor.InitNeighbor)1