use of com.tencent.angel.ps.ParameterServer in project angel by Tencent.
the class PSManagerTest method testPSError.
@Test
public void testPSError() throws Exception {
try {
int heartbeatInterval = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_PS_HEARTBEAT_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_PS_HEARTBEAT_INTERVAL_MS);
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
AMParameterServer amPs = psManager.getParameterServer(psId);
PSAttempt psAttempt0 = amPs.getPSAttempt(psAttempt0Id);
ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
int w1Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w1").getId();
int w2Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w2").getId();
Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
TConnection connection = TConnectionManager.getConnection(ps.getConf());
MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
int task0Iteration = 2;
int task1Iteration = 1;
int task0w1Clock = 10;
int task0w2Clock = 20;
int task1w1Clock = 9;
int task1w2Clock = 19;
int w1Clock = (task0w1Clock < task1w1Clock) ? task0w1Clock : task1w1Clock;
int w2Clock = (task0w2Clock < task1w2Clock) ? task0w2Clock : task1w2Clock;
TaskContext task0Context = worker.getTaskManager().getRunningTask().get(task0Id).getTaskContext().getContext();
TaskContext task1Context = worker.getTaskManager().getRunningTask().get(task1Id).getTaskContext().getContext();
master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task0w1Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task0w2Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task1w1Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task1w2Clock).build()).build());
assertEquals(amPs.getMaxAttempts(), 4);
PSAttemptId psAttempt1Id = new PSAttemptId(psId, 1);
PSAttemptId psAttempt2Id = new PSAttemptId(psId, 2);
PSAttemptId psAttempt3Id = new PSAttemptId(psId, 3);
// attempt 0
ps.stop(-1);
PSErrorRequest request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).setMsg("out of memory").build();
master.psError(null, request);
Thread.sleep(heartbeatInterval * 2);
PSAttempt psAttempt1 = amPs.getPSAttempt(psAttempt1Id);
assertTrue(psAttempt1 != null);
assertEquals(psAttempt0.getInternalState(), PSAttemptStateInternal.FAILED);
assertEquals(psAttempt1.getInternalState(), PSAttemptStateInternal.RUNNING);
assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
assertEquals(amPs.getNextAttemptNumber(), 2);
assertEquals(amPs.getRunningAttemptId(), psAttempt1Id);
assertNull(amPs.getSuccessAttemptId());
assertEquals(amPs.getPSAttempts().size(), 2);
List<String> diagnostics = amPs.getDiagnostics();
assertEquals(diagnostics.size(), 1);
assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
ps = LocalClusterContext.get().getPS(psAttempt1Id).getPS();
checkMatrixInfo(ps, w1Id, w2Id, w1Clock, w2Clock);
MatrixClient w1Task0Client = worker.getPSAgent().getMatrixClient("w1", 0);
MatrixClient w1Task1Client = worker.getPSAgent().getMatrixClient("w1", 1);
int matrixW1Id = w1Task0Client.getMatrixId();
int[] delta = new int[100000];
for (int i = 0; i < 100000; i++) {
delta[i] = 2;
}
IntIntVector deltaVec = new IntIntVector(100000, new IntIntDenseVectorStorage(delta));
deltaVec.setMatrixId(matrixW1Id);
deltaVec.setRowId(0);
w1Task0Client.increment(deltaVec);
deltaVec = new IntIntVector(100000, new IntIntDenseVectorStorage(delta));
deltaVec.setMatrixId(matrixW1Id);
deltaVec.setRowId(0);
w1Task1Client.increment(deltaVec);
w1Task0Client.clock().get();
w1Task1Client.clock().get();
ps = LocalClusterContext.get().getPS(psAttempt1Id).getPS();
int snapshotInterval = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_PS_BACKUP_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_PS_BACKUP_INTERVAL_MS);
Thread.sleep(snapshotInterval * 2);
// attempt1
ps.stop(-1);
request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt1Id)).setMsg("out of memory").build();
master.psError(null, request);
Thread.sleep(heartbeatInterval * 2);
PSAttempt psAttempt2 = amPs.getPSAttempt(psAttempt2Id);
assertTrue(psAttempt2 != null);
assertEquals(psAttempt1.getInternalState(), PSAttemptStateInternal.FAILED);
assertEquals(psAttempt2.getInternalState(), PSAttemptStateInternal.RUNNING);
assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
assertEquals(amPs.getNextAttemptNumber(), 3);
assertEquals(amPs.getRunningAttemptId(), psAttempt2Id);
assertNull(amPs.getSuccessAttemptId());
assertEquals(amPs.getPSAttempts().size(), 3);
diagnostics = amPs.getDiagnostics();
assertEquals(diagnostics.size(), 2);
assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
ps = LocalClusterContext.get().getPS(psAttempt2Id).getPS();
checkMatrixInfo(ps, w1Id, w2Id, w1Clock + 1, w2Clock);
assertEquals(sum((IntIntVector) w1Task0Client.getRow(0)), 400000);
// attempt1
ps.stop(-1);
request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt2Id)).setMsg("out of memory").build();
master.psError(null, request);
Thread.sleep(heartbeatInterval * 2);
PSAttempt psAttempt3 = amPs.getPSAttempt(psAttempt3Id);
assertTrue(psAttempt3 != null);
assertEquals(psAttempt2.getInternalState(), PSAttemptStateInternal.FAILED);
assertEquals(psAttempt3.getInternalState(), PSAttemptStateInternal.RUNNING);
assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
assertEquals(amPs.getNextAttemptNumber(), 4);
assertEquals(amPs.getRunningAttemptId(), psAttempt3Id);
assertNull(amPs.getSuccessAttemptId());
assertEquals(amPs.getPSAttempts().size(), 4);
diagnostics = amPs.getDiagnostics();
assertEquals(diagnostics.size(), 3);
assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
assertEquals(diagnostics.get(2), psAttempt2Id + " failed due to: out of memory");
ps = LocalClusterContext.get().getPS(psAttempt3Id).getPS();
checkMatrixInfo(ps, w1Id, w2Id, w1Clock + 1, w2Clock);
ps.stop(-1);
request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt3Id)).setMsg("out of memory").build();
master.psError(null, request);
Thread.sleep(heartbeatInterval * 2);
assertEquals(psAttempt3.getInternalState(), PSAttemptStateInternal.FAILED);
assertEquals(amPs.getState(), AMParameterServerState.FAILED);
assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.FAILED);
assertEquals(amPs.getNextAttemptNumber(), 4);
assertNull(amPs.getRunningAttemptId());
assertNull(amPs.getSuccessAttemptId());
assertEquals(amPs.getPSAttempts().size(), 4);
diagnostics = amPs.getDiagnostics();
assertEquals(diagnostics.size(), 4);
assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
assertEquals(diagnostics.get(2), psAttempt2Id + " failed due to: out of memory");
assertEquals(diagnostics.get(3), psAttempt3Id + " failed due to: out of memory");
} catch (Exception x) {
LOG.error("run testPSError failed ", x);
throw x;
}
}
use of com.tencent.angel.ps.ParameterServer in project angel by Tencent.
the class PSManagerTest method testPSReport.
@Test
public void testPSReport() throws Exception {
try {
ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
Location masterLoc = ps.getMasterLocation();
TConnection connection = TConnectionManager.getConnection(ps.getConf());
MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
PSReportRequest.Builder builder = PSReportRequest.newBuilder();
builder.setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id));
Pair.Builder pairBuilder = Pair.newBuilder();
pairBuilder.setKey("ps_key1");
pairBuilder.setValue("100");
builder.addMetrics(pairBuilder.build());
pairBuilder.setKey("ps_key2");
pairBuilder.setValue("200");
builder.addMetrics(pairBuilder.build());
MatrixReportProto.Builder matrixBuilder = MatrixReportProto.newBuilder();
ConcurrentHashMap<Integer, ServerMatrix> matrixIdMap = ps.getMatrixStorageManager().getMatrices();
for (Entry<Integer, ServerMatrix> matrixEntry : matrixIdMap.entrySet()) {
builder.addMatrixReports((matrixBuilder.setMatrixId(matrixEntry.getKey()).setMatrixName(matrixEntry.getValue().getName())));
}
PSReportResponse response = master.psReport(null, builder.build());
assertEquals(response.getPsCommand(), PSCommandProto.PSCOMMAND_OK);
assertEquals(response.getNeedCreateMatricesCount(), 0);
assertEquals(response.getNeedReleaseMatrixIdsCount(), 0);
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
AMParameterServer amPs = psManager.getParameterServer(psId);
PSAttempt psAttempt = amPs.getPSAttempt(psAttempt0Id);
Map<String, String> metrices = psAttempt.getMetrices();
assertTrue(metrices.get("ps_key1").equals("100"));
assertTrue(metrices.get("ps_key2").equals("200"));
PSAttemptId psAttempt1Id = new PSAttemptId(psId, 1);
builder.setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt1Id));
response = master.psReport(null, builder.build());
assertEquals(response.getPsCommand(), PSCommandProto.PSCOMMAND_SHUTDOWN);
} catch (Exception x) {
LOG.error("run testPSReport failed ", x);
throw x;
}
}
use of com.tencent.angel.ps.ParameterServer in project angel by Tencent.
the class MatrixMetaManagerTest method testCreateMatrix.
@Test
public void testCreateMatrix() throws Exception {
try {
LOG.info("===========================testCreateMatrix===============================");
Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
MasterClient masterClient = worker.getPSAgent().getMasterClient();
int w3Id = -1;
int w4Id = -1;
// add matrix
MatrixContext mMatrix = new MatrixContext();
mMatrix.setName("w3");
mMatrix.setRowNum(1);
mMatrix.setColNum(100000);
mMatrix.setMaxRowNumInBlock(1);
mMatrix.setMaxColNumInBlock(50000);
mMatrix.setRowType(RowType.T_DOUBLE_DENSE);
mMatrix.set(MatrixConf.MATRIX_OPLOG_ENABLEFILTER, "false");
mMatrix.set(MatrixConf.MATRIX_HOGWILD, "true");
mMatrix.set(MatrixConf.MATRIX_AVERAGE, "false");
mMatrix.set(MatrixConf.MATRIX_OPLOG_TYPE, RowType.T_DOUBLE_DENSE.name());
masterClient.createMatrix(mMatrix, 10000);
mMatrix.setName("w4");
mMatrix.setRowNum(1);
mMatrix.setColNum(100000);
mMatrix.setMaxRowNumInBlock(1);
mMatrix.setMaxColNumInBlock(50000);
mMatrix.setRowType(RowType.T_DOUBLE_DENSE);
mMatrix.set(MatrixConf.MATRIX_OPLOG_ENABLEFILTER, "false");
mMatrix.set(MatrixConf.MATRIX_HOGWILD, "true");
mMatrix.set(MatrixConf.MATRIX_AVERAGE, "false");
mMatrix.set(MatrixConf.MATRIX_OPLOG_TYPE, RowType.T_DOUBLE_DENSE.name());
masterClient.createMatrix(mMatrix, 10000);
MatrixMeta w3Meta = worker.getPSAgent().getMatrixMetaManager().getMatrixMeta("w3");
MatrixMeta w4Meta = worker.getPSAgent().getMatrixMetaManager().getMatrixMeta("w4");
assertEquals(w3Meta.getRowNum(), 1);
assertEquals(w3Meta.getColNum(), 100000);
assertEquals(w3Meta.getRowType(), RowType.T_DOUBLE_DENSE);
assertEquals(w4Meta.getRowNum(), 1);
assertEquals(w4Meta.getColNum(), 100000);
assertEquals(w4Meta.getRowType(), RowType.T_DOUBLE_DENSE);
w3Id = w3Meta.getId();
w4Id = w4Meta.getId();
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
assertTrue(angelAppMaster != null);
AMMatrixMetaManager matrixMetaManager = angelAppMaster.getAppContext().getMatrixMetaManager();
MatrixMeta matrixw3Proto = matrixMetaManager.getMatrix("w3");
MatrixMeta matrixw4Proto = matrixMetaManager.getMatrix("w4");
assertNotNull(matrixw3Proto);
assertNotNull(matrixw4Proto);
assertEquals(matrixw3Proto.getRowNum(), 1);
assertEquals(matrixw3Proto.getColNum(), 100000);
assertEquals(matrixw3Proto.getPartitionMetas().size(), 2);
Map<Integer, PartitionMeta> w3Parts = matrixw3Proto.getPartitionMetas();
assertEquals(w3Parts.get(0).getPss().get(0), psId);
assertEquals(w3Parts.get(0).getPartId(), 0);
assertEquals(w3Parts.get(0).getStartRow(), 0);
assertEquals(w3Parts.get(0).getEndRow(), 1);
assertEquals(w3Parts.get(0).getStartCol(), 0);
assertEquals(w3Parts.get(0).getEndCol(), 50000);
assertEquals(w3Parts.get(1).getPartId(), 1);
assertEquals(w3Parts.get(1).getStartRow(), 0);
assertEquals(w3Parts.get(1).getEndRow(), 1);
assertEquals(w3Parts.get(1).getStartCol(), 50000);
assertEquals(w3Parts.get(1).getEndCol(), 100000);
Map<Integer, PartitionMeta> w4Parts = matrixw4Proto.getPartitionMetas();
assertEquals(w4Parts.get(0).getPss().get(0), psId);
assertEquals(w4Parts.get(0).getPartId(), 0);
assertEquals(w4Parts.get(0).getStartRow(), 0);
assertEquals(w4Parts.get(0).getEndRow(), 1);
assertEquals(w4Parts.get(0).getStartCol(), 0);
assertEquals(w4Parts.get(0).getEndCol(), 50000);
assertEquals(w4Parts.get(1).getPartId(), 1);
assertEquals(w4Parts.get(1).getStartRow(), 0);
assertEquals(w4Parts.get(1).getEndRow(), 1);
assertEquals(w4Parts.get(1).getStartCol(), 50000);
assertEquals(w4Parts.get(1).getEndCol(), 100000);
ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
PSMatrixMetaManager matrixPartManager = ps.getMatrixMetaManager();
PartitionMeta w3Part0 = matrixPartManager.getPartMeta(w3Id, 0);
PartitionMeta w3Part1 = matrixPartManager.getPartMeta(w3Id, 1);
assertTrue(w3Part0 != null);
assertTrue(w3Part1 != null);
assertEquals(w3Part0.getPartitionKey().getStartRow(), 0);
assertEquals(w3Part0.getPartitionKey().getEndRow(), 1);
assertEquals(w3Part0.getPartitionKey().getStartCol(), 0);
assertEquals(w3Part0.getPartitionKey().getEndCol(), 50000);
assertEquals(w3Part1.getPartitionKey().getStartRow(), 0);
assertEquals(w3Part1.getPartitionKey().getEndRow(), 1);
assertEquals(w3Part1.getPartitionKey().getStartCol(), 50000);
assertEquals(w3Part1.getPartitionKey().getEndCol(), 100000);
PartitionMeta w4Part0 = matrixPartManager.getPartMeta(w4Id, 0);
PartitionMeta w4Part1 = matrixPartManager.getPartMeta(w4Id, 1);
assertTrue(w4Part0 != null);
assertTrue(w4Part1 != null);
assertEquals(w4Part0.getPartitionKey().getStartRow(), 0);
assertEquals(w4Part0.getPartitionKey().getEndRow(), 1);
assertEquals(w4Part0.getPartitionKey().getStartCol(), 0);
assertEquals(w4Part0.getPartitionKey().getEndCol(), 50000);
assertEquals(w4Part1.getPartitionKey().getStartRow(), 0);
assertEquals(w4Part1.getPartitionKey().getEndRow(), 1);
assertEquals(w4Part1.getPartitionKey().getStartCol(), 50000);
assertEquals(w4Part1.getPartitionKey().getEndCol(), 100000);
MatrixClient w4ClientForTask0 = worker.getPSAgent().getMatrixClient("w4", 0);
MatrixClient w4ClientForTask1 = worker.getPSAgent().getMatrixClient("w4", 1);
TaskContext task0Context = w4ClientForTask0.getTaskContext();
TaskContext task1Context = w4ClientForTask1.getTaskContext();
double[] delta = new double[100000];
for (int i = 0; i < delta.length; i++) {
delta[i] = 1.0;
}
int iterIndex = 0;
while (iterIndex < 5) {
IntDoubleVector row1 = (IntDoubleVector) w4ClientForTask0.getRow(0);
double sum1 = sum(row1.getStorage().getValues());
LOG.info("taskid=" + task0Context.getIndex() + ", matrixId=" + w4ClientForTask0.getMatrixId() + ", rowIndex=0, local row sum=" + sum1);
IntDoubleVector deltaRow1 = new IntDoubleVector(delta.length, new IntDoubleDenseVectorStorage(delta));
deltaRow1.setMatrixId(w4ClientForTask0.getMatrixId());
deltaRow1.setRowId(0);
w4ClientForTask0.increment(deltaRow1);
w4ClientForTask0.clock().get();
task0Context.increaseEpoch();
IntDoubleVector row2 = (IntDoubleVector) w4ClientForTask1.getRow(0);
double sum2 = sum(row2.getStorage().getValues());
LOG.info("taskid=" + task1Context.getIndex() + ", matrixId=" + w4ClientForTask1.getMatrixId() + ", rowIndex=1, local row sum=" + sum2);
IntDoubleVector deltaRow2 = new IntDoubleVector(delta.length, new IntDoubleDenseVectorStorage(delta));
deltaRow2.setMatrixId(w4ClientForTask1.getMatrixId());
deltaRow2.setRowId(0);
w4ClientForTask1.increment(deltaRow2);
w4ClientForTask1.clock().get();
task1Context.increaseEpoch();
iterIndex++;
}
AMTaskManager amTaskManager = angelAppMaster.getAppContext().getTaskManager();
AMTask amTask0 = amTaskManager.getTask(task0Id);
AMTask amTask1 = amTaskManager.getTask(task1Id);
assertEquals(amTask0.getIteration(), 5);
assertEquals(amTask1.getIteration(), 5);
Int2IntOpenHashMap task0MatrixClocks = amTask0.getMatrixClocks();
assertEquals(task0MatrixClocks.size(), 1);
assertEquals(task0MatrixClocks.get(w4Id), 5);
Int2IntOpenHashMap task1MatrixClocks = amTask1.getMatrixClocks();
assertEquals(task1MatrixClocks.size(), 1);
assertEquals(task1MatrixClocks.get(w4Id), 5);
IntDoubleVector row1 = (IntDoubleVector) w4ClientForTask0.getRow(0);
double sum1 = sum(row1.getStorage().getValues());
assertEquals(sum1, 1000000.0, 0.000001);
IntDoubleVector row2 = (IntDoubleVector) w4ClientForTask1.getRow(0);
double sum2 = sum(row2.getStorage().getValues());
assertEquals(sum2, 1000000.0, 0.000001);
masterClient.releaseMatrix(w3Meta.getName());
Thread.sleep(10000);
matrixw3Proto = matrixMetaManager.getMatrix("w3");
assertTrue(matrixw3Proto == null);
MatrixStorageManager matrixStorageManager = LocalClusterContext.get().getPS(psAttempt0Id).getPS().getMatrixStorageManager();
ServerMatrix sw3 = matrixStorageManager.getMatrix(w3Id);
assertTrue(sw3 == null);
w4ClientForTask0.clock().get();
w4ClientForTask1.clock().get();
row1 = (IntDoubleVector) w4ClientForTask0.getRow(0);
sum1 = sum(row1.getStorage().getValues());
assertEquals(sum1, 1000000.0, 0.000001);
row2 = (IntDoubleVector) w4ClientForTask1.getRow(0);
sum2 = sum(row2.getStorage().getValues());
assertEquals(sum2, 1000000.0, 0.000001);
} catch (Exception x) {
LOG.error("run testCreateMatrix failed ", x);
throw x;
}
}
use of com.tencent.angel.ps.ParameterServer in project angel by Tencent.
the class MasterRecoverTest method testMasterRecover.
@SuppressWarnings("unchecked")
@Test
public void testMasterRecover() throws Exception {
try {
ApplicationAttemptId appAttempt1Id = ApplicationAttemptId.newInstance(LocalClusterContext.get().getAppId(), 1);
ApplicationAttemptId appAttempt2Id = ApplicationAttemptId.newInstance(LocalClusterContext.get().getAppId(), 2);
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
LOG.info("angelAppMaster.getAppContext().getApplicationAttemptId()=" + angelAppMaster.getAppContext().getApplicationAttemptId());
assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt1Id);
ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
int w1Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w1").getId();
int w2Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w2").getId();
Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
TConnection connection = TConnectionManager.getConnection(ps.getConf());
MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
int task0Iteration = 2;
int task1Iteration = 1;
int task0w1Clock = 10;
int task0w2Clock = 20;
int task1w1Clock = 9;
int task1w2Clock = 19;
int w1Clock = (task0w1Clock < task1w1Clock) ? task0w1Clock : task1w1Clock;
int w2Clock = (task0w2Clock < task1w2Clock) ? task0w2Clock : task1w2Clock;
master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task0w1Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task0w2Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task1w1Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task1w2Clock).build()).build());
int writeIntervalMS = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_AM_WRITE_STATE_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_AM_WRITE_STATE_INTERVAL_MS);
Thread.sleep(writeIntervalMS * 2);
angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed", true));
Thread.sleep(10000);
angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.RUNNING);
LOG.info("angelAppMaster.getAppContext().getApplicationAttemptId()=" + angelAppMaster.getAppContext().getApplicationAttemptId());
assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt2Id);
PartitionKey w1Part0Key = new PartitionKey(0, w1Id, 0, 0, 1, 50000);
PartitionKey w1Part1Key = new PartitionKey(1, w1Id, 0, 50000, 1, 100000);
PartitionKey w2Part0Key = new PartitionKey(0, w2Id, 0, 0, 1, 50000);
PartitionKey w2Part1Key = new PartitionKey(1, w2Id, 0, 50000, 1, 100000);
Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
LOG.info("worker=" + worker);
LOG.info("worker.getTaskManager()=" + worker.getTaskManager());
LOG.info("worker.getTaskManager().getRunningTask()=" + worker.getTaskManager().getRunningTask().size());
TaskContext task0Context = worker.getTaskManager().getRunningTask().get(task0Id).getTaskContext();
TaskContext task1Context = worker.getTaskManager().getRunningTask().get(task1Id).getTaskContext();
assertEquals(task0Context.getEpoch(), task0Iteration);
assertEquals(task1Context.getEpoch(), task1Iteration);
LOG.info("===============worker.getPSAgent().getMatrixMetaManager().getMatrixMetas().size()=" + worker.getPSAgent().getMatrixMetaManager().getMatrixMetas().size());
assertTrue(worker.getPSAgent().getMatrixMetaManager().exist(w1Id));
assertTrue(worker.getPSAgent().getMatrixMetaManager().exist(w2Id));
assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w1Part0Key).get(0), psId);
assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w1Part1Key).get(0), psId);
assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w2Part0Key).get(0), psId);
assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w2Part1Key).get(0), psId);
ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
checkMatrixInfo(ps, w1Id, w2Id, w1Clock, w2Clock);
angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed", true));
Thread.sleep(10000);
angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt2Id);
assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.FAILED);
} catch (Exception x) {
LOG.error("run testMasterRecover failed ", x);
throw x;
}
}
use of com.tencent.angel.ps.ParameterServer in project angel by Tencent.
the class InitNeighborTest method testCSR.
@Test
public void testCSR() throws Exception {
Worker worker = LocalClusterContext.get().getWorker(workerAttempt0Id).getWorker();
MatrixClient client = worker.getPSAgent().getMatrixClient(SPARSE_INT_MAT, 0);
int matrixId = client.getMatrixId();
ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
TConnection connection = TConnectionManager.getConnection(ps.getConf());
MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
// Init node neighbors
Int2ObjectOpenHashMap<int[]> nodeIdToNeighbors = new Int2ObjectOpenHashMap<>();
nodeIdToNeighbors.put(1, new int[] { 2, 3 });
nodeIdToNeighbors.put(2, new int[] { 4 });
InitNeighbor func = new InitNeighbor(new InitNeighborParam(matrixId, nodeIdToNeighbors));
client.asyncUpdate(func).get();
nodeIdToNeighbors.clear();
nodeIdToNeighbors.put(1, new int[] { 4, 5, 6 });
nodeIdToNeighbors.put(2, new int[] { 5 });
nodeIdToNeighbors.put(4, new int[] { 5, 6 });
func = new InitNeighbor(new InitNeighborParam(matrixId, nodeIdToNeighbors));
client.asyncUpdate(func).get();
nodeIdToNeighbors.clear();
nodeIdToNeighbors.put(3, new int[] { 4, 5, 6 });
nodeIdToNeighbors.put(5, new int[] { 6 });
nodeIdToNeighbors.put(8, new int[] { 3, 4 });
func = new InitNeighbor(new InitNeighborParam(matrixId, nodeIdToNeighbors));
client.asyncUpdate(func).get();
nodeIdToNeighbors.clear();
client.asyncUpdate(new InitNeighborOver(new InitNeighborOverParam(matrixId))).get();
// Sample the neighbors
int[] nodeIds = new int[] { 1, 2, 3, 4, 5, 6, 7, 8 };
SampleNeighborParam param = new SampleNeighborParam(matrixId, nodeIds, -1);
Int2ObjectOpenHashMap<int[]> result = ((SampleNeighborResult) (client.get(new SampleNeighbor(param)))).getNodeIdToNeighbors();
ObjectIterator<Entry<int[]>> iter = result.int2ObjectEntrySet().fastIterator();
LOG.info("==============================sample neighbors result============================");
Entry<int[]> entry;
while (iter.hasNext()) {
entry = iter.next();
LOG.info("node id = " + entry.getIntKey() + ", neighbors = " + Arrays.toString(entry.getValue()));
}
client.checkpoint(0);
ps.stop(-1);
PSErrorRequest request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).setMsg("out of memory").build();
master.psError(null, request);
Thread.sleep(10000);
result = ((SampleNeighborResult) (client.get(new SampleNeighbor(param)))).getNodeIdToNeighbors();
iter = result.int2ObjectEntrySet().fastIterator();
LOG.info("==============================sample neighbors result============================");
while (iter.hasNext()) {
entry = iter.next();
LOG.info("node id = " + entry.getIntKey() + ", neighbors = " + Arrays.toString(entry.getValue()));
}
}
Aggregations