use of com.tencent.angel.ml.math2.vector.IntIntVector in project angel by Tencent.
the class PSManagerTest method testPSError.
@Test
public void testPSError() throws Exception {
try {
int heartbeatInterval = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_PS_HEARTBEAT_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_PS_HEARTBEAT_INTERVAL_MS);
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
AMParameterServer amPs = psManager.getParameterServer(psId);
PSAttempt psAttempt0 = amPs.getPSAttempt(psAttempt0Id);
ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
int w1Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w1").getId();
int w2Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w2").getId();
Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
TConnection connection = TConnectionManager.getConnection(ps.getConf());
MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
int task0Iteration = 2;
int task1Iteration = 1;
int task0w1Clock = 10;
int task0w2Clock = 20;
int task1w1Clock = 9;
int task1w2Clock = 19;
int w1Clock = (task0w1Clock < task1w1Clock) ? task0w1Clock : task1w1Clock;
int w2Clock = (task0w2Clock < task1w2Clock) ? task0w2Clock : task1w2Clock;
TaskContext task0Context = worker.getTaskManager().getRunningTask().get(task0Id).getTaskContext().getContext();
TaskContext task1Context = worker.getTaskManager().getRunningTask().get(task1Id).getTaskContext().getContext();
master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task0w1Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task0w2Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task1w1Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task1w2Clock).build()).build());
assertEquals(amPs.getMaxAttempts(), 4);
PSAttemptId psAttempt1Id = new PSAttemptId(psId, 1);
PSAttemptId psAttempt2Id = new PSAttemptId(psId, 2);
PSAttemptId psAttempt3Id = new PSAttemptId(psId, 3);
// attempt 0
ps.stop(-1);
PSErrorRequest request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).setMsg("out of memory").build();
master.psError(null, request);
Thread.sleep(heartbeatInterval * 2);
PSAttempt psAttempt1 = amPs.getPSAttempt(psAttempt1Id);
assertTrue(psAttempt1 != null);
assertEquals(psAttempt0.getInternalState(), PSAttemptStateInternal.FAILED);
assertEquals(psAttempt1.getInternalState(), PSAttemptStateInternal.RUNNING);
assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
assertEquals(amPs.getNextAttemptNumber(), 2);
assertEquals(amPs.getRunningAttemptId(), psAttempt1Id);
assertNull(amPs.getSuccessAttemptId());
assertEquals(amPs.getPSAttempts().size(), 2);
List<String> diagnostics = amPs.getDiagnostics();
assertEquals(diagnostics.size(), 1);
assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
ps = LocalClusterContext.get().getPS(psAttempt1Id).getPS();
checkMatrixInfo(ps, w1Id, w2Id, w1Clock, w2Clock);
MatrixClient w1Task0Client = worker.getPSAgent().getMatrixClient("w1", 0);
MatrixClient w1Task1Client = worker.getPSAgent().getMatrixClient("w1", 1);
int matrixW1Id = w1Task0Client.getMatrixId();
int[] delta = new int[100000];
for (int i = 0; i < 100000; i++) {
delta[i] = 2;
}
IntIntVector deltaVec = new IntIntVector(100000, new IntIntDenseVectorStorage(delta));
deltaVec.setMatrixId(matrixW1Id);
deltaVec.setRowId(0);
w1Task0Client.increment(deltaVec);
deltaVec = new IntIntVector(100000, new IntIntDenseVectorStorage(delta));
deltaVec.setMatrixId(matrixW1Id);
deltaVec.setRowId(0);
w1Task1Client.increment(deltaVec);
w1Task0Client.clock().get();
w1Task1Client.clock().get();
ps = LocalClusterContext.get().getPS(psAttempt1Id).getPS();
int snapshotInterval = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_PS_BACKUP_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_PS_BACKUP_INTERVAL_MS);
Thread.sleep(snapshotInterval * 2);
// attempt1
ps.stop(-1);
request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt1Id)).setMsg("out of memory").build();
master.psError(null, request);
Thread.sleep(heartbeatInterval * 2);
PSAttempt psAttempt2 = amPs.getPSAttempt(psAttempt2Id);
assertTrue(psAttempt2 != null);
assertEquals(psAttempt1.getInternalState(), PSAttemptStateInternal.FAILED);
assertEquals(psAttempt2.getInternalState(), PSAttemptStateInternal.RUNNING);
assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
assertEquals(amPs.getNextAttemptNumber(), 3);
assertEquals(amPs.getRunningAttemptId(), psAttempt2Id);
assertNull(amPs.getSuccessAttemptId());
assertEquals(amPs.getPSAttempts().size(), 3);
diagnostics = amPs.getDiagnostics();
assertEquals(diagnostics.size(), 2);
assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
ps = LocalClusterContext.get().getPS(psAttempt2Id).getPS();
checkMatrixInfo(ps, w1Id, w2Id, w1Clock + 1, w2Clock);
assertEquals(sum((IntIntVector) w1Task0Client.getRow(0)), 400000);
// attempt1
ps.stop(-1);
request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt2Id)).setMsg("out of memory").build();
master.psError(null, request);
Thread.sleep(heartbeatInterval * 2);
PSAttempt psAttempt3 = amPs.getPSAttempt(psAttempt3Id);
assertTrue(psAttempt3 != null);
assertEquals(psAttempt2.getInternalState(), PSAttemptStateInternal.FAILED);
assertEquals(psAttempt3.getInternalState(), PSAttemptStateInternal.RUNNING);
assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
assertEquals(amPs.getNextAttemptNumber(), 4);
assertEquals(amPs.getRunningAttemptId(), psAttempt3Id);
assertNull(amPs.getSuccessAttemptId());
assertEquals(amPs.getPSAttempts().size(), 4);
diagnostics = amPs.getDiagnostics();
assertEquals(diagnostics.size(), 3);
assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
assertEquals(diagnostics.get(2), psAttempt2Id + " failed due to: out of memory");
ps = LocalClusterContext.get().getPS(psAttempt3Id).getPS();
checkMatrixInfo(ps, w1Id, w2Id, w1Clock + 1, w2Clock);
ps.stop(-1);
request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt3Id)).setMsg("out of memory").build();
master.psError(null, request);
Thread.sleep(heartbeatInterval * 2);
assertEquals(psAttempt3.getInternalState(), PSAttemptStateInternal.FAILED);
assertEquals(amPs.getState(), AMParameterServerState.FAILED);
assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.FAILED);
assertEquals(amPs.getNextAttemptNumber(), 4);
assertNull(amPs.getRunningAttemptId());
assertNull(amPs.getSuccessAttemptId());
assertEquals(amPs.getPSAttempts().size(), 4);
diagnostics = amPs.getDiagnostics();
assertEquals(diagnostics.size(), 4);
assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
assertEquals(diagnostics.get(2), psAttempt2Id + " failed due to: out of memory");
assertEquals(diagnostics.get(3), psAttempt3Id + " failed due to: out of memory");
} catch (Exception x) {
LOG.error("run testPSError failed ", x);
throw x;
}
}
use of com.tencent.angel.ml.math2.vector.IntIntVector in project angel by Tencent.
the class UpdatePSFTest method testSparseIntUDF.
public void testSparseIntUDF() throws Exception {
Worker worker = LocalClusterContext.get().getWorker(workerAttempt0Id).getWorker();
MatrixClient client1 = worker.getPSAgent().getMatrixClient(SPARSE_INT_MAT, 0);
int matrixW1Id = client1.getMatrixId();
int[] index = genIndexs(feaNum, nnz);
IntIntVector deltaVec = new IntIntVector(feaNum, new IntIntSparseVectorStorage(feaNum, nnz));
for (int i = 0; i < index.length; i++) {
deltaVec.set(index[i], index[i]);
}
// for (int i = 0; i < feaNum; i++) {
// deltaVec.set(i, i);
// }
deltaVec.setRowId(0);
Vector[] updates = new Vector[1];
updates[0] = deltaVec;
client1.asyncUpdate(new IncrementRows(new IncrementRowsParam(matrixW1Id, updates))).get();
IntIntVector row = (IntIntVector) client1.getRow(0);
for (int id : index) {
// System.out.println("id=" + id + ", value=" + row.get(id));
Assert.assertTrue(row.get(id) == deltaVec.get(id));
}
Assert.assertTrue(index.length == row.size());
}
use of com.tencent.angel.ml.math2.vector.IntIntVector in project angel by Tencent.
the class UpdatePSFTest method testDenseIntUDF.
public void testDenseIntUDF() throws Exception {
Worker worker = LocalClusterContext.get().getWorker(workerAttempt0Id).getWorker();
MatrixClient client1 = worker.getPSAgent().getMatrixClient(DENSE_INT_MAT, 0);
int matrixW1Id = client1.getMatrixId();
int[] index = genIndexs(feaNum, nnz);
IntIntVector deltaVec = new IntIntVector(feaNum, new IntIntDenseVectorStorage(feaNum));
for (int i = 0; i < feaNum; i++) {
deltaVec.set(i, i);
}
deltaVec.setRowId(0);
Vector[] updates = new Vector[1];
updates[0] = deltaVec;
client1.asyncUpdate(new IncrementRows(new IncrementRowsParam(matrixW1Id, updates))).get();
IntIntVector row = (IntIntVector) client1.getRow(0);
for (int id : index) {
Assert.assertTrue(row.get(id) == deltaVec.get(id));
}
Assert.assertTrue(feaNum == row.size());
}
use of com.tencent.angel.ml.math2.vector.IntIntVector in project angel by Tencent.
the class SnapshotFormat method save.
private void save(ServerLongIntRow row, PSMatrixSaveContext saveContext, MatrixPartitionMeta meta, DataOutputStream out) throws IOException {
long startCol = meta.getStartCol();
if (ServerRowUtils.getVector(row) instanceof IntIntVector) {
IntIntVector vector = (IntIntVector) ServerRowUtils.getVector(row);
if (vector.isDense()) {
int[] data = vector.getStorage().getValues();
for (int i = 0; i < data.length; i++) {
out.writeInt(data[i]);
}
} else if (vector.isSorted()) {
int[] indices = vector.getStorage().getIndices();
int[] values = vector.getStorage().getValues();
for (int i = 0; i < indices.length; i++) {
out.writeLong(indices[i] + startCol);
out.writeInt(values[i]);
}
} else {
ObjectIterator<Int2IntMap.Entry> iter = vector.getStorage().entryIterator();
Int2IntMap.Entry entry;
while (iter.hasNext()) {
entry = iter.next();
out.writeLong(entry.getIntKey() + startCol);
out.writeInt(entry.getIntValue());
}
}
} else {
LongIntVector vector = (LongIntVector) ServerRowUtils.getVector(row);
if (vector.isSorted()) {
long[] indices = vector.getStorage().getIndices();
int[] values = vector.getStorage().getValues();
for (int i = 0; i < indices.length; i++) {
out.writeLong(indices[i] + startCol);
out.writeInt(values[i]);
}
} else {
ObjectIterator<Long2IntMap.Entry> iter = vector.getStorage().entryIterator();
Long2IntMap.Entry entry;
while (iter.hasNext()) {
entry = iter.next();
out.writeLong(entry.getLongKey() + startCol);
out.writeInt(entry.getIntValue());
}
}
}
}
use of com.tencent.angel.ml.math2.vector.IntIntVector in project angel by Tencent.
the class HashRouterUtils method split.
/**
* Split keys by matrix partition
*
* @param matrixMeta matrix meta data
* @param vector Matrix vector
* @return partition key to key partition map
*/
public static KeyValuePart[] split(MatrixMeta matrixMeta, Vector vector) {
KeyHash hasher = HasherFactory.getHasher(matrixMeta.getRouterHash());
PartitionKey[] matrixParts = matrixMeta.getPartitionKeys();
KeyValuePart[] dataParts = new KeyValuePart[matrixParts.length];
int estSize = (int) (vector.getSize() / matrixMeta.getPartitionNum());
for (int i = 0; i < dataParts.length; i++) {
dataParts[i] = generateDataPart(vector.getRowId(), vector.getType(), estSize);
}
switch(vector.getType()) {
case T_DOUBLE_DENSE:
case T_DOUBLE_SPARSE:
{
splitIntDoubleVector(hasher, matrixMeta, (IntDoubleVector) vector, dataParts);
break;
}
case T_FLOAT_DENSE:
case T_FLOAT_SPARSE:
{
splitIntFloatVector(hasher, matrixMeta, (IntFloatVector) vector, dataParts);
break;
}
case T_INT_DENSE:
case T_INT_SPARSE:
{
splitIntIntVector(hasher, matrixMeta, (IntIntVector) vector, dataParts);
break;
}
case T_LONG_DENSE:
case T_LONG_SPARSE:
{
splitIntLongVector(hasher, matrixMeta, (IntLongVector) vector, dataParts);
break;
}
case T_DOUBLE_SPARSE_LONGKEY:
{
splitLongDoubleVector(hasher, matrixMeta, (LongDoubleVector) vector, dataParts);
break;
}
case T_FLOAT_SPARSE_LONGKEY:
{
splitLongFloatVector(hasher, matrixMeta, (LongFloatVector) vector, dataParts);
break;
}
case T_INT_SPARSE_LONGKEY:
{
splitLongIntVector(hasher, matrixMeta, (LongIntVector) vector, dataParts);
break;
}
case T_LONG_SPARSE_LONGKEY:
{
splitLongLongVector(hasher, matrixMeta, (LongLongVector) vector, dataParts);
break;
}
default:
{
throw new UnsupportedOperationException("Unsupport vector type " + vector.getType());
}
}
for (int i = 0; i < dataParts.length; i++) {
if (dataParts[i] != null) {
dataParts[i].setRowId(vector.getRowId());
}
}
return dataParts;
}
Aggregations