use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.
the class YarnContainerAllocator method handleFinishContainers.
@SuppressWarnings("unchecked")
private void handleFinishContainers(List<ContainerStatus> finishedContainers) {
for (ContainerStatus cont : finishedContainers) {
LOG.info("Received completed container:" + cont);
Id id = assignedContainerToIDMap.get(cont.getContainerId());
if (id == null) {
LOG.error("Container complete event for unknown container id " + cont.getContainerId());
} else {
assignedContainerToIDMap.remove(cont.getContainerId());
idToContainerMap.remove(id);
// dispatch container exit message to corresponding components
String diagnostics = StringInterner.weakIntern(cont.getDiagnostics());
if (id instanceof PSAttemptId) {
context.getEventHandler().handle(new PSAttemptDiagnosticsUpdateEvent(diagnostics, (PSAttemptId) id));
context.getEventHandler().handle(createContainerFinishedEvent(cont, (PSAttemptId) id));
} else if (id instanceof PSAgentAttemptId) {
context.getEventHandler().handle(new PSAgentAttemptDiagnosticsUpdateEvent((PSAgentAttemptId) id, diagnostics));
context.getEventHandler().handle(createContainerFinishedEvent(cont, (PSAgentAttemptId) id));
} else if (id instanceof WorkerAttemptId) {
context.getEventHandler().handle(new WorkerAttemptDiagnosticsUpdateEvent((WorkerAttemptId) id, diagnostics));
context.getEventHandler().handle(createContainerFinishedEvent(cont, (WorkerAttemptId) id));
}
}
}
}
use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.
the class AppStateStorage method writePSMeta.
/**
* write ps meta to file
* @param psManager ps meta storage
* @throws IOException
*/
public void writePSMeta(ParameterServerManager psManager) throws IOException {
try {
psMetaLock.lock();
// generate a temporary file
String psMetaFile = getPsMetaFile();
String tmpFile = getPSMetaTmpeFile(psMetaFile);
Path tmpPath = new Path(writeDir, tmpFile);
FSDataOutputStream outputStream = fs.create(tmpPath);
// write ps meta to the temporary file first.
Map<ParameterServerId, AMParameterServer> psMap = psManager.getParameterServerMap();
outputStream.writeInt(psMap.size());
PSAttemptId attemptId = null;
int nextAttemptIndex = 0;
for (Entry<ParameterServerId, AMParameterServer> entry : psMap.entrySet()) {
outputStream.writeInt(entry.getKey().getIndex());
attemptId = entry.getValue().getRunningAttemptId();
nextAttemptIndex = entry.getValue().getNextAttemptNumber();
if (attemptId != null) {
nextAttemptIndex = attemptId.getIndex();
}
outputStream.writeInt(nextAttemptIndex);
}
outputStream.close();
// rename the temporary file to the final file
Path psMetaFilePath = new Path(writeDir, psMetaFile);
HdfsUtil.rename(tmpPath, psMetaFilePath, fs);
// if the old final file exist, just remove it
if (lastPsMetaFilePath != null) {
fs.delete(lastPsMetaFilePath, false);
}
lastPsMetaFilePath = psMetaFilePath;
} finally {
psMetaLock.unlock();
}
}
use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.
the class MasterServiceTest method setup.
@Before
public void setup() throws Exception {
try {
// set basic configuration keys
Configuration conf = new Configuration();
conf.setBoolean("mapred.mapper.new-api", true);
conf.setBoolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST, true);
conf.set(AngelConf.ANGEL_TASK_USER_TASKCLASS, DummyTask.class.getName());
// use local deploy mode and dummy dataspliter
conf.set(AngelConf.ANGEL_DEPLOY_MODE, "LOCAL");
conf.setBoolean(AngelConf.ANGEL_AM_USE_DUMMY_DATASPLITER, true);
conf.set(AngelConf.ANGEL_INPUTFORMAT_CLASS, CombineTextInputFormat.class.getName());
conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, LOCAL_FS + TMP_PATH + "/out");
conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, LOCAL_FS + TMP_PATH + "/in");
conf.set(AngelConf.ANGEL_LOG_PATH, LOCAL_FS + TMP_PATH + "/log");
conf.setInt(AngelConf.ANGEL_WORKERGROUP_NUMBER, 1);
conf.setInt(AngelConf.ANGEL_PS_NUMBER, 1);
conf.setInt(AngelConf.ANGEL_WORKER_TASK_NUMBER, 2);
// get a angel client
angelClient = AngelClientFactory.get(conf);
// add matrix
MatrixContext mMatrix = new MatrixContext();
mMatrix.setName("w1");
mMatrix.setRowNum(1);
mMatrix.setColNum(100000);
mMatrix.setMaxRowNumInBlock(1);
mMatrix.setMaxColNumInBlock(50000);
mMatrix.setRowType(RowType.T_INT_DENSE);
mMatrix.set(MatrixConf.MATRIX_OPLOG_ENABLEFILTER, "false");
mMatrix.set(MatrixConf.MATRIX_HOGWILD, "true");
mMatrix.set(MatrixConf.MATRIX_AVERAGE, "false");
mMatrix.set(MatrixConf.MATRIX_OPLOG_TYPE, "DENSE_INT");
angelClient.addMatrix(mMatrix);
MatrixContext mMatrix2 = new MatrixContext();
mMatrix2.setName("w2");
mMatrix2.setRowNum(1);
mMatrix2.setColNum(100000);
mMatrix2.setMaxRowNumInBlock(1);
mMatrix2.setMaxColNumInBlock(50000);
mMatrix2.setRowType(RowType.T_DOUBLE_DENSE);
mMatrix2.set(MatrixConf.MATRIX_OPLOG_ENABLEFILTER, "false");
mMatrix2.set(MatrixConf.MATRIX_HOGWILD, "false");
mMatrix2.set(MatrixConf.MATRIX_AVERAGE, "false");
mMatrix2.set(MatrixConf.MATRIX_OPLOG_TYPE, "DENSE_DOUBLE");
angelClient.addMatrix(mMatrix2);
angelClient.startPSServer();
angelClient.run();
Thread.sleep(5000);
group0Id = new WorkerGroupId(0);
worker0Id = new WorkerId(group0Id, 0);
worker0Attempt0Id = new WorkerAttemptId(worker0Id, 0);
task0Id = new TaskId(0);
task1Id = new TaskId(1);
psId = new ParameterServerId(0);
psAttempt0Id = new PSAttemptId(psId, 0);
} catch (Exception x) {
LOG.error("setup failed ", x);
throw x;
}
}
use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.
the class PSFailedReportTest method testPSFailedReport.
@Test
public void testPSFailedReport() throws Exception {
ParameterServerId ps1Id = new ParameterServerId(0);
final ParameterServerId ps2Id = new ParameterServerId(1);
PSAttemptId ps1Attempt0Id = new PSAttemptId(ps1Id, 0);
PSAttemptId ps2Attempt0Id = new PSAttemptId(ps2Id, 0);
PSAttemptId ps2Attempt1Id = new PSAttemptId(ps2Id, 1);
ParameterServer ps1Attempt0 = LocalClusterContext.get().getPS(ps1Attempt0Id).getPS();
ParameterServer ps2Attempt0 = LocalClusterContext.get().getPS(ps2Attempt0Id).getPS();
WorkerId worker0Id = new WorkerId(new WorkerGroupId(0), 0);
WorkerAttemptId worker0Attempt0Id = new WorkerAttemptId(worker0Id, 0);
Worker worker0 = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
TaskContext task0Context = worker0.getTaskManager().getRunningTask().get(task0Id).getTaskContext();
MatrixClient matrixClient = task0Context.getMatrix("w1");
int iterNum = 20;
for (int i = 0; i < iterNum; i++) {
DenseIntVector update = new DenseIntVector(dim);
for (int j = 0; j < dim; j++) {
update.set(j, 1);
}
update.setMatrixId(matrixClient.getMatrixId());
update.setRowId(0);
matrixClient.increment(update);
matrixClient.clock().get();
Thread.sleep(1000);
MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
assertNotNull(ps1w1.getPartition(0));
assertNotNull(ps1w1.getPartition(1));
IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
int part0Size = ps1w1.getRow(0, 0).size();
IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
int part1Size = ps1w1.getRow(1, 0).size();
assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
MatrixStorageManager ps2Storage = ps2Attempt0.getMatrixStorageManager();
ServerMatrix ps2w1 = ps2Storage.getMatrix(matrixClient.getMatrixId());
assertNotNull(ps2w1.getPartition(0));
assertNotNull(ps2w1.getPartition(1));
row0Part0 = ((ServerDenseIntRow) ps2w1.getRow(0, 0)).getData();
part0Size = ps2w1.getRow(0, 0).size();
row0Part1 = ((ServerDenseIntRow) ps2w1.getRow(1, 0)).getData();
part1Size = ps2w1.getRow(1, 0).size();
assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
}
LOG.info("===================================================================ps2 failed");
HashMap<PSLocation, Integer> failedCounters = new HashMap<>();
PSLocation psLoc = new PSLocation(ps2Id, ps2Attempt0.getLocationManager().getPsLocation(ps2Id));
failedCounters.put(psLoc, 10000);
worker0.getPSAgent().getMasterClient().psFailedReport(failedCounters);
Thread.sleep(20000);
for (int i = iterNum; i < 2 * iterNum; i++) {
DenseIntVector update = new DenseIntVector(dim);
for (int j = 0; j < dim; j++) {
update.set(j, 1);
}
update.setMatrixId(matrixClient.getMatrixId());
update.setRowId(0);
matrixClient.increment(update);
matrixClient.clock().get();
Thread.sleep(1000);
MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
assertNotNull(ps1w1.getPartition(0));
assertNotNull(ps1w1.getPartition(1));
IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
int part0Size = ps1w1.getRow(0, 0).size();
IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
int part1Size = ps1w1.getRow(1, 0).size();
assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
}
ParameterServer ps2Attempt = LocalClusterContext.get().getPS(ps2Attempt1Id).getPS();
for (int i = iterNum * 2; i < 3 * iterNum; i++) {
DenseIntVector update = new DenseIntVector(dim);
for (int j = 0; j < dim; j++) {
update.set(j, 1);
}
update.setMatrixId(matrixClient.getMatrixId());
update.setRowId(0);
matrixClient.increment(update);
matrixClient.clock().get();
Thread.sleep(1000);
MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
assertNotNull(ps1w1.getPartition(0));
assertNotNull(ps1w1.getPartition(1));
IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
int part0Size = ps1w1.getRow(0, 0).size();
IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
int part1Size = ps1w1.getRow(1, 0).size();
assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
MatrixStorageManager ps2Storage = ps2Attempt.getMatrixStorageManager();
ServerMatrix ps2w1 = ps2Storage.getMatrix(matrixClient.getMatrixId());
assertNotNull(ps2w1.getPartition(0));
assertNotNull(ps2w1.getPartition(1));
row0Part0 = ((ServerDenseIntRow) ps2w1.getRow(0, 0)).getData();
part0Size = ps2w1.getRow(0, 0).size();
row0Part1 = ((ServerDenseIntRow) ps2w1.getRow(1, 0)).getData();
part1Size = ps2w1.getRow(1, 0).size();
assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
}
}
use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.
the class PSManagerTest method setup.
@Before
public void setup() throws Exception {
try {
// set basic configuration keys
Configuration conf = new Configuration();
conf.setBoolean("mapred.mapper.new-api", true);
conf.setBoolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST, true);
conf.set(AngelConf.ANGEL_TASK_USER_TASKCLASS, DummyTask.class.getName());
// use local deploy mode and dummy dataspliter
conf.set(AngelConf.ANGEL_DEPLOY_MODE, "LOCAL");
conf.setBoolean(AngelConf.ANGEL_AM_USE_DUMMY_DATASPLITER, true);
conf.set(AngelConf.ANGEL_INPUTFORMAT_CLASS, CombineTextInputFormat.class.getName());
conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, LOCAL_FS + TMP_PATH + "/out");
conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, LOCAL_FS + TMP_PATH + "/in");
conf.set(AngelConf.ANGEL_LOG_PATH, LOCAL_FS + TMP_PATH + "/log");
conf.setInt(AngelConf.ANGEL_WORKERGROUP_NUMBER, 1);
conf.setInt(AngelConf.ANGEL_PS_NUMBER, 1);
conf.setInt(AngelConf.ANGEL_WORKER_TASK_NUMBER, 2);
conf.setInt(AngelConf.ANGEL_PS_BACKUP_INTERVAL_MS, 5000);
// get a angel client
angelClient = AngelClientFactory.get(conf);
// add matrix
MatrixContext mMatrix = new MatrixContext();
mMatrix.setName("w1");
mMatrix.setRowNum(1);
mMatrix.setColNum(100000);
mMatrix.setMaxRowNumInBlock(1);
mMatrix.setMaxColNumInBlock(50000);
mMatrix.setRowType(RowType.T_INT_DENSE);
mMatrix.set(MatrixConf.MATRIX_OPLOG_ENABLEFILTER, "false");
mMatrix.set(MatrixConf.MATRIX_HOGWILD, "true");
mMatrix.set(MatrixConf.MATRIX_AVERAGE, "false");
mMatrix.set(MatrixConf.MATRIX_OPLOG_TYPE, "DENSE_INT");
angelClient.addMatrix(mMatrix);
MatrixContext mMatrix2 = new MatrixContext();
mMatrix2.setName("w2");
mMatrix2.setRowNum(1);
mMatrix2.setColNum(100000);
mMatrix2.setMaxRowNumInBlock(1);
mMatrix2.setMaxColNumInBlock(50000);
mMatrix2.setRowType(RowType.T_DOUBLE_DENSE);
mMatrix2.set(MatrixConf.MATRIX_OPLOG_ENABLEFILTER, "false");
mMatrix2.set(MatrixConf.MATRIX_HOGWILD, "false");
mMatrix2.set(MatrixConf.MATRIX_AVERAGE, "false");
mMatrix2.set(MatrixConf.MATRIX_OPLOG_TYPE, "DENSE_DOUBLE");
angelClient.addMatrix(mMatrix2);
angelClient.startPSServer();
angelClient.run();
Thread.sleep(5000);
group0Id = new WorkerGroupId(0);
worker0Id = new WorkerId(group0Id, 0);
worker0Attempt0Id = new WorkerAttemptId(worker0Id, 0);
task0Id = new TaskId(0);
task1Id = new TaskId(1);
psId = new ParameterServerId(0);
psAttempt0Id = new PSAttemptId(psId, 0);
} catch (Exception x) {
LOG.error("setup failed ", x);
throw x;
}
}
Aggregations