Search in sources :

Example 36 with ParameterServerId

use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.

the class PSAgentTest method testLocationCache.

@Test
public void testLocationCache() throws Exception {
    try {
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertTrue(angelAppMaster != null);
        AMTaskManager taskManager = angelAppMaster.getAppContext().getTaskManager();
        assertTrue(taskManager != null);
        WorkerManager workerManager = angelAppMaster.getAppContext().getWorkerManager();
        assertTrue(workerManager != null);
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        PSAgent psAgent = worker.getPSAgent();
        assertTrue(psAgent != null);
        PSAgentLocationManager locationCache = psAgent.getLocationManager();
        assertTrue(locationCache != null);
        // test master location
        Location masterLoc = locationCache.getMasterLocation();
        String ipRegex = "(2[5][0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})";
        Pattern pattern = Pattern.compile(ipRegex);
        Matcher matcher = pattern.matcher(masterLoc.getIp());
        assertTrue(matcher.matches());
        assertTrue(masterLoc.getPort() >= 1 && masterLoc.getPort() <= 65535);
        // test ps location
        Location psLoc = locationCache.getPsLocation(psId);
        matcher = pattern.matcher(psLoc.getIp());
        assertTrue(matcher.matches());
        assertTrue(psLoc.getPort() >= 1 && psLoc.getPort() <= 65535);
        // assertEquals(psLoc, locationCache.updateAndGetPSLocation(psId));
        // test all ps ids
        ParameterServerId[] allPSIds = locationCache.getPsIds();
        assertEquals(allPSIds.length, 1);
        assertEquals(allPSIds[0], psId);
    } catch (Exception x) {
        LOG.error("run testLocationCache failed ", x);
        throw x;
    }
}
Also used : Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) WorkerManager(com.tencent.angel.master.worker.WorkerManager) AMTaskManager(com.tencent.angel.master.task.AMTaskManager) AngelApplicationMaster(com.tencent.angel.master.AngelApplicationMaster) PSAgentLocationManager(com.tencent.angel.psagent.matrix.PSAgentLocationManager) Worker(com.tencent.angel.worker.Worker) ParameterServerId(com.tencent.angel.ps.ParameterServerId) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Example 37 with ParameterServerId

use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.

the class IncrementRowHashTest method setup.

@Before
public void setup() throws Exception {
    // set basic configuration keys
    Configuration conf = new Configuration();
    conf.setBoolean("mapred.mapper.new-api", true);
    conf.setBoolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST, true);
    conf.set(AngelConf.ANGEL_TASK_USER_TASKCLASS, DummyTask.class.getName());
    // use local deploy mode and dummy dataspliter
    conf.set(AngelConf.ANGEL_DEPLOY_MODE, "LOCAL");
    conf.setBoolean(AngelConf.ANGEL_AM_USE_DUMMY_DATASPLITER, true);
    conf.set(AngelConf.ANGEL_INPUTFORMAT_CLASS, CombineTextInputFormat.class.getName());
    conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, LOCAL_FS + TMP_PATH + "/out");
    conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, LOCAL_FS + TMP_PATH + "/in");
    conf.set(AngelConf.ANGEL_LOG_PATH, LOCAL_FS + TMP_PATH + "/log");
    conf.setInt(AngelConf.ANGEL_WORKERGROUP_NUMBER, 1);
    conf.setInt(AngelConf.ANGEL_PS_NUMBER, 1);
    conf.setInt(AngelConf.ANGEL_WORKER_TASK_NUMBER, 1);
    conf.setInt(AngelConf.ANGEL_MODEL_PARTITIONER_PARTITION_SIZE, 1000);
    conf.setInt(AngelConf.ANGEL_PSAGENT_CACHE_SYNC_TIMEINTERVAL_MS, 10);
    conf.setInt(AngelConf.ANGEL_WORKER_HEARTBEAT_INTERVAL_MS, 1000);
    conf.setInt(AngelConf.ANGEL_PS_HEARTBEAT_INTERVAL_MS, 1000);
    conf.setBoolean("use.new.split", true);
    conf.setInt(AngelConf.ANGEL_WORKER_MAX_ATTEMPTS, 1);
    conf.setInt(AngelConf.ANGEL_PS_MAX_ATTEMPTS, 1);
    // get a angel client
    angelClient = AngelClientFactory.get(conf);
    // add sparse double matrix
    MatrixContext sMat = new MatrixContext();
    sMat.setName(SPARSE_DOUBLE_MAT);
    sMat.setRowNum(1);
    sMat.setPartitionNum(partNum);
    sMat.setPartitionerClass(HashPartitioner.class);
    sMat.setRowType(RowType.T_DOUBLE_SPARSE);
    angelClient.addMatrix(sMat);
    // add sparse float matrix
    MatrixContext sfMat = new MatrixContext();
    sfMat.setName(SPARSE_FLOAT_MAT);
    sfMat.setRowNum(1);
    sfMat.setPartitionNum(partNum);
    sfMat.setPartitionerClass(HashPartitioner.class);
    sfMat.setRowType(RowType.T_FLOAT_SPARSE);
    angelClient.addMatrix(sfMat);
    // add sparse float matrix
    MatrixContext siMat = new MatrixContext();
    siMat.setName(SPARSE_INT_MAT);
    siMat.setRowNum(1);
    siMat.setPartitionNum(partNum);
    siMat.setPartitionerClass(HashPartitioner.class);
    siMat.setRowType(RowType.T_INT_SPARSE);
    angelClient.addMatrix(siMat);
    // add sparse long matrix
    MatrixContext slMat = new MatrixContext();
    slMat.setName(SPARSE_LONG_MAT);
    slMat.setRowNum(1);
    slMat.setPartitionNum(partNum);
    slMat.setPartitionerClass(HashPartitioner.class);
    slMat.setRowType(RowType.T_LONG_SPARSE);
    angelClient.addMatrix(slMat);
    // add sparse long-key double matrix
    MatrixContext dLongKeysMatrix = new MatrixContext();
    dLongKeysMatrix.setName(SPARSE_DOUBLE_LONG_MAT);
    dLongKeysMatrix.setRowNum(1);
    dLongKeysMatrix.setPartitionNum(partNum);
    dLongKeysMatrix.setPartitionerClass(HashPartitioner.class);
    dLongKeysMatrix.setRowType(RowType.T_DOUBLE_SPARSE_LONGKEY);
    angelClient.addMatrix(dLongKeysMatrix);
    // add sparse long-key float matrix
    MatrixContext slfMatrix = new MatrixContext();
    slfMatrix.setName(SPARSE_FLOAT_LONG_MAT);
    slfMatrix.setRowNum(1);
    slfMatrix.setPartitionNum(partNum);
    slfMatrix.setPartitionerClass(HashPartitioner.class);
    slfMatrix.setRowType(RowType.T_FLOAT_SPARSE_LONGKEY);
    angelClient.addMatrix(slfMatrix);
    // add sparse long-key int matrix
    MatrixContext sliMatrix = new MatrixContext();
    sliMatrix.setName(SPARSE_INT_LONG_MAT);
    sliMatrix.setRowNum(1);
    sliMatrix.setPartitionNum(partNum);
    sliMatrix.setPartitionerClass(HashPartitioner.class);
    sliMatrix.setRowType(RowType.T_INT_SPARSE_LONGKEY);
    angelClient.addMatrix(sliMatrix);
    // add sparse long-key long matrix
    MatrixContext sllMatrix = new MatrixContext();
    sllMatrix.setName(SPARSE_LONG_LONG_MAT);
    sllMatrix.setRowNum(1);
    sllMatrix.setPartitionNum(partNum);
    sllMatrix.setPartitionerClass(HashPartitioner.class);
    sllMatrix.setRowType(RowType.T_LONG_SPARSE_LONGKEY);
    angelClient.addMatrix(sllMatrix);
    // Start PS
    angelClient.startPSServer();
    // Start to run application
    angelClient.run();
    Thread.sleep(5000);
    psId = new ParameterServerId(0);
    psAttempt0Id = new PSAttemptId(psId, 0);
    WorkerGroupId workerGroupId = new WorkerGroupId(0);
    workerId = new WorkerId(workerGroupId, 0);
    workerAttempt0Id = new WorkerAttemptId(workerId, 0);
}
Also used : CombineTextInputFormat(org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat) MatrixContext(com.tencent.angel.ml.matrix.MatrixContext) Configuration(org.apache.hadoop.conf.Configuration) PSAttemptId(com.tencent.angel.ps.PSAttemptId) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) ParameterServerId(com.tencent.angel.ps.ParameterServerId) WorkerId(com.tencent.angel.worker.WorkerId) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId) Before(org.junit.Before)

Example 38 with ParameterServerId

use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.

the class IndexGetRowHashTest method setup.

@Before
public void setup() throws Exception {
    // set basic configuration keys
    Configuration conf = new Configuration();
    conf.setBoolean("mapred.mapper.new-api", true);
    conf.setBoolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST, true);
    conf.set(AngelConf.ANGEL_TASK_USER_TASKCLASS, DummyTask.class.getName());
    // use local deploy mode and dummy dataspliter
    conf.set(AngelConf.ANGEL_DEPLOY_MODE, "LOCAL");
    conf.setBoolean(AngelConf.ANGEL_AM_USE_DUMMY_DATASPLITER, true);
    conf.set(AngelConf.ANGEL_INPUTFORMAT_CLASS, CombineTextInputFormat.class.getName());
    conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, LOCAL_FS + TMP_PATH + "/out");
    conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, LOCAL_FS + TMP_PATH + "/in");
    conf.set(AngelConf.ANGEL_LOG_PATH, LOCAL_FS + TMP_PATH + "/log");
    conf.setInt(AngelConf.ANGEL_WORKERGROUP_NUMBER, 1);
    conf.setInt(AngelConf.ANGEL_PS_NUMBER, 1);
    conf.setInt(AngelConf.ANGEL_WORKER_TASK_NUMBER, 1);
    conf.setInt(AngelConf.ANGEL_MODEL_PARTITIONER_PARTITION_SIZE, 100000);
    conf.setInt(AngelConf.ANGEL_PSAGENT_CACHE_SYNC_TIMEINTERVAL_MS, 10);
    conf.setInt(AngelConf.ANGEL_WORKER_HEARTBEAT_INTERVAL_MS, 1000);
    conf.setInt(AngelConf.ANGEL_PS_HEARTBEAT_INTERVAL_MS, 1000);
    conf.setInt(AngelConf.ANGEL_WORKER_MAX_ATTEMPTS, 1);
    conf.setInt(AngelConf.ANGEL_PS_MAX_ATTEMPTS, 1);
    // get a angel client
    angelClient = AngelClientFactory.get(conf);
    // add sparse double matrix
    MatrixContext sMat = new MatrixContext();
    sMat.setName(SPARSE_DOUBLE_MAT);
    sMat.setRowNum(1);
    sMat.setRowType(RowType.T_DOUBLE_SPARSE);
    sMat.setPartitionNum(partNum);
    sMat.setPartitionerClass(HashPartitioner.class);
    angelClient.addMatrix(sMat);
    // add sparse float matrix
    MatrixContext sfMat = new MatrixContext();
    sfMat.setName(SPARSE_FLOAT_MAT);
    sfMat.setRowNum(1);
    sfMat.setRowType(RowType.T_FLOAT_SPARSE);
    sfMat.setValidIndexNum(modelSize);
    sfMat.setPartitionNum(partNum);
    sfMat.setPartitionerClass(HashPartitioner.class);
    angelClient.addMatrix(sfMat);
    // add sparse float matrix
    MatrixContext siMat = new MatrixContext();
    siMat.setName(SPARSE_INT_MAT);
    siMat.setRowNum(1);
    siMat.setRowType(RowType.T_INT_SPARSE);
    siMat.setValidIndexNum(modelSize);
    siMat.setPartitionNum(partNum);
    siMat.setPartitionerClass(HashPartitioner.class);
    angelClient.addMatrix(siMat);
    // add sparse long matrix
    MatrixContext slMat = new MatrixContext();
    slMat.setName(SPARSE_LONG_MAT);
    slMat.setRowNum(1);
    slMat.setRowType(RowType.T_LONG_SPARSE);
    slMat.setValidIndexNum(modelSize);
    slMat.setPartitionNum(partNum);
    slMat.setPartitionerClass(HashPartitioner.class);
    angelClient.addMatrix(slMat);
    // add sparse long-key float matrix
    MatrixContext sldMatrix = new MatrixContext();
    sldMatrix.setName(SPARSE_DOUBLE_LONG_MAT);
    sldMatrix.setRowNum(1);
    sldMatrix.setRowType(RowType.T_DOUBLE_SPARSE_LONGKEY);
    sldMatrix.setValidIndexNum(modelSize);
    sldMatrix.setPartitionNum(partNum);
    sldMatrix.setPartitionerClass(HashPartitioner.class);
    angelClient.addMatrix(sldMatrix);
    // add sparse long-key float matrix
    MatrixContext slfMatrix = new MatrixContext();
    slfMatrix.setName(SPARSE_FLOAT_LONG_MAT);
    slfMatrix.setRowNum(1);
    slfMatrix.setRowType(RowType.T_FLOAT_SPARSE_LONGKEY);
    slfMatrix.setValidIndexNum(modelSize);
    slfMatrix.setPartitionNum(partNum);
    slfMatrix.setPartitionerClass(HashPartitioner.class);
    angelClient.addMatrix(slfMatrix);
    // add sparse long-key int matrix
    MatrixContext sliMatrix = new MatrixContext();
    sliMatrix.setName(SPARSE_INT_LONG_MAT);
    sliMatrix.setRowNum(1);
    sliMatrix.setRowType(RowType.T_INT_SPARSE_LONGKEY);
    sliMatrix.setValidIndexNum(modelSize);
    sliMatrix.setPartitionNum(partNum);
    sliMatrix.setPartitionerClass(HashPartitioner.class);
    angelClient.addMatrix(sliMatrix);
    // add sparse long-key long matrix
    MatrixContext sllMatrix = new MatrixContext();
    sllMatrix.setName(SPARSE_LONG_LONG_MAT);
    sllMatrix.setRowNum(1);
    sllMatrix.setRowType(RowType.T_LONG_SPARSE_LONGKEY);
    sllMatrix.setValidIndexNum(modelSize);
    sllMatrix.setPartitionNum(partNum);
    sllMatrix.setPartitionerClass(HashPartitioner.class);
    angelClient.addMatrix(sllMatrix);
    // Start PS
    angelClient.startPSServer();
    // Start to run application
    angelClient.run();
    Thread.sleep(5000);
    psId = new ParameterServerId(0);
    psAttempt0Id = new PSAttemptId(psId, 0);
    WorkerGroupId workerGroupId = new WorkerGroupId(0);
    workerId = new WorkerId(workerGroupId, 0);
    workerAttempt0Id = new WorkerAttemptId(workerId, 0);
}
Also used : CombineTextInputFormat(org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat) MatrixContext(com.tencent.angel.ml.matrix.MatrixContext) Configuration(org.apache.hadoop.conf.Configuration) PSAttemptId(com.tencent.angel.ps.PSAttemptId) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) ParameterServerId(com.tencent.angel.ps.ParameterServerId) WorkerId(com.tencent.angel.worker.WorkerId) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId) Before(org.junit.Before)

Example 39 with ParameterServerId

use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.

the class MasterService method getAllPSLocation.

/**
 * get all parameter server locations.
 *
 * @param controller rpc controller of protobuf
 */
@Override
public GetAllPSLocationResponse getAllPSLocation(RpcController controller, GetAllPSLocationRequest request) {
    GetAllPSLocationResponse.Builder resBuilder = GetAllPSLocationResponse.newBuilder();
    LocationManager locationManager = context.getLocationManager();
    ParameterServerId[] psIds = locationManager.getPsIds();
    for (int i = 0; i < psIds.length; i++) {
        resBuilder.addPsLocations(ProtobufUtil.convertToPSLocProto(psIds[i], locationManager.getPsLocation(psIds[i])));
    }
    return resBuilder.build();
}
Also used : LocationManager(com.tencent.angel.common.location.LocationManager) ParameterServerId(com.tencent.angel.ps.ParameterServerId) GetAllPSLocationResponse(com.tencent.angel.protobuf.generated.MLProtos.GetAllPSLocationResponse)

Example 40 with ParameterServerId

use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.

the class MasterService method killPS.

@Override
public KillPSResponse killPS(RpcController controller, KillPSRequest request) throws ServiceException {
    PSAttemptId psAttemptId = new PSAttemptId(new ParameterServerId(request.getPsId()), request.getAttemptIndex());
    // LOG.info("error happened in psAttempt " + psAttemptId + " error msg=" + request.getMsg());
    // remove this parameter server attempt from monitor set
    context.getParameterServerManager().unRegister(psAttemptId);
    context.getEventHandler().handle(new PSAttemptDiagnosticsUpdateEvent("kill by client", psAttemptId));
    context.getEventHandler().handle(new PSAttemptEvent(PSAttemptEventType.PA_FAILMSG, psAttemptId));
    return KillPSResponse.getDefaultInstance();
}
Also used : PSAttemptId(com.tencent.angel.ps.PSAttemptId) PSAttemptDiagnosticsUpdateEvent(com.tencent.angel.master.ps.attempt.PSAttemptDiagnosticsUpdateEvent) ParameterServerId(com.tencent.angel.ps.ParameterServerId) PSAttemptEvent(com.tencent.angel.master.ps.attempt.PSAttemptEvent)

Aggregations

ParameterServerId (com.tencent.angel.ps.ParameterServerId)65 PSAttemptId (com.tencent.angel.ps.PSAttemptId)33 WorkerAttemptId (com.tencent.angel.worker.WorkerAttemptId)28 WorkerGroupId (com.tencent.angel.worker.WorkerGroupId)28 WorkerId (com.tencent.angel.worker.WorkerId)28 Configuration (org.apache.hadoop.conf.Configuration)28 MatrixContext (com.tencent.angel.ml.matrix.MatrixContext)27 CombineTextInputFormat (org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat)27 Before (org.junit.Before)23 TaskId (com.tencent.angel.worker.task.TaskId)9 PSLocation (com.tencent.angel.ps.server.data.PSLocation)6 HashMap (java.util.HashMap)6 Location (com.tencent.angel.common.location.Location)5 MatrixMeta (com.tencent.angel.ml.matrix.MatrixMeta)5 PartitionLocation (com.tencent.angel.ml.matrix.PartitionLocation)5 ArrayList (java.util.ArrayList)5 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)5 Path (org.apache.hadoop.fs.Path)5 Test (org.junit.Test)5 AMParameterServer (com.tencent.angel.master.ps.ps.AMParameterServer)4