Search in sources :

Example 21 with TaskId

use of com.tencent.angel.worker.task.TaskId in project angel by Tencent.

the class AMTaskManager method deserialize.

/**
 * read all tasks state from a input stream
 *
 * @param input the input stream
 * @throws IOException
 */
public void deserialize(DataInputStream input) throws IOException {
    try {
        writeLock.lock();
        int size = input.readInt();
        for (int i = 0; i < size; i++) {
            TaskId taskId = new TaskId(input.readInt());
            AMTask task = new AMTask(taskId, null);
            task.deserialize(input);
            LOG.info("task deserialize=" + task);
            idToTaskMap.put(taskId, task);
        }
    } finally {
        writeLock.unlock();
    }
}
Also used : TaskId(com.tencent.angel.worker.task.TaskId)

Example 22 with TaskId

use of com.tencent.angel.worker.task.TaskId in project angel by Tencent.

the class TaskCalPerfChecker method check.

@Override
public List<Id> check(AMContext context) {
    double slowestDiscount = context.getConf().getDouble(AngelConf.ANGEL_AM_TASK_SLOWEST_DISCOUNT, AngelConf.DEFAULT_ANGEL_AM_TASK_SLOWEST_DISCOUNT);
    LOG.info("start to check slow workers use TaskCalPerfChecker policy, slowestDiscount = " + slowestDiscount);
    Set<Id> slowWorkers = new HashSet<Id>();
    AMTaskManager taskManage = context.getTaskManager();
    WorkerManager workerManager = context.getWorkerManager();
    Collection<AMTask> tasks = taskManage.getTasks();
    long totalSamples = 0;
    long totalCalTimeMs = 0;
    double averageRate = 0.0;
    Map<TaskId, Double> taskIdToRateMap = new HashMap<TaskId, Double>(tasks.size());
    for (AMTask task : tasks) {
        if (task.getMetrics().containsKey(TaskCounter.TOTAL_CALCULATE_SAMPLES) && task.getMetrics().containsKey(TaskCounter.TOTAL_CALCULATE_TIME_MS)) {
            long sampleNum = Long.valueOf(task.getMetrics().get(TaskCounter.TOTAL_CALCULATE_SAMPLES));
            double calTimeMs = Long.valueOf(task.getMetrics().get(TaskCounter.TOTAL_CALCULATE_TIME_MS));
            LOG.info("for task " + task.getTaskId() + ", sampleNum = " + sampleNum + ", calTimeMs = " + calTimeMs);
            totalSamples += sampleNum;
            totalCalTimeMs += calTimeMs;
            if (sampleNum > 5000000) {
                LOG.info("task " + task.getTaskId() + " calculate rate = " + (calTimeMs * 10000 / sampleNum));
                taskIdToRateMap.put(task.getTaskId(), calTimeMs * 10000 / sampleNum);
            }
        }
    }
    if (totalSamples != 0) {
        averageRate = (double) totalCalTimeMs * 10000 / totalSamples;
    }
    LOG.info("totalSamples = " + totalSamples + ", totalCalTimeMs = " + totalCalTimeMs + ", average calulate time for 10000 samples = " + averageRate + ", the maximum calulate time for 10000 sample = " + averageRate / slowestDiscount);
    for (Map.Entry<TaskId, Double> rateEntry : taskIdToRateMap.entrySet()) {
        if (averageRate < rateEntry.getValue() * slowestDiscount) {
            LOG.info("task " + rateEntry.getKey() + " rate = " + rateEntry.getValue() + " is < " + averageRate * slowestDiscount);
            AMWorker worker = workerManager.getWorker(rateEntry.getKey());
            if (worker != null) {
                LOG.info("put worker " + worker.getId() + " to slow worker list");
                slowWorkers.add(worker.getId());
            }
        }
    }
    List<Id> slowWorkerList = new ArrayList<>(slowWorkers.size());
    slowWorkerList.addAll(slowWorkers);
    return slowWorkerList;
}
Also used : TaskId(com.tencent.angel.worker.task.TaskId) WorkerManager(com.tencent.angel.master.worker.WorkerManager) AMTaskManager(com.tencent.angel.master.task.AMTaskManager) AMWorker(com.tencent.angel.master.worker.worker.AMWorker) Id(com.tencent.angel.common.Id) TaskId(com.tencent.angel.worker.task.TaskId) AMTask(com.tencent.angel.master.task.AMTask)

Example 23 with TaskId

use of com.tencent.angel.worker.task.TaskId in project angel by Tencent.

the class ServerPartitionTest method testWriteTo.

@Test
public void testWriteTo() throws Exception {
    // set basic configuration keys
    conf = new Configuration();
    conf.setBoolean("mapred.mapper.new-api", true);
    conf.setBoolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST, true);
    conf.set(AngelConf.ANGEL_TASK_USER_TASKCLASS, DummyTask.class.getName());
    // use local deploy mode and dummy dataspliter
    conf.set(AngelConf.ANGEL_DEPLOY_MODE, "LOCAL");
    conf.setBoolean(AngelConf.ANGEL_AM_USE_DUMMY_DATASPLITER, true);
    conf.set(AngelConf.ANGEL_INPUTFORMAT_CLASS, CombineTextInputFormat.class.getName());
    conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, LOCAL_FS + TMP_PATH + "/out");
    conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, LOCAL_FS + TMP_PATH + "/in");
    conf.set(AngelConf.ANGEL_LOG_PATH, LOCAL_FS + TMP_PATH + "/log");
    conf.setInt(AngelConf.ANGEL_WORKERGROUP_NUMBER, 1);
    conf.setInt(AngelConf.ANGEL_PS_NUMBER, 1);
    conf.setInt(AngelConf.ANGEL_WORKER_TASK_NUMBER, 2);
    // get a angel client
    angelClient = AngelClientFactory.get(conf);
    // add matrix
    MatrixContext mMatrix = new MatrixContext();
    mMatrix.setName("w1");
    mMatrix.setRowNum(1);
    mMatrix.setColNum(100000);
    mMatrix.setMaxRowNumInBlock(1);
    mMatrix.setMaxColNumInBlock(50000);
    mMatrix.setRowType(RowType.T_INT_DENSE);
    mMatrix.set(MatrixConf.MATRIX_OPLOG_ENABLEFILTER, "false");
    mMatrix.set(MatrixConf.MATRIX_HOGWILD, "true");
    mMatrix.set(MatrixConf.MATRIX_AVERAGE, "false");
    mMatrix.set(MatrixConf.MATRIX_OPLOG_TYPE, "DENSE_INT");
    angelClient.addMatrix(mMatrix);
    angelClient.startPSServer();
    angelClient.runTask(DummyTask.class);
    Thread.sleep(5000);
    group0Id = new WorkerGroupId(0);
    worker0Id = new WorkerId(group0Id, 0);
    worker0Attempt0Id = new WorkerAttemptId(worker0Id, 0);
    task0Id = new TaskId(0);
    task1Id = new TaskId(1);
    psId = new ParameterServerId(0);
    psAttempt0Id = new PSAttemptId(psId, 0);
    DataOutputStream out = new DataOutputStream(new FileOutputStream("data"));
    ByteBuf buf = Unpooled.buffer(16);
    buf.writeDouble(0.00);
    buf.writeDouble(1.00);
    buf.writeDouble(-1.00);
    buf.writeDouble(-2.00);
    buf.writeDouble(-5.00);
    buf.writeDouble(-6.00);
    buf.writeDouble(-7.00);
    buf.writeDouble(-8.00);
    serverPartition.getRow(6).update(RowType.T_DOUBLE_DENSE, buf, 8);
    serverPartition.save(out);
    out.close();
    DataInputStream in = new DataInputStream(new FileInputStream("data"));
    PartitionKey partitionKeyNew = new PartitionKey(2, 1, 1, 2, 8, 10);
    ServerPartition serverPartitionNew = new ServerPartition(partitionKeyNew, RowType.T_DOUBLE_DENSE);
    serverPartitionNew.init();
    assertNotEquals(((ServerDenseDoubleRow) serverPartition.getRow(6)).getData(), ((ServerDenseDoubleRow) serverPartitionNew.getRow(6)).getData());
    serverPartitionNew.load(in);
    in.close();
    assertEquals(((ServerDenseDoubleRow) serverPartition.getRow(6)).getData(), ((ServerDenseDoubleRow) serverPartitionNew.getRow(6)).getData());
    angelClient.stop();
}
Also used : CombineTextInputFormat(org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat) DummyTask(com.tencent.angel.master.DummyTask) TaskId(com.tencent.angel.worker.task.TaskId) Configuration(org.apache.hadoop.conf.Configuration) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) WorkerId(com.tencent.angel.worker.WorkerId) ByteBuf(io.netty.buffer.ByteBuf) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId) MatrixContext(com.tencent.angel.ml.matrix.MatrixContext) PSAttemptId(com.tencent.angel.ps.PSAttemptId) PartitionKey(com.tencent.angel.PartitionKey) ParameterServerId(com.tencent.angel.ps.ParameterServerId) Test(org.junit.Test)

Example 24 with TaskId

use of com.tencent.angel.worker.task.TaskId in project angel by Tencent.

the class AlgoMetricsTest method setup.

@Before
public void setup() throws Exception {
    try {
        // set basic configuration keys
        Configuration conf = new Configuration();
        conf.setBoolean("mapred.mapper.new-api", true);
        conf.setBoolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST, true);
        // use local deploy mode and dummy dataspliter
        conf.set(AngelConf.ANGEL_DEPLOY_MODE, "LOCAL");
        conf.setBoolean(AngelConf.ANGEL_AM_USE_DUMMY_DATASPLITER, true);
        conf.set(AngelConf.ANGEL_INPUTFORMAT_CLASS, CombineTextInputFormat.class.getName());
        conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, LOCAL_FS + TMP_PATH + "/out");
        conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, LOCAL_FS + TMP_PATH + "/in");
        conf.set(AngelConf.ANGEL_LOG_PATH, LOCAL_FS + TMP_PATH + "/log");
        conf.setInt(AngelConf.ANGEL_WORKERGROUP_NUMBER, 1);
        conf.setInt(AngelConf.ANGEL_PS_NUMBER, 1);
        conf.setInt(AngelConf.ANGEL_WORKER_TASK_NUMBER, 2);
        // get a angel client
        angelClient = AngelClientFactory.get(conf);
        // add matrix
        MatrixContext mMatrix = new MatrixContext();
        mMatrix.setName("w1");
        mMatrix.setRowNum(1);
        mMatrix.setColNum(100000);
        mMatrix.setMaxRowNumInBlock(1);
        mMatrix.setMaxColNumInBlock(50000);
        mMatrix.setRowType(RowType.T_INT_DENSE);
        mMatrix.set(MatrixConf.MATRIX_OPLOG_ENABLEFILTER, "false");
        mMatrix.set(MatrixConf.MATRIX_HOGWILD, "true");
        mMatrix.set(MatrixConf.MATRIX_AVERAGE, "false");
        mMatrix.set(MatrixConf.MATRIX_OPLOG_TYPE, "DENSE_INT");
        angelClient.addMatrix(mMatrix);
        angelClient.startPSServer();
        angelClient.runTask(MetricTestTask.class);
        Thread.sleep(5000);
        task0Id = new TaskId(0);
        task1Id = new TaskId(1);
    } catch (Exception x) {
        LOG.error("setup failed ", x);
        throw x;
    }
}
Also used : CombineTextInputFormat(org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat) MatrixContext(com.tencent.angel.ml.matrix.MatrixContext) TaskId(com.tencent.angel.worker.task.TaskId) Configuration(org.apache.hadoop.conf.Configuration) Before(org.junit.Before)

Example 25 with TaskId

use of com.tencent.angel.worker.task.TaskId in project angel by Tencent.

the class PeriodHATest method setup.

@Before
public void setup() throws Exception {
    try {
        // set basic configuration keys
        Configuration conf = new Configuration();
        conf.setBoolean("mapred.mapper.new-api", true);
        conf.setBoolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST, true);
        conf.set(AngelConf.ANGEL_TASK_USER_TASKCLASS, DummyTask.class.getName());
        // use local deploy mode and dummy dataspliter
        conf.set(AngelConf.ANGEL_DEPLOY_MODE, "LOCAL");
        conf.setBoolean(AngelConf.ANGEL_AM_USE_DUMMY_DATASPLITER, true);
        conf.set(AngelConf.ANGEL_INPUTFORMAT_CLASS, CombineTextInputFormat.class.getName());
        conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, LOCAL_FS + TMP_PATH + "/out");
        conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, LOCAL_FS + TMP_PATH + "/in");
        conf.set(AngelConf.ANGEL_LOG_PATH, LOCAL_FS + TMP_PATH + "/log");
        conf.setInt(AngelConf.ANGEL_WORKERGROUP_NUMBER, 1);
        conf.setInt(AngelConf.ANGEL_PS_NUMBER, 2);
        conf.setInt(AngelConf.ANGEL_WORKER_TASK_NUMBER, 1);
        conf.setInt(AngelConf.ANGEL_PS_HA_REPLICATION_NUMBER, 2);
        conf.setInt(AngelConf.ANGEL_PS_HA_PUSH_INTERVAL_MS, 1000);
        // conf.setBoolean(AngelConf.ANGEL_PS_HA_USE_EVENT_PUSH, true);
        // conf.setBoolean(AngelConf.ANGEL_PS_HA_PUSH_SYNC, true);
        // get a angel client
        angelClient = AngelClientFactory.get(conf);
        // add matrix
        MatrixContext mMatrix = new MatrixContext();
        mMatrix.setName("w1");
        mMatrix.setRowNum(1);
        mMatrix.setColNum(dim);
        mMatrix.setMaxRowNumInBlock(1);
        mMatrix.setMaxColNumInBlock(dim / 2);
        mMatrix.setRowType(RowType.T_INT_DENSE);
        mMatrix.set(MatrixConf.MATRIX_HOGWILD, "true");
        mMatrix.set(MatrixConf.MATRIX_AVERAGE, "false");
        angelClient.addMatrix(mMatrix);
        angelClient.startPSServer();
        angelClient.run();
        Thread.sleep(5000);
        task0Id = new TaskId(0);
    } catch (Exception x) {
        LOG.error("setup failed ", x);
        throw x;
    }
}
Also used : CombineTextInputFormat(org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat) MatrixContext(com.tencent.angel.ml.matrix.MatrixContext) TaskId(com.tencent.angel.worker.task.TaskId) Configuration(org.apache.hadoop.conf.Configuration) Before(org.junit.Before)

Aggregations

TaskId (com.tencent.angel.worker.task.TaskId)25 Configuration (org.apache.hadoop.conf.Configuration)15 MatrixContext (com.tencent.angel.ml.matrix.MatrixContext)14 CombineTextInputFormat (org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat)14 WorkerGroupId (com.tencent.angel.worker.WorkerGroupId)10 WorkerId (com.tencent.angel.worker.WorkerId)10 PSAttemptId (com.tencent.angel.ps.PSAttemptId)9 ParameterServerId (com.tencent.angel.ps.ParameterServerId)9 WorkerAttemptId (com.tencent.angel.worker.WorkerAttemptId)9 Before (org.junit.Before)8 AMTask (com.tencent.angel.master.task.AMTask)5 BeforeClass (org.junit.BeforeClass)4 AngelException (com.tencent.angel.exception.AngelException)3 DummyTask (com.tencent.angel.master.DummyTask)3 IOException (java.io.IOException)3 WorkerAttempt (com.tencent.angel.master.worker.attempt.WorkerAttempt)2 AMWorker (com.tencent.angel.master.worker.worker.AMWorker)2 ServiceException (com.google.protobuf.ServiceException)1 PartitionKey (com.tencent.angel.PartitionKey)1 Id (com.tencent.angel.common.Id)1