Search in sources :

Example 1 with InternalErrorEvent

use of com.tencent.angel.master.app.InternalErrorEvent in project angel by Tencent.

the class YarnContainerAllocator method startEventHandlerThread.

private void startEventHandlerThread() {
    this.eventHandlingThread = new Thread() {

        @SuppressWarnings("unchecked")
        @Override
        public void run() {
            YarnContainerAllocatorEvent event;
            while (!stopped.get() && !Thread.currentThread().isInterrupted()) {
                try {
                    event = (YarnContainerAllocatorEvent) eventQueue.take();
                } catch (InterruptedException e) {
                    if (!stopped.get()) {
                        LOG.fatal("yarn allocator event handler is interrupted. ", e);
                        context.getEventHandler().handle(new InternalErrorEvent(context.getApplicationId(), "yarn allocator event handler is interrupted. " + e.getMessage()));
                    }
                    return;
                }
                try {
                    handleEvent(event);
                } catch (Throwable t) {
                    LOG.fatal("Error in handling event type " + event.getType() + " to the ContainreAllocator", t);
                    context.getEventHandler().handle(new InternalErrorEvent(context.getApplicationId(), "Error in handling event type " + event.getType() + " to the ContainreAllocator" + t.getMessage()));
                    return;
                }
            }
        }
    };
    this.eventHandlingThread.start();
}
Also used : InternalErrorEvent(com.tencent.angel.master.app.InternalErrorEvent)

Example 2 with InternalErrorEvent

use of com.tencent.angel.master.app.InternalErrorEvent in project angel by Tencent.

the class YarnContainerLauncher method serviceStart.

protected void serviceStart() throws Exception {
    ThreadFactory tf = new ThreadFactoryBuilder().setNameFormat("ContainerLauncher #%d").setDaemon(true).build();
    // start a thread pool to startup the container
    launcherPool = new ThreadPoolExecutor(INITIAL_POOL_SIZE, Integer.MAX_VALUE, 1, TimeUnit.HOURS, new LinkedBlockingQueue<Runnable>(), tf);
    eventHandlingThread = new Thread() {

        @SuppressWarnings("unchecked")
        @Override
        public void run() {
            YarnContainerLauncherEvent event = null;
            Set<String> allNodes = new HashSet<String>();
            while (!stopped.get() && !Thread.currentThread().isInterrupted()) {
                try {
                    event = (YarnContainerLauncherEvent) eventQueue.take();
                } catch (InterruptedException e) {
                    if (!stopped.get()) {
                        LOG.fatal("yarn container launch event handler is interrupted. " + e);
                        context.getEventHandler().handle(new InternalErrorEvent(context.getApplicationId(), "yarn container launch event handler is interrupted. " + e.getMessage()));
                    }
                    return;
                }
                allNodes.add(event.getContainerMgrAddress());
                int poolSize = launcherPool.getCorePoolSize();
                // maximum limit yet.
                if (poolSize != limitOnPoolSize) {
                    // nodes where containers will run at *this* point of time. This is
                    // *not* the cluster size and doesn't need to be.
                    int numNodes = allNodes.size();
                    int idealPoolSize = Math.min(limitOnPoolSize, numNodes);
                    if (poolSize < idealPoolSize) {
                        // Bump up the pool size to idealPoolSize+INITIAL_POOL_SIZE, the
                        // later is just a buffer so we are not always increasing the
                        // pool-size
                        int newPoolSize = Math.min(limitOnPoolSize, idealPoolSize + INITIAL_POOL_SIZE);
                        LOG.info("Setting ContainerLauncher pool size to " + newPoolSize + " as number-of-nodes to talk to is " + numNodes);
                        launcherPool.setCorePoolSize(newPoolSize);
                    }
                }
                // the events from the queue are handled in parallel
                // using a thread pool
                launcherPool.execute(createEventProcessor(event));
            }
        }
    };
    eventHandlingThread.setName("ContainerLauncher Event Handler");
    eventHandlingThread.start();
    super.serviceStart();
}
Also used : HashSet(java.util.HashSet) Set(java.util.Set) InternalErrorEvent(com.tencent.angel.master.app.InternalErrorEvent) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder)

Example 3 with InternalErrorEvent

use of com.tencent.angel.master.app.InternalErrorEvent in project angel by Tencent.

the class ParameterServerManager method psFailed.

@SuppressWarnings("unchecked")
private void psFailed(ParameterServerManagerEvent event) {
    List<String> diagnostics = context.getParameterServerManager().getParameterServer(event.getPsId()).getDiagnostics();
    StringBuilder sb = new StringBuilder();
    sb.append(StringUtils.join("\n", diagnostics));
    context.getEventHandler().handle(new InternalErrorEvent(context.getApplicationId(), sb.toString()));
}
Also used : InternalErrorEvent(com.tencent.angel.master.app.InternalErrorEvent)

Example 4 with InternalErrorEvent

use of com.tencent.angel.master.app.InternalErrorEvent in project angel by Tencent.

the class AppTest method testGetJobReport.

@SuppressWarnings("unchecked")
@Test
public void testGetJobReport() throws Exception {
    try {
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
        TConnection connection = TConnectionManager.getConnection(angelAppMaster.getConfig());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        int task0Iteration = 2;
        int task1Iteration = 1;
        int jobIteration = (task0Iteration < task1Iteration) ? task0Iteration : task1Iteration;
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
        Thread.sleep(1000);
        GetJobReportRequest request = GetJobReportRequest.newBuilder().setAppId(LocalClusterContext.get().getAppId().toString()).build();
        GetJobReportResponse response = master.getJobReport(null, request);
        assertEquals(response.getJobReport().getJobState(), JobStateProto.J_RUNNING);
        assertEquals(response.getJobReport().getCurIteration(), jobIteration);
        angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed"));
        Thread.sleep(5000);
        response = master.getJobReport(null, request);
        assertEquals(response.getJobReport().getJobState(), JobStateProto.J_FAILED);
        assertEquals(response.getJobReport().getCurIteration(), jobIteration);
        assertEquals(response.getJobReport().getDiagnostics(), "failed");
        // Thread.sleep(5000);
        // response = master.getJobReport(null, request);
        // assertEquals(response.getJobReport().getJobState(), JobStateProto.J_FAILED);
        // assertEquals(response.getJobReport().getCurIteration(), jobIteration);
        // assertEquals(response.getJobReport().getDiagnostics(), "failed");
        Thread.sleep(10000);
        try {
            response = master.getJobReport(null, request);
        } catch (Exception x) {
            response = tryGetResponseFromFile(true);
        }
        assertEquals(response.getJobReport().getJobState(), JobStateProto.J_FAILED);
        assertEquals(response.getJobReport().getCurIteration(), jobIteration);
        assertEquals(response.getJobReport().getDiagnostics(), "failed");
    } catch (Exception x) {
        LOG.error("run testGetJobReport failed ", x);
        throw x;
    }
}
Also used : TConnection(com.tencent.angel.ipc.TConnection) InternalErrorEvent(com.tencent.angel.master.app.InternalErrorEvent) GetJobReportRequest(com.tencent.angel.protobuf.generated.ClientMasterServiceProtos.GetJobReportRequest) IOException(java.io.IOException) Location(com.tencent.angel.common.location.Location) GetJobReportResponse(com.tencent.angel.protobuf.generated.ClientMasterServiceProtos.GetJobReportResponse) Test(org.junit.Test)

Example 5 with InternalErrorEvent

use of com.tencent.angel.master.app.InternalErrorEvent in project angel by Tencent.

the class MasterRecoverTest method testMasterRecover.

@SuppressWarnings("unchecked")
@Test
public void testMasterRecover() throws Exception {
    try {
        ApplicationAttemptId appAttempt1Id = ApplicationAttemptId.newInstance(LocalClusterContext.get().getAppId(), 1);
        ApplicationAttemptId appAttempt2Id = ApplicationAttemptId.newInstance(LocalClusterContext.get().getAppId(), 2);
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        LOG.info("angelAppMaster.getAppContext().getApplicationAttemptId()=" + angelAppMaster.getAppContext().getApplicationAttemptId());
        assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt1Id);
        ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
        ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        int w1Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w1").getId();
        int w2Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w2").getId();
        Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
        TConnection connection = TConnectionManager.getConnection(ps.getConf());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        int task0Iteration = 2;
        int task1Iteration = 1;
        int task0w1Clock = 10;
        int task0w2Clock = 20;
        int task1w1Clock = 9;
        int task1w2Clock = 19;
        int w1Clock = (task0w1Clock < task1w1Clock) ? task0w1Clock : task1w1Clock;
        int w2Clock = (task0w2Clock < task1w2Clock) ? task0w2Clock : task1w2Clock;
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task0w1Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task0w2Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task1w1Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task1w2Clock).build()).build());
        int writeIntervalMS = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_AM_WRITE_STATE_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_AM_WRITE_STATE_INTERVAL_MS);
        Thread.sleep(writeIntervalMS * 2);
        angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed", true));
        Thread.sleep(15000);
        angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.RUNNING);
        LOG.info("angelAppMaster.getAppContext().getApplicationAttemptId()=" + angelAppMaster.getAppContext().getApplicationAttemptId());
        assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt2Id);
        PartitionKey w1Part0Key = new PartitionKey(0, w1Id, 0, 0, 1, 50000);
        PartitionKey w1Part1Key = new PartitionKey(1, w1Id, 0, 50000, 1, 100000);
        PartitionKey w2Part0Key = new PartitionKey(0, w2Id, 0, 0, 1, 50000);
        PartitionKey w2Part1Key = new PartitionKey(1, w2Id, 0, 50000, 1, 100000);
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        LOG.info("worker=" + worker);
        LOG.info("worker.getTaskManager()=" + worker.getTaskManager());
        LOG.info("worker.getTaskManager().getRunningTask()=" + worker.getTaskManager().getRunningTask().size());
        TaskContext task0Context = worker.getTaskManager().getRunningTask().get(task0Id).getTaskContext();
        TaskContext task1Context = worker.getTaskManager().getRunningTask().get(task1Id).getTaskContext();
        assertEquals(task0Context.getEpoch(), task0Iteration);
        assertEquals(task1Context.getEpoch(), task1Iteration);
        assertEquals(task0Context.getMatrixClock(w1Id), task0w1Clock);
        assertEquals(task0Context.getMatrixClock(w2Id), task0w2Clock);
        assertEquals(task1Context.getMatrixClock(w1Id), task1w1Clock);
        assertEquals(task1Context.getMatrixClock(w2Id), task1w2Clock);
        LOG.info("===============worker.getPSAgent().getMatrixMetaManager().getMatrixMetas().size()=" + worker.getPSAgent().getMatrixMetaManager().getMatrixMetas().size());
        assertTrue(worker.getPSAgent().getMatrixMetaManager().exist(w1Id));
        assertTrue(worker.getPSAgent().getMatrixMetaManager().exist(w2Id));
        assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w1Part0Key).get(0), psId);
        assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w1Part1Key).get(0), psId);
        assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w2Part0Key).get(0), psId);
        assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w2Part1Key).get(0), psId);
        ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock, w2Clock);
        angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed", true));
        Thread.sleep(15000);
        angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt2Id);
        assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.FAILED);
    } catch (Exception x) {
        LOG.error("run testMasterRecover failed ", x);
        throw x;
    }
}
Also used : TaskContext(com.tencent.angel.worker.task.TaskContext) InternalErrorEvent(com.tencent.angel.master.app.InternalErrorEvent) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) ParameterServer(com.tencent.angel.ps.impl.ParameterServer) TConnection(com.tencent.angel.ipc.TConnection) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) PartitionKey(com.tencent.angel.PartitionKey) Worker(com.tencent.angel.worker.Worker) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Aggregations

InternalErrorEvent (com.tencent.angel.master.app.InternalErrorEvent)6 Location (com.tencent.angel.common.location.Location)2 TConnection (com.tencent.angel.ipc.TConnection)2 Test (org.junit.Test)2 ThreadFactoryBuilder (com.google.common.util.concurrent.ThreadFactoryBuilder)1 PartitionKey (com.tencent.angel.PartitionKey)1 ParameterServerManager (com.tencent.angel.master.ps.ParameterServerManager)1 GetJobReportRequest (com.tencent.angel.protobuf.generated.ClientMasterServiceProtos.GetJobReportRequest)1 GetJobReportResponse (com.tencent.angel.protobuf.generated.ClientMasterServiceProtos.GetJobReportResponse)1 ParameterServer (com.tencent.angel.ps.impl.ParameterServer)1 Worker (com.tencent.angel.worker.Worker)1 TaskContext (com.tencent.angel.worker.task.TaskContext)1 IOException (java.io.IOException)1 HashSet (java.util.HashSet)1 Set (java.util.Set)1 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)1