Search in sources :

Example 16 with SplitLogTask

use of org.apache.hadoop.hbase.SplitLogTask in project hbase by apache.

the class ZKSplitLogManagerCoordination method removeStaleRecoveringRegions.

/**
   * ZooKeeper implementation of
   * {@link SplitLogManagerCoordination#removeStaleRecoveringRegions(Set)}
   */
@Override
public void removeStaleRecoveringRegions(final Set<String> knownFailedServers) throws IOException, InterruptedIOException {
    try {
        List<String> tasks = ZKUtil.listChildrenNoWatch(watcher, watcher.znodePaths.splitLogZNode);
        if (tasks != null) {
            int listSize = tasks.size();
            for (int i = 0; i < listSize; i++) {
                String t = tasks.get(i);
                byte[] data;
                try {
                    data = ZKUtil.getData(this.watcher, ZKUtil.joinZNode(watcher.znodePaths.splitLogZNode, t));
                } catch (InterruptedException e) {
                    throw new InterruptedIOException();
                }
                if (data != null) {
                    SplitLogTask slt = null;
                    try {
                        slt = SplitLogTask.parseFrom(data);
                    } catch (DeserializationException e) {
                        LOG.warn("Failed parse data for znode " + t, e);
                    }
                    if (slt != null && slt.isDone()) {
                        continue;
                    }
                }
                // decode the file name
                t = ZKSplitLog.getFileName(t);
                ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(new Path(t));
                if (serverName != null) {
                    knownFailedServers.add(serverName.getServerName());
                } else {
                    LOG.warn("Found invalid WAL log file name:" + t);
                }
            }
        }
        // remove recovering regions which doesn't have any RS associated with it
        List<String> regions = ZKUtil.listChildrenNoWatch(watcher, watcher.znodePaths.recoveringRegionsZNode);
        if (regions != null) {
            int listSize = regions.size();
            for (int i = 0; i < listSize; i++) {
                String nodePath = ZKUtil.joinZNode(watcher.znodePaths.recoveringRegionsZNode, regions.get(i));
                List<String> regionFailedServers = ZKUtil.listChildrenNoWatch(watcher, nodePath);
                if (regionFailedServers == null || regionFailedServers.isEmpty()) {
                    ZKUtil.deleteNode(watcher, nodePath);
                    continue;
                }
                boolean needMoreRecovery = false;
                int tmpFailedServerSize = regionFailedServers.size();
                for (int j = 0; j < tmpFailedServerSize; j++) {
                    if (knownFailedServers.contains(regionFailedServers.get(j))) {
                        needMoreRecovery = true;
                        break;
                    }
                }
                if (!needMoreRecovery) {
                    ZKUtil.deleteNodeRecursively(watcher, nodePath);
                }
            }
        }
    } catch (KeeperException e) {
        throw new IOException(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) InterruptedIOException(java.io.InterruptedIOException) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) DeserializationException(org.apache.hadoop.hbase.exceptions.DeserializationException) ServerName(org.apache.hadoop.hbase.ServerName) SplitLogTask(org.apache.hadoop.hbase.SplitLogTask) KeeperException(org.apache.zookeeper.KeeperException)

Example 17 with SplitLogTask

use of org.apache.hadoop.hbase.SplitLogTask in project hbase by apache.

the class ZKSplitLogManagerCoordination method setRecoveryMode.

/**
   * This function is to set recovery mode from outstanding split log tasks from before or current
   * configuration setting
   * @param isForInitialization
   * @throws IOException
   */
@Override
public void setRecoveryMode(boolean isForInitialization) throws IOException {
    synchronized (this) {
        if (this.isDrainingDone) {
            // date recovery mode
            return;
        }
    }
    if (this.watcher == null) {
        // when watcher is null(testing code) and recovery mode can only be LOG_SPLITTING
        synchronized (this) {
            this.isDrainingDone = true;
            this.recoveryMode = RecoveryMode.LOG_SPLITTING;
        }
        return;
    }
    boolean hasSplitLogTask = false;
    boolean hasRecoveringRegions = false;
    RecoveryMode previousRecoveryMode = RecoveryMode.UNKNOWN;
    RecoveryMode recoveryModeInConfig = (isDistributedLogReplay(conf)) ? RecoveryMode.LOG_REPLAY : RecoveryMode.LOG_SPLITTING;
    // Firstly check if there are outstanding recovering regions
    try {
        List<String> regions = ZKUtil.listChildrenNoWatch(watcher, watcher.znodePaths.recoveringRegionsZNode);
        if (regions != null && !regions.isEmpty()) {
            hasRecoveringRegions = true;
            previousRecoveryMode = RecoveryMode.LOG_REPLAY;
        }
        if (previousRecoveryMode == RecoveryMode.UNKNOWN) {
            // Secondly check if there are outstanding split log task
            List<String> tasks = listSplitLogTasks();
            if (!tasks.isEmpty()) {
                hasSplitLogTask = true;
                if (isForInitialization) {
                    // during initialization, try to get recovery mode from splitlogtask
                    int listSize = tasks.size();
                    for (int i = 0; i < listSize; i++) {
                        String task = tasks.get(i);
                        try {
                            byte[] data = ZKUtil.getData(this.watcher, ZKUtil.joinZNode(watcher.znodePaths.splitLogZNode, task));
                            if (data == null)
                                continue;
                            SplitLogTask slt = SplitLogTask.parseFrom(data);
                            previousRecoveryMode = slt.getMode();
                            if (previousRecoveryMode == RecoveryMode.UNKNOWN) {
                                // created by old code base where we don't set recovery mode in splitlogtask
                                // we can safely set to LOG_SPLITTING because we're in master initialization code
                                // before SSH is enabled & there is no outstanding recovering regions
                                previousRecoveryMode = RecoveryMode.LOG_SPLITTING;
                            }
                            break;
                        } catch (DeserializationException e) {
                            LOG.warn("Failed parse data for znode " + task, e);
                        } catch (InterruptedException e) {
                            throw new InterruptedIOException();
                        }
                    }
                }
            }
        }
    } catch (KeeperException e) {
        throw new IOException(e);
    }
    synchronized (this) {
        if (this.isDrainingDone) {
            return;
        }
        if (!hasSplitLogTask && !hasRecoveringRegions) {
            this.isDrainingDone = true;
            this.recoveryMode = recoveryModeInConfig;
            return;
        } else if (!isForInitialization) {
            // splitlogtask hasn't drained yet, keep existing recovery mode
            return;
        }
        if (previousRecoveryMode != RecoveryMode.UNKNOWN) {
            this.isDrainingDone = (previousRecoveryMode == recoveryModeInConfig);
            this.recoveryMode = previousRecoveryMode;
        } else {
            this.recoveryMode = recoveryModeInConfig;
        }
    }
}
Also used : InterruptedIOException(java.io.InterruptedIOException) RecoveryMode(org.apache.hadoop.hbase.shaded.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) SplitLogTask(org.apache.hadoop.hbase.SplitLogTask) DeserializationException(org.apache.hadoop.hbase.exceptions.DeserializationException) KeeperException(org.apache.zookeeper.KeeperException)

Example 18 with SplitLogTask

use of org.apache.hadoop.hbase.SplitLogTask in project hbase by apache.

the class ZkSplitLogWorkerCoordination method grabTask.

/**
 * try to grab a 'lock' on the task zk node to own and execute the task.
 * <p>
 * @param path zk node for the task
 * @return boolean value when grab a task success return true otherwise false
 */
private boolean grabTask(String path) {
    Stat stat = new Stat();
    byte[] data;
    synchronized (grabTaskLock) {
        currentTask = path;
        workerInGrabTask = true;
        if (Thread.interrupted()) {
            return false;
        }
    }
    try {
        try {
            if ((data = ZKUtil.getDataNoWatch(watcher, path, stat)) == null) {
                SplitLogCounters.tot_wkr_failed_to_grab_task_no_data.increment();
                return false;
            }
        } catch (KeeperException e) {
            LOG.warn("Failed to get data for znode " + path, e);
            SplitLogCounters.tot_wkr_failed_to_grab_task_exception.increment();
            return false;
        }
        SplitLogTask slt;
        try {
            slt = SplitLogTask.parseFrom(data);
        } catch (DeserializationException e) {
            LOG.warn("Failed parse data for znode " + path, e);
            SplitLogCounters.tot_wkr_failed_to_grab_task_exception.increment();
            return false;
        }
        if (!slt.isUnassigned()) {
            SplitLogCounters.tot_wkr_failed_to_grab_task_owned.increment();
            return false;
        }
        currentVersion = attemptToOwnTask(true, watcher, server.getServerName(), path, stat.getVersion());
        if (currentVersion < 0) {
            SplitLogCounters.tot_wkr_failed_to_grab_task_lost_race.increment();
            return false;
        }
        if (ZKSplitLog.isRescanNode(watcher, currentTask)) {
            ZkSplitLogWorkerCoordination.ZkSplitTaskDetails splitTaskDetails = new ZkSplitLogWorkerCoordination.ZkSplitTaskDetails();
            splitTaskDetails.setTaskNode(currentTask);
            splitTaskDetails.setCurTaskZKVersion(new MutableInt(currentVersion));
            endTask(new SplitLogTask.Done(server.getServerName()), SplitLogCounters.tot_wkr_task_acquired_rescan, splitTaskDetails);
            return false;
        }
        LOG.info("worker " + server.getServerName() + " acquired task " + path);
        SplitLogCounters.tot_wkr_task_acquired.increment();
        getDataSetWatchAsync();
        submitTask(path, currentVersion, reportPeriod);
        // after a successful submit, sleep a little bit to allow other RSs to grab the rest tasks
        try {
            int sleepTime = RandomUtils.nextInt(0, 500) + 500;
            Thread.sleep(sleepTime);
        } catch (InterruptedException e) {
            LOG.warn("Interrupted while yielding for other region servers", e);
            Thread.currentThread().interrupt();
        }
        return true;
    } finally {
        synchronized (grabTaskLock) {
            workerInGrabTask = false;
            // clear the interrupt from stopTask() otherwise the next task will
            // suffer
            Thread.interrupted();
        }
    }
}
Also used : Stat(org.apache.zookeeper.data.Stat) MutableInt(org.apache.commons.lang3.mutable.MutableInt) SplitLogTask(org.apache.hadoop.hbase.SplitLogTask) KeeperException(org.apache.zookeeper.KeeperException) DeserializationException(org.apache.hadoop.hbase.exceptions.DeserializationException)

Example 19 with SplitLogTask

use of org.apache.hadoop.hbase.SplitLogTask in project hbase by apache.

the class ZKSplitLogManagerCoordination method getDataSetWatchSuccess.

private void getDataSetWatchSuccess(String path, byte[] data, int version) throws DeserializationException {
    if (data == null) {
        if (version == Integer.MIN_VALUE) {
            // assume all done. The task znode suddenly disappeared.
            setDone(path, SUCCESS);
            return;
        }
        SplitLogCounters.tot_mgr_null_data.increment();
        LOG.error(HBaseMarkers.FATAL, "logic error - got null data " + path);
        setDone(path, FAILURE);
        return;
    }
    data = ZKMetadata.removeMetaData(data);
    SplitLogTask slt = SplitLogTask.parseFrom(data);
    if (slt.isUnassigned()) {
        LOG.debug("Task not yet acquired " + path + ", ver=" + version);
        handleUnassignedTask(path);
    } else if (slt.isOwned()) {
        heartbeat(path, version, slt.getServerName());
    } else if (slt.isResigned()) {
        LOG.info("Task " + path + " entered state=" + slt.toString());
        resubmitOrFail(path, FORCE);
    } else if (slt.isDone()) {
        LOG.info("Task " + path + " entered state=" + slt.toString());
        if (taskFinisher != null && !ZKSplitLog.isRescanNode(watcher, path)) {
            if (taskFinisher.finish(slt.getServerName(), ZKSplitLog.getFileName(path)) == Status.DONE) {
                setDone(path, SUCCESS);
            } else {
                resubmitOrFail(path, CHECK);
            }
        } else {
            setDone(path, SUCCESS);
        }
    } else if (slt.isErr()) {
        LOG.info("Task " + path + " entered state=" + slt.toString());
        resubmitOrFail(path, CHECK);
    } else {
        LOG.error(HBaseMarkers.FATAL, "logic error - unexpected zk state for path = " + path + " data = " + slt.toString());
        setDone(path, FAILURE);
    }
}
Also used : SplitLogTask(org.apache.hadoop.hbase.SplitLogTask)

Example 20 with SplitLogTask

use of org.apache.hadoop.hbase.SplitLogTask in project hbase by apache.

the class TestSplitLogWorker method testPreemptTask.

@Test
public void testPreemptTask() throws Exception {
    LOG.info("testPreemptTask");
    SplitLogCounters.resetCounters();
    final ServerName SRV = ServerName.valueOf("tpt_svr,1,1");
    final String PATH = ZKSplitLog.getEncodedNodeName(zkw, "tpt_task");
    RegionServerServices mockedRS = getRegionServer(SRV);
    SplitLogWorker slw = new SplitLogWorker(ds, TEST_UTIL.getConfiguration(), mockedRS, neverEndingTask);
    slw.start();
    try {
        // let the worker start
        Thread.yield();
        Thread.sleep(1000);
        waitForCounter(SplitLogCounters.tot_wkr_task_grabing, 0, 1, WAIT_TIME);
        // this time create a task node after starting the splitLogWorker
        zkw.getRecoverableZooKeeper().create(PATH, new SplitLogTask.Unassigned(MANAGER).toByteArray(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
        waitForCounter(SplitLogCounters.tot_wkr_task_acquired, 0, 1, WAIT_TIME);
        assertEquals(1, slw.getTaskReadySeq());
        byte[] bytes = ZKUtil.getData(zkw, PATH);
        SplitLogTask slt = SplitLogTask.parseFrom(bytes);
        assertTrue(slt.isOwned(SRV));
        slt = new SplitLogTask.Owned(MANAGER);
        ZKUtil.setData(zkw, PATH, slt.toByteArray());
        waitForCounter(SplitLogCounters.tot_wkr_preempt_task, 0, 1, WAIT_TIME);
    } finally {
        stopSplitLogWorker(slw);
    }
}
Also used : ServerName(org.apache.hadoop.hbase.ServerName) SplitLogTask(org.apache.hadoop.hbase.SplitLogTask) Test(org.junit.Test)

Aggregations

SplitLogTask (org.apache.hadoop.hbase.SplitLogTask)28 Test (org.junit.Test)19 ServerName (org.apache.hadoop.hbase.ServerName)17 TaskBatch (org.apache.hadoop.hbase.master.SplitLogManager.TaskBatch)9 KeeperException (org.apache.zookeeper.KeeperException)6 DeserializationException (org.apache.hadoop.hbase.exceptions.DeserializationException)4 Stat (org.apache.zookeeper.data.Stat)3 IOException (java.io.IOException)2 InterruptedIOException (java.io.InterruptedIOException)2 Configuration (org.apache.hadoop.conf.Configuration)2 Path (org.apache.hadoop.fs.Path)2 HBaseConfiguration (org.apache.hadoop.hbase.HBaseConfiguration)2 Task (org.apache.hadoop.hbase.master.SplitLogManager.Task)2 MutableInt (org.apache.commons.lang3.mutable.MutableInt)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 RecoveryMode (org.apache.hadoop.hbase.shaded.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode)1