Search in sources :

Example 1 with CheckContext

use of org.apache.hadoop.hdfs.server.datanode.StorageLocation.CheckContext in project hadoop by apache.

the class StorageLocationChecker method check.

/**
   * Initiate a check of the supplied storage volumes and return
   * a list of failed volumes.
   *
   * StorageLocations are returned in the same order as the input
   * for compatibility with existing unit tests.
   *
   * @param conf HDFS configuration.
   * @param dataDirs list of volumes to check.
   * @return returns a list of failed volumes. Returns the empty list if
   *         there are no failed volumes.
   *
   * @throws InterruptedException if the check was interrupted.
   * @throws IOException if the number of failed volumes exceeds the
   *                     maximum allowed or if there are no good
   *                     volumes.
   */
public List<StorageLocation> check(final Configuration conf, final Collection<StorageLocation> dataDirs) throws InterruptedException, IOException {
    final HashMap<StorageLocation, Boolean> goodLocations = new LinkedHashMap<>();
    final Set<StorageLocation> failedLocations = new HashSet<>();
    final Map<StorageLocation, ListenableFuture<VolumeCheckResult>> futures = Maps.newHashMap();
    final LocalFileSystem localFS = FileSystem.getLocal(conf);
    final CheckContext context = new CheckContext(localFS, expectedPermission);
    // Start parallel disk check operations on all StorageLocations.
    for (StorageLocation location : dataDirs) {
        goodLocations.put(location, true);
        Optional<ListenableFuture<VolumeCheckResult>> olf = delegateChecker.schedule(location, context);
        if (olf.isPresent()) {
            futures.put(location, olf.get());
        }
    }
    if (maxVolumeFailuresTolerated >= dataDirs.size()) {
        throw new DiskErrorException("Invalid value configured for " + DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY + " - " + maxVolumeFailuresTolerated + ". Value configured is >= " + "to the number of configured volumes (" + dataDirs.size() + ").");
    }
    final long checkStartTimeMs = timer.monotonicNow();
    // Retrieve the results of the disk checks.
    for (Map.Entry<StorageLocation, ListenableFuture<VolumeCheckResult>> entry : futures.entrySet()) {
        // Determine how much time we can allow for this check to complete.
        // The cumulative wait time cannot exceed maxAllowedTimeForCheck.
        final long waitSoFarMs = (timer.monotonicNow() - checkStartTimeMs);
        final long timeLeftMs = Math.max(0, maxAllowedTimeForCheckMs - waitSoFarMs);
        final StorageLocation location = entry.getKey();
        try {
            final VolumeCheckResult result = entry.getValue().get(timeLeftMs, TimeUnit.MILLISECONDS);
            switch(result) {
                case HEALTHY:
                    break;
                case DEGRADED:
                    LOG.warn("StorageLocation {} appears to be degraded.", location);
                    break;
                case FAILED:
                    LOG.warn("StorageLocation {} detected as failed.", location);
                    failedLocations.add(location);
                    goodLocations.remove(location);
                    break;
                default:
                    LOG.error("Unexpected health check result {} for StorageLocation {}", result, location);
            }
        } catch (ExecutionException | TimeoutException e) {
            LOG.warn("Exception checking StorageLocation " + location, e.getCause());
            failedLocations.add(location);
            goodLocations.remove(location);
        }
    }
    if (failedLocations.size() > maxVolumeFailuresTolerated) {
        throw new DiskErrorException("Too many failed volumes - " + "current valid volumes: " + goodLocations.size() + ", volumes configured: " + dataDirs.size() + ", volumes failed: " + failedLocations.size() + ", volume failures tolerated: " + maxVolumeFailuresTolerated);
    }
    if (goodLocations.size() == 0) {
        throw new DiskErrorException("All directories in " + DFS_DATANODE_DATA_DIR_KEY + " are invalid: " + failedLocations);
    }
    return new ArrayList<>(goodLocations.keySet());
}
Also used : CheckContext(org.apache.hadoop.hdfs.server.datanode.StorageLocation.CheckContext) DiskErrorException(org.apache.hadoop.util.DiskChecker.DiskErrorException) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) StorageLocation(org.apache.hadoop.hdfs.server.datanode.StorageLocation) ExecutionException(java.util.concurrent.ExecutionException) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) HashSet(java.util.HashSet) TimeoutException(java.util.concurrent.TimeoutException)

Aggregations

ListenableFuture (com.google.common.util.concurrent.ListenableFuture)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 LinkedHashMap (java.util.LinkedHashMap)1 Map (java.util.Map)1 ExecutionException (java.util.concurrent.ExecutionException)1 TimeoutException (java.util.concurrent.TimeoutException)1 LocalFileSystem (org.apache.hadoop.fs.LocalFileSystem)1 StorageLocation (org.apache.hadoop.hdfs.server.datanode.StorageLocation)1 CheckContext (org.apache.hadoop.hdfs.server.datanode.StorageLocation.CheckContext)1 DiskErrorException (org.apache.hadoop.util.DiskChecker.DiskErrorException)1