use of org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi in project hadoop by apache.
the class FsDatasetImpl method getBlockReports.
@Override
public Map<DatanodeStorage, BlockListAsLongs> getBlockReports(String bpid) {
Map<DatanodeStorage, BlockListAsLongs> blockReportsMap = new HashMap<DatanodeStorage, BlockListAsLongs>();
Map<String, BlockListAsLongs.Builder> builders = new HashMap<String, BlockListAsLongs.Builder>();
List<FsVolumeImpl> curVolumes = null;
try (AutoCloseableLock lock = datasetLock.acquire()) {
curVolumes = volumes.getVolumes();
for (FsVolumeSpi v : curVolumes) {
builders.put(v.getStorageID(), BlockListAsLongs.builder(maxDataLength));
}
Set<String> missingVolumesReported = new HashSet<>();
for (ReplicaInfo b : volumeMap.replicas(bpid)) {
String volStorageID = b.getVolume().getStorageID();
if (!builders.containsKey(volStorageID)) {
if (!missingVolumesReported.contains(volStorageID)) {
LOG.warn("Storage volume: " + volStorageID + " missing for the" + " replica block: " + b + ". Probably being removed!");
missingVolumesReported.add(volStorageID);
}
continue;
}
switch(b.getState()) {
case FINALIZED:
case RBW:
case RWR:
builders.get(b.getVolume().getStorageID()).add(b);
break;
case RUR:
ReplicaInfo orig = b.getOriginalReplica();
builders.get(b.getVolume().getStorageID()).add(orig);
break;
case TEMPORARY:
break;
default:
assert false : "Illegal ReplicaInfo state.";
}
}
}
for (FsVolumeImpl v : curVolumes) {
blockReportsMap.put(v.toDatanodeStorage(), builders.get(v.getStorageID()).build());
}
return blockReportsMap;
}
use of org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi in project hadoop by apache.
the class FsVolumeList method handleVolumeFailures.
/**
* Calls {@link FsVolumeImpl#checkDirs()} on each volume.
*
* Use {@link checkDirsLock} to allow only one instance of checkDirs() call.
*
* @return list of all the failed volumes.
* @param failedVolumes
*/
void handleVolumeFailures(Set<FsVolumeSpi> failedVolumes) {
try (AutoCloseableLock lock = checkDirsLock.acquire()) {
for (FsVolumeSpi vol : failedVolumes) {
FsVolumeImpl fsv = (FsVolumeImpl) vol;
try (FsVolumeReference ref = fsv.obtainReference()) {
addVolumeFailureInfo(fsv);
removeVolume(fsv);
} catch (ClosedChannelException e) {
FsDatasetImpl.LOG.debug("Caught exception when obtaining " + "reference count on closed volume", e);
} catch (IOException e) {
FsDatasetImpl.LOG.error("Unexpected IOException", e);
}
}
waitVolumeRemoved(5000, checkDirsLockCondition);
}
}
use of org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi in project hadoop by apache.
the class DatasetVolumeChecker method checkAllVolumes.
/**
* Run checks against all volumes of a dataset.
*
* This check may be performed at service startup and subsequently at
* regular intervals to detect and handle failed volumes.
*
* @param dataset - FsDatasetSpi to be checked.
* @return set of failed volumes.
*/
public Set<FsVolumeSpi> checkAllVolumes(final FsDatasetSpi<? extends FsVolumeSpi> dataset) throws InterruptedException {
final long gap = timer.monotonicNow() - lastAllVolumesCheck;
if (gap < minDiskCheckGapMs) {
numSkippedChecks.incrementAndGet();
LOG.trace("Skipped checking all volumes, time since last check {} is less " + "than the minimum gap between checks ({} ms).", gap, minDiskCheckGapMs);
return Collections.emptySet();
}
final FsDatasetSpi.FsVolumeReferences references = dataset.getFsVolumeReferences();
if (references.size() == 0) {
LOG.warn("checkAllVolumesAsync - no volumes can be referenced");
return Collections.emptySet();
}
lastAllVolumesCheck = timer.monotonicNow();
final Set<FsVolumeSpi> healthyVolumes = new HashSet<>();
final Set<FsVolumeSpi> failedVolumes = new HashSet<>();
final Set<FsVolumeSpi> allVolumes = new HashSet<>();
final AtomicLong numVolumes = new AtomicLong(references.size());
final CountDownLatch latch = new CountDownLatch(1);
for (int i = 0; i < references.size(); ++i) {
final FsVolumeReference reference = references.getReference(i);
Optional<ListenableFuture<VolumeCheckResult>> olf = delegateChecker.schedule(reference.getVolume(), IGNORED_CONTEXT);
LOG.info("Scheduled health check for volume {}", reference.getVolume());
if (olf.isPresent()) {
allVolumes.add(reference.getVolume());
Futures.addCallback(olf.get(), new ResultHandler(reference, healthyVolumes, failedVolumes, numVolumes, new Callback() {
@Override
public void call(Set<FsVolumeSpi> ignored1, Set<FsVolumeSpi> ignored2) {
latch.countDown();
}
}));
} else {
IOUtils.cleanup(null, reference);
if (numVolumes.decrementAndGet() == 0) {
latch.countDown();
}
}
}
// the remaining volumes.
if (!latch.await(maxAllowedTimeForCheckMs, TimeUnit.MILLISECONDS)) {
LOG.warn("checkAllVolumes timed out after {} ms" + maxAllowedTimeForCheckMs);
}
numSyncDatasetChecks.incrementAndGet();
synchronized (this) {
// of a potentially changing set.
return new HashSet<>(Sets.difference(allVolumes, healthyVolumes));
}
}
use of org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi in project hadoop by apache.
the class TestStorageMover method setVolumeFull.
private void setVolumeFull(DataNode dn, StorageType type) {
try (FsDatasetSpi.FsVolumeReferences refs = dn.getFSDataset().getFsVolumeReferences()) {
for (FsVolumeSpi fvs : refs) {
FsVolumeImpl volume = (FsVolumeImpl) fvs;
if (volume.getStorageType() == type) {
LOG.info("setCapacity to 0 for [" + volume.getStorageType() + "]" + volume.getStorageID());
volume.setCapacityForTesting(0);
}
}
} catch (IOException e) {
LOG.error("Unexpected exception by closing FsVolumeReference", e);
}
}
use of org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi in project hadoop by apache.
the class TestNameNodePrunesMissingStorages method testRemovingStorageDoesNotProduceZombies.
/**
* Regression test for HDFS-7960.<p/>
*
* Shutting down a datanode, removing a storage directory, and restarting
* the DataNode should not produce zombie storages.
*/
@Test(timeout = 300000)
public void testRemovingStorageDoesNotProduceZombies() throws Exception {
Configuration conf = new HdfsConfiguration();
conf.setInt(DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY, 1);
conf.setInt(DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY, 1000);
final int NUM_STORAGES_PER_DN = 2;
final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).storagesPerDatanode(NUM_STORAGES_PER_DN).build();
try {
cluster.waitActive();
for (DataNode dn : cluster.getDataNodes()) {
assertEquals(NUM_STORAGES_PER_DN, cluster.getNamesystem().getBlockManager().getDatanodeManager().getDatanode(dn.getDatanodeId()).getStorageInfos().length);
}
// Create a file which will end up on all 3 datanodes.
final Path TEST_PATH = new Path("/foo1");
DistributedFileSystem fs = cluster.getFileSystem();
DFSTestUtil.createFile(fs, TEST_PATH, 1024, (short) 3, 0xcafecafe);
for (DataNode dn : cluster.getDataNodes()) {
DataNodeTestUtils.triggerBlockReport(dn);
}
ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, new Path("/foo1"));
cluster.getNamesystem().writeLock();
final String storageIdToRemove;
String datanodeUuid;
// Find the first storage which this block is in.
try {
BlockInfo storedBlock = cluster.getNamesystem().getBlockManager().getStoredBlock(block.getLocalBlock());
Iterator<DatanodeStorageInfo> storageInfoIter = cluster.getNamesystem().getBlockManager().blocksMap.getStorages(storedBlock).iterator();
assertTrue(storageInfoIter.hasNext());
DatanodeStorageInfo info = storageInfoIter.next();
storageIdToRemove = info.getStorageID();
datanodeUuid = info.getDatanodeDescriptor().getDatanodeUuid();
} finally {
cluster.getNamesystem().writeUnlock();
}
// Find the DataNode which holds that first storage.
final DataNode datanodeToRemoveStorageFrom;
int datanodeToRemoveStorageFromIdx = 0;
while (true) {
if (datanodeToRemoveStorageFromIdx >= cluster.getDataNodes().size()) {
Assert.fail("failed to find datanode with uuid " + datanodeUuid);
datanodeToRemoveStorageFrom = null;
break;
}
DataNode dn = cluster.getDataNodes().get(datanodeToRemoveStorageFromIdx);
if (dn.getDatanodeUuid().equals(datanodeUuid)) {
datanodeToRemoveStorageFrom = dn;
break;
}
datanodeToRemoveStorageFromIdx++;
}
// Find the volume within the datanode which holds that first storage.
StorageLocation volumeLocationToRemove = null;
try (FsVolumeReferences volumes = datanodeToRemoveStorageFrom.getFSDataset().getFsVolumeReferences()) {
assertEquals(NUM_STORAGES_PER_DN, volumes.size());
for (FsVolumeSpi volume : volumes) {
if (volume.getStorageID().equals(storageIdToRemove)) {
volumeLocationToRemove = volume.getStorageLocation();
}
}
}
;
// Shut down the datanode and remove the volume.
// Replace the volume directory with a regular file, which will
// cause a volume failure. (If we merely removed the directory,
// it would be re-initialized with a new storage ID.)
assertNotNull(volumeLocationToRemove);
datanodeToRemoveStorageFrom.shutdown();
FileUtil.fullyDelete(new File(volumeLocationToRemove.getUri()));
FileOutputStream fos = new FileOutputStream(new File(volumeLocationToRemove.getUri()));
try {
fos.write(1);
} finally {
fos.close();
}
cluster.restartDataNode(datanodeToRemoveStorageFromIdx);
// Wait for the NameNode to remove the storage.
LOG.info("waiting for the datanode to remove " + storageIdToRemove);
GenericTestUtils.waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
final DatanodeDescriptor dnDescriptor = cluster.getNamesystem().getBlockManager().getDatanodeManager().getDatanode(datanodeToRemoveStorageFrom.getDatanodeUuid());
assertNotNull(dnDescriptor);
DatanodeStorageInfo[] infos = dnDescriptor.getStorageInfos();
for (DatanodeStorageInfo info : infos) {
if (info.getStorageID().equals(storageIdToRemove)) {
LOG.info("Still found storage " + storageIdToRemove + " on " + info + ".");
return false;
}
}
assertEquals(NUM_STORAGES_PER_DN - 1, infos.length);
return true;
}
}, 1000, 30000);
} finally {
if (cluster != null) {
cluster.shutdown();
}
}
}
Aggregations