use of java.util.concurrent.atomic.AtomicReference in project hadoop by apache.
the class TestDistributedFileSystem method testConcurrentStatistics.
@SuppressWarnings("ThrowableResultOfMethodCallIgnored")
@Test(timeout = 180000)
public void testConcurrentStatistics() throws IOException, InterruptedException {
FileSystem.getStatistics(HdfsConstants.HDFS_URI_SCHEME, DistributedFileSystem.class).reset();
final MiniDFSCluster cluster = new MiniDFSCluster.Builder(new Configuration()).build();
cluster.waitActive();
final FileSystem fs = cluster.getFileSystem();
final int numThreads = 5;
final ExecutorService threadPool = HadoopExecutors.newFixedThreadPool(numThreads);
try {
final CountDownLatch allExecutorThreadsReady = new CountDownLatch(numThreads);
final CountDownLatch startBlocker = new CountDownLatch(1);
final CountDownLatch allDone = new CountDownLatch(numThreads);
final AtomicReference<Throwable> childError = new AtomicReference<>();
for (int i = 0; i < numThreads; i++) {
threadPool.submit(new Runnable() {
@Override
public void run() {
allExecutorThreadsReady.countDown();
try {
startBlocker.await();
final FileSystem fs = cluster.getFileSystem();
fs.mkdirs(new Path("/testStatisticsParallelChild"));
} catch (Throwable t) {
LOG.error("Child failed when calling mkdir", t);
childError.compareAndSet(null, t);
} finally {
allDone.countDown();
}
}
});
}
final long oldMkdirOpCount = getOpStatistics(OpType.MKDIRS);
// wait until all threads are ready
allExecutorThreadsReady.await();
// all threads start making directories
startBlocker.countDown();
// wait until all threads are done
allDone.await();
assertNull("Child failed with exception " + childError.get(), childError.get());
checkStatistics(fs, 0, numThreads, 0);
// check the single operation count stat
checkOpStatistics(OpType.MKDIRS, numThreads + oldMkdirOpCount);
// iterate all the operation counts
for (Iterator<LongStatistic> opCountIter = FileSystem.getGlobalStorageStatistics().get(DFSOpsCountStatistics.NAME).getLongStatistics(); opCountIter.hasNext(); ) {
final LongStatistic opCount = opCountIter.next();
if (OpType.MKDIRS.getSymbol().equals(opCount.getName())) {
assertEquals("Unexpected op count from iterator!", numThreads + oldMkdirOpCount, opCount.getValue());
}
LOG.info(opCount.getName() + "\t" + opCount.getValue());
}
} finally {
threadPool.shutdownNow();
cluster.shutdown();
}
}
use of java.util.concurrent.atomic.AtomicReference in project hadoop by apache.
the class TestBlockRecovery method testStopWorker.
/**
* Test that an FsDatasetImpl operation does not hold the lock for an
* unreasonable amount of time if a writer is taking a long time to stop.
*/
private void testStopWorker(final TestStopWorkerRunnable tswr) throws Exception {
LOG.debug("Running " + currentTestName.getMethodName());
// We need a long value for the data xceiver stop timeout.
// Otherwise the timeout will trigger, and we will not have tested that
// thread join was done locklessly.
Assert.assertEquals(TEST_STOP_WORKER_XCEIVER_STOP_TIMEOUT_MILLIS, dn.getDnConf().getXceiverStopTimeout());
final TestStopWorkerSemaphore progressParent = new TestStopWorkerSemaphore();
final TestStopWorkerSemaphore terminateSlowWriter = new TestStopWorkerSemaphore();
final AtomicReference<String> failure = new AtomicReference<String>(null);
Collection<RecoveringBlock> recoveringBlocks = initRecoveringBlocks();
final RecoveringBlock recoveringBlock = Iterators.get(recoveringBlocks.iterator(), 0);
final ExtendedBlock block = recoveringBlock.getBlock();
Thread slowWriterThread = new Thread(new Runnable() {
@Override
public void run() {
try {
// Register this thread as the writer for the recoveringBlock.
LOG.debug("slowWriter creating rbw");
ReplicaHandler replicaHandler = spyDN.data.createRbw(StorageType.DISK, block, false);
replicaHandler.close();
LOG.debug("slowWriter created rbw");
// Tell the parent thread to start progressing.
progressParent.sem.release();
terminateSlowWriter.uninterruptiblyAcquire(60000);
LOG.debug("slowWriter exiting");
} catch (Throwable t) {
LOG.error("slowWriter got exception", t);
failure.compareAndSet(null, "slowWriter got exception " + t.getMessage());
}
}
});
// Start the slow worker thread and wait for it to take ownership of the
// ReplicaInPipeline
slowWriterThread.start();
progressParent.uninterruptiblyAcquire(60000);
// Start a worker thread which will attempt to stop the writer.
Thread stopWriterThread = new Thread(new Runnable() {
@Override
public void run() {
try {
LOG.debug("initiating " + tswr.opName());
tswr.run(recoveringBlock);
LOG.debug("finished " + tswr.opName());
} catch (Throwable t) {
LOG.error("stopWriterThread got unexpected exception for " + tswr.opName(), t);
failure.compareAndSet(null, "stopWriterThread got unexpected " + "exception for " + tswr.opName() + ": " + t.getMessage());
}
}
});
stopWriterThread.start();
while (!terminateSlowWriter.gotInterruption.get()) {
// Wait until stopWriterThread attempts to stop our slow writer by sending
// it an InterruptedException.
Thread.sleep(1);
}
// We know that stopWriterThread is in the process of joining our slow
// writer. It must not hold the lock during this operation.
// In order to test that it does not, we attempt to do an operation that
// requires the lock-- getReplicaString.
spyDN.getFSDataset().getReplicaString(recoveringBlock.getBlock().getBlockPoolId(), recoveringBlock.getBlock().getBlockId());
// Tell the slow writer to exit, and then wait for all threads to join.
terminateSlowWriter.sem.release();
slowWriterThread.join();
stopWriterThread.join();
// Check that our worker threads exited cleanly. This is not checked by the
// unit test framework, so we have to do it manually here.
String failureReason = failure.get();
if (failureReason != null) {
Assert.fail("Thread failure: " + failureReason);
}
}
use of java.util.concurrent.atomic.AtomicReference in project hadoop by apache.
the class TestEditLogRace method testSaveImageWhileSyncInProgress.
/**
* The logSync() method in FSEditLog is unsynchronized whiel syncing
* so that other threads can concurrently enqueue edits while the prior
* sync is ongoing. This test checks that the log is saved correctly
* if the saveImage occurs while the syncing thread is in the unsynchronized middle section.
*
* This replicates the following manual test proposed by Konstantin:
* I start the name-node in debugger.
* I do -mkdir and stop the debugger in logSync() just before it does flush.
* Then I enter safe mode with another client
* I start saveNamepsace and stop the debugger in
* FSImage.saveFSImage() -> FSEditLog.createEditLogFile()
* -> EditLogFileOutputStream.create() ->
* after truncating the file but before writing LAYOUT_VERSION into it.
* Then I let logSync() run.
* Then I terminate the name-node.
* After that the name-node wont start, since the edits file is broken.
*/
@Test
public void testSaveImageWhileSyncInProgress() throws Exception {
Configuration conf = getConf();
NameNode.initMetrics(conf, NamenodeRole.NAMENODE);
DFSTestUtil.formatNameNode(conf);
final FSNamesystem namesystem = FSNamesystem.loadFromDisk(conf);
try {
FSImage fsimage = namesystem.getFSImage();
FSEditLog editLog = fsimage.getEditLog();
JournalAndStream jas = editLog.getJournals().get(0);
EditLogFileOutputStream spyElos = spy((EditLogFileOutputStream) jas.getCurrentStream());
jas.setCurrentStreamForTests(spyElos);
final AtomicReference<Throwable> deferredException = new AtomicReference<Throwable>();
final CountDownLatch waitToEnterFlush = new CountDownLatch(1);
final Thread doAnEditThread = new Thread() {
@Override
public void run() {
try {
LOG.info("Starting mkdirs");
namesystem.mkdirs("/test", new PermissionStatus("test", "test", new FsPermission((short) 00755)), true);
LOG.info("mkdirs complete");
} catch (Throwable ioe) {
LOG.fatal("Got exception", ioe);
deferredException.set(ioe);
waitToEnterFlush.countDown();
}
}
};
Answer<Void> blockingFlush = new Answer<Void>() {
@Override
public Void answer(InvocationOnMock invocation) throws Throwable {
LOG.info("Flush called");
if (useAsyncEditLog || Thread.currentThread() == doAnEditThread) {
LOG.info("edit thread: Telling main thread we made it to flush section...");
// Signal to main thread that the edit thread is in the racy section
waitToEnterFlush.countDown();
LOG.info("edit thread: sleeping for " + BLOCK_TIME + "secs");
Thread.sleep(BLOCK_TIME * 1000);
LOG.info("Going through to flush. This will allow the main thread to continue.");
}
invocation.callRealMethod();
LOG.info("Flush complete");
return null;
}
};
doAnswer(blockingFlush).when(spyElos).flush();
doAnEditThread.start();
// Wait for the edit thread to get to the logsync unsynchronized section
LOG.info("Main thread: waiting to enter flush...");
waitToEnterFlush.await();
assertNull(deferredException.get());
LOG.info("Main thread: detected that logSync is in unsynchronized section.");
LOG.info("Trying to enter safe mode.");
LOG.info("This should block for " + BLOCK_TIME + "sec, since flush will sleep that long");
long st = Time.now();
namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
long et = Time.now();
LOG.info("Entered safe mode");
// Make sure we really waited for the flush to complete!
assertTrue(et - st > (BLOCK_TIME - 1) * 1000);
// Once we're in safe mode, save namespace.
namesystem.saveNamespace(0, 0);
LOG.info("Joining on edit thread...");
doAnEditThread.join();
assertNull(deferredException.get());
// We did 3 edits: begin, txn, and end
assertEquals(3, verifyEditLogs(namesystem, fsimage, NNStorage.getFinalizedEditsFileName(1, 3), 1));
// after the save, just the one "begin"
assertEquals(1, verifyEditLogs(namesystem, fsimage, NNStorage.getInProgressEditsFileName(4), 4));
} finally {
LOG.info("Closing nn");
if (namesystem != null)
namesystem.close();
}
}
use of java.util.concurrent.atomic.AtomicReference in project hadoop by apache.
the class TestEditLogRace method testEditLogRolling.
/**
* Tests rolling edit logs while transactions are ongoing.
*/
@Test
public void testEditLogRolling() throws Exception {
// start a cluster
Configuration conf = getConf();
final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(NUM_DATA_NODES).build();
FileSystem fileSys = null;
AtomicReference<Throwable> caughtErr = new AtomicReference<Throwable>();
try {
cluster.waitActive();
fileSys = cluster.getFileSystem();
final NamenodeProtocols nn = cluster.getNameNode().getRpcServer();
FSImage fsimage = cluster.getNamesystem().getFSImage();
StorageDirectory sd = fsimage.getStorage().getStorageDir(0);
startTransactionWorkers(cluster, caughtErr);
long previousLogTxId = 1;
for (int i = 0; i < NUM_ROLLS && caughtErr.get() == null; i++) {
try {
Thread.sleep(20);
} catch (InterruptedException e) {
}
LOG.info("Starting roll " + i + ".");
CheckpointSignature sig = nn.rollEditLog();
long nextLog = sig.curSegmentTxId;
String logFileName = NNStorage.getFinalizedEditsFileName(previousLogTxId, nextLog - 1);
previousLogTxId += verifyEditLogs(cluster.getNamesystem(), fsimage, logFileName, previousLogTxId);
assertEquals(previousLogTxId, nextLog);
File expectedLog = NNStorage.getInProgressEditsFile(sd, previousLogTxId);
assertTrue("Expect " + expectedLog + " to exist", expectedLog.exists());
}
} finally {
stopTransactionWorkers();
if (caughtErr.get() != null) {
throw new RuntimeException(caughtErr.get());
}
if (fileSys != null)
fileSys.close();
if (cluster != null)
cluster.shutdown();
}
}
use of java.util.concurrent.atomic.AtomicReference in project hadoop by apache.
the class TestEditLogRace method testSaveNamespace.
/**
* Tests saving fs image while transactions are ongoing.
*/
@Test
public void testSaveNamespace() throws Exception {
// start a cluster
Configuration conf = getConf();
MiniDFSCluster cluster = null;
FileSystem fileSys = null;
AtomicReference<Throwable> caughtErr = new AtomicReference<Throwable>();
try {
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(NUM_DATA_NODES).build();
cluster.waitActive();
fileSys = cluster.getFileSystem();
final FSNamesystem namesystem = cluster.getNamesystem();
FSImage fsimage = namesystem.getFSImage();
FSEditLog editLog = fsimage.getEditLog();
startTransactionWorkers(cluster, caughtErr);
for (int i = 0; i < NUM_SAVE_IMAGE && caughtErr.get() == null; i++) {
try {
Thread.sleep(20);
} catch (InterruptedException ignored) {
}
LOG.info("Save " + i + ": entering safe mode");
namesystem.enterSafeMode(false);
// Verify edit logs before the save
// They should start with the first edit after the checkpoint
long logStartTxId = fsimage.getStorage().getMostRecentCheckpointTxId() + 1;
verifyEditLogs(namesystem, fsimage, NNStorage.getInProgressEditsFileName(logStartTxId), logStartTxId);
LOG.info("Save " + i + ": saving namespace");
namesystem.saveNamespace(0, 0);
LOG.info("Save " + i + ": leaving safemode");
long savedImageTxId = fsimage.getStorage().getMostRecentCheckpointTxId();
// Verify that edit logs post save got finalized and aren't corrupt
verifyEditLogs(namesystem, fsimage, NNStorage.getFinalizedEditsFileName(logStartTxId, savedImageTxId), logStartTxId);
// The checkpoint id should be 1 less than the last written ID, since
// the log roll writes the "BEGIN" transaction to the new log.
assertEquals(fsimage.getStorage().getMostRecentCheckpointTxId(), editLog.getLastWrittenTxId() - 1);
namesystem.leaveSafeMode(false);
LOG.info("Save " + i + ": complete");
}
} finally {
stopTransactionWorkers();
if (caughtErr.get() != null) {
throw new RuntimeException(caughtErr.get());
}
if (fileSys != null)
fileSys.close();
if (cluster != null)
cluster.shutdown();
}
}
Aggregations