Search in sources :

Example 31 with MasterWorkerInfo

use of alluxio.master.block.meta.MasterWorkerInfo in project alluxio by Alluxio.

the class BlockMasterRegisterStreamIntegrationTest method verifyWorkerCanReregister.

// Verify a worker can reregister and have the correct final blocks
private void verifyWorkerCanReregister(long workerId, List<RegisterWorkerPRequest> requestChunks, int expectedBlockCount) throws Exception {
    Queue<Throwable> errorQueue = new ConcurrentLinkedQueue<>();
    sendStreamToMaster(requestChunks, RegisterStreamTestUtils.getErrorCapturingResponseObserver(errorQueue));
    assertEquals(0, errorQueue.size());
    MasterWorkerInfo worker = mBlockMaster.getWorker(workerId);
    assertEquals(expectedBlockCount, worker.getBlockCount());
    assertEquals(0, worker.getToRemoveBlockCount());
    assertEquals(1, mBlockMaster.getWorkerCount());
}
Also used : MasterWorkerInfo(alluxio.master.block.meta.MasterWorkerInfo) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue)

Example 32 with MasterWorkerInfo

use of alluxio.master.block.meta.MasterWorkerInfo in project alluxio by Alluxio.

the class BlockMasterRegisterStreamIntegrationTest method registerLostWorker.

@Test
public // The master has marked the worker as lost.
void registerLostWorker() throws Exception {
    long workerId = mBlockMaster.getWorkerId(NET_ADDRESS_1);
    // The worker registers to the master
    List<RegisterWorkerPRequest> requestChunks = RegisterStreamTestUtils.generateRegisterStreamForWorker(workerId);
    prepareBlocksOnMaster(requestChunks);
    Queue<Throwable> errorQueue = new ConcurrentLinkedQueue<>();
    sendStreamToMaster(requestChunks, RegisterStreamTestUtils.getErrorCapturingResponseObserver(errorQueue));
    // Verify the worker has been registered
    assertEquals(0, errorQueue.size());
    assertEquals(1, mBlockMaster.getWorkerCount());
    // The worker has lost heartbeat and been forgotten
    MasterWorkerInfo worker = mBlockMaster.getWorker(workerId);
    long newTimeMs = worker.getLastUpdatedTimeMs() + MASTER_WORKER_TIMEOUT + 1;
    mClock.setTimeMs(newTimeMs);
    DefaultBlockMaster.LostWorkerDetectionHeartbeatExecutor lostWorkerDetector = ((DefaultBlockMaster) mBlockMaster).new LostWorkerDetectionHeartbeatExecutor();
    lostWorkerDetector.heartbeat();
    // Verify the worker has been forgotten
    assertEquals(0, mBlockMaster.getWorkerCount());
    // Register again
    Queue<Throwable> newErrorQueue = new ConcurrentLinkedQueue<>();
    sendStreamToMaster(requestChunks, RegisterStreamTestUtils.getErrorCapturingResponseObserver(newErrorQueue));
    // Verify the worker is registered again
    assertEquals(0, errorQueue.size());
    MasterWorkerInfo updatedWorker = mBlockMaster.getWorker(workerId);
    assertEquals(TIER_BLOCK_TOTAL, updatedWorker.getBlockCount());
    assertEquals(0, updatedWorker.getToRemoveBlockCount());
    assertEquals(1, mBlockMaster.getWorkerCount());
    // Verify the worker is readable and writable
    verifyWorkerWritable(workerId);
}
Also used : DefaultBlockMaster(alluxio.master.block.DefaultBlockMaster) MasterWorkerInfo(alluxio.master.block.meta.MasterWorkerInfo) RegisterWorkerPRequest(alluxio.grpc.RegisterWorkerPRequest) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) Test(org.junit.Test)

Example 33 with MasterWorkerInfo

use of alluxio.master.block.meta.MasterWorkerInfo in project alluxio by Alluxio.

the class BlockMasterRegisterStreamIntegrationTest method reregisterWithDelete.

/**
 * Tests below cover the race conditions during concurrent executions.
 *
 * When the worker registers for the 1st time, no clients should know this worker.
 * Therefore there is no concurrent client-incurred write operations on this worker.
 * The races happen typically when the worker re-registers with the master,
 * where some clients already know this worker and can direct invoke writes on the worker.
 *
 * Tests here verify the integrity of the master-side metadata.
 * In other words, we assume those writers succeed on the worker, and the subsequent
 * update on the master-side metadata should also succeed and be correct.
 */
@Test
public void reregisterWithDelete() throws Exception {
    // Register the worker so the worker is marked active in master
    long workerId = mBlockMaster.getWorkerId(NET_ADDRESS_1);
    List<RegisterWorkerPRequest> requestChunks = RegisterStreamTestUtils.generateRegisterStreamForWorker(workerId);
    prepareBlocksOnMaster(requestChunks);
    Queue<Throwable> errorQueue = new ConcurrentLinkedQueue<>();
    sendStreamToMaster(requestChunks, RegisterStreamTestUtils.getErrorCapturingResponseObserver(errorQueue));
    assertEquals(0, errorQueue.size());
    assertEquals(1, mBlockMaster.getWorkerCount());
    // Find a block to remove
    long blockToRemove = RegisterStreamTestUtils.findFirstBlock(requestChunks);
    // Register again
    CountDownLatch latch = new CountDownLatch(1);
    Queue<Throwable> newErrorQueue = new ConcurrentLinkedQueue<>();
    Future f = mExecutorService.submit(() -> {
        sendStreamToMasterAndSignal(requestChunks, RegisterStreamTestUtils.getErrorCapturingResponseObserver(newErrorQueue), latch);
    });
    // During the register stream, trigger a delete on worker
    latch.await();
    mBlockMaster.removeBlocks(ImmutableList.of(blockToRemove), true);
    // Wait for the register to finish
    f.get();
    assertThrows(BlockInfoException.class, () -> {
        mBlockMaster.getBlockInfo(blockToRemove);
    });
    MasterWorkerInfo worker = mBlockMaster.getWorker(workerId);
    assertEquals(1, mBlockMaster.getWorkerCount());
    assertEquals(TIER_BLOCK_TOTAL - 1, worker.getBlockCount());
    // BlockMaster.removeBlocks() will first remove the block from master metadata
    // (with block lock) then update the block locations (with worker lock).
    // The worker lock is being held by the registering worker, but the 1st part
    // will likely succeed.
    // So during registration when checking on the block, the block is not recognized
    // any more and will remain in MasterWorkerInfo.mToRemoveBlocks.
    // In the next heartbeat the master will issue a command to remove the block
    // from the worker.
    // Even if the block is already removed on the worker it is fine,
    // because deletion of a not-found block is a noop.
    Command command = sendHeartbeatToMaster(workerId);
    assertEquals(Command.newBuilder().addData(blockToRemove).setCommandType(CommandType.Free).build(), command);
}
Also used : Command(alluxio.grpc.Command) MasterWorkerInfo(alluxio.master.block.meta.MasterWorkerInfo) Future(java.util.concurrent.Future) RegisterWorkerPRequest(alluxio.grpc.RegisterWorkerPRequest) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) CountDownLatch(java.util.concurrent.CountDownLatch) Test(org.junit.Test)

Example 34 with MasterWorkerInfo

use of alluxio.master.block.meta.MasterWorkerInfo in project alluxio by Alluxio.

the class BlockMasterRegisterStreamIntegrationTest method registerExistingWorkerBlocksLost.

@Test
public void registerExistingWorkerBlocksLost() throws Exception {
    long workerId = mBlockMaster.getWorkerId(NET_ADDRESS_1);
    // Register the worker for the 1st time
    List<RegisterWorkerPRequest> requestChunks = RegisterStreamTestUtils.generateRegisterStreamForWorker(workerId);
    prepareBlocksOnMaster(requestChunks);
    Queue<Throwable> errorQueue = new ConcurrentLinkedQueue<>();
    sendStreamToMaster(requestChunks, RegisterStreamTestUtils.getErrorCapturingResponseObserver(errorQueue));
    assertEquals(0, errorQueue.size());
    // Verify the worker has registered
    assertEquals(1, mBlockMaster.getWorkerCount());
    MasterWorkerInfo worker = mBlockMaster.getWorker(workerId);
    assertEquals(TIER_BLOCK_TOTAL, worker.getBlockCount());
    assertEquals(0, worker.getToRemoveBlockCount());
    // Manually generate the blocks again and remove some
    List<String> tierAliases = getTierAliases(parseTierConfig(TIER_CONFIG));
    Map<BlockStoreLocation, List<Long>> blockMap = RpcBenchPreparationUtils.generateBlockIdOnTiers(parseTierConfig(TIER_CONFIG));
    Set<Long> lostBlocks = removeSomeBlocks(blockMap);
    // Regenerate the requests
    RegisterStreamer newRegisterStreamer = new RegisterStreamer(null, workerId, tierAliases, CAPACITY_MAP, USAGE_MAP, blockMap, LOST_STORAGE, EMPTY_CONFIG);
    List<RegisterWorkerPRequest> newRequestChunks = ImmutableList.copyOf(newRegisterStreamer);
    int newExpectedBatchCount = (int) Math.ceil((TIER_BLOCK_TOTAL - lostBlocks.size()) / (double) BATCH_SIZE);
    assertEquals(newExpectedBatchCount, newRequestChunks.size());
    // Register again with the updated stream
    Queue<Throwable> newErrorQueue = new ConcurrentLinkedQueue<>();
    sendStreamToMaster(newRequestChunks, RegisterStreamTestUtils.getErrorCapturingResponseObserver(newErrorQueue));
    assertEquals(0, newErrorQueue.size());
    // Verify the worker is registered
    assertEquals(1, mBlockMaster.getWorkerCount());
    MasterWorkerInfo updatedWorker = mBlockMaster.getWorker(workerId);
    assertEquals(TIER_BLOCK_TOTAL - lostBlocks.size(), updatedWorker.getBlockCount());
    // The master will mark the lost blocks as to be removed
    // This is to ensure the unrecognized blocks do no live on the worker anymore
    assertEquals(lostBlocks.size(), updatedWorker.getToRemoveBlockCount());
    // The update is received during the registration so no command to send to the worker
    Command command = sendHeartbeatToMaster(workerId);
    assertEquals(CommandType.Free, command.getCommandType());
    assertEquals(lostBlocks, new HashSet<>(command.getDataList()));
    // Verify the worker is readable and writable
    verifyWorkerWritable(workerId);
}
Also used : RegisterWorkerPRequest(alluxio.grpc.RegisterWorkerPRequest) RegisterStreamer(alluxio.worker.block.RegisterStreamer) Command(alluxio.grpc.Command) MasterWorkerInfo(alluxio.master.block.meta.MasterWorkerInfo) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) StorageList(alluxio.grpc.StorageList) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) BlockStoreLocation(alluxio.worker.block.BlockStoreLocation) Test(org.junit.Test)

Aggregations

MasterWorkerInfo (alluxio.master.block.meta.MasterWorkerInfo)34 RegisterWorkerPRequest (alluxio.grpc.RegisterWorkerPRequest)10 ConcurrentLinkedQueue (java.util.concurrent.ConcurrentLinkedQueue)9 LockResource (alluxio.resource.LockResource)8 ArrayList (java.util.ArrayList)8 Test (org.junit.Test)8 StorageList (alluxio.grpc.StorageList)7 List (java.util.List)7 ConcurrentHashSet (alluxio.collections.ConcurrentHashSet)6 Command (alluxio.grpc.Command)6 HashMap (java.util.HashMap)6 HashSet (java.util.HashSet)6 Map (java.util.Map)6 BlockLocation (alluxio.proto.meta.Block.BlockLocation)5 WorkerInfo (alluxio.wire.WorkerInfo)5 NotFoundException (alluxio.exception.status.NotFoundException)4 UnavailableException (alluxio.exception.status.UnavailableException)4 Address (alluxio.wire.Address)4 BlockInfo (alluxio.wire.BlockInfo)4 WorkerNetAddress (alluxio.wire.WorkerNetAddress)4