Search in sources :

Example 21 with Command

use of alluxio.grpc.Command in project alluxio by Alluxio.

the class BlockMasterSync method heartbeat.

/**
 * Heartbeats to the master node about the change in the worker's managed space.
 */
@Override
public void heartbeat() {
    // Prepare metadata for the next heartbeat
    BlockHeartbeatReport blockReport = mBlockWorker.getReport();
    BlockStoreMeta storeMeta = mBlockWorker.getStoreMeta();
    // Send the heartbeat and execute the response
    Command cmdFromMaster = null;
    List<alluxio.grpc.Metric> metrics = MetricsSystem.reportWorkerMetrics();
    try {
        cmdFromMaster = mMasterClient.heartbeat(mWorkerId.get(), storeMeta.getCapacityBytesOnTiers(), storeMeta.getUsedBytesOnTiers(), blockReport.getRemovedBlocks(), blockReport.getAddedBlocks(), blockReport.getLostStorage(), metrics);
        handleMasterCommand(cmdFromMaster);
        mLastSuccessfulHeartbeatMs = System.currentTimeMillis();
    } catch (IOException | ConnectionFailedException e) {
        // An error occurred, log and ignore it or error if heartbeat timeout is reached
        if (cmdFromMaster == null) {
            LOG.error("Failed to receive master heartbeat command.", e);
        } else {
            LOG.error("Failed to receive or execute master heartbeat command: {}", cmdFromMaster.toString(), e);
        }
        mMasterClient.disconnect();
        if (mHeartbeatTimeoutMs > 0) {
            if (System.currentTimeMillis() - mLastSuccessfulHeartbeatMs >= mHeartbeatTimeoutMs) {
                if (ServerConfiguration.getBoolean(PropertyKey.TEST_MODE)) {
                    throw new RuntimeException("Master heartbeat timeout exceeded: " + mHeartbeatTimeoutMs);
                }
                // TODO(andrew): Propagate the exception to the main thread and exit there.
                ProcessUtils.fatalError(LOG, "Master heartbeat timeout exceeded: %d", mHeartbeatTimeoutMs);
            }
        }
    }
}
Also used : Command(alluxio.grpc.Command) IOException(java.io.IOException) ConnectionFailedException(alluxio.exception.ConnectionFailedException)

Example 22 with Command

use of alluxio.grpc.Command in project alluxio by Alluxio.

the class BlockMasterRegisterStreamIntegrationTest method reregisterWithDelete.

/**
 * Tests below cover the race conditions during concurrent executions.
 *
 * When the worker registers for the 1st time, no clients should know this worker.
 * Therefore there is no concurrent client-incurred write operations on this worker.
 * The races happen typically when the worker re-registers with the master,
 * where some clients already know this worker and can direct invoke writes on the worker.
 *
 * Tests here verify the integrity of the master-side metadata.
 * In other words, we assume those writers succeed on the worker, and the subsequent
 * update on the master-side metadata should also succeed and be correct.
 */
@Test
public void reregisterWithDelete() throws Exception {
    // Register the worker so the worker is marked active in master
    long workerId = mBlockMaster.getWorkerId(NET_ADDRESS_1);
    List<RegisterWorkerPRequest> requestChunks = RegisterStreamTestUtils.generateRegisterStreamForWorker(workerId);
    prepareBlocksOnMaster(requestChunks);
    Queue<Throwable> errorQueue = new ConcurrentLinkedQueue<>();
    sendStreamToMaster(requestChunks, RegisterStreamTestUtils.getErrorCapturingResponseObserver(errorQueue));
    assertEquals(0, errorQueue.size());
    assertEquals(1, mBlockMaster.getWorkerCount());
    // Find a block to remove
    long blockToRemove = RegisterStreamTestUtils.findFirstBlock(requestChunks);
    // Register again
    CountDownLatch latch = new CountDownLatch(1);
    Queue<Throwable> newErrorQueue = new ConcurrentLinkedQueue<>();
    Future f = mExecutorService.submit(() -> {
        sendStreamToMasterAndSignal(requestChunks, RegisterStreamTestUtils.getErrorCapturingResponseObserver(newErrorQueue), latch);
    });
    // During the register stream, trigger a delete on worker
    latch.await();
    mBlockMaster.removeBlocks(ImmutableList.of(blockToRemove), true);
    // Wait for the register to finish
    f.get();
    assertThrows(BlockInfoException.class, () -> {
        mBlockMaster.getBlockInfo(blockToRemove);
    });
    MasterWorkerInfo worker = mBlockMaster.getWorker(workerId);
    assertEquals(1, mBlockMaster.getWorkerCount());
    assertEquals(TIER_BLOCK_TOTAL - 1, worker.getBlockCount());
    // BlockMaster.removeBlocks() will first remove the block from master metadata
    // (with block lock) then update the block locations (with worker lock).
    // The worker lock is being held by the registering worker, but the 1st part
    // will likely succeed.
    // So during registration when checking on the block, the block is not recognized
    // any more and will remain in MasterWorkerInfo.mToRemoveBlocks.
    // In the next heartbeat the master will issue a command to remove the block
    // from the worker.
    // Even if the block is already removed on the worker it is fine,
    // because deletion of a not-found block is a noop.
    Command command = sendHeartbeatToMaster(workerId);
    assertEquals(Command.newBuilder().addData(blockToRemove).setCommandType(CommandType.Free).build(), command);
}
Also used : Command(alluxio.grpc.Command) MasterWorkerInfo(alluxio.master.block.meta.MasterWorkerInfo) Future(java.util.concurrent.Future) RegisterWorkerPRequest(alluxio.grpc.RegisterWorkerPRequest) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) CountDownLatch(java.util.concurrent.CountDownLatch) Test(org.junit.Test)

Example 23 with Command

use of alluxio.grpc.Command in project alluxio by Alluxio.

the class BlockMasterRegisterStreamIntegrationTest method registerExistingWorkerBlocksLost.

@Test
public void registerExistingWorkerBlocksLost() throws Exception {
    long workerId = mBlockMaster.getWorkerId(NET_ADDRESS_1);
    // Register the worker for the 1st time
    List<RegisterWorkerPRequest> requestChunks = RegisterStreamTestUtils.generateRegisterStreamForWorker(workerId);
    prepareBlocksOnMaster(requestChunks);
    Queue<Throwable> errorQueue = new ConcurrentLinkedQueue<>();
    sendStreamToMaster(requestChunks, RegisterStreamTestUtils.getErrorCapturingResponseObserver(errorQueue));
    assertEquals(0, errorQueue.size());
    // Verify the worker has registered
    assertEquals(1, mBlockMaster.getWorkerCount());
    MasterWorkerInfo worker = mBlockMaster.getWorker(workerId);
    assertEquals(TIER_BLOCK_TOTAL, worker.getBlockCount());
    assertEquals(0, worker.getToRemoveBlockCount());
    // Manually generate the blocks again and remove some
    List<String> tierAliases = getTierAliases(parseTierConfig(TIER_CONFIG));
    Map<BlockStoreLocation, List<Long>> blockMap = RpcBenchPreparationUtils.generateBlockIdOnTiers(parseTierConfig(TIER_CONFIG));
    Set<Long> lostBlocks = removeSomeBlocks(blockMap);
    // Regenerate the requests
    RegisterStreamer newRegisterStreamer = new RegisterStreamer(null, workerId, tierAliases, CAPACITY_MAP, USAGE_MAP, blockMap, LOST_STORAGE, EMPTY_CONFIG);
    List<RegisterWorkerPRequest> newRequestChunks = ImmutableList.copyOf(newRegisterStreamer);
    int newExpectedBatchCount = (int) Math.ceil((TIER_BLOCK_TOTAL - lostBlocks.size()) / (double) BATCH_SIZE);
    assertEquals(newExpectedBatchCount, newRequestChunks.size());
    // Register again with the updated stream
    Queue<Throwable> newErrorQueue = new ConcurrentLinkedQueue<>();
    sendStreamToMaster(newRequestChunks, RegisterStreamTestUtils.getErrorCapturingResponseObserver(newErrorQueue));
    assertEquals(0, newErrorQueue.size());
    // Verify the worker is registered
    assertEquals(1, mBlockMaster.getWorkerCount());
    MasterWorkerInfo updatedWorker = mBlockMaster.getWorker(workerId);
    assertEquals(TIER_BLOCK_TOTAL - lostBlocks.size(), updatedWorker.getBlockCount());
    // The master will mark the lost blocks as to be removed
    // This is to ensure the unrecognized blocks do no live on the worker anymore
    assertEquals(lostBlocks.size(), updatedWorker.getToRemoveBlockCount());
    // The update is received during the registration so no command to send to the worker
    Command command = sendHeartbeatToMaster(workerId);
    assertEquals(CommandType.Free, command.getCommandType());
    assertEquals(lostBlocks, new HashSet<>(command.getDataList()));
    // Verify the worker is readable and writable
    verifyWorkerWritable(workerId);
}
Also used : RegisterWorkerPRequest(alluxio.grpc.RegisterWorkerPRequest) RegisterStreamer(alluxio.worker.block.RegisterStreamer) Command(alluxio.grpc.Command) MasterWorkerInfo(alluxio.master.block.meta.MasterWorkerInfo) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) StorageList(alluxio.grpc.StorageList) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) BlockStoreLocation(alluxio.worker.block.BlockStoreLocation) Test(org.junit.Test)

Example 24 with Command

use of alluxio.grpc.Command in project alluxio by Alluxio.

the class ConcurrentBlockMasterTest method concurrentRemoveWithSameWorkerHeartbeatDifferentBlock.

@Test
public void concurrentRemoveWithSameWorkerHeartbeatDifferentBlock() throws Exception {
    for (boolean deleteMetadata : ImmutableList.of(true)) {
        // Prepare block 1 and 2 on the worker
        long worker1 = registerEmptyWorker(NET_ADDRESS_1);
        mBlockMaster.commitBlock(worker1, BLOCK1_LENGTH, "MEM", "MEM", BLOCK1_ID, BLOCK1_LENGTH);
        mBlockMaster.commitBlock(worker1, BLOCK1_LENGTH + BLOCK2_LENGTH, "MEM", "MEM", BLOCK2_ID, BLOCK2_LENGTH);
        CountDownLatch w1Latch = new CountDownLatch(1);
        mBlockMaster.setLatch(w1Latch);
        AtomicBoolean freeCommandSeen = new AtomicBoolean(false);
        concurrentWriterWithWriter(w1Latch, // W1
        () -> {
            mBlockMaster.removeBlocks(ImmutableList.of(BLOCK1_ID), deleteMetadata);
            return null;
        }, // W2
        () -> {
            // A different block is removed on the same worker
            // This should contend on the worker metadata
            Command cmd = mBlockMaster.workerHeartbeat(worker1, MEM_CAPACITY, // Block 2 is removed but 1 is still on the worker
            ImmutableMap.of("MEM", BLOCK1_LENGTH), // list of removed blockIds
            ImmutableList.of(BLOCK2_ID), ImmutableMap.of(), NO_LOST_STORAGE, ImmutableList.of());
            // adds to the to-be-removed list. In this case the return command has nothing.
            if (cmd.equals(FREE_BLOCK1_CMD)) {
                freeCommandSeen.set(true);
            } else {
                assertEquals(EMPTY_CMD, cmd);
            }
            return null;
        }, // Verifier
        () -> {
            // After heartbeat, verify the worker info
            List<WorkerInfo> workerInfoList = mBlockMaster.getWorkerReport(GetWorkerReportOptions.defaults());
            assertEquals(1, workerInfoList.size());
            WorkerInfo worker1Info = findWorkerInfo(workerInfoList, worker1);
            assertEquals(BLOCK1_LENGTH, worker1Info.getUsedBytes());
            if (deleteMetadata) {
                verifyBlockNotExisting(mBlockMaster, BLOCK1_ID);
            } else {
                // All locations of block 1 are freed in metadata
                verifyBlockOnWorkers(mBlockMaster, BLOCK1_ID, BLOCK1_LENGTH, workerInfoList);
            }
            verifyBlockOnWorkers(mBlockMaster, BLOCK2_ID, BLOCK2_LENGTH, ImmutableList.of());
            // and updated the to-be-removed list
            if (!freeCommandSeen.get()) {
                Command cmd = mBlockMaster.workerHeartbeat(worker1, MEM_CAPACITY, // Block 2 is removed but 1 is still on the worker
                ImmutableMap.of("MEM", BLOCK1_LENGTH), // list of removed blockIds
                ImmutableList.of(BLOCK2_ID), ImmutableMap.of(), NO_LOST_STORAGE, ImmutableList.of());
                assertEquals(FREE_BLOCK1_CMD, cmd);
            }
            return null;
        });
    }
}
Also used : AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) Command(alluxio.grpc.Command) WorkerInfo(alluxio.wire.WorkerInfo) BlockMasterTestUtils.findWorkerInfo(alluxio.master.block.BlockMasterTestUtils.findWorkerInfo) CountDownLatch(java.util.concurrent.CountDownLatch) Test(org.junit.Test)

Example 25 with Command

use of alluxio.grpc.Command in project alluxio by Alluxio.

the class ConcurrentBlockMasterTest method concurrentRemoveWithRegisterNewWorkerSameBlock.

@Test
public void concurrentRemoveWithRegisterNewWorkerSameBlock() throws Exception {
    for (boolean deleteMetadata : ImmutableList.of(true, false)) {
        // Prepare worker
        long worker1 = registerEmptyWorker(NET_ADDRESS_1);
        // Prepare block on the worker
        mBlockMaster.commitBlock(worker1, BLOCK1_LENGTH, "MEM", "MEM", BLOCK1_ID, BLOCK1_LENGTH);
        CountDownLatch w1Latch = new CountDownLatch(1);
        mBlockMaster.setLatch(w1Latch);
        // A new worker as the W2
        long worker2 = mBlockMaster.getWorkerId(NET_ADDRESS_2);
        concurrentWriterWithWriter(w1Latch, // W1
        () -> {
            mBlockMaster.removeBlocks(ImmutableList.of(BLOCK1_ID), deleteMetadata);
            return null;
        }, // W2
        () -> {
            // The new worker contains the block
            // W1 will remove the block exclusively before worker2 registers with the same block
            // So when worker 2 comes in, the block should be removed already
            // So the block on worker 2 should be ignored
            mBlockMaster.workerRegister(worker2, Arrays.asList("MEM"), MEM_CAPACITY, ImmutableMap.of("MEM", BLOCK1_LENGTH), ImmutableMap.of(newBlockLocationOnWorkerMemTier(worker2), ImmutableList.of(BLOCK1_ID)), NO_LOST_STORAGE, RegisterWorkerPOptions.getDefaultInstance());
            return null;
        }, // Verifier
        () -> {
            // After registration, verify the worker info
            List<WorkerInfo> workerInfoList = mBlockMaster.getWorkerReport(GetWorkerReportOptions.defaults());
            assertEquals(2, workerInfoList.size());
            WorkerInfo worker1Info = findWorkerInfo(workerInfoList, worker1);
            assertEquals(BLOCK1_LENGTH, worker1Info.getUsedBytes());
            WorkerInfo worker2Info = findWorkerInfo(workerInfoList, worker2);
            assertEquals(BLOCK1_LENGTH, worker2Info.getUsedBytes());
            // Verify the block metadata
            if (deleteMetadata) {
                // If the block metadata has been removed, getting that will get an exception
                assertThrows(BlockInfoException.class, () -> {
                    mBlockMaster.getBlockInfo(BLOCK1_ID);
                });
            } else {
                // The master will issue commands to remove blocks on the next heartbeat
                // So now the locations are still there
                verifyBlockOnWorkers(mBlockMaster, BLOCK1_ID, BLOCK1_LENGTH, workerInfoList);
            }
            // Verify the heartbeat from worker will get a command to remove the block
            Command worker1HeartbeatCmd = mBlockMaster.workerHeartbeat(worker1, MEM_CAPACITY, // the block has not yet been removed
            ImmutableMap.of("MEM", BLOCK1_LENGTH), // an empty list of removed blockIds
            ImmutableList.of(), ImmutableMap.of(), NO_LOST_STORAGE, ImmutableList.of());
            assertEquals(FREE_BLOCK1_CMD, worker1HeartbeatCmd);
            if (deleteMetadata) {
                // Block on worker 2 will be freed because the block is already removed
                Command worker2HeartbeatCmd = mBlockMaster.workerHeartbeat(worker2, MEM_CAPACITY, // the block has not yet been removed
                ImmutableMap.of("MEM", BLOCK1_LENGTH), // an empty list of removed blockIds
                ImmutableList.of(), ImmutableMap.of(), NO_LOST_STORAGE, ImmutableList.of());
                // Block on worker 2 will be freed because the block is already removed
                // Unrecognized blocks will be freed
                assertEquals(FREE_BLOCK1_CMD, worker2HeartbeatCmd);
            } else {
                // Two cases can happen:
                // 1. Worker 2 registers before the free operation checks the block locations
                // In this case the block on worker 2 will be freed
                // 2. Worker 2 registers after the free operation is complete
                // In this case the block on worker 2 will not be freed
                Command worker2HeartbeatCmd = mBlockMaster.workerHeartbeat(worker2, MEM_CAPACITY, // the block has not yet been removed
                ImmutableMap.of("MEM", BLOCK1_LENGTH), // an empty list of removed blockIds
                ImmutableList.of(), ImmutableMap.of(), NO_LOST_STORAGE, ImmutableList.of());
                assertTrue(worker2HeartbeatCmd.equals(FREE_BLOCK1_CMD) || worker2HeartbeatCmd.equals(EMPTY_CMD));
            }
            return null;
        });
    }
}
Also used : Command(alluxio.grpc.Command) WorkerInfo(alluxio.wire.WorkerInfo) BlockMasterTestUtils.findWorkerInfo(alluxio.master.block.BlockMasterTestUtils.findWorkerInfo) CountDownLatch(java.util.concurrent.CountDownLatch) Test(org.junit.Test)

Aggregations

Command (alluxio.grpc.Command)27 Test (org.junit.Test)23 CountDownLatch (java.util.concurrent.CountDownLatch)12 BlockMasterTestUtils.findWorkerInfo (alluxio.master.block.BlockMasterTestUtils.findWorkerInfo)10 WorkerInfo (alluxio.wire.WorkerInfo)10 FileSystemCommand (alluxio.wire.FileSystemCommand)9 RegisterWorkerPRequest (alluxio.grpc.RegisterWorkerPRequest)4 MasterWorkerInfo (alluxio.master.block.meta.MasterWorkerInfo)4 StorageList (alluxio.grpc.StorageList)2 CreateDirectoryContext (alluxio.master.file.contexts.CreateDirectoryContext)2 IOException (java.io.IOException)2 ConcurrentLinkedQueue (java.util.concurrent.ConcurrentLinkedQueue)2 AbstractMasterClient (alluxio.AbstractMasterClient)1 AlluxioURI (alluxio.AlluxioURI)1 Constants (alluxio.Constants)1 PropertyKey (alluxio.conf.PropertyKey)1 BlockInfoException (alluxio.exception.BlockInfoException)1 ConnectionFailedException (alluxio.exception.ConnectionFailedException)1 FailedToAcquireRegisterLeaseException (alluxio.exception.FailedToAcquireRegisterLeaseException)1 InvalidPathException (alluxio.exception.InvalidPathException)1