use of alluxio.grpc.Command in project alluxio by Alluxio.
the class BlockMasterSync method heartbeat.
/**
* Heartbeats to the master node about the change in the worker's managed space.
*/
@Override
public void heartbeat() {
// Prepare metadata for the next heartbeat
BlockHeartbeatReport blockReport = mBlockWorker.getReport();
BlockStoreMeta storeMeta = mBlockWorker.getStoreMeta();
// Send the heartbeat and execute the response
Command cmdFromMaster = null;
List<alluxio.grpc.Metric> metrics = MetricsSystem.reportWorkerMetrics();
try {
cmdFromMaster = mMasterClient.heartbeat(mWorkerId.get(), storeMeta.getCapacityBytesOnTiers(), storeMeta.getUsedBytesOnTiers(), blockReport.getRemovedBlocks(), blockReport.getAddedBlocks(), blockReport.getLostStorage(), metrics);
handleMasterCommand(cmdFromMaster);
mLastSuccessfulHeartbeatMs = System.currentTimeMillis();
} catch (IOException | ConnectionFailedException e) {
// An error occurred, log and ignore it or error if heartbeat timeout is reached
if (cmdFromMaster == null) {
LOG.error("Failed to receive master heartbeat command.", e);
} else {
LOG.error("Failed to receive or execute master heartbeat command: {}", cmdFromMaster.toString(), e);
}
mMasterClient.disconnect();
if (mHeartbeatTimeoutMs > 0) {
if (System.currentTimeMillis() - mLastSuccessfulHeartbeatMs >= mHeartbeatTimeoutMs) {
if (ServerConfiguration.getBoolean(PropertyKey.TEST_MODE)) {
throw new RuntimeException("Master heartbeat timeout exceeded: " + mHeartbeatTimeoutMs);
}
// TODO(andrew): Propagate the exception to the main thread and exit there.
ProcessUtils.fatalError(LOG, "Master heartbeat timeout exceeded: %d", mHeartbeatTimeoutMs);
}
}
}
}
use of alluxio.grpc.Command in project alluxio by Alluxio.
the class BlockMasterRegisterStreamIntegrationTest method reregisterWithDelete.
/**
* Tests below cover the race conditions during concurrent executions.
*
* When the worker registers for the 1st time, no clients should know this worker.
* Therefore there is no concurrent client-incurred write operations on this worker.
* The races happen typically when the worker re-registers with the master,
* where some clients already know this worker and can direct invoke writes on the worker.
*
* Tests here verify the integrity of the master-side metadata.
* In other words, we assume those writers succeed on the worker, and the subsequent
* update on the master-side metadata should also succeed and be correct.
*/
@Test
public void reregisterWithDelete() throws Exception {
// Register the worker so the worker is marked active in master
long workerId = mBlockMaster.getWorkerId(NET_ADDRESS_1);
List<RegisterWorkerPRequest> requestChunks = RegisterStreamTestUtils.generateRegisterStreamForWorker(workerId);
prepareBlocksOnMaster(requestChunks);
Queue<Throwable> errorQueue = new ConcurrentLinkedQueue<>();
sendStreamToMaster(requestChunks, RegisterStreamTestUtils.getErrorCapturingResponseObserver(errorQueue));
assertEquals(0, errorQueue.size());
assertEquals(1, mBlockMaster.getWorkerCount());
// Find a block to remove
long blockToRemove = RegisterStreamTestUtils.findFirstBlock(requestChunks);
// Register again
CountDownLatch latch = new CountDownLatch(1);
Queue<Throwable> newErrorQueue = new ConcurrentLinkedQueue<>();
Future f = mExecutorService.submit(() -> {
sendStreamToMasterAndSignal(requestChunks, RegisterStreamTestUtils.getErrorCapturingResponseObserver(newErrorQueue), latch);
});
// During the register stream, trigger a delete on worker
latch.await();
mBlockMaster.removeBlocks(ImmutableList.of(blockToRemove), true);
// Wait for the register to finish
f.get();
assertThrows(BlockInfoException.class, () -> {
mBlockMaster.getBlockInfo(blockToRemove);
});
MasterWorkerInfo worker = mBlockMaster.getWorker(workerId);
assertEquals(1, mBlockMaster.getWorkerCount());
assertEquals(TIER_BLOCK_TOTAL - 1, worker.getBlockCount());
// BlockMaster.removeBlocks() will first remove the block from master metadata
// (with block lock) then update the block locations (with worker lock).
// The worker lock is being held by the registering worker, but the 1st part
// will likely succeed.
// So during registration when checking on the block, the block is not recognized
// any more and will remain in MasterWorkerInfo.mToRemoveBlocks.
// In the next heartbeat the master will issue a command to remove the block
// from the worker.
// Even if the block is already removed on the worker it is fine,
// because deletion of a not-found block is a noop.
Command command = sendHeartbeatToMaster(workerId);
assertEquals(Command.newBuilder().addData(blockToRemove).setCommandType(CommandType.Free).build(), command);
}
use of alluxio.grpc.Command in project alluxio by Alluxio.
the class BlockMasterRegisterStreamIntegrationTest method registerExistingWorkerBlocksLost.
@Test
public void registerExistingWorkerBlocksLost() throws Exception {
long workerId = mBlockMaster.getWorkerId(NET_ADDRESS_1);
// Register the worker for the 1st time
List<RegisterWorkerPRequest> requestChunks = RegisterStreamTestUtils.generateRegisterStreamForWorker(workerId);
prepareBlocksOnMaster(requestChunks);
Queue<Throwable> errorQueue = new ConcurrentLinkedQueue<>();
sendStreamToMaster(requestChunks, RegisterStreamTestUtils.getErrorCapturingResponseObserver(errorQueue));
assertEquals(0, errorQueue.size());
// Verify the worker has registered
assertEquals(1, mBlockMaster.getWorkerCount());
MasterWorkerInfo worker = mBlockMaster.getWorker(workerId);
assertEquals(TIER_BLOCK_TOTAL, worker.getBlockCount());
assertEquals(0, worker.getToRemoveBlockCount());
// Manually generate the blocks again and remove some
List<String> tierAliases = getTierAliases(parseTierConfig(TIER_CONFIG));
Map<BlockStoreLocation, List<Long>> blockMap = RpcBenchPreparationUtils.generateBlockIdOnTiers(parseTierConfig(TIER_CONFIG));
Set<Long> lostBlocks = removeSomeBlocks(blockMap);
// Regenerate the requests
RegisterStreamer newRegisterStreamer = new RegisterStreamer(null, workerId, tierAliases, CAPACITY_MAP, USAGE_MAP, blockMap, LOST_STORAGE, EMPTY_CONFIG);
List<RegisterWorkerPRequest> newRequestChunks = ImmutableList.copyOf(newRegisterStreamer);
int newExpectedBatchCount = (int) Math.ceil((TIER_BLOCK_TOTAL - lostBlocks.size()) / (double) BATCH_SIZE);
assertEquals(newExpectedBatchCount, newRequestChunks.size());
// Register again with the updated stream
Queue<Throwable> newErrorQueue = new ConcurrentLinkedQueue<>();
sendStreamToMaster(newRequestChunks, RegisterStreamTestUtils.getErrorCapturingResponseObserver(newErrorQueue));
assertEquals(0, newErrorQueue.size());
// Verify the worker is registered
assertEquals(1, mBlockMaster.getWorkerCount());
MasterWorkerInfo updatedWorker = mBlockMaster.getWorker(workerId);
assertEquals(TIER_BLOCK_TOTAL - lostBlocks.size(), updatedWorker.getBlockCount());
// The master will mark the lost blocks as to be removed
// This is to ensure the unrecognized blocks do no live on the worker anymore
assertEquals(lostBlocks.size(), updatedWorker.getToRemoveBlockCount());
// The update is received during the registration so no command to send to the worker
Command command = sendHeartbeatToMaster(workerId);
assertEquals(CommandType.Free, command.getCommandType());
assertEquals(lostBlocks, new HashSet<>(command.getDataList()));
// Verify the worker is readable and writable
verifyWorkerWritable(workerId);
}
use of alluxio.grpc.Command in project alluxio by Alluxio.
the class ConcurrentBlockMasterTest method concurrentRemoveWithSameWorkerHeartbeatDifferentBlock.
@Test
public void concurrentRemoveWithSameWorkerHeartbeatDifferentBlock() throws Exception {
for (boolean deleteMetadata : ImmutableList.of(true)) {
// Prepare block 1 and 2 on the worker
long worker1 = registerEmptyWorker(NET_ADDRESS_1);
mBlockMaster.commitBlock(worker1, BLOCK1_LENGTH, "MEM", "MEM", BLOCK1_ID, BLOCK1_LENGTH);
mBlockMaster.commitBlock(worker1, BLOCK1_LENGTH + BLOCK2_LENGTH, "MEM", "MEM", BLOCK2_ID, BLOCK2_LENGTH);
CountDownLatch w1Latch = new CountDownLatch(1);
mBlockMaster.setLatch(w1Latch);
AtomicBoolean freeCommandSeen = new AtomicBoolean(false);
concurrentWriterWithWriter(w1Latch, // W1
() -> {
mBlockMaster.removeBlocks(ImmutableList.of(BLOCK1_ID), deleteMetadata);
return null;
}, // W2
() -> {
// A different block is removed on the same worker
// This should contend on the worker metadata
Command cmd = mBlockMaster.workerHeartbeat(worker1, MEM_CAPACITY, // Block 2 is removed but 1 is still on the worker
ImmutableMap.of("MEM", BLOCK1_LENGTH), // list of removed blockIds
ImmutableList.of(BLOCK2_ID), ImmutableMap.of(), NO_LOST_STORAGE, ImmutableList.of());
// adds to the to-be-removed list. In this case the return command has nothing.
if (cmd.equals(FREE_BLOCK1_CMD)) {
freeCommandSeen.set(true);
} else {
assertEquals(EMPTY_CMD, cmd);
}
return null;
}, // Verifier
() -> {
// After heartbeat, verify the worker info
List<WorkerInfo> workerInfoList = mBlockMaster.getWorkerReport(GetWorkerReportOptions.defaults());
assertEquals(1, workerInfoList.size());
WorkerInfo worker1Info = findWorkerInfo(workerInfoList, worker1);
assertEquals(BLOCK1_LENGTH, worker1Info.getUsedBytes());
if (deleteMetadata) {
verifyBlockNotExisting(mBlockMaster, BLOCK1_ID);
} else {
// All locations of block 1 are freed in metadata
verifyBlockOnWorkers(mBlockMaster, BLOCK1_ID, BLOCK1_LENGTH, workerInfoList);
}
verifyBlockOnWorkers(mBlockMaster, BLOCK2_ID, BLOCK2_LENGTH, ImmutableList.of());
// and updated the to-be-removed list
if (!freeCommandSeen.get()) {
Command cmd = mBlockMaster.workerHeartbeat(worker1, MEM_CAPACITY, // Block 2 is removed but 1 is still on the worker
ImmutableMap.of("MEM", BLOCK1_LENGTH), // list of removed blockIds
ImmutableList.of(BLOCK2_ID), ImmutableMap.of(), NO_LOST_STORAGE, ImmutableList.of());
assertEquals(FREE_BLOCK1_CMD, cmd);
}
return null;
});
}
}
use of alluxio.grpc.Command in project alluxio by Alluxio.
the class ConcurrentBlockMasterTest method concurrentRemoveWithRegisterNewWorkerSameBlock.
@Test
public void concurrentRemoveWithRegisterNewWorkerSameBlock() throws Exception {
for (boolean deleteMetadata : ImmutableList.of(true, false)) {
// Prepare worker
long worker1 = registerEmptyWorker(NET_ADDRESS_1);
// Prepare block on the worker
mBlockMaster.commitBlock(worker1, BLOCK1_LENGTH, "MEM", "MEM", BLOCK1_ID, BLOCK1_LENGTH);
CountDownLatch w1Latch = new CountDownLatch(1);
mBlockMaster.setLatch(w1Latch);
// A new worker as the W2
long worker2 = mBlockMaster.getWorkerId(NET_ADDRESS_2);
concurrentWriterWithWriter(w1Latch, // W1
() -> {
mBlockMaster.removeBlocks(ImmutableList.of(BLOCK1_ID), deleteMetadata);
return null;
}, // W2
() -> {
// The new worker contains the block
// W1 will remove the block exclusively before worker2 registers with the same block
// So when worker 2 comes in, the block should be removed already
// So the block on worker 2 should be ignored
mBlockMaster.workerRegister(worker2, Arrays.asList("MEM"), MEM_CAPACITY, ImmutableMap.of("MEM", BLOCK1_LENGTH), ImmutableMap.of(newBlockLocationOnWorkerMemTier(worker2), ImmutableList.of(BLOCK1_ID)), NO_LOST_STORAGE, RegisterWorkerPOptions.getDefaultInstance());
return null;
}, // Verifier
() -> {
// After registration, verify the worker info
List<WorkerInfo> workerInfoList = mBlockMaster.getWorkerReport(GetWorkerReportOptions.defaults());
assertEquals(2, workerInfoList.size());
WorkerInfo worker1Info = findWorkerInfo(workerInfoList, worker1);
assertEquals(BLOCK1_LENGTH, worker1Info.getUsedBytes());
WorkerInfo worker2Info = findWorkerInfo(workerInfoList, worker2);
assertEquals(BLOCK1_LENGTH, worker2Info.getUsedBytes());
// Verify the block metadata
if (deleteMetadata) {
// If the block metadata has been removed, getting that will get an exception
assertThrows(BlockInfoException.class, () -> {
mBlockMaster.getBlockInfo(BLOCK1_ID);
});
} else {
// The master will issue commands to remove blocks on the next heartbeat
// So now the locations are still there
verifyBlockOnWorkers(mBlockMaster, BLOCK1_ID, BLOCK1_LENGTH, workerInfoList);
}
// Verify the heartbeat from worker will get a command to remove the block
Command worker1HeartbeatCmd = mBlockMaster.workerHeartbeat(worker1, MEM_CAPACITY, // the block has not yet been removed
ImmutableMap.of("MEM", BLOCK1_LENGTH), // an empty list of removed blockIds
ImmutableList.of(), ImmutableMap.of(), NO_LOST_STORAGE, ImmutableList.of());
assertEquals(FREE_BLOCK1_CMD, worker1HeartbeatCmd);
if (deleteMetadata) {
// Block on worker 2 will be freed because the block is already removed
Command worker2HeartbeatCmd = mBlockMaster.workerHeartbeat(worker2, MEM_CAPACITY, // the block has not yet been removed
ImmutableMap.of("MEM", BLOCK1_LENGTH), // an empty list of removed blockIds
ImmutableList.of(), ImmutableMap.of(), NO_LOST_STORAGE, ImmutableList.of());
// Block on worker 2 will be freed because the block is already removed
// Unrecognized blocks will be freed
assertEquals(FREE_BLOCK1_CMD, worker2HeartbeatCmd);
} else {
// Two cases can happen:
// 1. Worker 2 registers before the free operation checks the block locations
// In this case the block on worker 2 will be freed
// 2. Worker 2 registers after the free operation is complete
// In this case the block on worker 2 will not be freed
Command worker2HeartbeatCmd = mBlockMaster.workerHeartbeat(worker2, MEM_CAPACITY, // the block has not yet been removed
ImmutableMap.of("MEM", BLOCK1_LENGTH), // an empty list of removed blockIds
ImmutableList.of(), ImmutableMap.of(), NO_LOST_STORAGE, ImmutableList.of());
assertTrue(worker2HeartbeatCmd.equals(FREE_BLOCK1_CMD) || worker2HeartbeatCmd.equals(EMPTY_CMD));
}
return null;
});
}
}
Aggregations