use of org.apache.accumulo.server.zookeeper.DistributedWorkQueue in project accumulo by apache.
the class UnorderedWorkAssignerIT method createWorkForFilesNeedingIt.
@Test
public void createWorkForFilesNeedingIt() throws Exception {
ReplicationTarget target1 = new ReplicationTarget("cluster1", "table1", Table.ID.of("1")), target2 = new ReplicationTarget("cluster1", "table2", Table.ID.of("2"));
Text serializedTarget1 = target1.toText(), serializedTarget2 = target2.toText();
String keyTarget1 = target1.getPeerName() + DistributedWorkQueueWorkAssignerHelper.KEY_SEPARATOR + target1.getRemoteIdentifier() + DistributedWorkQueueWorkAssignerHelper.KEY_SEPARATOR + target1.getSourceTableId(), keyTarget2 = target2.getPeerName() + DistributedWorkQueueWorkAssignerHelper.KEY_SEPARATOR + target2.getRemoteIdentifier() + DistributedWorkQueueWorkAssignerHelper.KEY_SEPARATOR + target2.getSourceTableId();
Status.Builder builder = Status.newBuilder().setBegin(0).setEnd(0).setInfiniteEnd(true).setClosed(false).setCreatedTime(5l);
Status status1 = builder.build();
builder.setCreatedTime(10l);
Status status2 = builder.build();
// Create two mutations, both of which need replication work done
BatchWriter bw = ReplicationTable.getBatchWriter(conn);
String filename1 = UUID.randomUUID().toString(), filename2 = UUID.randomUUID().toString();
String file1 = "/accumulo/wal/tserver+port/" + filename1, file2 = "/accumulo/wal/tserver+port/" + filename2;
Mutation m = new Mutation(file1);
WorkSection.add(m, serializedTarget1, ProtobufUtil.toValue(status1));
bw.addMutation(m);
m = OrderSection.createMutation(file1, status1.getCreatedTime());
OrderSection.add(m, target1.getSourceTableId(), ProtobufUtil.toValue(status1));
bw.addMutation(m);
m = new Mutation(file2);
WorkSection.add(m, serializedTarget2, ProtobufUtil.toValue(status2));
bw.addMutation(m);
m = OrderSection.createMutation(file2, status2.getCreatedTime());
OrderSection.add(m, target2.getSourceTableId(), ProtobufUtil.toValue(status2));
bw.addMutation(m);
bw.close();
DistributedWorkQueue workQueue = createMock(DistributedWorkQueue.class);
HashSet<String> queuedWork = new HashSet<>();
assigner.setQueuedWork(queuedWork);
assigner.setWorkQueue(workQueue);
assigner.setMaxQueueSize(Integer.MAX_VALUE);
// Make sure we expect the invocations in the order they were created
String key = filename1 + "|" + keyTarget1;
workQueue.addWork(key, file1);
expectLastCall().once();
key = filename2 + "|" + keyTarget2;
workQueue.addWork(key, file2);
expectLastCall().once();
replay(workQueue);
assigner.createWork();
verify(workQueue);
}
use of org.apache.accumulo.server.zookeeper.DistributedWorkQueue in project accumulo by apache.
the class SequentialWorkAssignerTest method basicZooKeeperCleanup.
@Test
public void basicZooKeeperCleanup() throws Exception {
DistributedWorkQueue workQueue = createMock(DistributedWorkQueue.class);
ZooCache zooCache = createMock(ZooCache.class);
Instance inst = createMock(Instance.class);
Map<String, Map<Table.ID, String>> queuedWork = new TreeMap<>();
Map<Table.ID, String> cluster1Work = new TreeMap<>();
// Two files for cluster1, one for table '1' and another for table '2' we havce assigned work for
cluster1Work.put(Table.ID.of("1"), DistributedWorkQueueWorkAssignerHelper.getQueueKey("file1", new ReplicationTarget("cluster1", "1", Table.ID.of("1"))));
cluster1Work.put(Table.ID.of("2"), DistributedWorkQueueWorkAssignerHelper.getQueueKey("file2", new ReplicationTarget("cluster1", "2", Table.ID.of("2"))));
queuedWork.put("cluster1", cluster1Work);
assigner.setConnector(conn);
assigner.setZooCache(zooCache);
assigner.setWorkQueue(workQueue);
assigner.setQueuedWork(queuedWork);
expect(conn.getInstance()).andReturn(inst);
expect(inst.getInstanceID()).andReturn("instance");
// file1 replicated
expect(zooCache.get(ZooUtil.getRoot("instance") + ReplicationConstants.ZOO_WORK_QUEUE + "/" + DistributedWorkQueueWorkAssignerHelper.getQueueKey("file1", new ReplicationTarget("cluster1", "1", Table.ID.of("1"))))).andReturn(null);
// file2 still needs to replicate
expect(zooCache.get(ZooUtil.getRoot("instance") + ReplicationConstants.ZOO_WORK_QUEUE + "/" + DistributedWorkQueueWorkAssignerHelper.getQueueKey("file2", new ReplicationTarget("cluster1", "2", Table.ID.of("2"))))).andReturn(new byte[0]);
replay(workQueue, zooCache, conn, inst);
assigner.cleanupFinishedWork();
verify(workQueue, zooCache, conn, inst);
Assert.assertEquals(1, cluster1Work.size());
Assert.assertEquals(DistributedWorkQueueWorkAssignerHelper.getQueueKey("file2", new ReplicationTarget("cluster1", "2", Table.ID.of("2"))), cluster1Work.get(Table.ID.of("2")));
}
use of org.apache.accumulo.server.zookeeper.DistributedWorkQueue in project accumulo by apache.
the class UnorderedWorkAssignerTest method workQueuedUsingFileName.
@Test
public void workQueuedUsingFileName() throws Exception {
ReplicationTarget target = new ReplicationTarget("cluster1", "table1", Table.ID.of("1"));
DistributedWorkQueue workQueue = createMock(DistributedWorkQueue.class);
Set<String> queuedWork = new HashSet<>();
assigner.setQueuedWork(queuedWork);
assigner.setWorkQueue(workQueue);
Path p = new Path("/accumulo/wal/tserver+port/" + UUID.randomUUID());
String expectedQueueKey = p.getName() + DistributedWorkQueueWorkAssignerHelper.KEY_SEPARATOR + target.getPeerName() + DistributedWorkQueueWorkAssignerHelper.KEY_SEPARATOR + target.getRemoteIdentifier() + DistributedWorkQueueWorkAssignerHelper.KEY_SEPARATOR + target.getSourceTableId();
workQueue.addWork(expectedQueueKey, p.toString());
expectLastCall().once();
replay(workQueue);
assigner.queueWork(p, target);
Assert.assertEquals(1, queuedWork.size());
Assert.assertEquals(expectedQueueKey, queuedWork.iterator().next());
}
use of org.apache.accumulo.server.zookeeper.DistributedWorkQueue in project accumulo by apache.
the class CopyFailed method call.
@Override
public Repo<Master> call(long tid, Master master) throws Exception {
// This needs to execute after the arbiter is stopped
master.updateBulkImportStatus(source, BulkImportState.COPY_FILES);
VolumeManager fs = master.getFileSystem();
if (!fs.exists(new Path(error, BulkImport.FAILURES_TXT)))
return new CleanUpBulkImport(tableId, source, bulk, error);
HashMap<FileRef, String> failures = new HashMap<>();
HashMap<FileRef, String> loadedFailures = new HashMap<>();
try (BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(new Path(error, BulkImport.FAILURES_TXT)), UTF_8))) {
String line = null;
while ((line = in.readLine()) != null) {
Path path = new Path(line);
if (!fs.exists(new Path(error, path.getName())))
failures.put(new FileRef(line, path), line);
}
}
/*
* I thought I could move files that have no file references in the table. However its possible a clone references a file. Therefore only move files that
* have no loaded markers.
*/
// determine which failed files were loaded
Connector conn = master.getConnector();
try (Scanner mscanner = new IsolatedScanner(conn.createScanner(MetadataTable.NAME, Authorizations.EMPTY))) {
mscanner.setRange(new KeyExtent(tableId, null, null).toMetadataRange());
mscanner.fetchColumnFamily(TabletsSection.BulkFileColumnFamily.NAME);
for (Entry<Key, Value> entry : mscanner) {
if (Long.parseLong(entry.getValue().toString()) == tid) {
FileRef loadedFile = new FileRef(fs, entry.getKey());
String absPath = failures.remove(loadedFile);
if (absPath != null) {
loadedFailures.put(loadedFile, absPath);
}
}
}
}
// move failed files that were not loaded
for (String failure : failures.values()) {
Path orig = new Path(failure);
Path dest = new Path(error, orig.getName());
fs.rename(orig, dest);
log.debug("tid " + tid + " renamed " + orig + " to " + dest + ": import failed");
}
if (loadedFailures.size() > 0) {
DistributedWorkQueue bifCopyQueue = new DistributedWorkQueue(Constants.ZROOT + "/" + master.getInstance().getInstanceID() + Constants.ZBULK_FAILED_COPYQ, master.getConfiguration());
HashSet<String> workIds = new HashSet<>();
for (String failure : loadedFailures.values()) {
Path orig = new Path(failure);
Path dest = new Path(error, orig.getName());
if (fs.exists(dest))
continue;
bifCopyQueue.addWork(orig.getName(), (failure + "," + dest).getBytes(UTF_8));
workIds.add(orig.getName());
log.debug("tid " + tid + " added to copyq: " + orig + " to " + dest + ": failed");
}
bifCopyQueue.waitUntilDone(workIds);
}
fs.deleteRecursively(new Path(error, BulkImport.FAILURES_TXT));
return new CleanUpBulkImport(tableId, source, bulk, error);
}
use of org.apache.accumulo.server.zookeeper.DistributedWorkQueue in project accumulo by apache.
the class TabletServer method run.
// main loop listens for client requests
@Override
public void run() {
SecurityUtil.serverLogin(SiteConfiguration.getInstance());
// We can just make the zookeeper paths before we try to use.
try {
ZooKeeperInitialization.ensureZooKeeperInitialized(ZooReaderWriter.getInstance(), ZooUtil.getRoot(getInstance()));
} catch (KeeperException | InterruptedException e) {
log.error("Could not ensure that ZooKeeper is properly initialized", e);
throw new RuntimeException(e);
}
Metrics tserverMetrics = metricsFactory.createTabletServerMetrics(this);
// Register MBeans
try {
tserverMetrics.register();
mincMetrics.register();
scanMetrics.register();
updateMetrics.register();
} catch (Exception e) {
log.error("Error registering with JMX", e);
}
if (null != authKeyWatcher) {
log.info("Seeding ZooKeeper watcher for authentication keys");
try {
authKeyWatcher.updateAuthKeys();
} catch (KeeperException | InterruptedException e) {
// TODO Does there need to be a better check? What are the error conditions that we'd fall out here? AUTH_FAILURE?
// If we get the error, do we just put it on a timer and retry the exists(String, Watcher) call?
log.error("Failed to perform initial check for authentication tokens in ZooKeeper. Delegation token authentication will be unavailable.", e);
}
}
try {
clientAddress = startTabletClientService();
} catch (UnknownHostException e1) {
throw new RuntimeException("Failed to start the tablet client service", e1);
}
announceExistence();
try {
walMarker.initWalMarker(getTabletSession());
} catch (Exception e) {
log.error("Unable to create WAL marker node in zookeeper", e);
throw new RuntimeException(e);
}
ThreadPoolExecutor distWorkQThreadPool = new SimpleThreadPool(getConfiguration().getCount(Property.TSERV_WORKQ_THREADS), "distributed work queue");
bulkFailedCopyQ = new DistributedWorkQueue(ZooUtil.getRoot(getInstance()) + Constants.ZBULK_FAILED_COPYQ, getConfiguration());
try {
bulkFailedCopyQ.startProcessing(new BulkFailedCopyProcessor(), distWorkQThreadPool);
} catch (Exception e1) {
throw new RuntimeException("Failed to start distributed work queue for copying ", e1);
}
try {
logSorter.startWatchingForRecoveryLogs(distWorkQThreadPool);
} catch (Exception ex) {
log.error("Error setting watches for recoveries");
throw new RuntimeException(ex);
}
// Start the thrift service listening for incoming replication requests
try {
replicationAddress = startReplicationService();
} catch (UnknownHostException e) {
throw new RuntimeException("Failed to start replication service", e);
}
// Start the pool to handle outgoing replications
final ThreadPoolExecutor replicationThreadPool = new SimpleThreadPool(getConfiguration().getCount(Property.REPLICATION_WORKER_THREADS), "replication task");
replWorker.setExecutor(replicationThreadPool);
replWorker.run();
// Check the configuration value for the size of the pool and, if changed, resize the pool, every 5 seconds);
final AccumuloConfiguration aconf = getConfiguration();
Runnable replicationWorkThreadPoolResizer = new Runnable() {
@Override
public void run() {
int maxPoolSize = aconf.getCount(Property.REPLICATION_WORKER_THREADS);
if (replicationThreadPool.getMaximumPoolSize() != maxPoolSize) {
log.info("Resizing thread pool for sending replication work from {} to {}", replicationThreadPool.getMaximumPoolSize(), maxPoolSize);
replicationThreadPool.setMaximumPoolSize(maxPoolSize);
}
}
};
SimpleTimer.getInstance(aconf).schedule(replicationWorkThreadPoolResizer, 10000, 30000);
final long CLEANUP_BULK_LOADED_CACHE_MILLIS = 15 * 60 * 1000;
SimpleTimer.getInstance(aconf).schedule(new BulkImportCacheCleaner(this), CLEANUP_BULK_LOADED_CACHE_MILLIS, CLEANUP_BULK_LOADED_CACHE_MILLIS);
HostAndPort masterHost;
while (!serverStopRequested) {
// send all of the pending messages
try {
MasterMessage mm = null;
MasterClientService.Client iface = null;
try {
// was requested
while (mm == null && !serverStopRequested) {
mm = masterMessages.poll(1000, TimeUnit.MILLISECONDS);
}
// have a message to send to the master, so grab a
// connection
masterHost = getMasterAddress();
iface = masterConnection(masterHost);
TServiceClient client = iface;
// then finally block should place mm back on queue
while (!serverStopRequested && mm != null && client != null && client.getOutputProtocol() != null && client.getOutputProtocol().getTransport() != null && client.getOutputProtocol().getTransport().isOpen()) {
try {
mm.send(rpcCreds(), getClientAddressString(), iface);
mm = null;
} catch (TException ex) {
log.warn("Error sending message: queuing message again");
masterMessages.putFirst(mm);
mm = null;
throw ex;
}
// if any messages are immediately available grab em and
// send them
mm = masterMessages.poll();
}
} finally {
if (mm != null) {
masterMessages.putFirst(mm);
}
returnMasterConnection(iface);
sleepUninterruptibly(1, TimeUnit.SECONDS);
}
} catch (InterruptedException e) {
log.info("Interrupt Exception received, shutting down");
serverStopRequested = true;
} catch (Exception e) {
// may have lost connection with master
// loop back to the beginning and wait for a new one
// this way we survive master failures
log.error(getClientAddressString() + ": TServerInfo: Exception. Master down?", e);
}
}
// get prematurely finalized
synchronized (this) {
while (!shutdownComplete) {
try {
this.wait(1000);
} catch (InterruptedException e) {
log.error(e.toString());
}
}
}
log.debug("Stopping Replication Server");
TServerUtils.stopTServer(this.replServer);
log.debug("Stopping Thrift Servers");
TServerUtils.stopTServer(server);
try {
log.debug("Closing filesystem");
fs.close();
} catch (IOException e) {
log.warn("Failed to close filesystem : {}", e.getMessage(), e);
}
gcLogger.logGCInfo(getConfiguration());
log.info("TServerInfo: stop requested. exiting ... ");
try {
tabletServerLock.unlock();
} catch (Exception e) {
log.warn("Failed to release tablet server lock", e);
}
}
Aggregations