use of org.apache.zookeeper.KeeperException.BadVersionException in project accumulo by apache.
the class ZooReaderWriterTest method testMutateWithBadVersion.
@Test
public void testMutateWithBadVersion() throws Exception {
final String path = "/foo";
final byte[] value = new byte[] { 0 };
final List<ACL> acls = Collections.emptyList();
final byte[] mutatedBytes = new byte[] { 1 };
Mutator mutator = new Mutator() {
@Override
public byte[] mutate(byte[] currentValue) throws Exception {
return mutatedBytes;
}
};
Method getDataMethod = ZooReaderWriter.class.getMethod("getData", String.class, boolean.class, Stat.class);
zrw = EasyMock.createMockBuilder(ZooReaderWriter.class).addMockedMethods("getRetryFactory", "getZooKeeper").addMockedMethod(getDataMethod).createMock();
EasyMock.expect(zrw.getRetryFactory()).andReturn(retryFactory).anyTimes();
EasyMock.expect(zrw.getZooKeeper()).andReturn(zk).anyTimes();
Stat stat = new Stat();
zk.create(path, value, acls, CreateMode.PERSISTENT);
EasyMock.expectLastCall().andThrow(new NodeExistsException()).once();
EasyMock.expect(zrw.getData(path, false, stat)).andReturn(new byte[] { 3 }).times(2);
// BadVersionException should retry
EasyMock.expect(zk.setData(path, mutatedBytes, 0)).andThrow(new BadVersionException());
// Let 2nd setData succeed
EasyMock.expect(zk.setData(path, mutatedBytes, 0)).andReturn(null);
EasyMock.replay(zk, zrw, retryFactory, retry);
Assert.assertArrayEquals(new byte[] { 1 }, zrw.mutate(path, value, acls, mutator));
EasyMock.verify(zk, zrw, retryFactory, retry);
}
use of org.apache.zookeeper.KeeperException.BadVersionException in project commons by twitter.
the class ZooKeeperUtilsTest method testMagicVersionNumberAllowsUnconditionalUpdate.
@Test
public void testMagicVersionNumberAllowsUnconditionalUpdate() throws Exception {
String nodePath = "/foo";
ZooKeeperClient zkClient = createZkClient();
zkClient.get().create(nodePath, "init".getBytes(), ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
Stat initStat = new Stat();
byte[] initialData = zkClient.get().getData(nodePath, false, initStat);
assertArrayEquals("init".getBytes(), initialData);
// bump the version
Stat rev1Stat = zkClient.get().setData(nodePath, "rev1".getBytes(), initStat.getVersion());
try {
zkClient.get().setData(nodePath, "rev2".getBytes(), initStat.getVersion());
fail("expected correct version to be required");
} catch (BadVersionException e) {
// expected
}
// expect using the correct version to work
Stat rev2Stat = zkClient.get().setData(nodePath, "rev2".getBytes(), rev1Stat.getVersion());
assertNotEqual(ZooKeeperUtils.ANY_VERSION, rev2Stat.getVersion());
zkClient.get().setData(nodePath, "force-write".getBytes(), ZooKeeperUtils.ANY_VERSION);
Stat forceWriteStat = new Stat();
byte[] forceWriteData = zkClient.get().getData(nodePath, false, forceWriteStat);
assertArrayEquals("force-write".getBytes(), forceWriteData);
assertTrue(forceWriteStat.getVersion() > rev2Stat.getVersion());
assertNotEqual(ZooKeeperUtils.ANY_VERSION, forceWriteStat.getVersion());
}
use of org.apache.zookeeper.KeeperException.BadVersionException in project helios by spotify.
the class ZooKeeperMasterModel method updateDeploymentGroupHosts.
@Override
public void updateDeploymentGroupHosts(final String groupName, final List<String> hosts) throws DeploymentGroupDoesNotExistException {
log.debug("updating deployment-group hosts: name={}", groupName);
final ZooKeeperClient client = provider.get("updateDeploymentGroupHosts");
try {
final DeploymentGroupStatus status = getDeploymentGroupStatus(groupName);
if (!allowHostChange(status)) {
return;
}
// statusDeploymentGroupRemovedHosts may not exist for deployment groups created before it was
// introduced.
client.ensurePathAndSetData(Paths.statusDeploymentGroupRemovedHosts(groupName), Json.asBytesUnchecked(emptyList()));
final List<String> curHosts = getHosts(client, Paths.statusDeploymentGroupHosts(groupName));
final List<String> previouslyRemovedHosts = getHosts(client, Paths.statusDeploymentGroupRemovedHosts(groupName));
final List<String> removedHosts = removedHosts(curHosts, hosts, previouslyRemovedHosts);
if (hosts.equals(curHosts) && removedHosts.equals(previouslyRemovedHosts)) {
return;
}
log.info("for deployment-group name={}, curHosts={}, new hosts={}, " + "previouslyRemovedHosts={}, derived removedHosts={}", groupName, curHosts, hosts, previouslyRemovedHosts, removedHosts);
final List<ZooKeeperOperation> ops = Lists.newArrayList();
ops.add(set(Paths.statusDeploymentGroupHosts(groupName), Json.asBytes(hosts)));
ops.add(set(Paths.statusDeploymentGroupRemovedHosts(groupName), Json.asBytes(removedHosts)));
final Node dgn = client.getNode(Paths.configDeploymentGroup(groupName));
final Integer deploymentGroupVersion = dgn.getStat().getVersion();
DeploymentGroup deploymentGroup = Json.read(dgn.getBytes(), DeploymentGroup.class);
List<Map<String, Object>> events = ImmutableList.of();
if (deploymentGroup.getJobId() != null && updateOnHostChange(deploymentGroup, status)) {
deploymentGroup = deploymentGroup.toBuilder().setRollingUpdateReason(HOSTS_CHANGED).build();
// Fail transaction if the deployment group has been updated elsewhere.
ops.add(check(Paths.configDeploymentGroup(groupName), deploymentGroupVersion));
// NOTE: If the DG was removed this set() cause the transaction to fail, because
// removing the DG removes this node. It's *important* that there's an operation that
// causes the transaction to fail if the DG was removed or we'll end up with
// inconsistent state.
ops.add(set(Paths.configDeploymentGroup(deploymentGroup.getName()), deploymentGroup));
final RollingUpdateOp op = getInitRollingUpdateOps(deploymentGroup, hosts, removedHosts, client);
ops.addAll(op.operations());
events = op.events();
}
log.info("starting zookeeper transaction for updateDeploymentGroupHosts on deployment-group: " + "name={} jobId={} operations={}", groupName, deploymentGroup.getJobId(), ops);
client.transaction(ops);
emitEvents(deploymentGroupEventTopic, events);
} catch (BadVersionException e) {
// some other master beat us in processing this host update. not exceptional.
// ideally we would check the path in the exception, but curator doesn't provide a path
// for exceptions thrown as part of a transaction.
log.info("zookeeper transaction for updateDeploymentGroupHosts on deployment-group was " + "processed by another master: name={}", groupName);
} catch (NoNodeException e) {
throw new DeploymentGroupDoesNotExistException(groupName, e);
} catch (KeeperException | IOException e) {
throw new HeliosRuntimeException("updating deployment group hosts failed", e);
}
}
use of org.apache.zookeeper.KeeperException.BadVersionException in project helios by spotify.
the class ZooKeeperMasterModel method rollingUpdateStep.
@Override
public void rollingUpdateStep() {
final ZooKeeperClient client = provider.get("rollingUpdateStep");
final Map<String, VersionedValue<DeploymentGroupTasks>> tasksMap = getDeploymentGroupTasks(client);
for (final Map.Entry<String, VersionedValue<DeploymentGroupTasks>> entry : tasksMap.entrySet()) {
final String deploymentGroupName = entry.getKey();
final VersionedValue<DeploymentGroupTasks> versionedTasks = entry.getValue();
final DeploymentGroupTasks tasks = versionedTasks.value();
final int taskIndex = tasks.getTaskIndex();
log.info("rolling-update step on deployment-group {}. Doing taskIndex {} of {}: {}. ", deploymentGroupName, taskIndex, tasks.getRolloutTasks().size(), tasks.getRolloutTasks().get(taskIndex));
try {
final RollingUpdateOpFactory opFactory = new RollingUpdateOpFactory(tasks, DEPLOYMENT_GROUP_EVENT_FACTORY);
final RolloutTask task = tasks.getRolloutTasks().get(taskIndex);
final RollingUpdateOp op = processRollingUpdateTask(client, opFactory, task, tasks.getDeploymentGroup());
if (!op.operations().isEmpty()) {
final List<ZooKeeperOperation> ops = Lists.newArrayList();
ops.add(check(Paths.statusDeploymentGroupTasks(deploymentGroupName), versionedTasks.version()));
ops.addAll(op.operations());
log.info("rolling-update step on deployment-group: name={}, zookeeper operations={}", deploymentGroupName, ops);
try {
client.transaction(ops);
emitEvents(deploymentGroupEventTopic, op.events());
} catch (BadVersionException e) {
// some other master beat us in processing this rolling update step. not exceptional.
// ideally we would check the path in the exception, but curator doesn't provide a path
// for exceptions thrown as part of a transaction.
log.info("rolling-update step on deployment-group was processed by another master" + ": name={}, zookeeper operations={}", deploymentGroupName, ops);
} catch (KeeperException e) {
log.error("rolling-update on deployment-group {} failed. {}", deploymentGroupName, e.getMessage(), e);
}
}
} catch (final Exception e) {
log.error("error processing rolling update step for {}", deploymentGroupName, e);
}
}
}
use of org.apache.zookeeper.KeeperException.BadVersionException in project Saturn by vipshop.
the class ShardingService method shardingIfNecessary.
/**
* 如果需要分片且当前节点为主节点, 则作业分片.
*/
public synchronized void shardingIfNecessary() throws JobShuttingDownException {
if (isShutdown) {
return;
}
GetDataStat getDataStat = null;
if (getJobNodeStorage().isJobNodeExisted(ShardingNode.NECESSARY)) {
getDataStat = getNecessaryDataStat();
}
// sharding necessary内容为空,或者内容是"0"则返回,否则,需要进行sharding处理
if (getDataStat == null || SHARDING_UN_NECESSARY.equals(getDataStat.getData())) {
return;
}
// 如果不是leader,则等待leader处理完成(这也是一个死循环,知道满足跳出循环的条件:1. 被shutdown 2. 无须sharding而且不处于processing状态)
if (blockUntilShardingComplatedIfNotLeader()) {
return;
}
// 如果有作业分片处于running状态则等待(无限期)
waitingOtherJobCompleted();
// 建立一个临时节点,标记sharding处理中
getJobNodeStorage().fillEphemeralJobNode(ShardingNode.PROCESSING, "");
try {
// 删除作业下面的所有JobServer的sharding节点
clearShardingInfo();
int maxRetryTime = 3;
int retryCount = 0;
while (!isShutdown) {
int version = getDataStat.getVersion();
// 首先尝试从job/leader/sharding/necessary节点获取,如果失败,会从$SaturnExecutors/sharding/content下面获取
// key is executor, value is sharding items
Map<String, List<Integer>> shardingItems = namespaceShardingContentService.getShardContent(jobName, getDataStat.getData());
try {
// 所有jobserver的(检查+创建),加上设置sharding necessary内容为0,都是一个事务
CuratorTransactionFinal curatorTransactionFinal = getJobNodeStorage().getClient().inTransaction().check().forPath("/").and();
for (Entry<String, List<Integer>> entry : shardingItems.entrySet()) {
curatorTransactionFinal.create().forPath(JobNodePath.getNodeFullPath(jobName, ShardingNode.getShardingNode(entry.getKey())), ItemUtils.toItemsString(entry.getValue()).getBytes(StandardCharsets.UTF_8)).and();
}
curatorTransactionFinal.setData().withVersion(version).forPath(JobNodePath.getNodeFullPath(jobName, ShardingNode.NECESSARY), SHARDING_UN_NECESSARY.getBytes(StandardCharsets.UTF_8)).and();
curatorTransactionFinal.commit();
break;
} catch (BadVersionException e) {
LogUtils.warn(log, jobName, "zookeeper bad version exception happens", e);
if (++retryCount <= maxRetryTime) {
LogUtils.info(log, jobName, "bad version because of concurrency, will retry to get shards from sharding/necessary later");
// NOSONAR
Thread.sleep(200L);
getDataStat = getNecessaryDataStat();
}
} catch (Exception e) {
LogUtils.warn(log, jobName, "commit shards failed", e);
/**
* 已知场景:
* 异常为NoNodeException,域下作业数量大,业务容器上下线。
* 原因是,大量的sharding task导致计算结果有滞后,同时某些server被删除,导致commit失败,报NoNode异常。
*
* 是否需要重试:
* 如果作业一直处于启用状态,necessary最终会被更新正确,这时不需要主动重试。 如果重试,可能导致提前拿到数据,后面再重新拿一次数据,不过也没多大问题。
* 如果作业在中途禁用了,那么necessary将不会被更新,这时从necessary拿到的数据是过时的,仍然会commit失败,这时需要从content获取数据来重试。
* 如果是其他未知场景导致的commit失败,也是可以尝试从content获取数据来重试。
* 所以,为了保险起见,均从content获取数据来重试。
*/
if (++retryCount <= maxRetryTime) {
LogUtils.info(log, jobName, "unexpected error, will retry to get shards from sharding/content later");
// 睡一下,没必要马上重试。减少对zk的压力。
// NOSONAR
Thread.sleep(500L);
/**
* 注意:
* data为GET_SHARD_FROM_CONTENT_NODE_FLAG,会从sharding/content下获取数据。
* version使用necessary的version。
*/
getDataStat = new GetDataStat(NamespaceShardingContentService.GET_SHARD_FROM_CONTENT_NODE_FLAG, version);
}
}
if (retryCount > maxRetryTime) {
LogUtils.warn(log, jobName, "retry time exceed {}, will give up to get shards", maxRetryTime);
break;
}
}
} catch (Exception e) {
LogUtils.error(log, jobName, e.getMessage(), e);
} finally {
getJobNodeStorage().removeJobNodeIfExisted(ShardingNode.PROCESSING);
}
}
Aggregations