use of org.apache.hadoop.ipc.RemoteException in project hbase by apache.
the class TestFanOutOneBlockAsyncDFSOutput method testCreateParentFailed.
/**
* This is important for fencing when recover from RS crash.
*/
@Test
public void testCreateParentFailed() throws IOException {
Path f = new Path("/" + name.getMethodName() + "/test");
EventLoop eventLoop = EVENT_LOOP_GROUP.next();
try {
FanOutOneBlockAsyncDFSOutputHelper.createOutput(FS, f, true, false, (short) 3, FS.getDefaultBlockSize(), eventLoop);
fail("should fail with parent does not exist");
} catch (RemoteException e) {
LOG.info("expected exception caught", e);
assertTrue(e.unwrapRemoteException() instanceof FileNotFoundException);
}
}
use of org.apache.hadoop.ipc.RemoteException in project hive by apache.
the class LlapTaskCommunicator method registerRunningTaskAttempt.
@Override
public void registerRunningTaskAttempt(final ContainerId containerId, final TaskSpec taskSpec, Map<String, LocalResource> additionalResources, Credentials credentials, boolean credentialsChanged, int priority) {
super.registerRunningTaskAttempt(containerId, taskSpec, additionalResources, credentials, credentialsChanged, priority);
int dagId = taskSpec.getTaskAttemptID().getTaskID().getVertexID().getDAGId().getId();
if (currentQueryIdentifierProto == null || (dagId != currentQueryIdentifierProto.getDagIndex())) {
// TODO HiveQueryId extraction by parsing the Processor payload is ugly. This can be improved
// once TEZ-2672 is fixed.
String hiveQueryId;
try {
hiveQueryId = extractQueryId(taskSpec);
} catch (IOException e) {
throw new RuntimeException("Failed to extract query id from task spec: " + taskSpec, e);
}
Preconditions.checkNotNull(hiveQueryId, "Unexpected null query id");
resetCurrentDag(dagId, hiveQueryId);
}
ContainerInfo containerInfo = getContainerInfo(containerId);
String host;
int port;
if (containerInfo != null) {
synchronized (containerInfo) {
host = containerInfo.host;
port = containerInfo.port;
}
} else {
// TODO Handle this properly
throw new RuntimeException("ContainerInfo not found for container: " + containerId + ", while trying to launch task: " + taskSpec.getTaskAttemptID());
}
LlapNodeId nodeId = LlapNodeId.getInstance(host, port);
registerKnownNode(nodeId);
entityTracker.registerTaskAttempt(containerId, taskSpec.getTaskAttemptID(), host, port);
nodesForQuery.add(nodeId);
sourceStateTracker.registerTaskForStateUpdates(host, port, taskSpec.getInputs());
FragmentRuntimeInfo fragmentRuntimeInfo;
try {
fragmentRuntimeInfo = sourceStateTracker.getFragmentRuntimeInfo(taskSpec.getVertexName(), taskSpec.getTaskAttemptID().getTaskID().getId(), priority);
} catch (Exception e) {
LOG.error("Error while trying to get runtimeFragmentInfo for fragmentId={}, containerId={}, currentQI={}, currentQueryId={}", taskSpec.getTaskAttemptID(), containerId, currentQueryIdentifierProto, currentHiveQueryId, e);
if (e instanceof RuntimeException) {
throw (RuntimeException) e;
} else {
throw new RuntimeException(e);
}
}
SubmitWorkRequestProto requestProto;
try {
requestProto = constructSubmitWorkRequest(containerId, taskSpec, fragmentRuntimeInfo, currentHiveQueryId);
} catch (IOException e) {
throw new RuntimeException("Failed to construct request", e);
}
// Have to register this up front right now. Otherwise, it's possible for the task to start
// sending out status/DONE/KILLED/FAILED messages before TAImpl knows how to handle them.
getContext().taskStartedRemotely(taskSpec.getTaskAttemptID(), containerId);
communicator.sendSubmitWork(requestProto, host, port, new LlapProtocolClientProxy.ExecuteRequestCallback<SubmitWorkResponseProto>() {
@Override
public void setResponse(SubmitWorkResponseProto response) {
if (response.hasSubmissionState()) {
LlapDaemonProtocolProtos.SubmissionStateProto ss = response.getSubmissionState();
if (ss.equals(LlapDaemonProtocolProtos.SubmissionStateProto.REJECTED)) {
LOG.info("Unable to run task: " + taskSpec.getTaskAttemptID() + " on containerId: " + containerId + ", Service Busy");
getContext().taskKilled(taskSpec.getTaskAttemptID(), TaskAttemptEndReason.EXECUTOR_BUSY, "Service Busy");
return;
}
} else {
// This should never happen as server always returns a valid status on success
throw new RuntimeException("SubmissionState in response is expected!");
}
if (response.hasUniqueNodeId()) {
entityTracker.registerTaskSubmittedToNode(taskSpec.getTaskAttemptID(), response.getUniqueNodeId());
}
LOG.info("Successfully launched task: " + taskSpec.getTaskAttemptID());
}
@Override
public void indicateError(Throwable t) {
Throwable originalError = t;
if (t instanceof ServiceException) {
ServiceException se = (ServiceException) t;
t = se.getCause();
}
if (t instanceof RemoteException) {
// All others from the remote service cause the task to FAIL.
LOG.info("Failed to run task: " + taskSpec.getTaskAttemptID() + " on containerId: " + containerId, t);
processSendError(originalError);
getContext().taskFailed(taskSpec.getTaskAttemptID(), TaskFailureType.NON_FATAL, TaskAttemptEndReason.OTHER, t.toString());
} else {
// Exception from the RPC layer - communication failure, consider as KILLED / service down.
if (t instanceof IOException) {
LOG.info("Unable to run task: " + taskSpec.getTaskAttemptID() + " on containerId: " + containerId + ", Communication Error");
processSendError(originalError);
getContext().taskKilled(taskSpec.getTaskAttemptID(), TaskAttemptEndReason.COMMUNICATION_ERROR, "Communication Error");
} else {
// Anything else is a FAIL.
LOG.info("Failed to run task: " + taskSpec.getTaskAttemptID() + " on containerId: " + containerId, t);
processSendError(originalError);
getContext().taskFailed(taskSpec.getTaskAttemptID(), TaskFailureType.NON_FATAL, TaskAttemptEndReason.OTHER, t.getMessage());
}
}
}
});
}
use of org.apache.hadoop.ipc.RemoteException in project lucene-solr by apache.
the class HdfsLockFactory method obtainLock.
@Override
public Lock obtainLock(Directory dir, String lockName) throws IOException {
if (!(dir instanceof HdfsDirectory)) {
throw new UnsupportedOperationException("HdfsLockFactory can only be used with HdfsDirectory subclasses, got: " + dir);
}
final HdfsDirectory hdfsDir = (HdfsDirectory) dir;
final Configuration conf = hdfsDir.getConfiguration();
final Path lockPath = hdfsDir.getHdfsDirPath();
final Path lockFile = new Path(lockPath, lockName);
FSDataOutputStream file = null;
final FileSystem fs = FileSystem.get(lockPath.toUri(), conf);
while (true) {
try {
if (!fs.exists(lockPath)) {
boolean success = fs.mkdirs(lockPath);
if (!success) {
throw new RuntimeException("Could not create directory: " + lockPath);
}
} else {
// just to check for safe mode
fs.mkdirs(lockPath);
}
file = fs.create(lockFile, false);
break;
} catch (FileAlreadyExistsException e) {
throw new LockObtainFailedException("Cannot obtain lock file: " + lockFile, e);
} catch (RemoteException e) {
if (e.getClassName().equals("org.apache.hadoop.hdfs.server.namenode.SafeModeException")) {
log.warn("The NameNode is in SafeMode - Solr will wait 5 seconds and try again.");
try {
Thread.sleep(5000);
} catch (InterruptedException e1) {
Thread.interrupted();
}
continue;
}
throw new LockObtainFailedException("Cannot obtain lock file: " + lockFile, e);
} catch (IOException e) {
throw new LockObtainFailedException("Cannot obtain lock file: " + lockFile, e);
} finally {
IOUtils.closeQuietly(file);
}
}
return new HdfsLock(conf, lockFile);
}
use of org.apache.hadoop.ipc.RemoteException in project lucene-solr by apache.
the class HdfsUpdateLog method init.
@Override
public void init(UpdateHandler uhandler, SolrCore core) {
// ulogDir from CoreDescriptor overrides
String ulogDir = core.getCoreDescriptor().getUlogDir();
this.uhandler = uhandler;
synchronized (fsLock) {
// moving the tlog dir on reload
if (fs == null) {
if (ulogDir != null) {
dataDir = ulogDir;
}
if (dataDir == null || dataDir.length() == 0) {
dataDir = core.getDataDir();
}
if (!core.getDirectoryFactory().isAbsolute(dataDir)) {
try {
dataDir = core.getDirectoryFactory().getDataHome(core.getCoreDescriptor());
} catch (IOException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
try {
fs = FileSystem.get(new Path(dataDir).toUri(), getConf());
} catch (IOException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
} else {
if (debug) {
log.debug("UpdateHandler init: tlogDir=" + tlogDir + ", next id=" + id, " this is a reopen or double init ... nothing else to do.");
}
versionInfo.reload();
return;
}
}
tlogDir = new Path(dataDir, TLOG_NAME);
while (true) {
try {
if (!fs.exists(tlogDir)) {
boolean success = fs.mkdirs(tlogDir);
if (!success) {
throw new RuntimeException("Could not create directory:" + tlogDir);
}
} else {
// To check for safe mode
fs.mkdirs(tlogDir);
}
break;
} catch (RemoteException e) {
if (e.getClassName().equals("org.apache.hadoop.hdfs.server.namenode.SafeModeException")) {
log.warn("The NameNode is in SafeMode - Solr will wait 5 seconds and try again.");
try {
Thread.sleep(5000);
} catch (InterruptedException e1) {
Thread.interrupted();
}
continue;
}
throw new RuntimeException("Problem creating directory: " + tlogDir, e);
} catch (IOException e) {
throw new RuntimeException("Problem creating directory: " + tlogDir, e);
}
}
tlogFiles = getLogList(fs, tlogDir);
// add 1 since we will create a new log for the
id = getLastLogId() + 1;
if (debug) {
log.debug("UpdateHandler init: tlogDir=" + tlogDir + ", existing tlogs=" + Arrays.asList(tlogFiles) + ", next id=" + id);
}
TransactionLog oldLog = null;
for (String oldLogName : tlogFiles) {
Path f = new Path(tlogDir, oldLogName);
try {
oldLog = new HdfsTransactionLog(fs, f, null, true, tlogDfsReplication);
// don't remove old logs on startup since more
addOldLog(oldLog, false);
// than one may be uncapped.
} catch (Exception e) {
INIT_FAILED_LOGS_COUNT.incrementAndGet();
SolrException.log(log, "Failure to open existing log file (non fatal) " + f, e);
try {
fs.delete(f, false);
} catch (IOException e1) {
throw new RuntimeException(e1);
}
}
}
// uncapped.
for (TransactionLog ll : logs) {
if (newestLogsOnStartup.size() < 2) {
newestLogsOnStartup.addFirst(ll);
} else {
// We're never going to modify old non-recovery logs - no need to hold their output open
log.info("Closing output for old non-recovery log " + ll);
ll.closeOutput();
}
}
try {
versionInfo = new VersionInfo(this, numVersionBuckets);
} catch (SolrException e) {
log.error("Unable to use updateLog: " + e.getMessage(), e);
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to use updateLog: " + e.getMessage(), e);
}
// non-complete tlogs.
try (RecentUpdates startingUpdates = getRecentUpdates()) {
startingVersions = startingUpdates.getVersions(getNumRecordsToKeep());
startingOperation = startingUpdates.getLatestOperation();
// index)
for (int i = startingUpdates.deleteList.size() - 1; i >= 0; i--) {
DeleteUpdate du = startingUpdates.deleteList.get(i);
oldDeletes.put(new BytesRef(du.id), new LogPtr(-1, du.version));
}
// populate recent deleteByQuery commands
for (int i = startingUpdates.deleteByQueryList.size() - 1; i >= 0; i--) {
Update update = startingUpdates.deleteByQueryList.get(i);
List<Object> dbq = (List<Object>) update.log.lookup(update.pointer);
long version = (Long) dbq.get(1);
String q = (String) dbq.get(2);
trackDeleteByQuery(q, version);
}
}
// initialize metrics
core.getCoreMetricManager().registerMetricProducer(SolrInfoBean.Category.TLOG.toString(), this);
}
use of org.apache.hadoop.ipc.RemoteException in project hbase by apache.
the class AssignmentManager method retrySendRegionClose.
/**
* At master failover, for pending_close region, make sure
* sendRegionClose RPC call is sent to the target regionserver
*/
private void retrySendRegionClose(final RegionState regionState) {
this.executorService.submit(new EventHandler(server, EventType.M_MASTER_RECOVERY) {
@Override
public void process() throws IOException {
HRegionInfo hri = regionState.getRegion();
ServerName serverName = regionState.getServerName();
ReentrantLock lock = locker.acquireLock(hri.getEncodedName());
try {
for (int i = 1; i <= maximumAttempts; i++) {
if (!serverManager.isServerOnline(serverName) || server.isStopped() || server.isAborted()) {
// No need any more
return;
}
try {
if (!regionState.equals(regionStates.getRegionState(hri))) {
// Region is not in the expected state any more
return;
}
serverManager.sendRegionClose(serverName, hri, null);
// Done.
return;
} catch (Throwable t) {
if (t instanceof RemoteException) {
t = ((RemoteException) t).unwrapRemoteException();
}
if (t instanceof FailedServerException && i < maximumAttempts) {
// retry too soon. Retry after the failed_server_expiry time
try {
Configuration conf = this.server.getConfiguration();
long sleepTime = 1 + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, RpcClient.FAILED_SERVER_EXPIRY_DEFAULT);
if (LOG.isDebugEnabled()) {
LOG.debug(serverName + " is on failed server list; waiting " + sleepTime + "ms", t);
}
Thread.sleep(sleepTime);
continue;
} catch (InterruptedException ie) {
LOG.warn("Failed to unassign " + hri.getRegionNameAsString() + " since interrupted", ie);
regionStates.updateRegionState(hri, RegionState.State.FAILED_CLOSE);
Thread.currentThread().interrupt();
return;
}
}
if (serverManager.isServerOnline(serverName) && t instanceof java.net.SocketTimeoutException) {
// reset the try count
i--;
} else {
LOG.info("Got exception in retrying sendRegionClose for " + regionState + "; try=" + i + " of " + maximumAttempts, t);
}
Threads.sleep(100);
}
}
// Run out of attempts
regionStates.updateRegionState(hri, State.FAILED_CLOSE);
} finally {
lock.unlock();
}
}
});
}
Aggregations