use of org.apache.thrift.transport.TTransportException in project accumulo by apache.
the class AccumuloReplicaSystem method replicateLogs.
protected Status replicateLogs(ClientContext peerContext, final HostAndPort peerTserver, final ReplicationTarget target, final Path p, final Status status, final long sizeLimit, final String remoteTableId, final TCredentials tcreds, final ReplicaSystemHelper helper, final UserGroupInformation accumuloUgi, long timeout) throws TTransportException, AccumuloException, AccumuloSecurityException {
log.debug("Replication WAL to peer tserver");
final Set<Integer> tids;
try (final FSDataInputStream fsinput = fs.open(p);
final DataInputStream input = getWalStream(p, fsinput)) {
log.debug("Skipping unwanted data in WAL");
Span span = Trace.start("Consume WAL prefix");
span.data("file", p.toString());
try {
// We want to read all records in the WAL up to the "begin" offset contained in the Status message,
// building a Set of tids from DEFINE_TABLET events which correspond to table ids for future mutations
tids = consumeWalPrefix(target, input, p, status, sizeLimit);
} catch (IOException e) {
log.warn("Unexpected error consuming file.");
return status;
} finally {
span.stop();
}
log.debug("Sending batches of data to peer tserver");
Status lastStatus = status, currentStatus = status;
final AtomicReference<Exception> exceptionRef = new AtomicReference<>();
while (true) {
// Set some trace info
span = Trace.start("Replicate WAL batch");
span.data("Batch size (bytes)", Long.toString(sizeLimit));
span.data("File", p.toString());
span.data("Peer instance name", peerContext.getInstance().getInstanceName());
span.data("Peer tserver", peerTserver.toString());
span.data("Remote table ID", remoteTableId);
ReplicationStats replResult;
try {
// Read and send a batch of mutations
replResult = ReplicationClient.executeServicerWithReturn(peerContext, peerTserver, new WalClientExecReturn(target, input, p, currentStatus, sizeLimit, remoteTableId, tcreds, tids), timeout);
} catch (Exception e) {
log.error("Caught exception replicating data to {} at {}", peerContext.getInstance().getInstanceName(), peerTserver, e);
throw e;
} finally {
span.stop();
}
// Catch the overflow
long newBegin = currentStatus.getBegin() + replResult.entriesConsumed;
if (newBegin < 0) {
newBegin = Long.MAX_VALUE;
}
currentStatus = Status.newBuilder(currentStatus).setBegin(newBegin).build();
log.debug("Sent batch for replication of {} to {}, with new Status {}", p, target, ProtobufUtil.toString(currentStatus));
// If we got a different status
if (!currentStatus.equals(lastStatus)) {
span = Trace.start("Update replication table");
try {
if (null != accumuloUgi) {
final Status copy = currentStatus;
accumuloUgi.doAs(new PrivilegedAction<Void>() {
@Override
public Void run() {
try {
helper.recordNewStatus(p, copy, target);
} catch (Exception e) {
exceptionRef.set(e);
}
return null;
}
});
Exception e = exceptionRef.get();
if (null != e) {
if (e instanceof TableNotFoundException) {
throw (TableNotFoundException) e;
} else if (e instanceof AccumuloSecurityException) {
throw (AccumuloSecurityException) e;
} else if (e instanceof AccumuloException) {
throw (AccumuloException) e;
} else {
throw new RuntimeException("Received unexpected exception", e);
}
}
} else {
helper.recordNewStatus(p, currentStatus, target);
}
} catch (TableNotFoundException e) {
log.error("Tried to update status in replication table for {} as {}, but the table did not exist", p, ProtobufUtil.toString(currentStatus), e);
throw new RuntimeException("Replication table did not exist, will retry", e);
} finally {
span.stop();
}
log.debug("Recorded updated status for {}: {}", p, ProtobufUtil.toString(currentStatus));
// If we don't have any more work, just quit
if (!StatusUtil.isWorkRequired(currentStatus)) {
return currentStatus;
} else {
// Otherwise, let it loop and replicate some more data
lastStatus = currentStatus;
}
} else {
log.debug("Did not replicate any new data for {} to {}, (state was {})", p, target, ProtobufUtil.toString(lastStatus));
// we can just not record any updates, and it will be picked up again by the work assigner
return status;
}
}
} catch (LogHeaderIncompleteException e) {
log.warn("Could not read header from {}, assuming that there is no data present in the WAL, therefore replication is complete", p);
Status newStatus;
// Bump up the begin to the (infinite) end, trying to be accurate
if (status.getInfiniteEnd()) {
newStatus = Status.newBuilder(status).setBegin(Long.MAX_VALUE).build();
} else {
newStatus = Status.newBuilder(status).setBegin(status.getEnd()).build();
}
Span span = Trace.start("Update replication table");
try {
helper.recordNewStatus(p, newStatus, target);
} catch (TableNotFoundException tnfe) {
log.error("Tried to update status in replication table for {} as {}, but the table did not exist", p, ProtobufUtil.toString(newStatus), e);
throw new RuntimeException("Replication table did not exist, will retry", e);
} finally {
span.stop();
}
return newStatus;
} catch (IOException e) {
log.error("Could not create stream for WAL", e);
// No data sent (bytes nor records) and no progress made
return status;
}
}
use of org.apache.thrift.transport.TTransportException in project accumulo by apache.
the class TableOperationsImpl method addSplits.
private void addSplits(String tableName, SortedSet<Text> partitionKeys, Table.ID tableId) throws AccumuloException, AccumuloSecurityException, TableNotFoundException, AccumuloServerException {
TabletLocator tabLocator = TabletLocator.getLocator(context, tableId);
for (Text split : partitionKeys) {
boolean successful = false;
int attempt = 0;
long locationFailures = 0;
while (!successful) {
if (attempt > 0)
sleepUninterruptibly(100, TimeUnit.MILLISECONDS);
attempt++;
TabletLocation tl = tabLocator.locateTablet(context, split, false, false);
if (tl == null) {
if (!Tables.exists(context.getInstance(), tableId))
throw new TableNotFoundException(tableId.canonicalID(), tableName, null);
else if (Tables.getTableState(context.getInstance(), tableId) == TableState.OFFLINE)
throw new TableOfflineException(context.getInstance(), tableId.canonicalID());
continue;
}
HostAndPort address = HostAndPort.fromString(tl.tablet_location);
try {
TabletClientService.Client client = ThriftUtil.getTServerClient(address, context);
try {
OpTimer timer = null;
if (log.isTraceEnabled()) {
log.trace("tid={} Splitting tablet {} on {} at {}", Thread.currentThread().getId(), tl.tablet_extent, address, split);
timer = new OpTimer().start();
}
client.splitTablet(Tracer.traceInfo(), context.rpcCreds(), tl.tablet_extent.toThrift(), TextUtil.getByteBuffer(split));
// just split it, might as well invalidate it in the cache
tabLocator.invalidateCache(tl.tablet_extent);
if (timer != null) {
timer.stop();
log.trace("Split tablet in {}", String.format("%.3f secs", timer.scale(TimeUnit.SECONDS)));
}
} finally {
ThriftUtil.returnClient(client);
}
} catch (TApplicationException tae) {
throw new AccumuloServerException(address.toString(), tae);
} catch (TTransportException e) {
tabLocator.invalidateCache(context.getInstance(), tl.tablet_location);
continue;
} catch (ThriftSecurityException e) {
Tables.clearCache(context.getInstance());
if (!Tables.exists(context.getInstance(), tableId))
throw new TableNotFoundException(tableId.canonicalID(), tableName, null);
throw new AccumuloSecurityException(e.user, e.code, e);
} catch (NotServingTabletException e) {
// Do not silently spin when we repeatedly fail to get the location for a tablet
locationFailures++;
if (5 == locationFailures || 0 == locationFailures % 50) {
log.warn("Having difficulty locating hosting tabletserver for split {} on table {}. Seen {} failures.", split, tableName, locationFailures);
}
tabLocator.invalidateCache(tl.tablet_extent);
continue;
} catch (TException e) {
tabLocator.invalidateCache(context.getInstance(), tl.tablet_location);
continue;
}
successful = true;
}
}
}
use of org.apache.thrift.transport.TTransportException in project accumulo by apache.
the class TabletServerBatchReaderIterator method doLookup.
static void doLookup(ClientContext context, String server, Map<KeyExtent, List<Range>> requested, Map<KeyExtent, List<Range>> failures, Map<KeyExtent, List<Range>> unscanned, ResultReceiver receiver, List<Column> columns, ScannerOptions options, Authorizations authorizations, TimeoutTracker timeoutTracker) throws IOException, AccumuloSecurityException, AccumuloServerException {
if (requested.size() == 0) {
return;
}
// copy requested to unscanned map. we will remove ranges as they are scanned in trackScanning()
for (Entry<KeyExtent, List<Range>> entry : requested.entrySet()) {
ArrayList<Range> ranges = new ArrayList<>();
for (Range range : entry.getValue()) {
ranges.add(new Range(range));
}
unscanned.put(new KeyExtent(entry.getKey()), ranges);
}
timeoutTracker.startingScan();
TTransport transport = null;
try {
final HostAndPort parsedServer = HostAndPort.fromString(server);
final TabletClientService.Client client;
if (timeoutTracker.getTimeOut() < context.getClientTimeoutInMillis())
client = ThriftUtil.getTServerClient(parsedServer, context, timeoutTracker.getTimeOut());
else
client = ThriftUtil.getTServerClient(parsedServer, context);
try {
OpTimer timer = null;
if (log.isTraceEnabled()) {
log.trace("tid={} Starting multi scan, tserver={} #tablets={} #ranges={} ssil={} ssio={}", Thread.currentThread().getId(), server, requested.size(), sumSizes(requested.values()), options.serverSideIteratorList, options.serverSideIteratorOptions);
timer = new OpTimer().start();
}
TabletType ttype = TabletType.type(requested.keySet());
boolean waitForWrites = !ThriftScanner.serversWaitedForWrites.get(ttype).contains(server);
Map<TKeyExtent, List<TRange>> thriftTabletRanges = Translator.translate(requested, Translators.KET, new Translator.ListTranslator<>(Translators.RT));
InitialMultiScan imsr = client.startMultiScan(Tracer.traceInfo(), context.rpcCreds(), thriftTabletRanges, Translator.translate(columns, Translators.CT), options.serverSideIteratorList, options.serverSideIteratorOptions, ByteBufferUtil.toByteBuffers(authorizations.getAuthorizations()), waitForWrites, SamplerConfigurationImpl.toThrift(options.getSamplerConfiguration()), options.batchTimeOut, options.classLoaderContext);
if (waitForWrites)
ThriftScanner.serversWaitedForWrites.get(ttype).add(server.toString());
MultiScanResult scanResult = imsr.result;
if (timer != null) {
timer.stop();
log.trace("tid={} Got 1st multi scan results, #results={} {} in {}", Thread.currentThread().getId(), scanResult.results.size(), (scanResult.more ? "scanID=" + imsr.scanID : ""), String.format("%.3f secs", timer.scale(TimeUnit.SECONDS)));
}
ArrayList<Entry<Key, Value>> entries = new ArrayList<>(scanResult.results.size());
for (TKeyValue kv : scanResult.results) {
entries.add(new SimpleImmutableEntry<>(new Key(kv.key), new Value(kv.value)));
}
if (entries.size() > 0)
receiver.receive(entries);
if (entries.size() > 0 || scanResult.fullScans.size() > 0)
timeoutTracker.madeProgress();
trackScanning(failures, unscanned, scanResult);
AtomicLong nextOpid = new AtomicLong();
while (scanResult.more) {
timeoutTracker.check();
if (timer != null) {
log.trace("tid={} oid={} Continuing multi scan, scanid={}", Thread.currentThread().getId(), nextOpid.get(), imsr.scanID);
timer.reset().start();
}
scanResult = client.continueMultiScan(Tracer.traceInfo(), imsr.scanID);
if (timer != null) {
timer.stop();
log.trace("tid={} oid={} Got more multi scan results, #results={} {} in {}", Thread.currentThread().getId(), nextOpid.getAndIncrement(), scanResult.results.size(), (scanResult.more ? " scanID=" + imsr.scanID : ""), String.format("%.3f secs", timer.scale(TimeUnit.SECONDS)));
}
entries = new ArrayList<>(scanResult.results.size());
for (TKeyValue kv : scanResult.results) {
entries.add(new SimpleImmutableEntry<>(new Key(kv.key), new Value(kv.value)));
}
if (entries.size() > 0)
receiver.receive(entries);
if (entries.size() > 0 || scanResult.fullScans.size() > 0)
timeoutTracker.madeProgress();
trackScanning(failures, unscanned, scanResult);
}
client.closeMultiScan(Tracer.traceInfo(), imsr.scanID);
} finally {
ThriftUtil.returnClient(client);
}
} catch (TTransportException e) {
log.debug("Server : {} msg : {}", server, e.getMessage());
timeoutTracker.errorOccured(e);
throw new IOException(e);
} catch (ThriftSecurityException e) {
log.debug("Server : {} msg : {}", server, e.getMessage(), e);
throw new AccumuloSecurityException(e.user, e.code, e);
} catch (TApplicationException e) {
log.debug("Server : {} msg : {}", server, e.getMessage(), e);
throw new AccumuloServerException(server, e);
} catch (NoSuchScanIDException e) {
log.debug("Server : {} msg : {}", server, e.getMessage(), e);
throw new IOException(e);
} catch (TSampleNotPresentException e) {
log.debug("Server : " + server + " msg : " + e.getMessage(), e);
String tableInfo = "?";
if (e.getExtent() != null) {
Table.ID tableId = new KeyExtent(e.getExtent()).getTableId();
tableInfo = Tables.getPrintableTableInfoFromId(context.getInstance(), tableId);
}
String message = "Table " + tableInfo + " does not have sampling configured or built";
throw new SampleNotPresentException(message, e);
} catch (TException e) {
log.debug("Server : {} msg : {}", server, e.getMessage(), e);
timeoutTracker.errorOccured(e);
throw new IOException(e);
} finally {
ThriftTransportPool.getInstance().returnTransport(transport);
}
}
use of org.apache.thrift.transport.TTransportException in project accumulo by apache.
the class ReplicationClient method getCoordinatorConnection.
public static ReplicationCoordinator.Client getCoordinatorConnection(ClientContext context) {
Instance instance = context.getInstance();
List<String> locations = instance.getMasterLocations();
if (locations.size() == 0) {
log.debug("No masters for replication to instance {}", instance.getInstanceName());
return null;
}
// This is the master thrift service, we just want the hostname, not the port
String masterThriftService = locations.get(0);
if (masterThriftService.endsWith(":0")) {
log.warn("Master found for {} did not have real location {}", instance.getInstanceName(), masterThriftService);
return null;
}
String zkPath = ZooUtil.getRoot(instance) + Constants.ZMASTER_REPLICATION_COORDINATOR_ADDR;
String replCoordinatorAddr;
log.debug("Using ZooKeeper quorum at {} with path {} to find peer Master information", instance.getZooKeepers(), zkPath);
// Get the coordinator port for the master we're trying to connect to
try {
ZooReader reader = new ZooReader(instance.getZooKeepers(), instance.getZooKeepersSessionTimeOut());
replCoordinatorAddr = new String(reader.getData(zkPath, null), UTF_8);
} catch (KeeperException | InterruptedException e) {
log.error("Could not fetch remote coordinator port", e);
return null;
}
// Throw the hostname and port through HostAndPort to get some normalization
HostAndPort coordinatorAddr = HostAndPort.fromString(replCoordinatorAddr);
log.debug("Connecting to master at {}", coordinatorAddr);
try {
// Master requests can take a long time: don't ever time out
ReplicationCoordinator.Client client = ThriftUtil.getClientNoTimeout(new ReplicationCoordinator.Client.Factory(), coordinatorAddr, context);
return client;
} catch (TTransportException tte) {
log.debug("Failed to connect to master coordinator service ({})", coordinatorAddr, tte);
return null;
}
}
use of org.apache.thrift.transport.TTransportException in project accumulo by apache.
the class TableOperationsImpl method _flush.
private void _flush(Table.ID tableId, Text start, Text end, boolean wait) throws AccumuloException, AccumuloSecurityException, TableNotFoundException {
try {
long flushID;
while (true) {
MasterClientService.Iface client = null;
try {
client = MasterClient.getConnectionWithRetry(context);
flushID = client.initiateFlush(Tracer.traceInfo(), context.rpcCreds(), tableId.canonicalID());
break;
} catch (TTransportException tte) {
log.debug("Failed to call initiateFlush, retrying ... ", tte);
sleepUninterruptibly(100, TimeUnit.MILLISECONDS);
} catch (ThriftNotActiveServiceException e) {
// Let it loop, fetching a new location
log.debug("Contacted a Master which is no longer active, retrying");
sleepUninterruptibly(100, TimeUnit.MILLISECONDS);
} finally {
MasterClient.close(client);
}
}
while (true) {
MasterClientService.Iface client = null;
try {
client = MasterClient.getConnectionWithRetry(context);
client.waitForFlush(Tracer.traceInfo(), context.rpcCreds(), tableId.canonicalID(), TextUtil.getByteBuffer(start), TextUtil.getByteBuffer(end), flushID, wait ? Long.MAX_VALUE : 1);
break;
} catch (TTransportException tte) {
log.debug("Failed to call initiateFlush, retrying ... ", tte);
sleepUninterruptibly(100, TimeUnit.MILLISECONDS);
} catch (ThriftNotActiveServiceException e) {
// Let it loop, fetching a new location
log.debug("Contacted a Master which is no longer active, retrying");
sleepUninterruptibly(100, TimeUnit.MILLISECONDS);
} finally {
MasterClient.close(client);
}
}
} catch (ThriftSecurityException e) {
switch(e.getCode()) {
case TABLE_DOESNT_EXIST:
throw new TableNotFoundException(tableId.canonicalID(), null, e.getMessage(), e);
default:
log.debug("flush security exception on table id {}", tableId);
throw new AccumuloSecurityException(e.user, e.code, e);
}
} catch (ThriftTableOperationException e) {
switch(e.getType()) {
case NOTFOUND:
throw new TableNotFoundException(e);
default:
throw new AccumuloException(e.description, e);
}
} catch (Exception e) {
throw new AccumuloException(e);
}
}
Aggregations