Search in sources :

Example 26 with Status

use of org.apache.accumulo.server.replication.proto.Replication.Status in project accumulo by apache.

the class CloseWriteAheadLogReferencesIT method partiallyReplicatedReferencedWalsAreNotClosed.

@Test
public void partiallyReplicatedReferencedWalsAreNotClosed() throws Exception {
    String file = "file:/accumulo/wal/tserver+port/12345";
    Set<String> wals = Collections.singleton(file);
    BatchWriter bw = ReplicationTable.getBatchWriter(conn);
    Mutation m = new Mutation(file);
    StatusSection.add(m, Table.ID.of("1"), ProtobufUtil.toValue(StatusUtil.ingestedUntil(1000)));
    bw.addMutation(m);
    bw.close();
    refs.updateReplicationEntries(conn, wals);
    try (Scanner s = ReplicationTable.getScanner(conn)) {
        Entry<Key, Value> entry = Iterables.getOnlyElement(s);
        Status status = Status.parseFrom(entry.getValue().get());
        Assert.assertFalse(status.getClosed());
    }
}
Also used : Status(org.apache.accumulo.server.replication.proto.Replication.Status) Scanner(org.apache.accumulo.core.client.Scanner) Value(org.apache.accumulo.core.data.Value) BatchWriter(org.apache.accumulo.core.client.BatchWriter) Mutation(org.apache.accumulo.core.data.Mutation) Key(org.apache.accumulo.core.data.Key) Test(org.junit.Test)

Example 27 with Status

use of org.apache.accumulo.server.replication.proto.Replication.Status in project accumulo by apache.

the class CloseWriteAheadLogReferencesIT method closedWalsUpdateStatus.

@Test
public void closedWalsUpdateStatus() throws Exception {
    String file = "file:/accumulo/wal/tserver+port/12345";
    Set<String> wals = Collections.singleton(file);
    BatchWriter bw = conn.createBatchWriter(MetadataTable.NAME, new BatchWriterConfig());
    Mutation m = new Mutation(ReplicationSection.getRowPrefix() + file);
    m.put(ReplicationSection.COLF, new Text("1"), StatusUtil.fileCreatedValue(System.currentTimeMillis()));
    bw.addMutation(m);
    bw.close();
    refs.updateReplicationEntries(conn, wals);
    try (Scanner s = conn.createScanner(MetadataTable.NAME, Authorizations.EMPTY)) {
        s.fetchColumnFamily(ReplicationSection.COLF);
        Entry<Key, Value> entry = Iterables.getOnlyElement(s);
        Status status = Status.parseFrom(entry.getValue().get());
        Assert.assertTrue(status.getClosed());
    }
}
Also used : Status(org.apache.accumulo.server.replication.proto.Replication.Status) Scanner(org.apache.accumulo.core.client.Scanner) Value(org.apache.accumulo.core.data.Value) BatchWriterConfig(org.apache.accumulo.core.client.BatchWriterConfig) Text(org.apache.hadoop.io.Text) BatchWriter(org.apache.accumulo.core.client.BatchWriter) Mutation(org.apache.accumulo.core.data.Mutation) Key(org.apache.accumulo.core.data.Key) Test(org.junit.Test)

Example 28 with Status

use of org.apache.accumulo.server.replication.proto.Replication.Status in project accumulo by apache.

the class AccumuloReplicaSystem method replicate.

@Override
public Status replicate(final Path p, final Status status, final ReplicationTarget target, final ReplicaSystemHelper helper) {
    final Instance localInstance = HdfsZooInstance.getInstance();
    final AccumuloConfiguration localConf = new ServerConfigurationFactory(localInstance).getSystemConfiguration();
    log.debug("Replication RPC timeout is {}", localConf.get(Property.REPLICATION_RPC_TIMEOUT.getKey()));
    final String principal = getPrincipal(localConf, target);
    final File keytab;
    final String password;
    if (localConf.getBoolean(Property.INSTANCE_RPC_SASL_ENABLED)) {
        String keytabPath = getKeytab(localConf, target);
        keytab = new File(keytabPath);
        if (!keytab.exists() || !keytab.isFile()) {
            log.error("{} is not a regular file. Cannot login to replicate", keytabPath);
            return status;
        }
        password = null;
    } else {
        keytab = null;
        password = getPassword(localConf, target);
    }
    if (null != keytab) {
        try {
            final UserGroupInformation accumuloUgi = UserGroupInformation.getCurrentUser();
            // Get a UGI with the principal + keytab
            UserGroupInformation ugi = UserGroupInformation.loginUserFromKeytabAndReturnUGI(principal, keytab.getAbsolutePath());
            // Run inside a doAs to avoid nuking the Tserver's user
            return ugi.doAs(new PrivilegedAction<Status>() {

                @Override
                public Status run() {
                    KerberosToken token;
                    try {
                        // Do *not* replace the current user
                        token = new KerberosToken(principal, keytab);
                    } catch (IOException e) {
                        log.error("Failed to create KerberosToken", e);
                        return status;
                    }
                    ClientContext peerContext = getContextForPeer(localConf, target, principal, token);
                    return _replicate(p, status, target, helper, localConf, peerContext, accumuloUgi);
                }
            });
        } catch (IOException e) {
            // Can't log in, can't replicate
            log.error("Failed to perform local login", e);
            return status;
        }
    } else {
        // Simple case: make a password token, context and then replicate
        PasswordToken token = new PasswordToken(password);
        ClientContext peerContext = getContextForPeer(localConf, target, principal, token);
        return _replicate(p, status, target, helper, localConf, peerContext, null);
    }
}
Also used : Status(org.apache.accumulo.server.replication.proto.Replication.Status) Instance(org.apache.accumulo.core.client.Instance) ZooKeeperInstance(org.apache.accumulo.core.client.ZooKeeperInstance) HdfsZooInstance(org.apache.accumulo.server.client.HdfsZooInstance) KerberosToken(org.apache.accumulo.core.client.security.tokens.KerberosToken) ClientContext(org.apache.accumulo.core.client.impl.ClientContext) ServerConfigurationFactory(org.apache.accumulo.server.conf.ServerConfigurationFactory) IOException(java.io.IOException) PasswordToken(org.apache.accumulo.core.client.security.tokens.PasswordToken) File(java.io.File) RFile(org.apache.accumulo.core.file.rfile.RFile) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation)

Example 29 with Status

use of org.apache.accumulo.server.replication.proto.Replication.Status in project accumulo by apache.

the class AccumuloReplicaSystem method _replicate.

/**
 * Perform replication, making a few attempts when an exception is returned.
 *
 * @param p
 *          Path of WAL to replicate
 * @param status
 *          Current status for the WAL
 * @param target
 *          Where we're replicating to
 * @param helper
 *          A helper for replication
 * @param localConf
 *          The local instance's configuration
 * @param peerContext
 *          The ClientContext to connect to the peer
 * @return The new (or unchanged) Status for the WAL
 */
private Status _replicate(final Path p, final Status status, final ReplicationTarget target, final ReplicaSystemHelper helper, final AccumuloConfiguration localConf, final ClientContext peerContext, final UserGroupInformation accumuloUgi) {
    try {
        double tracePercent = localConf.getFraction(Property.REPLICATION_TRACE_PERCENT);
        ProbabilitySampler sampler = new ProbabilitySampler(tracePercent);
        Trace.on("AccumuloReplicaSystem", sampler);
        // Remote identifier is an integer (table id) in this case.
        final String remoteTableId = target.getRemoteIdentifier();
        // Attempt the replication of this status a number of times before giving up and
        // trying to replicate it again later some other time.
        int numAttempts = localConf.getCount(Property.REPLICATION_WORK_ATTEMPTS);
        for (int i = 0; i < numAttempts; i++) {
            log.debug("Attempt {}", i);
            String peerTserverStr;
            log.debug("Fetching peer tserver address");
            Span span = Trace.start("Fetch peer tserver");
            try {
                // Ask the master on the remote what TServer we should talk with to replicate the data
                peerTserverStr = ReplicationClient.executeCoordinatorWithReturn(peerContext, new ClientExecReturn<String, ReplicationCoordinator.Client>() {

                    @Override
                    public String execute(ReplicationCoordinator.Client client) throws Exception {
                        return client.getServicerAddress(remoteTableId, peerContext.rpcCreds());
                    }
                });
            } catch (AccumuloException | AccumuloSecurityException e) {
                // No progress is made
                log.error("Could not connect to master at {}, cannot proceed with replication. Will retry", target, e);
                continue;
            } finally {
                span.stop();
            }
            if (null == peerTserverStr) {
                // Something went wrong, and we didn't get a valid tserver from the remote for some reason
                log.warn("Did not receive tserver from master at {}, cannot proceed with replication. Will retry.", target);
                continue;
            }
            final HostAndPort peerTserver = HostAndPort.fromString(peerTserverStr);
            final long timeout = localConf.getTimeInMillis(Property.REPLICATION_RPC_TIMEOUT);
            // We have a tserver on the remote -- send the data its way.
            Status finalStatus;
            final long sizeLimit = conf.getAsBytes(Property.REPLICATION_MAX_UNIT_SIZE);
            try {
                if (p.getName().endsWith(RFILE_SUFFIX)) {
                    span = Trace.start("RFile replication");
                    try {
                        finalStatus = replicateRFiles(peerContext, peerTserver, target, p, status, sizeLimit, remoteTableId, peerContext.rpcCreds(), helper, timeout);
                    } finally {
                        span.stop();
                    }
                } else {
                    span = Trace.start("WAL replication");
                    try {
                        finalStatus = replicateLogs(peerContext, peerTserver, target, p, status, sizeLimit, remoteTableId, peerContext.rpcCreds(), helper, accumuloUgi, timeout);
                    } finally {
                        span.stop();
                    }
                }
                log.debug("New status for {} after replicating to {} is {}", p, peerContext.getInstance(), ProtobufUtil.toString(finalStatus));
                return finalStatus;
            } catch (TTransportException | AccumuloException | AccumuloSecurityException e) {
                log.warn("Could not connect to remote server {}, will retry", peerTserverStr, e);
                sleepUninterruptibly(1, TimeUnit.SECONDS);
            }
        }
        log.info("No progress was made after {} attempts to replicate {}, returning so file can be re-queued", numAttempts, p);
        // We made no status, punt on it for now, and let it re-queue itself for work
        return status;
    } finally {
        Trace.off();
    }
}
Also used : ProbabilitySampler(org.apache.accumulo.core.trace.ProbabilitySampler) Status(org.apache.accumulo.server.replication.proto.Replication.Status) AccumuloException(org.apache.accumulo.core.client.AccumuloException) ClientExecReturn(org.apache.accumulo.core.client.impl.ClientExecReturn) TTransportException(org.apache.thrift.transport.TTransportException) ReplicationCoordinator(org.apache.accumulo.core.replication.thrift.ReplicationCoordinator) Span(org.apache.accumulo.core.trace.Span) HostAndPort(org.apache.accumulo.core.util.HostAndPort) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) Client(org.apache.accumulo.core.replication.thrift.ReplicationServicer.Client) ReplicationClient(org.apache.accumulo.core.client.impl.ReplicationClient)

Example 30 with Status

use of org.apache.accumulo.server.replication.proto.Replication.Status in project accumulo by apache.

the class AccumuloReplicaSystem method replicateLogs.

protected Status replicateLogs(ClientContext peerContext, final HostAndPort peerTserver, final ReplicationTarget target, final Path p, final Status status, final long sizeLimit, final String remoteTableId, final TCredentials tcreds, final ReplicaSystemHelper helper, final UserGroupInformation accumuloUgi, long timeout) throws TTransportException, AccumuloException, AccumuloSecurityException {
    log.debug("Replication WAL to peer tserver");
    final Set<Integer> tids;
    try (final FSDataInputStream fsinput = fs.open(p);
        final DataInputStream input = getWalStream(p, fsinput)) {
        log.debug("Skipping unwanted data in WAL");
        Span span = Trace.start("Consume WAL prefix");
        span.data("file", p.toString());
        try {
            // We want to read all records in the WAL up to the "begin" offset contained in the Status message,
            // building a Set of tids from DEFINE_TABLET events which correspond to table ids for future mutations
            tids = consumeWalPrefix(target, input, p, status, sizeLimit);
        } catch (IOException e) {
            log.warn("Unexpected error consuming file.");
            return status;
        } finally {
            span.stop();
        }
        log.debug("Sending batches of data to peer tserver");
        Status lastStatus = status, currentStatus = status;
        final AtomicReference<Exception> exceptionRef = new AtomicReference<>();
        while (true) {
            // Set some trace info
            span = Trace.start("Replicate WAL batch");
            span.data("Batch size (bytes)", Long.toString(sizeLimit));
            span.data("File", p.toString());
            span.data("Peer instance name", peerContext.getInstance().getInstanceName());
            span.data("Peer tserver", peerTserver.toString());
            span.data("Remote table ID", remoteTableId);
            ReplicationStats replResult;
            try {
                // Read and send a batch of mutations
                replResult = ReplicationClient.executeServicerWithReturn(peerContext, peerTserver, new WalClientExecReturn(target, input, p, currentStatus, sizeLimit, remoteTableId, tcreds, tids), timeout);
            } catch (Exception e) {
                log.error("Caught exception replicating data to {} at {}", peerContext.getInstance().getInstanceName(), peerTserver, e);
                throw e;
            } finally {
                span.stop();
            }
            // Catch the overflow
            long newBegin = currentStatus.getBegin() + replResult.entriesConsumed;
            if (newBegin < 0) {
                newBegin = Long.MAX_VALUE;
            }
            currentStatus = Status.newBuilder(currentStatus).setBegin(newBegin).build();
            log.debug("Sent batch for replication of {} to {}, with new Status {}", p, target, ProtobufUtil.toString(currentStatus));
            // If we got a different status
            if (!currentStatus.equals(lastStatus)) {
                span = Trace.start("Update replication table");
                try {
                    if (null != accumuloUgi) {
                        final Status copy = currentStatus;
                        accumuloUgi.doAs(new PrivilegedAction<Void>() {

                            @Override
                            public Void run() {
                                try {
                                    helper.recordNewStatus(p, copy, target);
                                } catch (Exception e) {
                                    exceptionRef.set(e);
                                }
                                return null;
                            }
                        });
                        Exception e = exceptionRef.get();
                        if (null != e) {
                            if (e instanceof TableNotFoundException) {
                                throw (TableNotFoundException) e;
                            } else if (e instanceof AccumuloSecurityException) {
                                throw (AccumuloSecurityException) e;
                            } else if (e instanceof AccumuloException) {
                                throw (AccumuloException) e;
                            } else {
                                throw new RuntimeException("Received unexpected exception", e);
                            }
                        }
                    } else {
                        helper.recordNewStatus(p, currentStatus, target);
                    }
                } catch (TableNotFoundException e) {
                    log.error("Tried to update status in replication table for {} as {}, but the table did not exist", p, ProtobufUtil.toString(currentStatus), e);
                    throw new RuntimeException("Replication table did not exist, will retry", e);
                } finally {
                    span.stop();
                }
                log.debug("Recorded updated status for {}: {}", p, ProtobufUtil.toString(currentStatus));
                // If we don't have any more work, just quit
                if (!StatusUtil.isWorkRequired(currentStatus)) {
                    return currentStatus;
                } else {
                    // Otherwise, let it loop and replicate some more data
                    lastStatus = currentStatus;
                }
            } else {
                log.debug("Did not replicate any new data for {} to {}, (state was {})", p, target, ProtobufUtil.toString(lastStatus));
                // we can just not record any updates, and it will be picked up again by the work assigner
                return status;
            }
        }
    } catch (LogHeaderIncompleteException e) {
        log.warn("Could not read header from {}, assuming that there is no data present in the WAL, therefore replication is complete", p);
        Status newStatus;
        // Bump up the begin to the (infinite) end, trying to be accurate
        if (status.getInfiniteEnd()) {
            newStatus = Status.newBuilder(status).setBegin(Long.MAX_VALUE).build();
        } else {
            newStatus = Status.newBuilder(status).setBegin(status.getEnd()).build();
        }
        Span span = Trace.start("Update replication table");
        try {
            helper.recordNewStatus(p, newStatus, target);
        } catch (TableNotFoundException tnfe) {
            log.error("Tried to update status in replication table for {} as {}, but the table did not exist", p, ProtobufUtil.toString(newStatus), e);
            throw new RuntimeException("Replication table did not exist, will retry", e);
        } finally {
            span.stop();
        }
        return newStatus;
    } catch (IOException e) {
        log.error("Could not create stream for WAL", e);
        // No data sent (bytes nor records) and no progress made
        return status;
    }
}
Also used : Status(org.apache.accumulo.server.replication.proto.Replication.Status) AccumuloException(org.apache.accumulo.core.client.AccumuloException) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) DataInputStream(java.io.DataInputStream) Span(org.apache.accumulo.core.trace.Span) LogHeaderIncompleteException(org.apache.accumulo.tserver.log.DfsLogger.LogHeaderIncompleteException) TTransportException(org.apache.thrift.transport.TTransportException) TableNotFoundException(org.apache.accumulo.core.client.TableNotFoundException) LogHeaderIncompleteException(org.apache.accumulo.tserver.log.DfsLogger.LogHeaderIncompleteException) EOFException(java.io.EOFException) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) IOException(java.io.IOException) AccumuloException(org.apache.accumulo.core.client.AccumuloException) TableNotFoundException(org.apache.accumulo.core.client.TableNotFoundException) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException)

Aggregations

Status (org.apache.accumulo.server.replication.proto.Replication.Status)77 Test (org.junit.Test)57 Mutation (org.apache.accumulo.core.data.Mutation)30 Text (org.apache.hadoop.io.Text)29 BatchWriter (org.apache.accumulo.core.client.BatchWriter)28 Key (org.apache.accumulo.core.data.Key)27 Value (org.apache.accumulo.core.data.Value)26 Scanner (org.apache.accumulo.core.client.Scanner)21 ReplicationTarget (org.apache.accumulo.core.replication.ReplicationTarget)20 Path (org.apache.hadoop.fs.Path)17 HashMap (java.util.HashMap)14 BatchWriterConfig (org.apache.accumulo.core.client.BatchWriterConfig)14 Table (org.apache.accumulo.core.client.impl.Table)14 ReplicationTable (org.apache.accumulo.core.replication.ReplicationTable)13 AccumuloSecurityException (org.apache.accumulo.core.client.AccumuloSecurityException)12 AccumuloException (org.apache.accumulo.core.client.AccumuloException)11 Connector (org.apache.accumulo.core.client.Connector)11 InvalidProtocolBufferException (com.google.protobuf.InvalidProtocolBufferException)10 TableNotFoundException (org.apache.accumulo.core.client.TableNotFoundException)10 DataInputStream (java.io.DataInputStream)9