Search in sources :

Example 1 with ProbabilitySampler

use of org.apache.accumulo.core.trace.ProbabilitySampler in project accumulo by apache.

the class AccumuloReplicaSystem method _replicate.

/**
 * Perform replication, making a few attempts when an exception is returned.
 *
 * @param p
 *          Path of WAL to replicate
 * @param status
 *          Current status for the WAL
 * @param target
 *          Where we're replicating to
 * @param helper
 *          A helper for replication
 * @param localConf
 *          The local instance's configuration
 * @param peerContext
 *          The ClientContext to connect to the peer
 * @return The new (or unchanged) Status for the WAL
 */
private Status _replicate(final Path p, final Status status, final ReplicationTarget target, final ReplicaSystemHelper helper, final AccumuloConfiguration localConf, final ClientContext peerContext, final UserGroupInformation accumuloUgi) {
    try {
        double tracePercent = localConf.getFraction(Property.REPLICATION_TRACE_PERCENT);
        ProbabilitySampler sampler = new ProbabilitySampler(tracePercent);
        Trace.on("AccumuloReplicaSystem", sampler);
        // Remote identifier is an integer (table id) in this case.
        final String remoteTableId = target.getRemoteIdentifier();
        // Attempt the replication of this status a number of times before giving up and
        // trying to replicate it again later some other time.
        int numAttempts = localConf.getCount(Property.REPLICATION_WORK_ATTEMPTS);
        for (int i = 0; i < numAttempts; i++) {
            log.debug("Attempt {}", i);
            String peerTserverStr;
            log.debug("Fetching peer tserver address");
            Span span = Trace.start("Fetch peer tserver");
            try {
                // Ask the master on the remote what TServer we should talk with to replicate the data
                peerTserverStr = ReplicationClient.executeCoordinatorWithReturn(peerContext, new ClientExecReturn<String, ReplicationCoordinator.Client>() {

                    @Override
                    public String execute(ReplicationCoordinator.Client client) throws Exception {
                        return client.getServicerAddress(remoteTableId, peerContext.rpcCreds());
                    }
                });
            } catch (AccumuloException | AccumuloSecurityException e) {
                // No progress is made
                log.error("Could not connect to master at {}, cannot proceed with replication. Will retry", target, e);
                continue;
            } finally {
                span.stop();
            }
            if (null == peerTserverStr) {
                // Something went wrong, and we didn't get a valid tserver from the remote for some reason
                log.warn("Did not receive tserver from master at {}, cannot proceed with replication. Will retry.", target);
                continue;
            }
            final HostAndPort peerTserver = HostAndPort.fromString(peerTserverStr);
            final long timeout = localConf.getTimeInMillis(Property.REPLICATION_RPC_TIMEOUT);
            // We have a tserver on the remote -- send the data its way.
            Status finalStatus;
            final long sizeLimit = conf.getAsBytes(Property.REPLICATION_MAX_UNIT_SIZE);
            try {
                if (p.getName().endsWith(RFILE_SUFFIX)) {
                    span = Trace.start("RFile replication");
                    try {
                        finalStatus = replicateRFiles(peerContext, peerTserver, target, p, status, sizeLimit, remoteTableId, peerContext.rpcCreds(), helper, timeout);
                    } finally {
                        span.stop();
                    }
                } else {
                    span = Trace.start("WAL replication");
                    try {
                        finalStatus = replicateLogs(peerContext, peerTserver, target, p, status, sizeLimit, remoteTableId, peerContext.rpcCreds(), helper, accumuloUgi, timeout);
                    } finally {
                        span.stop();
                    }
                }
                log.debug("New status for {} after replicating to {} is {}", p, peerContext.getInstance(), ProtobufUtil.toString(finalStatus));
                return finalStatus;
            } catch (TTransportException | AccumuloException | AccumuloSecurityException e) {
                log.warn("Could not connect to remote server {}, will retry", peerTserverStr, e);
                sleepUninterruptibly(1, TimeUnit.SECONDS);
            }
        }
        log.info("No progress was made after {} attempts to replicate {}, returning so file can be re-queued", numAttempts, p);
        // We made no status, punt on it for now, and let it re-queue itself for work
        return status;
    } finally {
        Trace.off();
    }
}
Also used : ProbabilitySampler(org.apache.accumulo.core.trace.ProbabilitySampler) Status(org.apache.accumulo.server.replication.proto.Replication.Status) AccumuloException(org.apache.accumulo.core.client.AccumuloException) ClientExecReturn(org.apache.accumulo.core.client.impl.ClientExecReturn) TTransportException(org.apache.thrift.transport.TTransportException) ReplicationCoordinator(org.apache.accumulo.core.replication.thrift.ReplicationCoordinator) Span(org.apache.accumulo.core.trace.Span) HostAndPort(org.apache.accumulo.core.util.HostAndPort) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) Client(org.apache.accumulo.core.replication.thrift.ReplicationServicer.Client) ReplicationClient(org.apache.accumulo.core.client.impl.ReplicationClient)

Example 2 with ProbabilitySampler

use of org.apache.accumulo.core.trace.ProbabilitySampler in project accumulo by apache.

the class MinorCompactionTask method run.

@Override
public void run() {
    tablet.minorCompactionStarted();
    ProbabilitySampler sampler = new ProbabilitySampler(tracePercent);
    Span minorCompaction = Trace.on("minorCompaction", sampler);
    try {
        FileRef newMapfileLocation = tablet.getNextMapFilename(mergeFile == null ? "F" : "M");
        FileRef tmpFileRef = new FileRef(newMapfileLocation.path() + "_tmp");
        Span span = Trace.start("waitForCommits");
        synchronized (tablet) {
            commitSession.waitForCommitsToFinish();
        }
        span.stop();
        span = Trace.start("start");
        while (true) {
            try {
                // the purpose of the minor compaction start event is to keep track of the filename... in the case
                // where the metadata table write for the minor compaction finishes and the process dies before
                // writing the minor compaction finish event, then the start event+filename in metadata table will
                // prevent recovery of duplicate data... the minor compaction start event could be written at any time
                // before the metadata write for the minor compaction
                tablet.getTabletServer().minorCompactionStarted(commitSession, commitSession.getWALogSeq() + 1, newMapfileLocation.path().toString());
                break;
            } catch (IOException e) {
                log.warn("Failed to write to write ahead log {}", e.getMessage(), e);
            }
        }
        span.stop();
        span = Trace.start("compact");
        this.stats = tablet.minorCompact(tablet.getTabletServer().getFileSystem(), tablet.getTabletMemory().getMinCMemTable(), tmpFileRef, newMapfileLocation, mergeFile, true, queued, commitSession, flushId, mincReason);
        span.stop();
        minorCompaction.data("extent", tablet.getExtent().toString());
        minorCompaction.data("numEntries", Long.toString(this.stats.getNumEntries()));
        minorCompaction.data("size", Long.toString(this.stats.getSize()));
        minorCompaction.stop();
        if (tablet.needsSplit()) {
            tablet.getTabletServer().executeSplit(tablet);
        } else {
            tablet.initiateMajorCompaction(MajorCompactionReason.NORMAL);
        }
    } catch (Throwable t) {
        log.error("Unknown error during minor compaction for extent: " + tablet.getExtent(), t);
        throw new RuntimeException(t);
    } finally {
        tablet.minorCompactionComplete();
        minorCompaction.stop();
    }
}
Also used : ProbabilitySampler(org.apache.accumulo.core.trace.ProbabilitySampler) FileRef(org.apache.accumulo.server.fs.FileRef) IOException(java.io.IOException) Span(org.apache.accumulo.core.trace.Span)

Example 3 with ProbabilitySampler

use of org.apache.accumulo.core.trace.ProbabilitySampler in project accumulo by apache.

the class Tablet method majorCompact.

// END PRIVATE METHODS RELATED TO MAJOR COMPACTION
/**
 * Performs a major compaction on the tablet. If needsSplit() returns true, the tablet is split and a reference to the new tablet is returned.
 */
CompactionStats majorCompact(MajorCompactionReason reason, long queued) {
    CompactionStats majCStats = null;
    boolean success = false;
    long start = System.currentTimeMillis();
    timer.incrementStatusMajor();
    synchronized (this) {
        // check that compaction is still needed - defer to splitting
        majorCompactionQueued.remove(reason);
        if (isClosing() || isClosed() || !needsMajorCompaction(reason) || isMajorCompactionRunning() || needsSplit()) {
            return null;
        }
        majorCompactionState = CompactionState.WAITING_TO_START;
    }
    Span span = null;
    try {
        double tracePercent = tabletServer.getConfiguration().getFraction(Property.TSERV_MAJC_TRACE_PERCENT);
        ProbabilitySampler sampler = new ProbabilitySampler(tracePercent);
        span = Trace.on("majorCompaction", sampler);
        majCStats = _majorCompact(reason);
        if (reason == MajorCompactionReason.CHOP) {
            MetadataTableUtil.chopped(getTabletServer(), getExtent(), this.getTabletServer().getLock());
            getTabletServer().enqueueMasterMessage(new TabletStatusMessage(TabletLoadState.CHOPPED, extent));
        }
        success = true;
    } catch (CompactionCanceledException cce) {
        log.debug("Major compaction canceled, extent = {}", getExtent());
    } catch (IOException ioe) {
        log.error("MajC Failed, extent = " + getExtent(), ioe);
    } catch (RuntimeException e) {
        log.error("MajC Unexpected exception, extent = " + getExtent(), e);
    } finally {
        // when an exception is thrown
        synchronized (this) {
            majorCompactionState = null;
            this.notifyAll();
        }
        if (span != null) {
            span.data("extent", "" + getExtent());
            if (majCStats != null) {
                span.data("read", "" + majCStats.getEntriesRead());
                span.data("written", "" + majCStats.getEntriesWritten());
            }
            span.stop();
        }
    }
    long count = 0;
    if (majCStats != null)
        count = majCStats.getEntriesRead();
    timer.updateTime(Operation.MAJOR, queued, start, count, !success);
    return majCStats;
}
Also used : ProbabilitySampler(org.apache.accumulo.core.trace.ProbabilitySampler) TabletStatusMessage(org.apache.accumulo.tserver.mastermessage.TabletStatusMessage) CompactionCanceledException(org.apache.accumulo.tserver.tablet.Compactor.CompactionCanceledException) IOException(java.io.IOException) Span(org.apache.accumulo.core.trace.Span)

Example 4 with ProbabilitySampler

use of org.apache.accumulo.core.trace.ProbabilitySampler in project accumulo by apache.

the class ReplicationDriver method run.

@Override
public void run() {
    ProbabilitySampler sampler = new ProbabilitySampler(conf.getFraction(Property.REPLICATION_TRACE_PERCENT));
    long millisToWait = conf.getTimeInMillis(Property.REPLICATION_DRIVER_DELAY);
    log.debug("Waiting {}ms before starting main replication loop", millisToWait);
    UtilWaitThread.sleep(millisToWait);
    log.debug("Starting replication loop");
    while (master.stillMaster()) {
        if (null == workMaker) {
            try {
                conn = master.getConnector();
            } catch (AccumuloException | AccumuloSecurityException e) {
                // couldn't get a connector, try again in a "short" amount of time
                log.warn("Error trying to get connector to process replication records", e);
                UtilWaitThread.sleep(2000);
                continue;
            }
            statusMaker = new StatusMaker(conn, master.getFileSystem());
            workMaker = new WorkMaker(master, conn);
            finishedWorkUpdater = new FinishedWorkUpdater(conn);
            rcrr = new RemoveCompleteReplicationRecords(conn);
        }
        Trace.on("masterReplicationDriver", sampler);
        // This will end up creating the replication table too
        try {
            statusMaker.run();
        } catch (Exception e) {
            log.error("Caught Exception trying to create Replication status records", e);
        }
        // Tell the work maker to make work
        try {
            workMaker.run();
        } catch (Exception e) {
            log.error("Caught Exception trying to create Replication work records", e);
        }
        // Update the status records from the work records
        try {
            finishedWorkUpdater.run();
        } catch (Exception e) {
            log.error("Caught Exception trying to update Replication records using finished work records", e);
        }
        // So it's important that we run these sequentially and not concurrently
        try {
            rcrr.run();
        } catch (Exception e) {
            log.error("Caught Exception trying to remove finished Replication records", e);
        }
        Trace.off();
        // Sleep for a bit
        long sleepMillis = conf.getTimeInMillis(Property.MASTER_REPLICATION_SCAN_INTERVAL);
        log.debug("Sleeping for {}ms before re-running", sleepMillis);
        try {
            Thread.sleep(sleepMillis);
        } catch (InterruptedException e) {
            log.error("Interrupted while sleeping", e);
        }
    }
}
Also used : ProbabilitySampler(org.apache.accumulo.core.trace.ProbabilitySampler) AccumuloException(org.apache.accumulo.core.client.AccumuloException) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) AccumuloException(org.apache.accumulo.core.client.AccumuloException)

Example 5 with ProbabilitySampler

use of org.apache.accumulo.core.trace.ProbabilitySampler in project accumulo by apache.

the class SimpleGarbageCollector method run.

private void run() {
    long tStart, tStop;
    // Sleep for an initial period, giving the master time to start up and
    // old data files to be unused
    log.info("Trying to acquire ZooKeeper lock for garbage collector");
    try {
        getZooLock(startStatsService());
    } catch (Exception ex) {
        log.error("{}", ex.getMessage(), ex);
        System.exit(1);
    }
    try {
        long delay = getStartDelay();
        log.debug("Sleeping for {} milliseconds before beginning garbage collection cycles", delay);
        Thread.sleep(delay);
    } catch (InterruptedException e) {
        log.warn("{}", e.getMessage(), e);
        return;
    }
    ProbabilitySampler sampler = new ProbabilitySampler(getConfiguration().getFraction(Property.GC_TRACE_PERCENT));
    while (true) {
        Trace.on("gc", sampler);
        Span gcSpan = Trace.start("loop");
        tStart = System.currentTimeMillis();
        try {
            // make room
            System.gc();
            status.current.started = System.currentTimeMillis();
            new GarbageCollectionAlgorithm().collect(new GCEnv(RootTable.NAME));
            new GarbageCollectionAlgorithm().collect(new GCEnv(MetadataTable.NAME));
            log.info("Number of data file candidates for deletion: {}", status.current.candidates);
            log.info("Number of data file candidates still in use: {}", status.current.inUse);
            log.info("Number of successfully deleted data files: {}", status.current.deleted);
            log.info("Number of data files delete failures: {}", status.current.errors);
            status.current.finished = System.currentTimeMillis();
            status.last = status.current;
            status.current = new GcCycleStats();
        } catch (Exception e) {
            log.error("{}", e.getMessage(), e);
        }
        tStop = System.currentTimeMillis();
        log.info(String.format("Collect cycle took %.2f seconds", ((tStop - tStart) / 1000.0)));
        // We want to prune references to fully-replicated WALs from the replication table which are no longer referenced in the metadata table
        // before running GarbageCollectWriteAheadLogs to ensure we delete as many files as possible.
        Span replSpan = Trace.start("replicationClose");
        try {
            CloseWriteAheadLogReferences closeWals = new CloseWriteAheadLogReferences(this);
            closeWals.run();
        } catch (Exception e) {
            log.error("Error trying to close write-ahead logs for replication table", e);
        } finally {
            replSpan.stop();
        }
        // Clean up any unused write-ahead logs
        Span waLogs = Trace.start("walogs");
        try {
            GarbageCollectWriteAheadLogs walogCollector = new GarbageCollectWriteAheadLogs(this, fs, isUsingTrash());
            log.info("Beginning garbage collection of write-ahead logs");
            walogCollector.collect(status);
        } catch (Exception e) {
            log.error("{}", e.getMessage(), e);
        } finally {
            waLogs.stop();
        }
        gcSpan.stop();
        // we just made a lot of metadata changes: flush them out
        try {
            Connector connector = getConnector();
            connector.tableOperations().compact(MetadataTable.NAME, null, null, true, true);
            connector.tableOperations().compact(RootTable.NAME, null, null, true, true);
        } catch (Exception e) {
            log.warn("{}", e.getMessage(), e);
        }
        Trace.off();
        try {
            long gcDelay = getConfiguration().getTimeInMillis(Property.GC_CYCLE_DELAY);
            log.debug("Sleeping for {} milliseconds", gcDelay);
            Thread.sleep(gcDelay);
        } catch (InterruptedException e) {
            log.warn("{}", e.getMessage(), e);
            return;
        }
    }
}
Also used : ProbabilitySampler(org.apache.accumulo.core.trace.ProbabilitySampler) CloseWriteAheadLogReferences(org.apache.accumulo.gc.replication.CloseWriteAheadLogReferences) Connector(org.apache.accumulo.core.client.Connector) GcCycleStats(org.apache.accumulo.core.gc.thrift.GcCycleStats) Span(org.apache.accumulo.core.trace.Span) TableNotFoundException(org.apache.accumulo.core.client.TableNotFoundException) InvalidProtocolBufferException(com.google.protobuf.InvalidProtocolBufferException) MutationsRejectedException(org.apache.accumulo.core.client.MutationsRejectedException) FileNotFoundException(java.io.FileNotFoundException) ReplicationTableOfflineException(org.apache.accumulo.core.replication.ReplicationTableOfflineException) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) KeeperException(org.apache.zookeeper.KeeperException) IOException(java.io.IOException) UnknownHostException(java.net.UnknownHostException) AccumuloException(org.apache.accumulo.core.client.AccumuloException)

Aggregations

ProbabilitySampler (org.apache.accumulo.core.trace.ProbabilitySampler)5 Span (org.apache.accumulo.core.trace.Span)4 IOException (java.io.IOException)3 AccumuloException (org.apache.accumulo.core.client.AccumuloException)3 AccumuloSecurityException (org.apache.accumulo.core.client.AccumuloSecurityException)3 InvalidProtocolBufferException (com.google.protobuf.InvalidProtocolBufferException)1 FileNotFoundException (java.io.FileNotFoundException)1 UnknownHostException (java.net.UnknownHostException)1 Connector (org.apache.accumulo.core.client.Connector)1 MutationsRejectedException (org.apache.accumulo.core.client.MutationsRejectedException)1 TableNotFoundException (org.apache.accumulo.core.client.TableNotFoundException)1 ClientExecReturn (org.apache.accumulo.core.client.impl.ClientExecReturn)1 ReplicationClient (org.apache.accumulo.core.client.impl.ReplicationClient)1 GcCycleStats (org.apache.accumulo.core.gc.thrift.GcCycleStats)1 ReplicationTableOfflineException (org.apache.accumulo.core.replication.ReplicationTableOfflineException)1 ReplicationCoordinator (org.apache.accumulo.core.replication.thrift.ReplicationCoordinator)1 Client (org.apache.accumulo.core.replication.thrift.ReplicationServicer.Client)1 HostAndPort (org.apache.accumulo.core.util.HostAndPort)1 CloseWriteAheadLogReferences (org.apache.accumulo.gc.replication.CloseWriteAheadLogReferences)1 FileRef (org.apache.accumulo.server.fs.FileRef)1