Search in sources :

Example 1 with GcMetrics

use of org.apache.accumulo.gc.metrics.GcMetrics in project accumulo by apache.

the class SimpleGarbageCollector method run.

@Override
@SuppressFBWarnings(value = "DM_EXIT", justification = "main class can call System.exit")
public void run() {
    final VolumeManager fs = getContext().getVolumeManager();
    // Sleep for an initial period, giving the manager time to start up and
    // old data files to be unused
    log.info("Trying to acquire ZooKeeper lock for garbage collector");
    HostAndPort address = startStatsService();
    try {
        getZooLock(address);
    } catch (Exception ex) {
        log.error("{}", ex.getMessage(), ex);
        System.exit(1);
    }
    try {
        MetricsUtil.initializeMetrics(getContext().getConfiguration(), this.applicationName, address);
        MetricsUtil.initializeProducers(new GcMetrics(this));
    } catch (Exception e1) {
        log.error("Error initializing metrics, metrics will not be emitted.", e1);
    }
    try {
        long delay = getStartDelay();
        log.debug("Sleeping for {} milliseconds before beginning garbage collection cycles", delay);
        Thread.sleep(delay);
    } catch (InterruptedException e) {
        log.warn("{}", e.getMessage(), e);
        return;
    }
    // This is created outside of the run loop and passed to the walogCollector so that
    // only a single timed task is created (internal to LiveTServerSet) using SimpleTimer.
    final LiveTServerSet liveTServerSet = new LiveTServerSet(getContext(), (current, deleted, added) -> {
        log.debug("Number of current servers {}, tservers added {}, removed {}", current == null ? -1 : current.size(), added, deleted);
        if (log.isTraceEnabled()) {
            log.trace("Current servers: {}\nAdded: {}\n Removed: {}", current, added, deleted);
        }
    });
    while (true) {
        Span outerSpan = TraceUtil.startSpan(this.getClass(), "gc");
        try (Scope outerScope = outerSpan.makeCurrent()) {
            Span innerSpan = TraceUtil.startSpan(this.getClass(), "loop");
            try (Scope innerScope = innerSpan.makeCurrent()) {
                final long tStart = System.nanoTime();
                try {
                    // make room
                    System.gc();
                    status.current.started = System.currentTimeMillis();
                    new GarbageCollectionAlgorithm().collect(new GCEnv(DataLevel.ROOT));
                    new GarbageCollectionAlgorithm().collect(new GCEnv(DataLevel.METADATA));
                    new GarbageCollectionAlgorithm().collect(new GCEnv(DataLevel.USER));
                    log.info("Number of data file candidates for deletion: {}", status.current.candidates);
                    log.info("Number of data file candidates still in use: {}", status.current.inUse);
                    log.info("Number of successfully deleted data files: {}", status.current.deleted);
                    log.info("Number of data files delete failures: {}", status.current.errors);
                    status.current.finished = System.currentTimeMillis();
                    status.last = status.current;
                    gcCycleMetrics.setLastCollect(status.current);
                    status.current = new GcCycleStats();
                } catch (Exception e) {
                    TraceUtil.setException(innerSpan, e, false);
                    log.error("{}", e.getMessage(), e);
                }
                final long tStop = System.nanoTime();
                log.info(String.format("Collect cycle took %.2f seconds", (TimeUnit.NANOSECONDS.toMillis(tStop - tStart) / 1000.0)));
                /*
           * We want to prune references to fully-replicated WALs from the replication table which
           * are no longer referenced in the metadata table before running
           * GarbageCollectWriteAheadLogs to ensure we delete as many files as possible.
           */
                Span replSpan = TraceUtil.startSpan(this.getClass(), "replicationClose");
                try (Scope replScope = replSpan.makeCurrent()) {
                    @SuppressWarnings("deprecation") Runnable closeWals = new org.apache.accumulo.gc.replication.CloseWriteAheadLogReferences(getContext());
                    closeWals.run();
                } catch (Exception e) {
                    TraceUtil.setException(replSpan, e, false);
                    log.error("Error trying to close write-ahead logs for replication table", e);
                } finally {
                    replSpan.end();
                }
                // Clean up any unused write-ahead logs
                Span walSpan = TraceUtil.startSpan(this.getClass(), "walogs");
                try (Scope walScope = walSpan.makeCurrent()) {
                    GarbageCollectWriteAheadLogs walogCollector = new GarbageCollectWriteAheadLogs(getContext(), fs, liveTServerSet, isUsingTrash());
                    log.info("Beginning garbage collection of write-ahead logs");
                    walogCollector.collect(status);
                    gcCycleMetrics.setLastWalCollect(status.lastLog);
                } catch (Exception e) {
                    TraceUtil.setException(walSpan, e, false);
                    log.error("{}", e.getMessage(), e);
                } finally {
                    walSpan.end();
                }
            } catch (Exception e) {
                TraceUtil.setException(innerSpan, e, true);
                throw e;
            } finally {
                innerSpan.end();
            }
            // we just made a lot of metadata changes: flush them out
            try {
                AccumuloClient accumuloClient = getContext();
                final long actionStart = System.nanoTime();
                String action = getConfiguration().get(Property.GC_USE_FULL_COMPACTION);
                log.debug("gc post action {} started", action);
                switch(action) {
                    case "compact":
                        accumuloClient.tableOperations().compact(MetadataTable.NAME, null, null, true, true);
                        accumuloClient.tableOperations().compact(RootTable.NAME, null, null, true, true);
                        break;
                    case "flush":
                        accumuloClient.tableOperations().flush(MetadataTable.NAME, null, null, true);
                        accumuloClient.tableOperations().flush(RootTable.NAME, null, null, true);
                        break;
                    default:
                        log.trace("'none - no action' or invalid value provided: {}", action);
                }
                final long actionComplete = System.nanoTime();
                gcCycleMetrics.setPostOpDurationNanos(actionComplete - actionStart);
                log.info("gc post action {} completed in {} seconds", action, String.format("%.2f", (TimeUnit.NANOSECONDS.toMillis(actionComplete - actionStart) / 1000.0)));
            } catch (Exception e) {
                TraceUtil.setException(outerSpan, e, false);
                log.warn("{}", e.getMessage(), e);
            }
        } catch (Exception e) {
            TraceUtil.setException(outerSpan, e, true);
            throw e;
        } finally {
            outerSpan.end();
        }
        try {
            gcCycleMetrics.incrementRunCycleCount();
            long gcDelay = getConfiguration().getTimeInMillis(Property.GC_CYCLE_DELAY);
            log.debug("Sleeping for {} milliseconds", gcDelay);
            Thread.sleep(gcDelay);
        } catch (InterruptedException e) {
            log.warn("{}", e.getMessage(), e);
            return;
        }
    }
}
Also used : AccumuloClient(org.apache.accumulo.core.client.AccumuloClient) VolumeManager(org.apache.accumulo.server.fs.VolumeManager) GcMetrics(org.apache.accumulo.gc.metrics.GcMetrics) GcCycleStats(org.apache.accumulo.core.gc.thrift.GcCycleStats) Span(io.opentelemetry.api.trace.Span) TableNotFoundException(org.apache.accumulo.core.client.TableNotFoundException) InvalidProtocolBufferException(com.google.protobuf.InvalidProtocolBufferException) FileNotFoundException(java.io.FileNotFoundException) KeeperException(org.apache.zookeeper.KeeperException) IOException(java.io.IOException) LiveTServerSet(org.apache.accumulo.server.manager.LiveTServerSet) HostAndPort(org.apache.accumulo.core.util.HostAndPort) Scope(io.opentelemetry.context.Scope) SuppressFBWarnings(edu.umd.cs.findbugs.annotations.SuppressFBWarnings)

Aggregations

InvalidProtocolBufferException (com.google.protobuf.InvalidProtocolBufferException)1 SuppressFBWarnings (edu.umd.cs.findbugs.annotations.SuppressFBWarnings)1 Span (io.opentelemetry.api.trace.Span)1 Scope (io.opentelemetry.context.Scope)1 FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1 AccumuloClient (org.apache.accumulo.core.client.AccumuloClient)1 TableNotFoundException (org.apache.accumulo.core.client.TableNotFoundException)1 GcCycleStats (org.apache.accumulo.core.gc.thrift.GcCycleStats)1 HostAndPort (org.apache.accumulo.core.util.HostAndPort)1 GcMetrics (org.apache.accumulo.gc.metrics.GcMetrics)1 VolumeManager (org.apache.accumulo.server.fs.VolumeManager)1 LiveTServerSet (org.apache.accumulo.server.manager.LiveTServerSet)1 KeeperException (org.apache.zookeeper.KeeperException)1