Search in sources :

Example 1 with Membership

use of com.couchbase.connector.cluster.Membership in project couchbase-elasticsearch-connector by couchbase.

the class LeaderTask method rebalance.

private void rebalance() throws InterruptedException {
    final String configLocation = ctx.keys().config();
    LOGGER.info("Reading connector config from Consul key: {}", configLocation);
    final String config = ctx.consul().keyValueClient().getValue(configLocation).orElseThrow(() -> new ConfigException("missing Consul config key: " + configLocation)).getValueAsString(UTF_8).orElseThrow(() -> new ConfigException("missing value for Consul key: " + configLocation));
    // Sanity check, validate the config.
    ConnectorConfig.from(config);
    restartRebalance: while (true) {
        LOGGER.info("Rebalancing the cluster");
        // dumb strategy: shut everything down, then reassign vbuckets
        stopStreaming();
        final List<RpcEndpoint> endpoints = awaitReadyEndpoints();
        for (int i = 0; i < endpoints.size(); i++) {
            throwIfDone();
            final int memberNumber = i + 1;
            final int clusterSize = endpoints.size();
            final Membership membership = Membership.of(memberNumber, clusterSize);
            final RpcEndpoint endpoint = endpoints.get(i);
            LOGGER.info("Assigning group membership {} to endpoint {}", membership, endpoint);
            try {
                endpoint.service(WorkerService.class).startStreaming(membership, config);
            } catch (Throwable t) {
                // todo what happens here? What if it fails due to timeout, and the worker is actually doing the work?
                // For now, start the whole rebalance process over again. This is obviously not ideal.
                LOGGER.warn("Failed to assign group membership {} to endpoint {}", membership, endpoint, t);
                SECONDS.sleep(3);
                continue restartRebalance;
            }
        }
        // success!
        return;
    }
}
Also used : RpcEndpoint(com.couchbase.connector.cluster.consul.rpc.RpcEndpoint) Membership(com.couchbase.connector.cluster.Membership) ConfigException(com.couchbase.connector.config.ConfigException) List(java.util.List)

Example 2 with Membership

use of com.couchbase.connector.cluster.Membership in project couchbase-elasticsearch-connector by couchbase.

the class ElasticsearchConnector method run.

public static void run(ConnectorConfig config, PanicButton panicButton, Duration startupQuietPeriod) throws Throwable {
    final Throwable fatalError;
    final Membership membership = config.group().staticMembership();
    LOGGER.info("Read configuration: {}", redactSystem(config));
    final ScheduledExecutorService checkpointExecutor = Executors.newSingleThreadScheduledExecutor();
    try (Slf4jReporter metricReporter = newSlf4jReporter(config.metrics().logInterval());
        HttpServer httpServer = new HttpServer(config.metrics().httpPort(), membership);
        RestHighLevelClient esClient = newElasticsearchClient(config.elasticsearch(), config.trustStore())) {
        DocumentLifecycle.setLogLevel(config.logging().logDocumentLifecycle() ? LogLevel.INFO : LogLevel.DEBUG);
        LogRedaction.setRedactionLevel(config.logging().redactionLevel());
        DcpHelper.setRedactionLevel(config.logging().redactionLevel());
        final ClusterEnvironment env = CouchbaseHelper.environmentBuilder(config.couchbase(), config.trustStore()).build();
        final Cluster cluster = CouchbaseHelper.createCluster(config.couchbase(), env);
        final Version elasticsearchVersion = waitForElasticsearchAndRequireVersion(esClient, new Version(2, 0, 0), new Version(5, 6, 16));
        LOGGER.info("Elasticsearch version {}", elasticsearchVersion);
        validateConfig(elasticsearchVersion, config.elasticsearch());
        // Wait for couchbase server to come online, then open the bucket.
        final Bucket bucket = CouchbaseHelper.waitForBucket(cluster, config.couchbase().bucket());
        final Set<SeedNode> kvNodes = CouchbaseHelper.getKvNodes(config.couchbase(), bucket);
        final boolean storeMetadataInSourceBucket = config.couchbase().metadataBucket().equals(config.couchbase().bucket());
        final Bucket metadataBucket = storeMetadataInSourceBucket ? bucket : CouchbaseHelper.waitForBucket(cluster, config.couchbase().metadataBucket());
        final Collection metadataCollection = CouchbaseHelper.getMetadataCollection(metadataBucket, config.couchbase());
        final CheckpointDao checkpointDao = new CouchbaseCheckpointDao(metadataCollection, config.group().name());
        // todo get this from dcp client
        final String bucketUuid = "";
        final CheckpointService checkpointService = new CheckpointService(bucketUuid, checkpointDao);
        final RequestFactory requestFactory = new RequestFactory(config.elasticsearch().types(), config.elasticsearch().docStructure(), config.elasticsearch().rejectLog());
        final ElasticsearchWorkerGroup workers = new ElasticsearchWorkerGroup(esClient, checkpointService, requestFactory, ErrorListener.NOOP, config.elasticsearch().bulkRequest());
        Metrics.gauge("write.queue", "Document events currently buffered in memory.", workers, ElasticsearchWorkerGroup::getQueueSize);
        // High value indicates the connector has stalled
        Metrics.gauge("es.wait.ms", null, workers, ElasticsearchWorkerGroup::getCurrentRequestMillis);
        // Same as "es.wait.ms" but normalized to seconds for Prometheus
        Metrics.gauge("es.wait.seconds", "Duration of in-flight Elasticsearch bulk request (including any retries). Long duration may indicate connector has stalled.", workers, value -> value.getCurrentRequestMillis() / (double) SECONDS.toMillis(1));
        final Client dcpClient = DcpHelper.newClient(config.group().name(), config.couchbase(), kvNodes, config.trustStore());
        initEventListener(dcpClient, panicButton, workers::submit);
        final Thread saveCheckpoints = new Thread(checkpointService::save, "save-checkpoints");
        try {
            try {
                dcpClient.connect().block(Duration.ofMillis(config.couchbase().dcp().connectTimeout().millis()));
            } catch (Throwable t) {
                panicButton.panic("Failed to establish initial DCP connection within " + config.couchbase().dcp().connectTimeout(), t);
            }
            final int numPartitions = dcpClient.numPartitions();
            LOGGER.info("Bucket has {} partitions. Membership = {}", numPartitions, membership);
            final Set<Integer> partitions = membership.getPartitions(numPartitions);
            if (partitions.isEmpty()) {
                // need to do this check, because if we started streaming with an empty list, the DCP client would open streams for *all* partitions
                throw new IllegalArgumentException("There are more workers than Couchbase vbuckets; this worker doesn't have any work to do.");
            }
            checkpointService.init(numPartitions, () -> DcpHelper.getCurrentSeqnosAsMap(dcpClient, partitions, Duration.ofSeconds(5)));
            dcpClient.initializeState(StreamFrom.BEGINNING, StreamTo.INFINITY).block();
            initSessionState(dcpClient, checkpointService, partitions);
            // configuration problems.
            if (!startupQuietPeriod.isZero()) {
                LOGGER.info("Entering startup quiet period; sleeping for {} so peers can terminate in case of unsafe scaling.", startupQuietPeriod);
                MILLISECONDS.sleep(startupQuietPeriod.toMillis());
                LOGGER.info("Startup quiet period complete.");
            }
            checkpointExecutor.scheduleWithFixedDelay(checkpointService::save, 10, 10, SECONDS);
            RuntimeHelper.addShutdownHook(saveCheckpoints);
            // Unless shutdown is due to panic...
            panicButton.addPrePanicHook(() -> RuntimeHelper.removeShutdownHook(saveCheckpoints));
            try {
                LOGGER.debug("Opening DCP streams for partitions: {}", partitions);
                dcpClient.startStreaming(partitions).block();
            } catch (RuntimeException e) {
                ThrowableHelper.propagateCauseIfPossible(e, InterruptedException.class);
                throw e;
            }
            // Start HTTP server *after* other setup is complete, so the metrics endpoint
            // can be used as a "successful startup" probe.
            httpServer.start();
            if (config.metrics().httpPort() >= 0) {
                LOGGER.info("Prometheus metrics available at http://localhost:{}/metrics/prometheus", httpServer.getBoundPort());
                LOGGER.info("Dropwizard metrics available at http://localhost:{}/metrics/dropwizard?pretty", httpServer.getBoundPort());
            } else {
                LOGGER.info("Metrics HTTP server is disabled. Edit the [metrics] 'httpPort' config property to enable.");
            }
            LOGGER.info("Elasticsearch connector startup complete.");
            fatalError = workers.awaitFatalError();
            LOGGER.error("Terminating due to fatal error from worker", fatalError);
        } catch (InterruptedException shutdownRequest) {
            LOGGER.info("Graceful shutdown requested. Saving checkpoints and cleaning up.");
            checkpointService.save();
            throw shutdownRequest;
        } catch (Throwable t) {
            LOGGER.error("Terminating due to fatal error during setup", t);
            throw t;
        } finally {
            // If we get here it means there was a fatal exception, or the connector is running in distributed
            // or test mode and a graceful shutdown was requested. Don't need the shutdown hook for any of those cases.
            RuntimeHelper.removeShutdownHook(saveCheckpoints);
            checkpointExecutor.shutdown();
            metricReporter.stop();
            dcpClient.disconnect().block();
            // to avoid buffer leak, must close *after* dcp client stops feeding it events
            workers.close();
            checkpointExecutor.awaitTermination(10, SECONDS);
            cluster.disconnect();
            // can't reuse, because connector config might have different SSL settings next time
            env.shutdown();
        }
    }
    // give stdout a chance to quiet down so the stack trace on stderr isn't interleaved with stdout.
    MILLISECONDS.sleep(500);
    throw fatalError;
}
Also used : VersionHelper.getVersionString(com.couchbase.connector.VersionHelper.getVersionString) CouchbaseCheckpointDao(com.couchbase.connector.dcp.CouchbaseCheckpointDao) CheckpointService(com.couchbase.connector.dcp.CheckpointService) RequestFactory(com.couchbase.connector.elasticsearch.io.RequestFactory) Version(com.couchbase.client.dcp.util.Version) ElasticsearchHelper.waitForElasticsearchAndRequireVersion(com.couchbase.connector.elasticsearch.ElasticsearchHelper.waitForElasticsearchAndRequireVersion) HttpServer(com.couchbase.connector.util.HttpServer) Membership(com.couchbase.connector.cluster.Membership) DefaultKubernetesClient(io.fabric8.kubernetes.client.DefaultKubernetesClient) ElasticsearchHelper.newElasticsearchClient(com.couchbase.connector.elasticsearch.ElasticsearchHelper.newElasticsearchClient) Client(com.couchbase.client.dcp.Client) RestHighLevelClient(org.elasticsearch.client.RestHighLevelClient) KubernetesClient(io.fabric8.kubernetes.client.KubernetesClient) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) SeedNode(com.couchbase.client.core.env.SeedNode) Cluster(com.couchbase.client.java.Cluster) RestHighLevelClient(org.elasticsearch.client.RestHighLevelClient) ClusterEnvironment(com.couchbase.client.java.env.ClusterEnvironment) CheckpointDao(com.couchbase.connector.dcp.CheckpointDao) CouchbaseCheckpointDao(com.couchbase.connector.dcp.CouchbaseCheckpointDao) Bucket(com.couchbase.client.java.Bucket) Slf4jReporter(com.codahale.metrics.Slf4jReporter) Collection(com.couchbase.client.java.Collection)

Example 3 with Membership

use of com.couchbase.connector.cluster.Membership in project couchbase-elasticsearch-connector by couchbase.

the class ElasticsearchConnector method main.

public static void main(String... args) throws Throwable {
    LOGGER.info("Couchbase Elasticsearch Connector version {}", getVersionString());
    final OptionsParser parser = new OptionsParser();
    final OptionSet options = parser.parse(args);
    final File configFile = options.valueOf(parser.configFile);
    System.out.println("Reading connector configuration from " + configFile.getAbsoluteFile());
    ConnectorConfig config = ConnectorConfig.from(configFile);
    final PanicButton panicButton = new DefaultPanicButton();
    boolean watchK8sReplicas = "true".equals(System.getenv("CBES_K8S_WATCH_REPLICAS"));
    boolean getMemberNumberFromHostname = watchK8sReplicas || "true".equals(System.getenv("CBES_K8S_STATEFUL_SET"));
    if (getMemberNumberFromHostname) {
        int memberNumber = StatefulSetInfo.fromHostname().podOrdinal + 1;
        LOGGER.info("Getting group member number from Kubernetes pod hostname: {}", memberNumber);
        // This is a kludge. The Membership class validates its arguments, so you can't have a Membership
        // of "4 of 1", for example. If we plan to get the group size from the Kubernetes StatefulSet,
        // bypass this validation by temporarily setting the group size to the largest sane value (1024).
        // We'll dial it down to the actual size of the StatefulSet a bit later on.
        int clusterSize = watchK8sReplicas ? 1024 : config.group().staticMembership().getClusterSize();
        config = transformMembership(config, m -> Membership.of(memberNumber, clusterSize));
    }
    KubernetesClient k8sClient = null;
    try {
        if (watchK8sReplicas) {
            k8sClient = new DefaultKubernetesClient();
            LOGGER.info("Activating native Kubernetes integration; connector will use StatefulSet spec" + " to determine group size." + " This mode requires a Kubernetes service account with 'get' and 'watch', and 'list'" + " permissions for the StatefulSet.");
            int k8sReplicas = ReplicaChangeWatcher.getReplicasAndPanicOnChange(k8sClient, panicButton);
            config = transformMembership(config, m -> Membership.of(m.getMemberNumber(), k8sReplicas));
        }
        if (watchK8sReplicas || getMemberNumberFromHostname) {
            LOGGER.info("Patched configuration with info from Kubernetes environment; membership = {}", config.group().staticMembership());
        }
        if (config.group().staticMembership().getClusterSize() > 1024) {
            panicButton.panic("Invalid group size configuration; totalMembers must be <= 1024." + " Did you forget to set the CBES_TOTAL_MEMBERS environment variable?");
        }
        Duration startupQuietPeriod = watchK8sReplicas ? ReplicaChangeWatcher.startupQuietPeriod() : Duration.ZERO;
        run(config, panicButton, startupQuietPeriod);
    } finally {
        if (k8sClient != null) {
            // so client threads don't prevent app from exiting
            k8sClient.close();
        }
    }
}
Also used : DcpHelper.initSessionState(com.couchbase.connector.dcp.DcpHelper.initSessionState) LoggerFactory(org.slf4j.LoggerFactory) ImmutableConnectorConfig(com.couchbase.connector.config.es.ImmutableConnectorConfig) Collection(com.couchbase.client.java.Collection) Duration(java.time.Duration) ClusterEnvironment(com.couchbase.client.java.env.ClusterEnvironment) RequestFactory(com.couchbase.connector.elasticsearch.io.RequestFactory) DefaultKubernetesClient(io.fabric8.kubernetes.client.DefaultKubernetesClient) DcpHelper.initEventListener(com.couchbase.connector.dcp.DcpHelper.initEventListener) ConnectorConfig(com.couchbase.connector.config.es.ConnectorConfig) OptionSet(joptsimple.OptionSet) StreamFrom(com.couchbase.client.dcp.StreamFrom) StatefulSetInfo(com.couchbase.connector.cluster.k8s.StatefulSetInfo) StreamTo(com.couchbase.client.dcp.StreamTo) LogLevel(com.couchbase.client.dcp.metrics.LogLevel) Set(java.util.Set) Version(com.couchbase.client.dcp.util.Version) MILLISECONDS(java.util.concurrent.TimeUnit.MILLISECONDS) Membership(com.couchbase.connector.cluster.Membership) Executors(java.util.concurrent.Executors) RuntimeHelper(com.couchbase.connector.util.RuntimeHelper) Bucket(com.couchbase.client.java.Bucket) ElasticsearchConfig(com.couchbase.connector.config.es.ElasticsearchConfig) ThrowableHelper(com.couchbase.connector.util.ThrowableHelper) Slf4jReporter(com.codahale.metrics.Slf4jReporter) CheckpointService(com.couchbase.connector.dcp.CheckpointService) ConfigException(com.couchbase.connector.config.ConfigException) TypeConfig(com.couchbase.connector.config.es.TypeConfig) ImmutableGroupConfig(com.couchbase.connector.config.common.ImmutableGroupConfig) ElasticsearchHelper.newElasticsearchClient(com.couchbase.connector.elasticsearch.ElasticsearchHelper.newElasticsearchClient) HttpServer(com.couchbase.connector.util.HttpServer) Client(com.couchbase.client.dcp.Client) LogRedaction(com.couchbase.client.core.logging.LogRedaction) SeedNode(com.couchbase.client.core.env.SeedNode) CheckpointDao(com.couchbase.connector.dcp.CheckpointDao) Function(java.util.function.Function) DefaultPanicButton(com.couchbase.connector.cluster.DefaultPanicButton) TimeValue(org.elasticsearch.common.unit.TimeValue) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) AbstractCliCommand(com.couchbase.connector.elasticsearch.cli.AbstractCliCommand) Logger(org.slf4j.Logger) RedactableArgument.redactSystem(com.couchbase.client.core.logging.RedactableArgument.redactSystem) PanicButton(com.couchbase.connector.cluster.PanicButton) CouchbaseCheckpointDao(com.couchbase.connector.dcp.CouchbaseCheckpointDao) DcpHelper(com.couchbase.connector.dcp.DcpHelper) RestHighLevelClient(org.elasticsearch.client.RestHighLevelClient) File(java.io.File) Cluster(com.couchbase.client.java.Cluster) CouchbaseHelper(com.couchbase.connector.dcp.CouchbaseHelper) ReplicaChangeWatcher(com.couchbase.connector.cluster.k8s.ReplicaChangeWatcher) KubernetesClient(io.fabric8.kubernetes.client.KubernetesClient) ElasticsearchHelper.waitForElasticsearchAndRequireVersion(com.couchbase.connector.elasticsearch.ElasticsearchHelper.waitForElasticsearchAndRequireVersion) SECONDS(java.util.concurrent.TimeUnit.SECONDS) VersionHelper.getVersionString(com.couchbase.connector.VersionHelper.getVersionString) DefaultKubernetesClient(io.fabric8.kubernetes.client.DefaultKubernetesClient) KubernetesClient(io.fabric8.kubernetes.client.KubernetesClient) ImmutableConnectorConfig(com.couchbase.connector.config.es.ImmutableConnectorConfig) ConnectorConfig(com.couchbase.connector.config.es.ConnectorConfig) DefaultPanicButton(com.couchbase.connector.cluster.DefaultPanicButton) PanicButton(com.couchbase.connector.cluster.PanicButton) Duration(java.time.Duration) DefaultKubernetesClient(io.fabric8.kubernetes.client.DefaultKubernetesClient) OptionSet(joptsimple.OptionSet) File(java.io.File) DefaultPanicButton(com.couchbase.connector.cluster.DefaultPanicButton)

Aggregations

Membership (com.couchbase.connector.cluster.Membership)3 Slf4jReporter (com.codahale.metrics.Slf4jReporter)2 SeedNode (com.couchbase.client.core.env.SeedNode)2 Client (com.couchbase.client.dcp.Client)2 Version (com.couchbase.client.dcp.util.Version)2 Bucket (com.couchbase.client.java.Bucket)2 Cluster (com.couchbase.client.java.Cluster)2 Collection (com.couchbase.client.java.Collection)2 ClusterEnvironment (com.couchbase.client.java.env.ClusterEnvironment)2 VersionHelper.getVersionString (com.couchbase.connector.VersionHelper.getVersionString)2 ConfigException (com.couchbase.connector.config.ConfigException)2 CheckpointDao (com.couchbase.connector.dcp.CheckpointDao)2 CheckpointService (com.couchbase.connector.dcp.CheckpointService)2 CouchbaseCheckpointDao (com.couchbase.connector.dcp.CouchbaseCheckpointDao)2 ElasticsearchHelper.newElasticsearchClient (com.couchbase.connector.elasticsearch.ElasticsearchHelper.newElasticsearchClient)2 ElasticsearchHelper.waitForElasticsearchAndRequireVersion (com.couchbase.connector.elasticsearch.ElasticsearchHelper.waitForElasticsearchAndRequireVersion)2 RequestFactory (com.couchbase.connector.elasticsearch.io.RequestFactory)2 HttpServer (com.couchbase.connector.util.HttpServer)2 DefaultKubernetesClient (io.fabric8.kubernetes.client.DefaultKubernetesClient)2 LogRedaction (com.couchbase.client.core.logging.LogRedaction)1