Search in sources :

Example 6 with DataflowPipelineDebugOptions

use of org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions in project beam by apache.

the class DataflowRunnerTest method testHasExperiment.

@Test
public void testHasExperiment() {
    DataflowPipelineDebugOptions options = PipelineOptionsFactory.as(DataflowPipelineDebugOptions.class);
    options.setExperiments(null);
    assertFalse(DataflowRunner.hasExperiment(options, "foo"));
    options.setExperiments(ImmutableList.of("foo", "bar"));
    assertTrue(DataflowRunner.hasExperiment(options, "foo"));
    assertTrue(DataflowRunner.hasExperiment(options, "bar"));
    assertFalse(DataflowRunner.hasExperiment(options, "baz"));
    assertFalse(DataflowRunner.hasExperiment(options, "ba"));
    assertFalse(DataflowRunner.hasExperiment(options, "BAR"));
}
Also used : DataflowPipelineDebugOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions) Test(org.junit.Test)

Example 7 with DataflowPipelineDebugOptions

use of org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions in project java-bigtable-hbase by googleapis.

the class Utils method tweakOptions.

/**
 * Helper to tweak default pipelineOptions for import/export jobs
 *
 * @param opts
 * @return PipelineOptions
 */
public static PipelineOptions tweakOptions(PipelineOptions opts) {
    if (!DataflowRunner.class.isAssignableFrom(opts.getRunner())) {
        return opts;
    }
    DataflowPipelineOptions dataflowOpts = opts.as(DataflowPipelineOptions.class);
    // This is done by chopping off the last dash
    if (Strings.isNullOrEmpty(dataflowOpts.getRegion())) {
        String zone = dataflowOpts.getWorkerZone();
        if (Strings.isNullOrEmpty(zone)) {
            zone = dataflowOpts.getZone();
        }
        if (!Strings.isNullOrEmpty(zone)) {
            String region = zone.replaceAll("-[^-]+$", "");
            dataflowOpts.setRegion(region);
        }
    }
    // user requested an explicit size
    if (dataflowOpts.getDiskSizeGb() == 0) {
        dataflowOpts.setDiskSizeGb(25);
    }
    /**
     * Bigtable pipelines are very GC intensive, For each cell in Bigtable we create following
     * objects: 1. Row key 2. Column qualifier 3. Timestamp 4. Value 5. A cell object that contains
     * the above 4 objects.
     *
     * <p>So each cell has at least 5 objects. On top of that, each cell may represented by
     * different kinds of objects. For example, import job creates HBase Result object and Mutation
     * objects for all the cells. Same is the case with Snapshot related pipelines.
     *
     * <p>Given this abundance of objects, for cells with smaller values, the pipeline may lead to a
     * high GC overhead, but it does make progress. The MemoryMonitor on dataflow worker kills the
     * pipeline and results in wasted work.
     *
     * <p>The above is true for most dataflow pipeline, but this specific use case is different as
     * the pipeline does nothing else. CPU is only used for object transformation and GC. So, we
     * disable the memory monitor on Bigtable pipelines. If pipeline stalls, it will OOM and then
     * human intervention will be required. As a mitigation, users should choose a worker machine
     * with higher memory or reduce the parallelism on the workers (by setting
     * --numberOfWorkerHarnessThreads).
     */
    DataflowPipelineDebugOptions debugOptions = dataflowOpts.as(DataflowPipelineDebugOptions.class);
    debugOptions.setGCThrashingPercentagePerPeriod(100.00);
    return debugOptions;
}
Also used : DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) DataflowRunner(org.apache.beam.runners.dataflow.DataflowRunner) DataflowPipelineDebugOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions)

Example 8 with DataflowPipelineDebugOptions

use of org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions in project beam by apache.

the class DataflowRunner method fromOptions.

/**
 * Construct a runner from the provided options.
 *
 * @param options Properties that configure the runner.
 * @return The newly created runner.
 */
public static DataflowRunner fromOptions(PipelineOptions options) {
    DataflowPipelineOptions dataflowOptions = PipelineOptionsValidator.validate(DataflowPipelineOptions.class, options);
    ArrayList<String> missing = new ArrayList<>();
    if (dataflowOptions.getAppName() == null) {
        missing.add("appName");
    }
    if (Strings.isNullOrEmpty(dataflowOptions.getRegion()) && isServiceEndpoint(dataflowOptions.getDataflowEndpoint())) {
        missing.add("region");
    }
    if (missing.size() > 0) {
        throw new IllegalArgumentException("Missing required pipeline options: " + Joiner.on(',').join(missing));
    }
    validateWorkerSettings(PipelineOptionsValidator.validate(DataflowPipelineWorkerPoolOptions.class, options));
    PathValidator validator = dataflowOptions.getPathValidator();
    String gcpTempLocation;
    try {
        gcpTempLocation = dataflowOptions.getGcpTempLocation();
    } catch (Exception e) {
        throw new IllegalArgumentException("DataflowRunner requires gcpTempLocation, " + "but failed to retrieve a value from PipelineOptions", e);
    }
    validator.validateOutputFilePrefixSupported(gcpTempLocation);
    String stagingLocation;
    try {
        stagingLocation = dataflowOptions.getStagingLocation();
    } catch (Exception e) {
        throw new IllegalArgumentException("DataflowRunner requires stagingLocation, " + "but failed to retrieve a value from PipelineOptions", e);
    }
    validator.validateOutputFilePrefixSupported(stagingLocation);
    if (!isNullOrEmpty(dataflowOptions.getSaveProfilesToGcs())) {
        validator.validateOutputFilePrefixSupported(dataflowOptions.getSaveProfilesToGcs());
    }
    if (dataflowOptions.getFilesToStage() != null) {
        // The user specifically requested these files, so fail now if they do not exist.
        // (automatically detected classpath elements are permitted to not exist, so later
        // staging will not fail on nonexistent files)
        dataflowOptions.getFilesToStage().stream().forEach(stagedFileSpec -> {
            File localFile;
            if (stagedFileSpec.contains("=")) {
                String[] components = stagedFileSpec.split("=", 2);
                localFile = new File(components[1]);
            } else {
                localFile = new File(stagedFileSpec);
            }
            if (!localFile.exists()) {
                // cannot add checked exception
                throw new RuntimeException(String.format("Non-existent files specified in filesToStage: %s", localFile));
            }
        });
    } else {
        dataflowOptions.setFilesToStage(detectClassPathResourcesToStage(DataflowRunner.class.getClassLoader(), options));
        if (dataflowOptions.getFilesToStage().isEmpty()) {
            throw new IllegalArgumentException("No files to stage has been found.");
        } else {
            LOG.info("PipelineOptions.filesToStage was not specified. " + "Defaulting to files from the classpath: will stage {} files. " + "Enable logging at DEBUG level to see which files will be staged.", dataflowOptions.getFilesToStage().size());
            LOG.debug("Classpath elements: {}", dataflowOptions.getFilesToStage());
        }
    }
    // Verify jobName according to service requirements, truncating converting to lowercase if
    // necessary.
    String jobName = dataflowOptions.getJobName().toLowerCase();
    checkArgument(jobName.matches("[a-z]([-a-z0-9]*[a-z0-9])?"), "JobName invalid; the name must consist of only the characters " + "[-a-z0-9], starting with a letter and ending with a letter " + "or number");
    if (!jobName.equals(dataflowOptions.getJobName())) {
        LOG.info("PipelineOptions.jobName did not match the service requirements. " + "Using {} instead of {}.", jobName, dataflowOptions.getJobName());
    }
    dataflowOptions.setJobName(jobName);
    // Verify project
    String project = dataflowOptions.getProject();
    if (project.matches("[0-9]*")) {
        throw new IllegalArgumentException("Project ID '" + project + "' invalid. Please make sure you specified the Project ID, not project number.");
    } else if (!project.matches(PROJECT_ID_REGEXP)) {
        throw new IllegalArgumentException("Project ID '" + project + "' invalid. Please make sure you specified the Project ID, not project" + " description.");
    }
    DataflowPipelineDebugOptions debugOptions = dataflowOptions.as(DataflowPipelineDebugOptions.class);
    // Verify the number of worker threads is a valid value
    if (debugOptions.getNumberOfWorkerHarnessThreads() < 0) {
        throw new IllegalArgumentException("Number of worker harness threads '" + debugOptions.getNumberOfWorkerHarnessThreads() + "' invalid. Please make sure the value is non-negative.");
    }
    // Verify that if recordJfrOnGcThrashing is set, the pipeline is at least on java 11
    if (dataflowOptions.getRecordJfrOnGcThrashing() && Environments.getJavaVersion() == Environments.JavaVersion.java8) {
        throw new IllegalArgumentException("recordJfrOnGcThrashing is only supported on java 9 and up.");
    }
    if (dataflowOptions.isStreaming() && dataflowOptions.getGcsUploadBufferSizeBytes() == null) {
        dataflowOptions.setGcsUploadBufferSizeBytes(GCS_UPLOAD_BUFFER_SIZE_BYTES_DEFAULT);
    }
    // Adding the Java version to the SDK name for user's and support convenience.
    String agentJavaVer = "(JRE 8 environment)";
    if (Environments.getJavaVersion() == Environments.JavaVersion.java17) {
        agentJavaVer = "(JRE 17 environment)";
    } else if (Environments.getJavaVersion() == Environments.JavaVersion.java11) {
        agentJavaVer = "(JRE 11 environment)";
    }
    DataflowRunnerInfo dataflowRunnerInfo = DataflowRunnerInfo.getDataflowRunnerInfo();
    String userAgent = String.format("%s/%s%s", dataflowRunnerInfo.getName(), dataflowRunnerInfo.getVersion(), agentJavaVer).replace(" ", "_");
    dataflowOptions.setUserAgent(userAgent);
    return new DataflowRunner(dataflowOptions);
}
Also used : DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) ArrayList(java.util.ArrayList) PathValidator(org.apache.beam.sdk.extensions.gcp.storage.PathValidator) StringUtils.byteArrayToJsonString(org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) NonDeterministicException(org.apache.beam.sdk.coders.Coder.NonDeterministicException) IOException(java.io.IOException) GoogleJsonResponseException(com.google.api.client.googleapis.json.GoogleJsonResponseException) DataflowPipelineDebugOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions) File(java.io.File) StagedFile(org.apache.beam.runners.dataflow.util.PackageUtil.StagedFile) DataflowPipelineWorkerPoolOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineWorkerPoolOptions)

Example 9 with DataflowPipelineDebugOptions

use of org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions in project beam by apache.

the class MemoryMonitor method fromOptions.

public static MemoryMonitor fromOptions(PipelineOptions options) {
    DataflowPipelineDebugOptions debugOptions = options.as(DataflowPipelineDebugOptions.class);
    DataflowWorkerHarnessOptions workerHarnessOptions = options.as(DataflowWorkerHarnessOptions.class);
    String uploadToGCSPath = debugOptions.getSaveHeapDumpsToGcsPath();
    String workerId = workerHarnessOptions.getWorkerId();
    boolean canDumpHeap = uploadToGCSPath != null || debugOptions.getDumpHeapOnOOM();
    double gcThrashingPercentagePerPeriod = debugOptions.getGCThrashingPercentagePerPeriod();
    Duration jfrProfileDuration;
    if (uploadToGCSPath != null && debugOptions.getRecordJfrOnGcThrashing()) {
        if (Environments.getJavaVersion() == Environments.JavaVersion.java8) {
            throw new IllegalArgumentException("recordJfrOnGcThrashing is only supported on java 9 and up.");
        }
        jfrProfileDuration = Duration.ofSeconds(debugOptions.getJfrRecordingDurationSec());
    } else {
        jfrProfileDuration = null;
    }
    return new MemoryMonitor(new SystemGCStatsProvider(), DEFAULT_SLEEP_TIME_MILLIS, DEFAULT_SHUT_DOWN_AFTER_NUM_GCTHRASHING, canDumpHeap, gcThrashingPercentagePerPeriod, uploadToGCSPath, getLoggingDir(), workerId, jfrProfileDuration, Clock.systemUTC());
}
Also used : DataflowWorkerHarnessOptions(org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions) DataflowPipelineDebugOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions) Duration(java.time.Duration)

Aggregations

DataflowPipelineDebugOptions (org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions)9 Test (org.junit.Test)4 DataflowPipelineOptions (org.apache.beam.runners.dataflow.options.DataflowPipelineOptions)3 DataflowWorkerHarnessOptions (org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions)2 CloudObject (org.apache.beam.runners.dataflow.util.CloudObject)2 TestCountingSource (org.apache.beam.runners.dataflow.worker.testing.TestCountingSource)2 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)2 GoogleJsonResponseException (com.google.api.client.googleapis.json.GoogleJsonResponseException)1 CounterUpdate (com.google.api.services.dataflow.model.CounterUpdate)1 InstructionOutput (com.google.api.services.dataflow.model.InstructionOutput)1 ParallelInstruction (com.google.api.services.dataflow.model.ParallelInstruction)1 ReadInstruction (com.google.api.services.dataflow.model.ReadInstruction)1 File (java.io.File)1 IOException (java.io.IOException)1 Duration (java.time.Duration)1 ArrayList (java.util.ArrayList)1 AtomicLong (java.util.concurrent.atomic.AtomicLong)1 ApiServiceDescriptor (org.apache.beam.model.pipeline.v1.Endpoints.ApiServiceDescriptor)1 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)1 DataflowRunner (org.apache.beam.runners.dataflow.DataflowRunner)1