use of org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions in project beam by apache.
the class DataflowRunnerTest method testHasExperiment.
@Test
public void testHasExperiment() {
DataflowPipelineDebugOptions options = PipelineOptionsFactory.as(DataflowPipelineDebugOptions.class);
options.setExperiments(null);
assertFalse(DataflowRunner.hasExperiment(options, "foo"));
options.setExperiments(ImmutableList.of("foo", "bar"));
assertTrue(DataflowRunner.hasExperiment(options, "foo"));
assertTrue(DataflowRunner.hasExperiment(options, "bar"));
assertFalse(DataflowRunner.hasExperiment(options, "baz"));
assertFalse(DataflowRunner.hasExperiment(options, "ba"));
assertFalse(DataflowRunner.hasExperiment(options, "BAR"));
}
use of org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions in project java-bigtable-hbase by googleapis.
the class Utils method tweakOptions.
/**
* Helper to tweak default pipelineOptions for import/export jobs
*
* @param opts
* @return PipelineOptions
*/
public static PipelineOptions tweakOptions(PipelineOptions opts) {
if (!DataflowRunner.class.isAssignableFrom(opts.getRunner())) {
return opts;
}
DataflowPipelineOptions dataflowOpts = opts.as(DataflowPipelineOptions.class);
// This is done by chopping off the last dash
if (Strings.isNullOrEmpty(dataflowOpts.getRegion())) {
String zone = dataflowOpts.getWorkerZone();
if (Strings.isNullOrEmpty(zone)) {
zone = dataflowOpts.getZone();
}
if (!Strings.isNullOrEmpty(zone)) {
String region = zone.replaceAll("-[^-]+$", "");
dataflowOpts.setRegion(region);
}
}
// user requested an explicit size
if (dataflowOpts.getDiskSizeGb() == 0) {
dataflowOpts.setDiskSizeGb(25);
}
/**
* Bigtable pipelines are very GC intensive, For each cell in Bigtable we create following
* objects: 1. Row key 2. Column qualifier 3. Timestamp 4. Value 5. A cell object that contains
* the above 4 objects.
*
* <p>So each cell has at least 5 objects. On top of that, each cell may represented by
* different kinds of objects. For example, import job creates HBase Result object and Mutation
* objects for all the cells. Same is the case with Snapshot related pipelines.
*
* <p>Given this abundance of objects, for cells with smaller values, the pipeline may lead to a
* high GC overhead, but it does make progress. The MemoryMonitor on dataflow worker kills the
* pipeline and results in wasted work.
*
* <p>The above is true for most dataflow pipeline, but this specific use case is different as
* the pipeline does nothing else. CPU is only used for object transformation and GC. So, we
* disable the memory monitor on Bigtable pipelines. If pipeline stalls, it will OOM and then
* human intervention will be required. As a mitigation, users should choose a worker machine
* with higher memory or reduce the parallelism on the workers (by setting
* --numberOfWorkerHarnessThreads).
*/
DataflowPipelineDebugOptions debugOptions = dataflowOpts.as(DataflowPipelineDebugOptions.class);
debugOptions.setGCThrashingPercentagePerPeriod(100.00);
return debugOptions;
}
use of org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions in project beam by apache.
the class DataflowRunner method fromOptions.
/**
* Construct a runner from the provided options.
*
* @param options Properties that configure the runner.
* @return The newly created runner.
*/
public static DataflowRunner fromOptions(PipelineOptions options) {
DataflowPipelineOptions dataflowOptions = PipelineOptionsValidator.validate(DataflowPipelineOptions.class, options);
ArrayList<String> missing = new ArrayList<>();
if (dataflowOptions.getAppName() == null) {
missing.add("appName");
}
if (Strings.isNullOrEmpty(dataflowOptions.getRegion()) && isServiceEndpoint(dataflowOptions.getDataflowEndpoint())) {
missing.add("region");
}
if (missing.size() > 0) {
throw new IllegalArgumentException("Missing required pipeline options: " + Joiner.on(',').join(missing));
}
validateWorkerSettings(PipelineOptionsValidator.validate(DataflowPipelineWorkerPoolOptions.class, options));
PathValidator validator = dataflowOptions.getPathValidator();
String gcpTempLocation;
try {
gcpTempLocation = dataflowOptions.getGcpTempLocation();
} catch (Exception e) {
throw new IllegalArgumentException("DataflowRunner requires gcpTempLocation, " + "but failed to retrieve a value from PipelineOptions", e);
}
validator.validateOutputFilePrefixSupported(gcpTempLocation);
String stagingLocation;
try {
stagingLocation = dataflowOptions.getStagingLocation();
} catch (Exception e) {
throw new IllegalArgumentException("DataflowRunner requires stagingLocation, " + "but failed to retrieve a value from PipelineOptions", e);
}
validator.validateOutputFilePrefixSupported(stagingLocation);
if (!isNullOrEmpty(dataflowOptions.getSaveProfilesToGcs())) {
validator.validateOutputFilePrefixSupported(dataflowOptions.getSaveProfilesToGcs());
}
if (dataflowOptions.getFilesToStage() != null) {
// The user specifically requested these files, so fail now if they do not exist.
// (automatically detected classpath elements are permitted to not exist, so later
// staging will not fail on nonexistent files)
dataflowOptions.getFilesToStage().stream().forEach(stagedFileSpec -> {
File localFile;
if (stagedFileSpec.contains("=")) {
String[] components = stagedFileSpec.split("=", 2);
localFile = new File(components[1]);
} else {
localFile = new File(stagedFileSpec);
}
if (!localFile.exists()) {
// cannot add checked exception
throw new RuntimeException(String.format("Non-existent files specified in filesToStage: %s", localFile));
}
});
} else {
dataflowOptions.setFilesToStage(detectClassPathResourcesToStage(DataflowRunner.class.getClassLoader(), options));
if (dataflowOptions.getFilesToStage().isEmpty()) {
throw new IllegalArgumentException("No files to stage has been found.");
} else {
LOG.info("PipelineOptions.filesToStage was not specified. " + "Defaulting to files from the classpath: will stage {} files. " + "Enable logging at DEBUG level to see which files will be staged.", dataflowOptions.getFilesToStage().size());
LOG.debug("Classpath elements: {}", dataflowOptions.getFilesToStage());
}
}
// Verify jobName according to service requirements, truncating converting to lowercase if
// necessary.
String jobName = dataflowOptions.getJobName().toLowerCase();
checkArgument(jobName.matches("[a-z]([-a-z0-9]*[a-z0-9])?"), "JobName invalid; the name must consist of only the characters " + "[-a-z0-9], starting with a letter and ending with a letter " + "or number");
if (!jobName.equals(dataflowOptions.getJobName())) {
LOG.info("PipelineOptions.jobName did not match the service requirements. " + "Using {} instead of {}.", jobName, dataflowOptions.getJobName());
}
dataflowOptions.setJobName(jobName);
// Verify project
String project = dataflowOptions.getProject();
if (project.matches("[0-9]*")) {
throw new IllegalArgumentException("Project ID '" + project + "' invalid. Please make sure you specified the Project ID, not project number.");
} else if (!project.matches(PROJECT_ID_REGEXP)) {
throw new IllegalArgumentException("Project ID '" + project + "' invalid. Please make sure you specified the Project ID, not project" + " description.");
}
DataflowPipelineDebugOptions debugOptions = dataflowOptions.as(DataflowPipelineDebugOptions.class);
// Verify the number of worker threads is a valid value
if (debugOptions.getNumberOfWorkerHarnessThreads() < 0) {
throw new IllegalArgumentException("Number of worker harness threads '" + debugOptions.getNumberOfWorkerHarnessThreads() + "' invalid. Please make sure the value is non-negative.");
}
// Verify that if recordJfrOnGcThrashing is set, the pipeline is at least on java 11
if (dataflowOptions.getRecordJfrOnGcThrashing() && Environments.getJavaVersion() == Environments.JavaVersion.java8) {
throw new IllegalArgumentException("recordJfrOnGcThrashing is only supported on java 9 and up.");
}
if (dataflowOptions.isStreaming() && dataflowOptions.getGcsUploadBufferSizeBytes() == null) {
dataflowOptions.setGcsUploadBufferSizeBytes(GCS_UPLOAD_BUFFER_SIZE_BYTES_DEFAULT);
}
// Adding the Java version to the SDK name for user's and support convenience.
String agentJavaVer = "(JRE 8 environment)";
if (Environments.getJavaVersion() == Environments.JavaVersion.java17) {
agentJavaVer = "(JRE 17 environment)";
} else if (Environments.getJavaVersion() == Environments.JavaVersion.java11) {
agentJavaVer = "(JRE 11 environment)";
}
DataflowRunnerInfo dataflowRunnerInfo = DataflowRunnerInfo.getDataflowRunnerInfo();
String userAgent = String.format("%s/%s%s", dataflowRunnerInfo.getName(), dataflowRunnerInfo.getVersion(), agentJavaVer).replace(" ", "_");
dataflowOptions.setUserAgent(userAgent);
return new DataflowRunner(dataflowOptions);
}
use of org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions in project beam by apache.
the class MemoryMonitor method fromOptions.
public static MemoryMonitor fromOptions(PipelineOptions options) {
DataflowPipelineDebugOptions debugOptions = options.as(DataflowPipelineDebugOptions.class);
DataflowWorkerHarnessOptions workerHarnessOptions = options.as(DataflowWorkerHarnessOptions.class);
String uploadToGCSPath = debugOptions.getSaveHeapDumpsToGcsPath();
String workerId = workerHarnessOptions.getWorkerId();
boolean canDumpHeap = uploadToGCSPath != null || debugOptions.getDumpHeapOnOOM();
double gcThrashingPercentagePerPeriod = debugOptions.getGCThrashingPercentagePerPeriod();
Duration jfrProfileDuration;
if (uploadToGCSPath != null && debugOptions.getRecordJfrOnGcThrashing()) {
if (Environments.getJavaVersion() == Environments.JavaVersion.java8) {
throw new IllegalArgumentException("recordJfrOnGcThrashing is only supported on java 9 and up.");
}
jfrProfileDuration = Duration.ofSeconds(debugOptions.getJfrRecordingDurationSec());
} else {
jfrProfileDuration = null;
}
return new MemoryMonitor(new SystemGCStatsProvider(), DEFAULT_SLEEP_TIME_MILLIS, DEFAULT_SHUT_DOWN_AFTER_NUM_GCTHRASHING, canDumpHeap, gcThrashingPercentagePerPeriod, uploadToGCSPath, getLoggingDir(), workerId, jfrProfileDuration, Clock.systemUTC());
}
Aggregations