use of org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList in project beam by apache.
the class GreedyPipelineFuser method sanitizeDanglingPTransformInputs.
private static ExecutableStage sanitizeDanglingPTransformInputs(ExecutableStage stage) {
/* Possible inputs to a PTransform can only be those which are:
* <ul>
* <li>Explicit input PCollection to the stage
* <li>Outputs of a PTransform within the same stage
* <li>Timer PCollections
* <li>Side input PCollections
* <li>Explicit outputs from the stage
* </ul>
*/
Set<String> possibleInputs = new HashSet<>();
possibleInputs.add(stage.getInputPCollection().getId());
possibleInputs.addAll(stage.getOutputPCollections().stream().map(PCollectionNode::getId).collect(Collectors.toSet()));
possibleInputs.addAll(stage.getSideInputs().stream().map(s -> s.collection().getId()).collect(Collectors.toSet()));
possibleInputs.addAll(stage.getTransforms().stream().flatMap(t -> t.getTransform().getOutputsMap().values().stream()).collect(Collectors.toSet()));
Set<String> danglingInputs = stage.getTransforms().stream().flatMap(t -> t.getTransform().getInputsMap().values().stream()).filter(in -> !possibleInputs.contains(in)).collect(Collectors.toSet());
ImmutableList.Builder<PTransformNode> pTransformNodesBuilder = ImmutableList.builder();
for (PTransformNode transformNode : stage.getTransforms()) {
PTransform transform = transformNode.getTransform();
Map<String, String> validInputs = transform.getInputsMap().entrySet().stream().filter(e -> !danglingInputs.contains(e.getValue())).collect(Collectors.toMap(Entry::getKey, Entry::getValue));
if (!validInputs.equals(transform.getInputsMap())) {
// Dangling inputs found so recreate pTransform without the dangling inputs.
transformNode = PipelineNode.pTransform(transformNode.getId(), transform.toBuilder().clearInputs().putAllInputs(validInputs).build());
}
pTransformNodesBuilder.add(transformNode);
}
ImmutableList<PTransformNode> pTransformNodes = pTransformNodesBuilder.build();
Components.Builder componentBuilder = stage.getComponents().toBuilder();
// Update the pTransforms in components.
componentBuilder.clearTransforms().putAllTransforms(pTransformNodes.stream().collect(Collectors.toMap(PTransformNode::getId, PTransformNode::getTransform)));
Map<String, PCollection> validPCollectionMap = stage.getComponents().getPcollectionsMap().entrySet().stream().filter(e -> !danglingInputs.contains(e.getKey())).collect(Collectors.toMap(Entry::getKey, Entry::getValue));
// Update pCollections in the components.
componentBuilder.clearPcollections().putAllPcollections(validPCollectionMap);
return ImmutableExecutableStage.of(componentBuilder.build(), stage.getEnvironment(), stage.getInputPCollection(), stage.getSideInputs(), stage.getUserStates(), stage.getTimers(), pTransformNodes, stage.getOutputPCollections(), stage.getWireCoderSettings());
}
use of org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList in project beam by apache.
the class FlinkPipelineExecutionEnvironmentTest method shouldUseTransformOverrides.
@Test
public void shouldUseTransformOverrides() {
boolean[] testParameters = { true, false };
for (boolean streaming : testParameters) {
FlinkPipelineOptions options = FlinkPipelineOptions.defaults();
options.setStreaming(streaming);
options.setRunner(FlinkRunner.class);
FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options);
Pipeline p = Mockito.spy(Pipeline.create(options));
flinkEnv.translate(p);
ArgumentCaptor<ImmutableList> captor = ArgumentCaptor.forClass(ImmutableList.class);
Mockito.verify(p).replaceAll(captor.capture());
ImmutableList<PTransformOverride> overridesList = captor.getValue();
assertThat(overridesList.isEmpty(), is(false));
assertThat(overridesList.size(), is(FlinkTransformOverrides.getDefaultOverrides(options).size()));
}
}
use of org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList in project beam by apache.
the class FlinkPipelineExecutionEnvironmentTest method shouldProvideParallelismToTransformOverrides.
@Test
public void shouldProvideParallelismToTransformOverrides() {
FlinkPipelineOptions options = FlinkPipelineOptions.defaults();
options.setStreaming(true);
options.setRunner(FlinkRunner.class);
FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options);
Pipeline p = Pipeline.create(options);
// Create a transform applicable for PTransformMatchers.writeWithRunnerDeterminedSharding()
// which requires parallelism
p.apply(Create.of("test")).apply(TextIO.write().to("/tmp"));
p = Mockito.spy(p);
// If this succeeds we're ok
flinkEnv.translate(p);
// Verify we were using desired replacement transform
ArgumentCaptor<ImmutableList> captor = ArgumentCaptor.forClass(ImmutableList.class);
Mockito.verify(p).replaceAll(captor.capture());
ImmutableList<PTransformOverride> overridesList = captor.getValue();
assertThat(overridesList, hasItem(new BaseMatcher<PTransformOverride>() {
@Override
public void describeTo(Description description) {
}
@Override
public boolean matches(Object actual) {
if (actual instanceof PTransformOverride) {
PTransformOverrideFactory overrideFactory = ((PTransformOverride) actual).getOverrideFactory();
if (overrideFactory instanceof FlinkStreamingPipelineTranslator.StreamingShardedWriteFactory) {
FlinkStreamingPipelineTranslator.StreamingShardedWriteFactory factory = (FlinkStreamingPipelineTranslator.StreamingShardedWriteFactory) overrideFactory;
return factory.options.getParallelism() > 0;
}
}
return false;
}
}));
}
use of org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList in project beam by apache.
the class DataflowRunner method run.
@Override
public DataflowPipelineJob run(Pipeline pipeline) {
if (useUnifiedWorker(options)) {
// non-null if useUnifiedWorker is true
List<String> experiments = options.getExperiments();
if (!experiments.contains("use_runner_v2")) {
experiments.add("use_runner_v2");
}
if (!experiments.contains("use_unified_worker")) {
experiments.add("use_unified_worker");
}
if (!experiments.contains("beam_fn_api")) {
experiments.add("beam_fn_api");
}
if (!experiments.contains("use_portable_job_submission")) {
experiments.add("use_portable_job_submission");
}
options.setExperiments(ImmutableList.copyOf(experiments));
}
logWarningIfPCollectionViewHasNonDeterministicKeyCoder(pipeline);
if (containsUnboundedPCollection(pipeline)) {
options.setStreaming(true);
}
LOG.info("Executing pipeline on the Dataflow Service, which will have billing implications " + "related to Google Compute Engine usage and other Google Cloud Services.");
DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
String workerHarnessContainerImageURL = DataflowRunner.getContainerImageForJob(dataflowOptions);
// This incorrectly puns the worker harness container image (which implements v1beta3 API)
// with the SDK harness image (which implements Fn API).
//
// The same Environment is used in different and contradictory ways, depending on whether
// it is a v1 or v2 job submission.
RunnerApi.Environment defaultEnvironmentForDataflow = Environments.createDockerEnvironment(workerHarnessContainerImageURL);
// The SdkComponents for portable an non-portable job submission must be kept distinct. Both
// need the default environment.
SdkComponents portableComponents = SdkComponents.create();
portableComponents.registerEnvironment(defaultEnvironmentForDataflow.toBuilder().addAllDependencies(getDefaultArtifacts()).addAllCapabilities(Environments.getJavaCapabilities()).build());
RunnerApi.Pipeline portablePipelineProto = PipelineTranslation.toProto(pipeline, portableComponents, false);
// Note that `stageArtifacts` has to be called before `resolveArtifact` because
// `resolveArtifact` updates local paths to staged paths in pipeline proto.
List<DataflowPackage> packages = stageArtifacts(portablePipelineProto);
portablePipelineProto = resolveArtifacts(portablePipelineProto);
portablePipelineProto = applySdkEnvironmentOverrides(portablePipelineProto, options);
if (LOG.isDebugEnabled()) {
LOG.debug("Portable pipeline proto:\n{}", TextFormat.printer().printToString(portablePipelineProto));
}
// Stage the portable pipeline proto, retrieving the staged pipeline path, then update
// the options on the new job
// TODO: add an explicit `pipeline` parameter to the submission instead of pipeline options
LOG.info("Staging portable pipeline proto to {}", options.getStagingLocation());
byte[] serializedProtoPipeline = portablePipelineProto.toByteArray();
DataflowPackage stagedPipeline = options.getStager().stageToFile(serializedProtoPipeline, PIPELINE_FILE_NAME);
dataflowOptions.setPipelineUrl(stagedPipeline.getLocation());
// Now rewrite things to be as needed for v1 (mutates the pipeline)
// This way the job submitted is valid for v1 and v2, simultaneously
replaceV1Transforms(pipeline);
// Capture the SdkComponents for look up during step translations
SdkComponents dataflowV1Components = SdkComponents.create();
dataflowV1Components.registerEnvironment(defaultEnvironmentForDataflow.toBuilder().addAllDependencies(getDefaultArtifacts()).addAllCapabilities(Environments.getJavaCapabilities()).build());
RunnerApi.Pipeline dataflowV1PipelineProto = PipelineTranslation.toProto(pipeline, dataflowV1Components, true);
if (LOG.isDebugEnabled()) {
LOG.debug("Dataflow v1 pipeline proto:\n{}", TextFormat.printer().printToString(dataflowV1PipelineProto));
}
// Set a unique client_request_id in the CreateJob request.
// This is used to ensure idempotence of job creation across retried
// attempts to create a job. Specifically, if the service returns a job with
// a different client_request_id, it means the returned one is a different
// job previously created with the same job name, and that the job creation
// has been effectively rejected. The SDK should return
// Error::Already_Exists to user in that case.
int randomNum = new Random().nextInt(9000) + 1000;
String requestId = DateTimeFormat.forPattern("YYYYMMddHHmmssmmm").withZone(DateTimeZone.UTC).print(DateTimeUtils.currentTimeMillis()) + "_" + randomNum;
// Try to create a debuggee ID. This must happen before the job is translated since it may
// update the options.
maybeRegisterDebuggee(dataflowOptions, requestId);
JobSpecification jobSpecification = translator.translate(pipeline, dataflowV1PipelineProto, dataflowV1Components, this, packages);
if (!isNullOrEmpty(dataflowOptions.getDataflowWorkerJar()) && !useUnifiedWorker(options)) {
List<String> experiments = firstNonNull(dataflowOptions.getExperiments(), Collections.emptyList());
if (!experiments.contains("use_staged_dataflow_worker_jar")) {
dataflowOptions.setExperiments(ImmutableList.<String>builder().addAll(experiments).add("use_staged_dataflow_worker_jar").build());
}
}
Job newJob = jobSpecification.getJob();
try {
newJob.getEnvironment().setSdkPipelineOptions(MAPPER.readValue(MAPPER_WITH_MODULES.writeValueAsBytes(options), Map.class));
} catch (IOException e) {
throw new IllegalArgumentException("PipelineOptions specified failed to serialize to JSON.", e);
}
newJob.setClientRequestId(requestId);
DataflowRunnerInfo dataflowRunnerInfo = DataflowRunnerInfo.getDataflowRunnerInfo();
String version = dataflowRunnerInfo.getVersion();
checkState(!"${pom.version}".equals(version), "Unable to submit a job to the Dataflow service with unset version ${pom.version}");
LOG.info("Dataflow SDK version: {}", version);
newJob.getEnvironment().setUserAgent((Map) dataflowRunnerInfo.getProperties());
// must be verified.
if (!isNullOrEmpty(options.getGcpTempLocation())) {
newJob.getEnvironment().setTempStoragePrefix(dataflowOptions.getPathValidator().verifyPath(options.getGcpTempLocation()));
}
newJob.getEnvironment().setDataset(options.getTempDatasetId());
if (options.getWorkerRegion() != null) {
newJob.getEnvironment().setWorkerRegion(options.getWorkerRegion());
}
if (options.getWorkerZone() != null) {
newJob.getEnvironment().setWorkerZone(options.getWorkerZone());
}
if (options.getFlexRSGoal() == DataflowPipelineOptions.FlexResourceSchedulingGoal.COST_OPTIMIZED) {
newJob.getEnvironment().setFlexResourceSchedulingGoal("FLEXRS_COST_OPTIMIZED");
} else if (options.getFlexRSGoal() == DataflowPipelineOptions.FlexResourceSchedulingGoal.SPEED_OPTIMIZED) {
newJob.getEnvironment().setFlexResourceSchedulingGoal("FLEXRS_SPEED_OPTIMIZED");
}
// Represent the minCpuPlatform pipeline option as an experiment, if not already present.
if (!isNullOrEmpty(dataflowOptions.getMinCpuPlatform())) {
List<String> experiments = firstNonNull(dataflowOptions.getExperiments(), Collections.emptyList());
List<String> minCpuFlags = experiments.stream().filter(p -> p.startsWith("min_cpu_platform")).collect(Collectors.toList());
if (minCpuFlags.isEmpty()) {
dataflowOptions.setExperiments(ImmutableList.<String>builder().addAll(experiments).add("min_cpu_platform=" + dataflowOptions.getMinCpuPlatform()).build());
} else {
LOG.warn("Flag min_cpu_platform is defined in both top level PipelineOption, " + "as well as under experiments. Proceed using {}.", minCpuFlags.get(0));
}
}
newJob.getEnvironment().setExperiments(ImmutableList.copyOf(firstNonNull(dataflowOptions.getExperiments(), Collections.emptyList())));
// Set the Docker container image that executes Dataflow worker harness, residing in Google
// Container Registry. Translator is guaranteed to create a worker pool prior to this point.
// For runner_v1, only worker_harness_container is set.
// For runner_v2, both worker_harness_container and sdk_harness_container are set to the same
// value.
String containerImage = getContainerImageForJob(options);
for (WorkerPool workerPool : newJob.getEnvironment().getWorkerPools()) {
workerPool.setWorkerHarnessContainerImage(containerImage);
}
configureSdkHarnessContainerImages(options, portablePipelineProto, newJob);
newJob.getEnvironment().setVersion(getEnvironmentVersion(options));
if (hooks != null) {
hooks.modifyEnvironmentBeforeSubmission(newJob.getEnvironment());
}
// will be downloaded from GCS by the service.
if (hasExperiment(options, "upload_graph")) {
DataflowPackage stagedGraph = options.getStager().stageToFile(DataflowPipelineTranslator.jobToString(newJob).getBytes(UTF_8), DATAFLOW_GRAPH_FILE_NAME);
newJob.getSteps().clear();
newJob.setStepsLocation(stagedGraph.getLocation());
}
if (!isNullOrEmpty(options.getDataflowJobFile()) || !isNullOrEmpty(options.getTemplateLocation())) {
boolean isTemplate = !isNullOrEmpty(options.getTemplateLocation());
if (isTemplate) {
checkArgument(isNullOrEmpty(options.getDataflowJobFile()), "--dataflowJobFile and --templateLocation are mutually exclusive.");
}
String fileLocation = firstNonNull(options.getTemplateLocation(), options.getDataflowJobFile());
checkArgument(fileLocation.startsWith("/") || fileLocation.startsWith("gs://"), "Location must be local or on Cloud Storage, got %s.", fileLocation);
ResourceId fileResource = FileSystems.matchNewResource(fileLocation, false);
String workSpecJson = DataflowPipelineTranslator.jobToString(newJob);
try (PrintWriter printWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(Channels.newOutputStream(FileSystems.create(fileResource, MimeTypes.TEXT)), UTF_8)))) {
printWriter.print(workSpecJson);
LOG.info("Printed job specification to {}", fileLocation);
} catch (IOException ex) {
String error = String.format("Cannot create output file at %s", fileLocation);
if (isTemplate) {
throw new RuntimeException(error, ex);
} else {
LOG.warn(error, ex);
}
}
if (isTemplate) {
LOG.info("Template successfully created.");
return new DataflowTemplateJob();
}
}
String jobIdToUpdate = null;
if (options.isUpdate()) {
jobIdToUpdate = getJobIdFromName(options.getJobName());
newJob.setTransformNameMapping(options.getTransformNameMapping());
newJob.setReplaceJobId(jobIdToUpdate);
}
if (options.getCreateFromSnapshot() != null && !options.getCreateFromSnapshot().isEmpty()) {
newJob.setCreatedFromSnapshotId(options.getCreateFromSnapshot());
}
Job jobResult;
try {
jobResult = dataflowClient.createJob(newJob);
} catch (GoogleJsonResponseException e) {
String errorMessages = "Unexpected errors";
if (e.getDetails() != null) {
if (Utf8.encodedLength(newJob.toString()) >= CREATE_JOB_REQUEST_LIMIT_BYTES) {
errorMessages = "The size of the serialized JSON representation of the pipeline " + "exceeds the allowable limit. " + "For more information, please see the documentation on job submission:\n" + "https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline#jobs";
} else {
errorMessages = e.getDetails().getMessage();
}
}
throw new RuntimeException("Failed to create a workflow job: " + errorMessages, e);
} catch (IOException e) {
throw new RuntimeException("Failed to create a workflow job", e);
}
// Use a raw client for post-launch monitoring, as status calls may fail
// regularly and need not be retried automatically.
DataflowPipelineJob dataflowPipelineJob = new DataflowPipelineJob(DataflowClient.create(options), jobResult.getId(), options, jobSpecification != null ? jobSpecification.getStepNames() : Collections.emptyMap(), portablePipelineProto);
// depending on whether this is a reload or not.
if (jobResult.getClientRequestId() != null && !jobResult.getClientRequestId().isEmpty() && !jobResult.getClientRequestId().equals(requestId)) {
// If updating a job.
if (options.isUpdate()) {
throw new DataflowJobAlreadyUpdatedException(dataflowPipelineJob, String.format("The job named %s with id: %s has already been updated into job id: %s " + "and cannot be updated again.", newJob.getName(), jobIdToUpdate, jobResult.getId()));
} else {
throw new DataflowJobAlreadyExistsException(dataflowPipelineJob, String.format("There is already an active job named %s with id: %s. If you want to submit a" + " second job, try again by setting a different name using --jobName.", newJob.getName(), jobResult.getId()));
}
}
LOG.info("To access the Dataflow monitoring console, please navigate to {}", MonitoringUtil.getJobMonitoringPageURL(options.getProject(), options.getRegion(), jobResult.getId()));
LOG.info("Submitted job: {}", jobResult.getId());
LOG.info("To cancel the job using the 'gcloud' tool, run:\n> {}", MonitoringUtil.getGcloudCancelCommand(options, jobResult.getId()));
return dataflowPipelineJob;
}
use of org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList in project beam by apache.
the class ExecutableStageDoFnOperatorTest method testEnsureStateCleanupWithKeyedInputStateCleaner.
@Test
public void testEnsureStateCleanupWithKeyedInputStateCleaner() throws Exception {
GlobalWindow.Coder windowCoder = GlobalWindow.Coder.INSTANCE;
InMemoryStateInternals<String> stateInternals = InMemoryStateInternals.forKey("key");
List<String> userStateNames = ImmutableList.of("state1", "state2");
ImmutableList.Builder<BagState<String>> bagStateBuilder = ImmutableList.builder();
for (String userStateName : userStateNames) {
BagState<String> state = stateInternals.state(StateNamespaces.window(windowCoder, GlobalWindow.INSTANCE), StateTags.bag(userStateName, StringUtf8Coder.of()));
bagStateBuilder.add(state);
state.add("this should be cleaned");
}
ImmutableList<BagState<String>> bagStates = bagStateBuilder.build();
MutableObject<ByteBuffer> key = new MutableObject<>(ByteBuffer.wrap(stateInternals.getKey().getBytes(StandardCharsets.UTF_8)));
// Test that state is cleaned up correctly
ExecutableStageDoFnOperator.StateCleaner stateCleaner = new ExecutableStageDoFnOperator.StateCleaner(userStateNames, windowCoder, key::getValue, ts -> false, null);
for (BagState<String> bagState : bagStates) {
assertThat(Iterables.size(bagState.read()), is(1));
}
stateCleaner.clearForWindow(GlobalWindow.INSTANCE);
stateCleaner.cleanupState(stateInternals, key::setValue);
for (BagState<String> bagState : bagStates) {
assertThat(Iterables.size(bagState.read()), is(0));
}
}
Aggregations