Search in sources :

Example 6 with ImmutableList

use of org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList in project beam by apache.

the class GreedyPipelineFuser method sanitizeDanglingPTransformInputs.

private static ExecutableStage sanitizeDanglingPTransformInputs(ExecutableStage stage) {
    /* Possible inputs to a PTransform can only be those which are:
     * <ul>
     *  <li>Explicit input PCollection to the stage
     *  <li>Outputs of a PTransform within the same stage
     *  <li>Timer PCollections
     *  <li>Side input PCollections
     *  <li>Explicit outputs from the stage
     * </ul>
     */
    Set<String> possibleInputs = new HashSet<>();
    possibleInputs.add(stage.getInputPCollection().getId());
    possibleInputs.addAll(stage.getOutputPCollections().stream().map(PCollectionNode::getId).collect(Collectors.toSet()));
    possibleInputs.addAll(stage.getSideInputs().stream().map(s -> s.collection().getId()).collect(Collectors.toSet()));
    possibleInputs.addAll(stage.getTransforms().stream().flatMap(t -> t.getTransform().getOutputsMap().values().stream()).collect(Collectors.toSet()));
    Set<String> danglingInputs = stage.getTransforms().stream().flatMap(t -> t.getTransform().getInputsMap().values().stream()).filter(in -> !possibleInputs.contains(in)).collect(Collectors.toSet());
    ImmutableList.Builder<PTransformNode> pTransformNodesBuilder = ImmutableList.builder();
    for (PTransformNode transformNode : stage.getTransforms()) {
        PTransform transform = transformNode.getTransform();
        Map<String, String> validInputs = transform.getInputsMap().entrySet().stream().filter(e -> !danglingInputs.contains(e.getValue())).collect(Collectors.toMap(Entry::getKey, Entry::getValue));
        if (!validInputs.equals(transform.getInputsMap())) {
            // Dangling inputs found so recreate pTransform without the dangling inputs.
            transformNode = PipelineNode.pTransform(transformNode.getId(), transform.toBuilder().clearInputs().putAllInputs(validInputs).build());
        }
        pTransformNodesBuilder.add(transformNode);
    }
    ImmutableList<PTransformNode> pTransformNodes = pTransformNodesBuilder.build();
    Components.Builder componentBuilder = stage.getComponents().toBuilder();
    // Update the pTransforms in components.
    componentBuilder.clearTransforms().putAllTransforms(pTransformNodes.stream().collect(Collectors.toMap(PTransformNode::getId, PTransformNode::getTransform)));
    Map<String, PCollection> validPCollectionMap = stage.getComponents().getPcollectionsMap().entrySet().stream().filter(e -> !danglingInputs.contains(e.getKey())).collect(Collectors.toMap(Entry::getKey, Entry::getValue));
    // Update pCollections in the components.
    componentBuilder.clearPcollections().putAllPcollections(validPCollectionMap);
    return ImmutableExecutableStage.of(componentBuilder.build(), stage.getEnvironment(), stage.getInputPCollection(), stage.getSideInputs(), stage.getUserStates(), stage.getTimers(), pTransformNodes, stage.getOutputPCollections(), stage.getWireCoderSettings());
}
Also used : PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) HashMultimap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.HashMultimap) TreeSet(java.util.TreeSet) ComparisonChain(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ComparisonChain) HashSet(java.util.HashSet) ImmutableSet(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSet) Sets(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Sets) PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) Multimap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Multimap) Map(java.util.Map) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) LinkedHashSet(java.util.LinkedHashSet) PTransformTranslation(org.apache.beam.runners.core.construction.PTransformTranslation) Logger(org.slf4j.Logger) Collection(java.util.Collection) Set(java.util.Set) NavigableSet(java.util.NavigableSet) DeduplicationResult(org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult) Collectors(java.util.stream.Collectors) Pipeline(org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) Environment(org.apache.beam.model.pipeline.v1.RunnerApi.Environment) AutoValue(com.google.auto.value.AutoValue) Entry(java.util.Map.Entry) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Queue(java.util.Queue) ArrayDeque(java.util.ArrayDeque) Comparator(java.util.Comparator) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)

Example 7 with ImmutableList

use of org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList in project beam by apache.

the class FlinkPipelineExecutionEnvironmentTest method shouldUseTransformOverrides.

@Test
public void shouldUseTransformOverrides() {
    boolean[] testParameters = { true, false };
    for (boolean streaming : testParameters) {
        FlinkPipelineOptions options = FlinkPipelineOptions.defaults();
        options.setStreaming(streaming);
        options.setRunner(FlinkRunner.class);
        FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options);
        Pipeline p = Mockito.spy(Pipeline.create(options));
        flinkEnv.translate(p);
        ArgumentCaptor<ImmutableList> captor = ArgumentCaptor.forClass(ImmutableList.class);
        Mockito.verify(p).replaceAll(captor.capture());
        ImmutableList<PTransformOverride> overridesList = captor.getValue();
        assertThat(overridesList.isEmpty(), is(false));
        assertThat(overridesList.size(), is(FlinkTransformOverrides.getDefaultOverrides(options).size()));
    }
}
Also used : ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) PTransformOverride(org.apache.beam.sdk.runners.PTransformOverride) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 8 with ImmutableList

use of org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList in project beam by apache.

the class FlinkPipelineExecutionEnvironmentTest method shouldProvideParallelismToTransformOverrides.

@Test
public void shouldProvideParallelismToTransformOverrides() {
    FlinkPipelineOptions options = FlinkPipelineOptions.defaults();
    options.setStreaming(true);
    options.setRunner(FlinkRunner.class);
    FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options);
    Pipeline p = Pipeline.create(options);
    // Create a transform applicable for PTransformMatchers.writeWithRunnerDeterminedSharding()
    // which requires parallelism
    p.apply(Create.of("test")).apply(TextIO.write().to("/tmp"));
    p = Mockito.spy(p);
    // If this succeeds we're ok
    flinkEnv.translate(p);
    // Verify we were using desired replacement transform
    ArgumentCaptor<ImmutableList> captor = ArgumentCaptor.forClass(ImmutableList.class);
    Mockito.verify(p).replaceAll(captor.capture());
    ImmutableList<PTransformOverride> overridesList = captor.getValue();
    assertThat(overridesList, hasItem(new BaseMatcher<PTransformOverride>() {

        @Override
        public void describeTo(Description description) {
        }

        @Override
        public boolean matches(Object actual) {
            if (actual instanceof PTransformOverride) {
                PTransformOverrideFactory overrideFactory = ((PTransformOverride) actual).getOverrideFactory();
                if (overrideFactory instanceof FlinkStreamingPipelineTranslator.StreamingShardedWriteFactory) {
                    FlinkStreamingPipelineTranslator.StreamingShardedWriteFactory factory = (FlinkStreamingPipelineTranslator.StreamingShardedWriteFactory) overrideFactory;
                    return factory.options.getParallelism() > 0;
                }
            }
            return false;
        }
    }));
}
Also used : Description(org.hamcrest.Description) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) PTransformOverride(org.apache.beam.sdk.runners.PTransformOverride) Pipeline(org.apache.beam.sdk.Pipeline) BaseMatcher(org.hamcrest.BaseMatcher) PTransformOverrideFactory(org.apache.beam.sdk.runners.PTransformOverrideFactory) Test(org.junit.Test)

Example 9 with ImmutableList

use of org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList in project beam by apache.

the class DataflowRunner method run.

@Override
public DataflowPipelineJob run(Pipeline pipeline) {
    if (useUnifiedWorker(options)) {
        // non-null if useUnifiedWorker is true
        List<String> experiments = options.getExperiments();
        if (!experiments.contains("use_runner_v2")) {
            experiments.add("use_runner_v2");
        }
        if (!experiments.contains("use_unified_worker")) {
            experiments.add("use_unified_worker");
        }
        if (!experiments.contains("beam_fn_api")) {
            experiments.add("beam_fn_api");
        }
        if (!experiments.contains("use_portable_job_submission")) {
            experiments.add("use_portable_job_submission");
        }
        options.setExperiments(ImmutableList.copyOf(experiments));
    }
    logWarningIfPCollectionViewHasNonDeterministicKeyCoder(pipeline);
    if (containsUnboundedPCollection(pipeline)) {
        options.setStreaming(true);
    }
    LOG.info("Executing pipeline on the Dataflow Service, which will have billing implications " + "related to Google Compute Engine usage and other Google Cloud Services.");
    DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
    String workerHarnessContainerImageURL = DataflowRunner.getContainerImageForJob(dataflowOptions);
    // This incorrectly puns the worker harness container image (which implements v1beta3 API)
    // with the SDK harness image (which implements Fn API).
    // 
    // The same Environment is used in different and contradictory ways, depending on whether
    // it is a v1 or v2 job submission.
    RunnerApi.Environment defaultEnvironmentForDataflow = Environments.createDockerEnvironment(workerHarnessContainerImageURL);
    // The SdkComponents for portable an non-portable job submission must be kept distinct. Both
    // need the default environment.
    SdkComponents portableComponents = SdkComponents.create();
    portableComponents.registerEnvironment(defaultEnvironmentForDataflow.toBuilder().addAllDependencies(getDefaultArtifacts()).addAllCapabilities(Environments.getJavaCapabilities()).build());
    RunnerApi.Pipeline portablePipelineProto = PipelineTranslation.toProto(pipeline, portableComponents, false);
    // Note that `stageArtifacts` has to be called before `resolveArtifact` because
    // `resolveArtifact` updates local paths to staged paths in pipeline proto.
    List<DataflowPackage> packages = stageArtifacts(portablePipelineProto);
    portablePipelineProto = resolveArtifacts(portablePipelineProto);
    portablePipelineProto = applySdkEnvironmentOverrides(portablePipelineProto, options);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Portable pipeline proto:\n{}", TextFormat.printer().printToString(portablePipelineProto));
    }
    // Stage the portable pipeline proto, retrieving the staged pipeline path, then update
    // the options on the new job
    // TODO: add an explicit `pipeline` parameter to the submission instead of pipeline options
    LOG.info("Staging portable pipeline proto to {}", options.getStagingLocation());
    byte[] serializedProtoPipeline = portablePipelineProto.toByteArray();
    DataflowPackage stagedPipeline = options.getStager().stageToFile(serializedProtoPipeline, PIPELINE_FILE_NAME);
    dataflowOptions.setPipelineUrl(stagedPipeline.getLocation());
    // Now rewrite things to be as needed for v1 (mutates the pipeline)
    // This way the job submitted is valid for v1 and v2, simultaneously
    replaceV1Transforms(pipeline);
    // Capture the SdkComponents for look up during step translations
    SdkComponents dataflowV1Components = SdkComponents.create();
    dataflowV1Components.registerEnvironment(defaultEnvironmentForDataflow.toBuilder().addAllDependencies(getDefaultArtifacts()).addAllCapabilities(Environments.getJavaCapabilities()).build());
    RunnerApi.Pipeline dataflowV1PipelineProto = PipelineTranslation.toProto(pipeline, dataflowV1Components, true);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Dataflow v1 pipeline proto:\n{}", TextFormat.printer().printToString(dataflowV1PipelineProto));
    }
    // Set a unique client_request_id in the CreateJob request.
    // This is used to ensure idempotence of job creation across retried
    // attempts to create a job. Specifically, if the service returns a job with
    // a different client_request_id, it means the returned one is a different
    // job previously created with the same job name, and that the job creation
    // has been effectively rejected. The SDK should return
    // Error::Already_Exists to user in that case.
    int randomNum = new Random().nextInt(9000) + 1000;
    String requestId = DateTimeFormat.forPattern("YYYYMMddHHmmssmmm").withZone(DateTimeZone.UTC).print(DateTimeUtils.currentTimeMillis()) + "_" + randomNum;
    // Try to create a debuggee ID. This must happen before the job is translated since it may
    // update the options.
    maybeRegisterDebuggee(dataflowOptions, requestId);
    JobSpecification jobSpecification = translator.translate(pipeline, dataflowV1PipelineProto, dataflowV1Components, this, packages);
    if (!isNullOrEmpty(dataflowOptions.getDataflowWorkerJar()) && !useUnifiedWorker(options)) {
        List<String> experiments = firstNonNull(dataflowOptions.getExperiments(), Collections.emptyList());
        if (!experiments.contains("use_staged_dataflow_worker_jar")) {
            dataflowOptions.setExperiments(ImmutableList.<String>builder().addAll(experiments).add("use_staged_dataflow_worker_jar").build());
        }
    }
    Job newJob = jobSpecification.getJob();
    try {
        newJob.getEnvironment().setSdkPipelineOptions(MAPPER.readValue(MAPPER_WITH_MODULES.writeValueAsBytes(options), Map.class));
    } catch (IOException e) {
        throw new IllegalArgumentException("PipelineOptions specified failed to serialize to JSON.", e);
    }
    newJob.setClientRequestId(requestId);
    DataflowRunnerInfo dataflowRunnerInfo = DataflowRunnerInfo.getDataflowRunnerInfo();
    String version = dataflowRunnerInfo.getVersion();
    checkState(!"${pom.version}".equals(version), "Unable to submit a job to the Dataflow service with unset version ${pom.version}");
    LOG.info("Dataflow SDK version: {}", version);
    newJob.getEnvironment().setUserAgent((Map) dataflowRunnerInfo.getProperties());
    // must be verified.
    if (!isNullOrEmpty(options.getGcpTempLocation())) {
        newJob.getEnvironment().setTempStoragePrefix(dataflowOptions.getPathValidator().verifyPath(options.getGcpTempLocation()));
    }
    newJob.getEnvironment().setDataset(options.getTempDatasetId());
    if (options.getWorkerRegion() != null) {
        newJob.getEnvironment().setWorkerRegion(options.getWorkerRegion());
    }
    if (options.getWorkerZone() != null) {
        newJob.getEnvironment().setWorkerZone(options.getWorkerZone());
    }
    if (options.getFlexRSGoal() == DataflowPipelineOptions.FlexResourceSchedulingGoal.COST_OPTIMIZED) {
        newJob.getEnvironment().setFlexResourceSchedulingGoal("FLEXRS_COST_OPTIMIZED");
    } else if (options.getFlexRSGoal() == DataflowPipelineOptions.FlexResourceSchedulingGoal.SPEED_OPTIMIZED) {
        newJob.getEnvironment().setFlexResourceSchedulingGoal("FLEXRS_SPEED_OPTIMIZED");
    }
    // Represent the minCpuPlatform pipeline option as an experiment, if not already present.
    if (!isNullOrEmpty(dataflowOptions.getMinCpuPlatform())) {
        List<String> experiments = firstNonNull(dataflowOptions.getExperiments(), Collections.emptyList());
        List<String> minCpuFlags = experiments.stream().filter(p -> p.startsWith("min_cpu_platform")).collect(Collectors.toList());
        if (minCpuFlags.isEmpty()) {
            dataflowOptions.setExperiments(ImmutableList.<String>builder().addAll(experiments).add("min_cpu_platform=" + dataflowOptions.getMinCpuPlatform()).build());
        } else {
            LOG.warn("Flag min_cpu_platform is defined in both top level PipelineOption, " + "as well as under experiments. Proceed using {}.", minCpuFlags.get(0));
        }
    }
    newJob.getEnvironment().setExperiments(ImmutableList.copyOf(firstNonNull(dataflowOptions.getExperiments(), Collections.emptyList())));
    // Set the Docker container image that executes Dataflow worker harness, residing in Google
    // Container Registry. Translator is guaranteed to create a worker pool prior to this point.
    // For runner_v1, only worker_harness_container is set.
    // For runner_v2, both worker_harness_container and sdk_harness_container are set to the same
    // value.
    String containerImage = getContainerImageForJob(options);
    for (WorkerPool workerPool : newJob.getEnvironment().getWorkerPools()) {
        workerPool.setWorkerHarnessContainerImage(containerImage);
    }
    configureSdkHarnessContainerImages(options, portablePipelineProto, newJob);
    newJob.getEnvironment().setVersion(getEnvironmentVersion(options));
    if (hooks != null) {
        hooks.modifyEnvironmentBeforeSubmission(newJob.getEnvironment());
    }
    // will be downloaded from GCS by the service.
    if (hasExperiment(options, "upload_graph")) {
        DataflowPackage stagedGraph = options.getStager().stageToFile(DataflowPipelineTranslator.jobToString(newJob).getBytes(UTF_8), DATAFLOW_GRAPH_FILE_NAME);
        newJob.getSteps().clear();
        newJob.setStepsLocation(stagedGraph.getLocation());
    }
    if (!isNullOrEmpty(options.getDataflowJobFile()) || !isNullOrEmpty(options.getTemplateLocation())) {
        boolean isTemplate = !isNullOrEmpty(options.getTemplateLocation());
        if (isTemplate) {
            checkArgument(isNullOrEmpty(options.getDataflowJobFile()), "--dataflowJobFile and --templateLocation are mutually exclusive.");
        }
        String fileLocation = firstNonNull(options.getTemplateLocation(), options.getDataflowJobFile());
        checkArgument(fileLocation.startsWith("/") || fileLocation.startsWith("gs://"), "Location must be local or on Cloud Storage, got %s.", fileLocation);
        ResourceId fileResource = FileSystems.matchNewResource(fileLocation, false);
        String workSpecJson = DataflowPipelineTranslator.jobToString(newJob);
        try (PrintWriter printWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(Channels.newOutputStream(FileSystems.create(fileResource, MimeTypes.TEXT)), UTF_8)))) {
            printWriter.print(workSpecJson);
            LOG.info("Printed job specification to {}", fileLocation);
        } catch (IOException ex) {
            String error = String.format("Cannot create output file at %s", fileLocation);
            if (isTemplate) {
                throw new RuntimeException(error, ex);
            } else {
                LOG.warn(error, ex);
            }
        }
        if (isTemplate) {
            LOG.info("Template successfully created.");
            return new DataflowTemplateJob();
        }
    }
    String jobIdToUpdate = null;
    if (options.isUpdate()) {
        jobIdToUpdate = getJobIdFromName(options.getJobName());
        newJob.setTransformNameMapping(options.getTransformNameMapping());
        newJob.setReplaceJobId(jobIdToUpdate);
    }
    if (options.getCreateFromSnapshot() != null && !options.getCreateFromSnapshot().isEmpty()) {
        newJob.setCreatedFromSnapshotId(options.getCreateFromSnapshot());
    }
    Job jobResult;
    try {
        jobResult = dataflowClient.createJob(newJob);
    } catch (GoogleJsonResponseException e) {
        String errorMessages = "Unexpected errors";
        if (e.getDetails() != null) {
            if (Utf8.encodedLength(newJob.toString()) >= CREATE_JOB_REQUEST_LIMIT_BYTES) {
                errorMessages = "The size of the serialized JSON representation of the pipeline " + "exceeds the allowable limit. " + "For more information, please see the documentation on job submission:\n" + "https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline#jobs";
            } else {
                errorMessages = e.getDetails().getMessage();
            }
        }
        throw new RuntimeException("Failed to create a workflow job: " + errorMessages, e);
    } catch (IOException e) {
        throw new RuntimeException("Failed to create a workflow job", e);
    }
    // Use a raw client for post-launch monitoring, as status calls may fail
    // regularly and need not be retried automatically.
    DataflowPipelineJob dataflowPipelineJob = new DataflowPipelineJob(DataflowClient.create(options), jobResult.getId(), options, jobSpecification != null ? jobSpecification.getStepNames() : Collections.emptyMap(), portablePipelineProto);
    // depending on whether this is a reload or not.
    if (jobResult.getClientRequestId() != null && !jobResult.getClientRequestId().isEmpty() && !jobResult.getClientRequestId().equals(requestId)) {
        // If updating a job.
        if (options.isUpdate()) {
            throw new DataflowJobAlreadyUpdatedException(dataflowPipelineJob, String.format("The job named %s with id: %s has already been updated into job id: %s " + "and cannot be updated again.", newJob.getName(), jobIdToUpdate, jobResult.getId()));
        } else {
            throw new DataflowJobAlreadyExistsException(dataflowPipelineJob, String.format("There is already an active job named %s with id: %s. If you want to submit a" + " second job, try again by setting a different name using --jobName.", newJob.getName(), jobResult.getId()));
        }
    }
    LOG.info("To access the Dataflow monitoring console, please navigate to {}", MonitoringUtil.getJobMonitoringPageURL(options.getProject(), options.getRegion(), jobResult.getId()));
    LOG.info("Submitted job: {}", jobResult.getId());
    LOG.info("To cancel the job using the 'gcloud' tool, run:\n> {}", MonitoringUtil.getGcloudCancelCommand(options, jobResult.getId()));
    return dataflowPipelineJob;
}
Also used : CombineFn(org.apache.beam.sdk.transforms.Combine.CombineFn) Arrays(java.util.Arrays) DateTimeZone(org.joda.time.DateTimeZone) PBegin(org.apache.beam.sdk.values.PBegin) UnboundedSource(org.apache.beam.sdk.io.UnboundedSource) Joiner(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Joiner) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) IsBounded(org.apache.beam.sdk.values.PCollection.IsBounded) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) PubsubMessage(org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage) DoFnSignatures(org.apache.beam.sdk.transforms.reflect.DoFnSignatures) Strings(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Strings) PTransformOverrideFactory(org.apache.beam.sdk.runners.PTransformOverrideFactory) DataflowPipelineDebugOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions) Map(java.util.Map) Node(org.apache.beam.sdk.runners.TransformHierarchy.Node) GlobalWindow(org.apache.beam.sdk.transforms.windowing.GlobalWindow) PrintWriter(java.io.PrintWriter) KvCoder(org.apache.beam.sdk.coders.KvCoder) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) Set(java.util.Set) StringUtils.byteArrayToJsonString(org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString) EmptyFlattenAsCreateFactory(org.apache.beam.runners.core.construction.EmptyFlattenAsCreateFactory) VisibleForTesting(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting) AutoValue(com.google.auto.value.AutoValue) PipelineVisitor(org.apache.beam.sdk.Pipeline.PipelineVisitor) State(org.apache.beam.sdk.PipelineResult.State) PathValidator(org.apache.beam.sdk.extensions.gcp.storage.PathValidator) PubsubUnboundedSink(org.apache.beam.sdk.io.gcp.pubsub.PubsubUnboundedSink) ValueWithRecordId(org.apache.beam.sdk.values.ValueWithRecordId) KV(org.apache.beam.sdk.values.KV) DataflowPipelineWorkerPoolOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineWorkerPoolOptions) SetState(org.apache.beam.sdk.state.SetState) Experimental(org.apache.beam.sdk.annotations.Experimental) Combine(org.apache.beam.sdk.transforms.Combine) Impulse(org.apache.beam.sdk.transforms.Impulse) View(org.apache.beam.sdk.transforms.View) PTransformReplacements(org.apache.beam.runners.core.construction.PTransformReplacements) Environments(org.apache.beam.runners.core.construction.Environments) TreeSet(java.util.TreeSet) PubsubMessageWithAttributesCoder(org.apache.beam.sdk.io.gcp.pubsub.PubsubMessageWithAttributesCoder) ArrayList(java.util.ArrayList) FileBasedSink(org.apache.beam.sdk.io.FileBasedSink) NestedValueProvider(org.apache.beam.sdk.options.ValueProvider.NestedValueProvider) MonitoringUtil(org.apache.beam.runners.dataflow.util.MonitoringUtil) External(org.apache.beam.runners.core.construction.External) Read(org.apache.beam.sdk.io.Read) PipelineRunner(org.apache.beam.sdk.PipelineRunner) RegisterDebuggeeResponse(com.google.api.services.clouddebugger.v2.model.RegisterDebuggeeResponse) TupleTag(org.apache.beam.sdk.values.TupleTag) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) Pipeline(org.apache.beam.sdk.Pipeline) PInput(org.apache.beam.sdk.values.PInput) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) DoFn(org.apache.beam.sdk.transforms.DoFn) NonDeterministicException(org.apache.beam.sdk.coders.Coder.NonDeterministicException) PDone(org.apache.beam.sdk.values.PDone) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) BufferedWriter(java.io.BufferedWriter) PipelineResources.detectClassPathResourcesToStage(org.apache.beam.runners.core.construction.resources.PipelineResources.detectClassPathResourcesToStage) IOException(java.io.IOException) DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) File(java.io.File) BeamUrns(org.apache.beam.runners.core.construction.BeamUrns) DataflowTemplateJob(org.apache.beam.runners.dataflow.util.DataflowTemplateJob) BoundedSource(org.apache.beam.sdk.io.BoundedSource) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) InstanceBuilder(org.apache.beam.sdk.util.InstanceBuilder) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) PubsubUnboundedSource(org.apache.beam.sdk.io.gcp.pubsub.PubsubUnboundedSource) MoreObjects.firstNonNull(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.MoreObjects.firstNonNull) FileSystems(org.apache.beam.sdk.io.FileSystems) Module(com.fasterxml.jackson.databind.Module) MimeTypes(org.apache.beam.sdk.util.MimeTypes) CloudDebugger(com.google.api.services.clouddebugger.v2.CloudDebugger) SortedSet(java.util.SortedSet) WriteFilesResult(org.apache.beam.sdk.io.WriteFilesResult) WindowedValue(org.apache.beam.sdk.util.WindowedValue) DataflowTransport(org.apache.beam.runners.dataflow.util.DataflowTransport) LoggerFactory(org.slf4j.LoggerFactory) SdkHarnessContainerImage(com.google.api.services.dataflow.model.SdkHarnessContainerImage) Random(java.util.Random) NameUtils(org.apache.beam.sdk.util.NameUtils) SingleInputOutputOverrideFactory(org.apache.beam.runners.core.construction.SingleInputOutputOverrideFactory) WriteFiles(org.apache.beam.sdk.io.WriteFiles) Job(com.google.api.services.dataflow.model.Job) KafkaIO(org.apache.beam.sdk.io.kafka.KafkaIO) GoogleJsonResponseException(com.google.api.client.googleapis.json.GoogleJsonResponseException) ListJobsResponse(com.google.api.services.dataflow.model.ListJobsResponse) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) WriteFilesTranslation(org.apache.beam.runners.core.construction.WriteFilesTranslation) DateTimeFormat(org.joda.time.format.DateTimeFormat) Utf8(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Utf8) Strings.isNullOrEmpty(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Strings.isNullOrEmpty) StepTranslationContext(org.apache.beam.runners.dataflow.TransformTranslator.StepTranslationContext) Collectors(java.util.stream.Collectors) PipelineOptionsValidator(org.apache.beam.sdk.options.PipelineOptionsValidator) TransformHierarchy(org.apache.beam.sdk.runners.TransformHierarchy) List(java.util.List) PValue(org.apache.beam.sdk.values.PValue) ParDo(org.apache.beam.sdk.transforms.ParDo) ReplacementOutputs(org.apache.beam.runners.core.construction.ReplacementOutputs) ResolveOptions(org.apache.beam.sdk.io.fs.ResolveOptions) ReflectHelpers(org.apache.beam.sdk.util.common.ReflectHelpers) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) UnconsumedReads(org.apache.beam.runners.core.construction.UnconsumedReads) GroupedValues(org.apache.beam.sdk.transforms.Combine.GroupedValues) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Pattern(java.util.regex.Pattern) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) DataflowPackage(com.google.api.services.dataflow.model.DataflowPackage) StagedFile(org.apache.beam.runners.dataflow.util.PackageUtil.StagedFile) DateTimeUtils(org.joda.time.DateTimeUtils) CoderUtils.encodeToByteArray(org.apache.beam.sdk.util.CoderUtils.encodeToByteArray) Coder(org.apache.beam.sdk.coders.Coder) PipelineTranslation(org.apache.beam.runners.core.construction.PipelineTranslation) PTransformOverride(org.apache.beam.sdk.runners.PTransformOverride) AtomicReference(java.util.concurrent.atomic.AtomicReference) TextFormat(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.TextFormat) PTransform(org.apache.beam.sdk.transforms.PTransform) HashSet(java.util.HashSet) SplittableParDoNaiveBounded(org.apache.beam.runners.core.construction.SplittableParDoNaiveBounded) MapState(org.apache.beam.sdk.state.MapState) ReleaseInfo(org.apache.beam.sdk.util.ReleaseInfo) Files(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.io.Files) SerializableUtils.serializeToByteArray(org.apache.beam.sdk.util.SerializableUtils.serializeToByteArray) DeduplicatedFlattenFactory(org.apache.beam.runners.core.construction.DeduplicatedFlattenFactory) OutputStreamWriter(java.io.OutputStreamWriter) Hashing(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.hash.Hashing) UnboundedReadFromBoundedSource(org.apache.beam.runners.core.construction.UnboundedReadFromBoundedSource) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) GroupIntoBatches(org.apache.beam.sdk.transforms.GroupIntoBatches) WorkerPool(com.google.api.services.dataflow.model.WorkerPool) HashCode(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.hash.HashCode) RegisterDebuggeeRequest(com.google.api.services.clouddebugger.v2.model.RegisterDebuggeeRequest) Reshuffle(org.apache.beam.sdk.transforms.Reshuffle) DisplayData(org.apache.beam.sdk.transforms.display.DisplayData) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) WithKeys(org.apache.beam.sdk.transforms.WithKeys) Logger(org.slf4j.Logger) UTF_8(java.nio.charset.StandardCharsets.UTF_8) Channels(java.nio.channels.Channels) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) SplittableParDo(org.apache.beam.runners.core.construction.SplittableParDo) StreamingCreatePCollectionViewFactory(org.apache.beam.runners.dataflow.StreamingViewOverrides.StreamingCreatePCollectionViewFactory) JobSpecification(org.apache.beam.runners.dataflow.DataflowPipelineTranslator.JobSpecification) PCollection(org.apache.beam.sdk.values.PCollection) Debuggee(com.google.api.services.clouddebugger.v2.model.Debuggee) Preconditions(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions) PubsubMessageWithAttributesAndMessageIdCoder(org.apache.beam.sdk.io.gcp.pubsub.PubsubMessageWithAttributesAndMessageIdCoder) CreatePCollectionView(org.apache.beam.sdk.transforms.View.CreatePCollectionView) Collections(java.util.Collections) PTransformMatchers(org.apache.beam.runners.core.construction.PTransformMatchers) PropertyNames(org.apache.beam.runners.dataflow.util.PropertyNames) DataflowTemplateJob(org.apache.beam.runners.dataflow.util.DataflowTemplateJob) StringUtils.byteArrayToJsonString(org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) BufferedWriter(java.io.BufferedWriter) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) Random(java.util.Random) JobSpecification(org.apache.beam.runners.dataflow.DataflowPipelineTranslator.JobSpecification) DataflowTemplateJob(org.apache.beam.runners.dataflow.util.DataflowTemplateJob) Job(com.google.api.services.dataflow.model.Job) PrintWriter(java.io.PrintWriter) DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) IOException(java.io.IOException) GoogleJsonResponseException(com.google.api.client.googleapis.json.GoogleJsonResponseException) WorkerPool(com.google.api.services.dataflow.model.WorkerPool) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) OutputStreamWriter(java.io.OutputStreamWriter) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) DataflowPackage(com.google.api.services.dataflow.model.DataflowPackage) PTransformOverride(org.apache.beam.sdk.runners.PTransformOverride)

Example 10 with ImmutableList

use of org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList in project beam by apache.

the class ExecutableStageDoFnOperatorTest method testEnsureStateCleanupWithKeyedInputStateCleaner.

@Test
public void testEnsureStateCleanupWithKeyedInputStateCleaner() throws Exception {
    GlobalWindow.Coder windowCoder = GlobalWindow.Coder.INSTANCE;
    InMemoryStateInternals<String> stateInternals = InMemoryStateInternals.forKey("key");
    List<String> userStateNames = ImmutableList.of("state1", "state2");
    ImmutableList.Builder<BagState<String>> bagStateBuilder = ImmutableList.builder();
    for (String userStateName : userStateNames) {
        BagState<String> state = stateInternals.state(StateNamespaces.window(windowCoder, GlobalWindow.INSTANCE), StateTags.bag(userStateName, StringUtf8Coder.of()));
        bagStateBuilder.add(state);
        state.add("this should be cleaned");
    }
    ImmutableList<BagState<String>> bagStates = bagStateBuilder.build();
    MutableObject<ByteBuffer> key = new MutableObject<>(ByteBuffer.wrap(stateInternals.getKey().getBytes(StandardCharsets.UTF_8)));
    // Test that state is cleaned up correctly
    ExecutableStageDoFnOperator.StateCleaner stateCleaner = new ExecutableStageDoFnOperator.StateCleaner(userStateNames, windowCoder, key::getValue, ts -> false, null);
    for (BagState<String> bagState : bagStates) {
        assertThat(Iterables.size(bagState.read()), is(1));
    }
    stateCleaner.clearForWindow(GlobalWindow.INSTANCE);
    stateCleaner.cleanupState(stateInternals, key::setValue);
    for (BagState<String> bagState : bagStates) {
        assertThat(Iterables.size(bagState.read()), is(0));
    }
}
Also used : ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) ArgumentMatchers.anyString(org.mockito.ArgumentMatchers.anyString) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) ByteBuffer(java.nio.ByteBuffer) BagState(org.apache.beam.sdk.state.BagState) GlobalWindow(org.apache.beam.sdk.transforms.windowing.GlobalWindow) MutableObject(org.apache.beam.repackaged.core.org.apache.commons.lang3.mutable.MutableObject) Test(org.junit.Test) FlinkStateInternalsTest(org.apache.beam.runners.flink.streaming.FlinkStateInternalsTest)

Aggregations

ImmutableList (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList)10 Pipeline (org.apache.beam.sdk.Pipeline)4 PTransformOverride (org.apache.beam.sdk.runners.PTransformOverride)4 Test (org.junit.Test)4 AutoValue (com.google.auto.value.AutoValue)2 IOException (java.io.IOException)2 HashSet (java.util.HashSet)2 Map (java.util.Map)2 Set (java.util.Set)2 TreeSet (java.util.TreeSet)2 TimeoutException (java.util.concurrent.TimeoutException)2 Collectors (java.util.stream.Collectors)2 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)2 InstructionRequestHandler (org.apache.beam.runners.fnexecution.control.InstructionRequestHandler)2 RemoteEnvironmentOptions (org.apache.beam.sdk.options.RemoteEnvironmentOptions)2 PTransformOverrideFactory (org.apache.beam.sdk.runners.PTransformOverrideFactory)2 GlobalWindow (org.apache.beam.sdk.transforms.windowing.GlobalWindow)2 Preconditions.checkArgument (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument)2 Preconditions.checkState (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState)2 Logger (org.slf4j.Logger)2