use of org.apache.beam.runners.core.construction.SdkComponents in project beam by apache.
the class RegisterNodeFunction method apply.
@Override
public Node apply(MutableNetwork<Node, Edge> input) {
for (Node node : input.nodes()) {
if (node instanceof RemoteGrpcPortNode || node instanceof ParallelInstructionNode || node instanceof InstructionOutputNode) {
continue;
}
throw new IllegalArgumentException(String.format("Network contains unknown type of node: %s", input));
}
// Fix all non output nodes to have named edges.
for (Node node : input.nodes()) {
if (node instanceof InstructionOutputNode) {
continue;
}
for (Node successor : input.successors(node)) {
for (Edge edge : input.edgesConnecting(node, successor)) {
if (edge instanceof DefaultEdge) {
input.removeEdge(edge);
input.addEdge(node, successor, MultiOutputInfoEdge.create(new MultiOutputInfo().setTag(idGenerator.getId())));
}
}
}
}
// We start off by replacing all edges within the graph with edges that have the named
// outputs from the predecessor step. For ParallelInstruction Source nodes and RemoteGrpcPort
// nodes this is a generated port id. All ParDoInstructions will have already
ProcessBundleDescriptor.Builder processBundleDescriptor = ProcessBundleDescriptor.newBuilder().setId(idGenerator.getId()).setStateApiServiceDescriptor(stateApiServiceDescriptor);
// For intermediate PCollections we fabricate, we make a bogus WindowingStrategy
// TODO: create a correct windowing strategy, including coders and environment
SdkComponents sdkComponents = SdkComponents.create(pipeline.getComponents(), null);
// Default to use the Java environment if pipeline doesn't have environment specified.
if (pipeline.getComponents().getEnvironmentsMap().isEmpty()) {
sdkComponents.registerEnvironment(Environments.JAVA_SDK_HARNESS_ENVIRONMENT);
}
String fakeWindowingStrategyId = "fakeWindowingStrategy" + idGenerator.getId();
try {
RunnerApi.MessageWithComponents fakeWindowingStrategyProto = WindowingStrategyTranslation.toMessageProto(WindowingStrategy.globalDefault(), sdkComponents);
processBundleDescriptor.putWindowingStrategies(fakeWindowingStrategyId, fakeWindowingStrategyProto.getWindowingStrategy()).putAllCoders(fakeWindowingStrategyProto.getComponents().getCodersMap()).putAllEnvironments(fakeWindowingStrategyProto.getComponents().getEnvironmentsMap());
} catch (IOException exc) {
throw new RuntimeException("Could not convert default windowing stratey to proto", exc);
}
Map<Node, String> nodesToPCollections = new HashMap<>();
ImmutableMap.Builder<String, NameContext> ptransformIdToNameContexts = ImmutableMap.builder();
ImmutableMap.Builder<String, Iterable<SideInputInfo>> ptransformIdToSideInputInfos = ImmutableMap.builder();
ImmutableMap.Builder<String, Iterable<PCollectionView<?>>> ptransformIdToPCollectionViews = ImmutableMap.builder();
ImmutableMap.Builder<String, NameContext> pcollectionIdToNameContexts = ImmutableMap.builder();
ImmutableMap.Builder<InstructionOutputNode, String> instructionOutputNodeToCoderIdBuilder = ImmutableMap.builder();
// 2. Generate new PCollectionId and register it with ProcessBundleDescriptor.
for (InstructionOutputNode node : Iterables.filter(input.nodes(), InstructionOutputNode.class)) {
InstructionOutput instructionOutput = node.getInstructionOutput();
String coderId = "generatedCoder" + idGenerator.getId();
instructionOutputNodeToCoderIdBuilder.put(node, coderId);
try (ByteString.Output output = ByteString.newOutput()) {
try {
Coder<?> javaCoder = CloudObjects.coderFromCloudObject(CloudObject.fromSpec(instructionOutput.getCodec()));
sdkComponents.registerCoder(javaCoder);
RunnerApi.Coder coderProto = CoderTranslation.toProto(javaCoder, sdkComponents);
processBundleDescriptor.putCoders(coderId, coderProto);
} catch (IOException e) {
throw new IllegalArgumentException(String.format("Unable to encode coder %s for output %s", instructionOutput.getCodec(), instructionOutput), e);
} catch (Exception e) {
// Coder probably wasn't a java coder
OBJECT_MAPPER.writeValue(output, instructionOutput.getCodec());
processBundleDescriptor.putCoders(coderId, RunnerApi.Coder.newBuilder().setSpec(RunnerApi.FunctionSpec.newBuilder().setPayload(output.toByteString())).build());
}
} catch (IOException e) {
throw new IllegalArgumentException(String.format("Unable to encode coder %s for output %s", instructionOutput.getCodec(), instructionOutput), e);
}
// Generate new PCollection ID and map it to relevant node.
// Will later be used to fill PTransform inputs/outputs information.
String pcollectionId = "generatedPcollection" + idGenerator.getId();
processBundleDescriptor.putPcollections(pcollectionId, RunnerApi.PCollection.newBuilder().setCoderId(coderId).setWindowingStrategyId(fakeWindowingStrategyId).build());
nodesToPCollections.put(node, pcollectionId);
pcollectionIdToNameContexts.put(pcollectionId, NameContext.create(null, instructionOutput.getOriginalName(), instructionOutput.getSystemName(), instructionOutput.getName()));
}
processBundleDescriptor.putAllCoders(sdkComponents.toComponents().getCodersMap());
Map<InstructionOutputNode, String> instructionOutputNodeToCoderIdMap = instructionOutputNodeToCoderIdBuilder.build();
for (ParallelInstructionNode node : Iterables.filter(input.nodes(), ParallelInstructionNode.class)) {
ParallelInstruction parallelInstruction = node.getParallelInstruction();
String ptransformId = "generatedPtransform" + idGenerator.getId();
ptransformIdToNameContexts.put(ptransformId, NameContext.create(null, parallelInstruction.getOriginalName(), parallelInstruction.getSystemName(), parallelInstruction.getName()));
RunnerApi.PTransform.Builder pTransform = RunnerApi.PTransform.newBuilder();
RunnerApi.FunctionSpec.Builder transformSpec = RunnerApi.FunctionSpec.newBuilder();
if (parallelInstruction.getParDo() != null) {
ParDoInstruction parDoInstruction = parallelInstruction.getParDo();
CloudObject userFnSpec = CloudObject.fromSpec(parDoInstruction.getUserFn());
String userFnClassName = userFnSpec.getClassName();
if ("CombineValuesFn".equals(userFnClassName) || "KeyedCombineFn".equals(userFnClassName)) {
transformSpec = transformCombineValuesFnToFunctionSpec(userFnSpec);
ptransformIdToPCollectionViews.put(ptransformId, Collections.emptyList());
} else {
String parDoPTransformId = getString(userFnSpec, PropertyNames.SERIALIZED_FN);
RunnerApi.PTransform parDoPTransform = pipeline.getComponents().getTransformsOrDefault(parDoPTransformId, null);
// TODO: only the non-null branch should exist; for migration ease only
if (parDoPTransform != null) {
checkArgument(parDoPTransform.getSpec().getUrn().equals(PTransformTranslation.PAR_DO_TRANSFORM_URN), "Found transform \"%s\" for ParallelDo instruction, " + " but that transform had unexpected URN \"%s\" (expected \"%s\")", parDoPTransformId, parDoPTransform.getSpec().getUrn(), PTransformTranslation.PAR_DO_TRANSFORM_URN);
RunnerApi.ParDoPayload parDoPayload;
try {
parDoPayload = RunnerApi.ParDoPayload.parseFrom(parDoPTransform.getSpec().getPayload());
} catch (InvalidProtocolBufferException exc) {
throw new RuntimeException("ParDo did not have a ParDoPayload", exc);
}
ImmutableList.Builder<PCollectionView<?>> pcollectionViews = ImmutableList.builder();
for (Map.Entry<String, SideInput> sideInputEntry : parDoPayload.getSideInputsMap().entrySet()) {
pcollectionViews.add(transformSideInputForRunner(pipeline, parDoPTransform, sideInputEntry.getKey(), sideInputEntry.getValue()));
transformSideInputForSdk(pipeline, parDoPTransform, sideInputEntry.getKey(), processBundleDescriptor, pTransform);
}
ptransformIdToPCollectionViews.put(ptransformId, pcollectionViews.build());
transformSpec.setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(parDoPayload.toByteString());
} else {
// legacy path - bytes are the FunctionSpec's payload field, basically, and
// SDKs expect it in the PTransform's payload field
byte[] userFnBytes = getBytes(userFnSpec, PropertyNames.SERIALIZED_FN);
transformSpec.setUrn(ParDoTranslation.CUSTOM_JAVA_DO_FN_URN).setPayload(ByteString.copyFrom(userFnBytes));
}
// Add side input information for batch pipelines
if (parDoInstruction.getSideInputs() != null) {
ptransformIdToSideInputInfos.put(ptransformId, forSideInputInfos(parDoInstruction.getSideInputs(), true));
}
}
} else if (parallelInstruction.getRead() != null) {
ReadInstruction readInstruction = parallelInstruction.getRead();
CloudObject sourceSpec = CloudObject.fromSpec(CloudSourceUtils.flattenBaseSpecs(readInstruction.getSource()).getSpec());
// TODO: Need to plumb through the SDK specific function spec.
transformSpec.setUrn(JAVA_SOURCE_URN);
try {
byte[] serializedSource = Base64.getDecoder().decode(getString(sourceSpec, SERIALIZED_SOURCE));
ByteString sourceByteString = ByteString.copyFrom(serializedSource);
transformSpec.setPayload(sourceByteString);
} catch (Exception e) {
throw new IllegalArgumentException(String.format("Unable to process Read %s", parallelInstruction), e);
}
} else if (parallelInstruction.getFlatten() != null) {
transformSpec.setUrn(PTransformTranslation.FLATTEN_TRANSFORM_URN);
} else {
throw new IllegalArgumentException(String.format("Unknown type of ParallelInstruction %s", parallelInstruction));
}
for (Node predecessorOutput : input.predecessors(node)) {
pTransform.putInputs("generatedInput" + idGenerator.getId(), nodesToPCollections.get(predecessorOutput));
}
for (Edge edge : input.outEdges(node)) {
Node nodeOutput = input.incidentNodes(edge).target();
MultiOutputInfoEdge edge2 = (MultiOutputInfoEdge) edge;
pTransform.putOutputs(edge2.getMultiOutputInfo().getTag(), nodesToPCollections.get(nodeOutput));
}
pTransform.setSpec(transformSpec);
processBundleDescriptor.putTransforms(ptransformId, pTransform.build());
}
// Add the PTransforms representing the remote gRPC nodes
for (RemoteGrpcPortNode node : Iterables.filter(input.nodes(), RemoteGrpcPortNode.class)) {
RunnerApi.PTransform.Builder pTransform = RunnerApi.PTransform.newBuilder();
Set<Node> predecessors = input.predecessors(node);
Set<Node> successors = input.successors(node);
if (predecessors.isEmpty() && !successors.isEmpty()) {
Node instructionOutputNode = Iterables.getOnlyElement(successors);
pTransform.putOutputs("generatedOutput" + idGenerator.getId(), nodesToPCollections.get(instructionOutputNode));
pTransform.setSpec(RunnerApi.FunctionSpec.newBuilder().setUrn(DATA_INPUT_URN).setPayload(node.getRemoteGrpcPort().toBuilder().setCoderId(instructionOutputNodeToCoderIdMap.get(instructionOutputNode)).build().toByteString()).build());
} else if (!predecessors.isEmpty() && successors.isEmpty()) {
Node instructionOutputNode = Iterables.getOnlyElement(predecessors);
pTransform.putInputs("generatedInput" + idGenerator.getId(), nodesToPCollections.get(instructionOutputNode));
pTransform.setSpec(RunnerApi.FunctionSpec.newBuilder().setUrn(DATA_OUTPUT_URN).setPayload(node.getRemoteGrpcPort().toBuilder().setCoderId(instructionOutputNodeToCoderIdMap.get(instructionOutputNode)).build().toByteString()).build());
} else {
throw new IllegalStateException("Expected either one input OR one output " + "InstructionOutputNode for this RemoteGrpcPortNode");
}
processBundleDescriptor.putTransforms(node.getPrimitiveTransformId(), pTransform.build());
}
return RegisterRequestNode.create(RegisterRequest.newBuilder().addProcessBundleDescriptor(processBundleDescriptor).build(), ptransformIdToNameContexts.build(), ptransformIdToSideInputInfos.build(), ptransformIdToPCollectionViews.build(), pcollectionIdToNameContexts.build());
}
use of org.apache.beam.runners.core.construction.SdkComponents in project beam by apache.
the class DataflowRunner method run.
@Override
public DataflowPipelineJob run(Pipeline pipeline) {
if (useUnifiedWorker(options)) {
// non-null if useUnifiedWorker is true
List<String> experiments = options.getExperiments();
if (!experiments.contains("use_runner_v2")) {
experiments.add("use_runner_v2");
}
if (!experiments.contains("use_unified_worker")) {
experiments.add("use_unified_worker");
}
if (!experiments.contains("beam_fn_api")) {
experiments.add("beam_fn_api");
}
if (!experiments.contains("use_portable_job_submission")) {
experiments.add("use_portable_job_submission");
}
options.setExperiments(ImmutableList.copyOf(experiments));
}
logWarningIfPCollectionViewHasNonDeterministicKeyCoder(pipeline);
if (containsUnboundedPCollection(pipeline)) {
options.setStreaming(true);
}
LOG.info("Executing pipeline on the Dataflow Service, which will have billing implications " + "related to Google Compute Engine usage and other Google Cloud Services.");
DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
String workerHarnessContainerImageURL = DataflowRunner.getContainerImageForJob(dataflowOptions);
// This incorrectly puns the worker harness container image (which implements v1beta3 API)
// with the SDK harness image (which implements Fn API).
//
// The same Environment is used in different and contradictory ways, depending on whether
// it is a v1 or v2 job submission.
RunnerApi.Environment defaultEnvironmentForDataflow = Environments.createDockerEnvironment(workerHarnessContainerImageURL);
// The SdkComponents for portable an non-portable job submission must be kept distinct. Both
// need the default environment.
SdkComponents portableComponents = SdkComponents.create();
portableComponents.registerEnvironment(defaultEnvironmentForDataflow.toBuilder().addAllDependencies(getDefaultArtifacts()).addAllCapabilities(Environments.getJavaCapabilities()).build());
RunnerApi.Pipeline portablePipelineProto = PipelineTranslation.toProto(pipeline, portableComponents, false);
// Note that `stageArtifacts` has to be called before `resolveArtifact` because
// `resolveArtifact` updates local paths to staged paths in pipeline proto.
List<DataflowPackage> packages = stageArtifacts(portablePipelineProto);
portablePipelineProto = resolveArtifacts(portablePipelineProto);
portablePipelineProto = applySdkEnvironmentOverrides(portablePipelineProto, options);
if (LOG.isDebugEnabled()) {
LOG.debug("Portable pipeline proto:\n{}", TextFormat.printer().printToString(portablePipelineProto));
}
// Stage the portable pipeline proto, retrieving the staged pipeline path, then update
// the options on the new job
// TODO: add an explicit `pipeline` parameter to the submission instead of pipeline options
LOG.info("Staging portable pipeline proto to {}", options.getStagingLocation());
byte[] serializedProtoPipeline = portablePipelineProto.toByteArray();
DataflowPackage stagedPipeline = options.getStager().stageToFile(serializedProtoPipeline, PIPELINE_FILE_NAME);
dataflowOptions.setPipelineUrl(stagedPipeline.getLocation());
// Now rewrite things to be as needed for v1 (mutates the pipeline)
// This way the job submitted is valid for v1 and v2, simultaneously
replaceV1Transforms(pipeline);
// Capture the SdkComponents for look up during step translations
SdkComponents dataflowV1Components = SdkComponents.create();
dataflowV1Components.registerEnvironment(defaultEnvironmentForDataflow.toBuilder().addAllDependencies(getDefaultArtifacts()).addAllCapabilities(Environments.getJavaCapabilities()).build());
RunnerApi.Pipeline dataflowV1PipelineProto = PipelineTranslation.toProto(pipeline, dataflowV1Components, true);
if (LOG.isDebugEnabled()) {
LOG.debug("Dataflow v1 pipeline proto:\n{}", TextFormat.printer().printToString(dataflowV1PipelineProto));
}
// Set a unique client_request_id in the CreateJob request.
// This is used to ensure idempotence of job creation across retried
// attempts to create a job. Specifically, if the service returns a job with
// a different client_request_id, it means the returned one is a different
// job previously created with the same job name, and that the job creation
// has been effectively rejected. The SDK should return
// Error::Already_Exists to user in that case.
int randomNum = new Random().nextInt(9000) + 1000;
String requestId = DateTimeFormat.forPattern("YYYYMMddHHmmssmmm").withZone(DateTimeZone.UTC).print(DateTimeUtils.currentTimeMillis()) + "_" + randomNum;
// Try to create a debuggee ID. This must happen before the job is translated since it may
// update the options.
maybeRegisterDebuggee(dataflowOptions, requestId);
JobSpecification jobSpecification = translator.translate(pipeline, dataflowV1PipelineProto, dataflowV1Components, this, packages);
if (!isNullOrEmpty(dataflowOptions.getDataflowWorkerJar()) && !useUnifiedWorker(options)) {
List<String> experiments = firstNonNull(dataflowOptions.getExperiments(), Collections.emptyList());
if (!experiments.contains("use_staged_dataflow_worker_jar")) {
dataflowOptions.setExperiments(ImmutableList.<String>builder().addAll(experiments).add("use_staged_dataflow_worker_jar").build());
}
}
Job newJob = jobSpecification.getJob();
try {
newJob.getEnvironment().setSdkPipelineOptions(MAPPER.readValue(MAPPER_WITH_MODULES.writeValueAsBytes(options), Map.class));
} catch (IOException e) {
throw new IllegalArgumentException("PipelineOptions specified failed to serialize to JSON.", e);
}
newJob.setClientRequestId(requestId);
DataflowRunnerInfo dataflowRunnerInfo = DataflowRunnerInfo.getDataflowRunnerInfo();
String version = dataflowRunnerInfo.getVersion();
checkState(!"${pom.version}".equals(version), "Unable to submit a job to the Dataflow service with unset version ${pom.version}");
LOG.info("Dataflow SDK version: {}", version);
newJob.getEnvironment().setUserAgent((Map) dataflowRunnerInfo.getProperties());
// must be verified.
if (!isNullOrEmpty(options.getGcpTempLocation())) {
newJob.getEnvironment().setTempStoragePrefix(dataflowOptions.getPathValidator().verifyPath(options.getGcpTempLocation()));
}
newJob.getEnvironment().setDataset(options.getTempDatasetId());
if (options.getWorkerRegion() != null) {
newJob.getEnvironment().setWorkerRegion(options.getWorkerRegion());
}
if (options.getWorkerZone() != null) {
newJob.getEnvironment().setWorkerZone(options.getWorkerZone());
}
if (options.getFlexRSGoal() == DataflowPipelineOptions.FlexResourceSchedulingGoal.COST_OPTIMIZED) {
newJob.getEnvironment().setFlexResourceSchedulingGoal("FLEXRS_COST_OPTIMIZED");
} else if (options.getFlexRSGoal() == DataflowPipelineOptions.FlexResourceSchedulingGoal.SPEED_OPTIMIZED) {
newJob.getEnvironment().setFlexResourceSchedulingGoal("FLEXRS_SPEED_OPTIMIZED");
}
// Represent the minCpuPlatform pipeline option as an experiment, if not already present.
if (!isNullOrEmpty(dataflowOptions.getMinCpuPlatform())) {
List<String> experiments = firstNonNull(dataflowOptions.getExperiments(), Collections.emptyList());
List<String> minCpuFlags = experiments.stream().filter(p -> p.startsWith("min_cpu_platform")).collect(Collectors.toList());
if (minCpuFlags.isEmpty()) {
dataflowOptions.setExperiments(ImmutableList.<String>builder().addAll(experiments).add("min_cpu_platform=" + dataflowOptions.getMinCpuPlatform()).build());
} else {
LOG.warn("Flag min_cpu_platform is defined in both top level PipelineOption, " + "as well as under experiments. Proceed using {}.", minCpuFlags.get(0));
}
}
newJob.getEnvironment().setExperiments(ImmutableList.copyOf(firstNonNull(dataflowOptions.getExperiments(), Collections.emptyList())));
// Set the Docker container image that executes Dataflow worker harness, residing in Google
// Container Registry. Translator is guaranteed to create a worker pool prior to this point.
// For runner_v1, only worker_harness_container is set.
// For runner_v2, both worker_harness_container and sdk_harness_container are set to the same
// value.
String containerImage = getContainerImageForJob(options);
for (WorkerPool workerPool : newJob.getEnvironment().getWorkerPools()) {
workerPool.setWorkerHarnessContainerImage(containerImage);
}
configureSdkHarnessContainerImages(options, portablePipelineProto, newJob);
newJob.getEnvironment().setVersion(getEnvironmentVersion(options));
if (hooks != null) {
hooks.modifyEnvironmentBeforeSubmission(newJob.getEnvironment());
}
// will be downloaded from GCS by the service.
if (hasExperiment(options, "upload_graph")) {
DataflowPackage stagedGraph = options.getStager().stageToFile(DataflowPipelineTranslator.jobToString(newJob).getBytes(UTF_8), DATAFLOW_GRAPH_FILE_NAME);
newJob.getSteps().clear();
newJob.setStepsLocation(stagedGraph.getLocation());
}
if (!isNullOrEmpty(options.getDataflowJobFile()) || !isNullOrEmpty(options.getTemplateLocation())) {
boolean isTemplate = !isNullOrEmpty(options.getTemplateLocation());
if (isTemplate) {
checkArgument(isNullOrEmpty(options.getDataflowJobFile()), "--dataflowJobFile and --templateLocation are mutually exclusive.");
}
String fileLocation = firstNonNull(options.getTemplateLocation(), options.getDataflowJobFile());
checkArgument(fileLocation.startsWith("/") || fileLocation.startsWith("gs://"), "Location must be local or on Cloud Storage, got %s.", fileLocation);
ResourceId fileResource = FileSystems.matchNewResource(fileLocation, false);
String workSpecJson = DataflowPipelineTranslator.jobToString(newJob);
try (PrintWriter printWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(Channels.newOutputStream(FileSystems.create(fileResource, MimeTypes.TEXT)), UTF_8)))) {
printWriter.print(workSpecJson);
LOG.info("Printed job specification to {}", fileLocation);
} catch (IOException ex) {
String error = String.format("Cannot create output file at %s", fileLocation);
if (isTemplate) {
throw new RuntimeException(error, ex);
} else {
LOG.warn(error, ex);
}
}
if (isTemplate) {
LOG.info("Template successfully created.");
return new DataflowTemplateJob();
}
}
String jobIdToUpdate = null;
if (options.isUpdate()) {
jobIdToUpdate = getJobIdFromName(options.getJobName());
newJob.setTransformNameMapping(options.getTransformNameMapping());
newJob.setReplaceJobId(jobIdToUpdate);
}
if (options.getCreateFromSnapshot() != null && !options.getCreateFromSnapshot().isEmpty()) {
newJob.setCreatedFromSnapshotId(options.getCreateFromSnapshot());
}
Job jobResult;
try {
jobResult = dataflowClient.createJob(newJob);
} catch (GoogleJsonResponseException e) {
String errorMessages = "Unexpected errors";
if (e.getDetails() != null) {
if (Utf8.encodedLength(newJob.toString()) >= CREATE_JOB_REQUEST_LIMIT_BYTES) {
errorMessages = "The size of the serialized JSON representation of the pipeline " + "exceeds the allowable limit. " + "For more information, please see the documentation on job submission:\n" + "https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline#jobs";
} else {
errorMessages = e.getDetails().getMessage();
}
}
throw new RuntimeException("Failed to create a workflow job: " + errorMessages, e);
} catch (IOException e) {
throw new RuntimeException("Failed to create a workflow job", e);
}
// Use a raw client for post-launch monitoring, as status calls may fail
// regularly and need not be retried automatically.
DataflowPipelineJob dataflowPipelineJob = new DataflowPipelineJob(DataflowClient.create(options), jobResult.getId(), options, jobSpecification != null ? jobSpecification.getStepNames() : Collections.emptyMap(), portablePipelineProto);
// depending on whether this is a reload or not.
if (jobResult.getClientRequestId() != null && !jobResult.getClientRequestId().isEmpty() && !jobResult.getClientRequestId().equals(requestId)) {
// If updating a job.
if (options.isUpdate()) {
throw new DataflowJobAlreadyUpdatedException(dataflowPipelineJob, String.format("The job named %s with id: %s has already been updated into job id: %s " + "and cannot be updated again.", newJob.getName(), jobIdToUpdate, jobResult.getId()));
} else {
throw new DataflowJobAlreadyExistsException(dataflowPipelineJob, String.format("There is already an active job named %s with id: %s. If you want to submit a" + " second job, try again by setting a different name using --jobName.", newJob.getName(), jobResult.getId()));
}
}
LOG.info("To access the Dataflow monitoring console, please navigate to {}", MonitoringUtil.getJobMonitoringPageURL(options.getProject(), options.getRegion(), jobResult.getId()));
LOG.info("Submitted job: {}", jobResult.getId());
LOG.info("To cancel the job using the 'gcloud' tool, run:\n> {}", MonitoringUtil.getGcloudCancelCommand(options, jobResult.getId()));
return dataflowPipelineJob;
}
use of org.apache.beam.runners.core.construction.SdkComponents in project beam by apache.
the class StreamingDataflowWorkerTest method runMergeSessionsActions.
// Helper for running tests for merging sessions based upon Actions consisting of GetWorkResponse
// and expected timers and holds in the corresponding commit. All GetData requests are responded
// to with empty state, relying on user worker caching to keep data written.
private void runMergeSessionsActions(List<Action> actions) throws Exception {
Coder<KV<String, String>> kvCoder = KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of());
Coder<WindowedValue<KV<String, String>>> windowedKvCoder = FullWindowedValueCoder.of(kvCoder, IntervalWindow.getCoder());
KvCoder<String, List<String>> groupedCoder = KvCoder.of(StringUtf8Coder.of(), ListCoder.of(StringUtf8Coder.of()));
Coder<WindowedValue<KV<String, List<String>>>> windowedGroupedCoder = FullWindowedValueCoder.of(groupedCoder, IntervalWindow.getCoder());
CloudObject spec = CloudObject.forClassName("MergeWindowsDoFn");
SdkComponents sdkComponents = SdkComponents.create();
sdkComponents.registerEnvironment(Environments.JAVA_SDK_HARNESS_ENVIRONMENT);
addString(spec, PropertyNames.SERIALIZED_FN, StringUtils.byteArrayToJsonString(WindowingStrategyTranslation.toMessageProto(WindowingStrategy.of(Sessions.withGapDuration(Duration.millis(10))).withMode(AccumulationMode.DISCARDING_FIRED_PANES).withTrigger(Repeatedly.forever(AfterWatermark.pastEndOfWindow().withLateFirings(AfterPane.elementCountAtLeast(1)))).withAllowedLateness(Duration.standardMinutes(60)), sdkComponents).toByteArray()));
addObject(spec, WorkerPropertyNames.INPUT_CODER, CloudObjects.asCloudObject(windowedKvCoder, /*sdkComponents=*/
null));
ParallelInstruction mergeWindowsInstruction = new ParallelInstruction().setSystemName("MergeWindows-System").setName("MergeWindowsStep").setOriginalName("MergeWindowsOriginal").setParDo(new ParDoInstruction().setInput(new InstructionInput().setProducerInstructionIndex(0).setOutputNum(0)).setNumOutputs(1).setUserFn(spec)).setOutputs(Arrays.asList(new InstructionOutput().setOriginalName(DEFAULT_OUTPUT_ORIGINAL_NAME).setSystemName(DEFAULT_OUTPUT_SYSTEM_NAME).setName("output").setCodec(CloudObjects.asCloudObject(windowedGroupedCoder, /*sdkComponents=*/
null))));
List<ParallelInstruction> instructions = Arrays.asList(makeWindowingSourceInstruction(kvCoder), mergeWindowsInstruction, makeSinkInstruction(groupedCoder, 1));
FakeWindmillServer server = new FakeWindmillServer(errorCollector);
StreamingDataflowWorker worker = makeWorker(instructions, createTestingPipelineOptions(server), false);
Map<String, String> nameMap = new HashMap<>();
nameMap.put("MergeWindowsStep", "MergeWindows");
worker.addStateNameMappings(nameMap);
worker.start();
// Respond to any GetData requests with empty state.
for (int i = 0; i < 1000; ++i) {
server.addDataFnToOffer(EMPTY_DATA_RESPONDER);
}
for (int i = 0; i < actions.size(); ++i) {
Action action = actions.get(i);
server.addWorkToOffer(action.response);
Map<Long, Windmill.WorkItemCommitRequest> result = server.waitForAndGetCommits(1);
WorkItemCommitRequest actualOutput = result.get(i + 1L);
assertThat(actualOutput, Matchers.not(Matchers.nullValue()));
verifyTimers(actualOutput, action.expectedTimers);
verifyHolds(actualOutput, action.expectedHolds);
}
}
use of org.apache.beam.runners.core.construction.SdkComponents in project beam by apache.
the class StreamingDataflowWorkerTest method testAssignWindows.
@Test
public void testAssignWindows() throws Exception {
Duration gapDuration = Duration.standardSeconds(1);
CloudObject spec = CloudObject.forClassName("AssignWindowsDoFn");
SdkComponents sdkComponents = SdkComponents.create();
sdkComponents.registerEnvironment(Environments.JAVA_SDK_HARNESS_ENVIRONMENT);
addString(spec, PropertyNames.SERIALIZED_FN, StringUtils.byteArrayToJsonString(WindowingStrategyTranslation.toMessageProto(WindowingStrategy.of(FixedWindows.of(gapDuration)), sdkComponents).toByteArray()));
ParallelInstruction addWindowsInstruction = new ParallelInstruction().setSystemName("AssignWindows").setName("AssignWindows").setOriginalName("AssignWindowsOriginal").setParDo(new ParDoInstruction().setInput(new InstructionInput().setProducerInstructionIndex(0).setOutputNum(0)).setNumOutputs(1).setUserFn(spec)).setOutputs(Arrays.asList(new InstructionOutput().setOriginalName(DEFAULT_OUTPUT_ORIGINAL_NAME).setSystemName(DEFAULT_OUTPUT_SYSTEM_NAME).setName("output").setCodec(CloudObjects.asCloudObject(WindowedValue.getFullCoder(StringUtf8Coder.of(), IntervalWindow.getCoder()), /*sdkComponents=*/
null))));
List<ParallelInstruction> instructions = Arrays.asList(makeSourceInstruction(StringUtf8Coder.of()), addWindowsInstruction, makeSinkInstruction(StringUtf8Coder.of(), 1));
FakeWindmillServer server = new FakeWindmillServer(errorCollector);
int timestamp1 = 0;
int timestamp2 = 1000000;
server.addWorkToOffer(makeInput(timestamp1, timestamp1));
server.addWorkToOffer(makeInput(timestamp2, timestamp2));
StreamingDataflowWorker worker = makeWorker(instructions, createTestingPipelineOptions(server), false);
worker.start();
Map<Long, Windmill.WorkItemCommitRequest> result = server.waitForAndGetCommits(2);
assertThat(result.get((long) timestamp1), equalTo(setMessagesMetadata(PaneInfo.NO_FIRING, intervalWindowBytes(WINDOW_AT_ZERO), makeExpectedOutput(timestamp1, timestamp1)).build()));
assertThat(result.get((long) timestamp2), equalTo(setMessagesMetadata(PaneInfo.NO_FIRING, intervalWindowBytes(WINDOW_AT_ONE_SECOND), makeExpectedOutput(timestamp2, timestamp2)).build()));
}
use of org.apache.beam.runners.core.construction.SdkComponents in project beam by apache.
the class StreamingDataflowWorkerTest method testMergeWindowsCaching.
@Test
public // the first processing having is_new_key set.
void testMergeWindowsCaching() throws Exception {
Coder<KV<String, String>> kvCoder = KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of());
Coder<WindowedValue<KV<String, String>>> windowedKvCoder = FullWindowedValueCoder.of(kvCoder, IntervalWindow.getCoder());
KvCoder<String, List<String>> groupedCoder = KvCoder.of(StringUtf8Coder.of(), ListCoder.of(StringUtf8Coder.of()));
Coder<WindowedValue<KV<String, List<String>>>> windowedGroupedCoder = FullWindowedValueCoder.of(groupedCoder, IntervalWindow.getCoder());
CloudObject spec = CloudObject.forClassName("MergeWindowsDoFn");
SdkComponents sdkComponents = SdkComponents.create();
sdkComponents.registerEnvironment(Environments.JAVA_SDK_HARNESS_ENVIRONMENT);
addString(spec, PropertyNames.SERIALIZED_FN, StringUtils.byteArrayToJsonString(WindowingStrategyTranslation.toMessageProto(WindowingStrategy.of(FixedWindows.of(Duration.standardSeconds(1))).withTimestampCombiner(TimestampCombiner.EARLIEST), sdkComponents).toByteArray()));
addObject(spec, WorkerPropertyNames.INPUT_CODER, CloudObjects.asCloudObject(windowedKvCoder, /*sdkComponents=*/
null));
ParallelInstruction mergeWindowsInstruction = new ParallelInstruction().setSystemName("MergeWindows-System").setName("MergeWindowsStep").setOriginalName("MergeWindowsOriginal").setParDo(new ParDoInstruction().setInput(new InstructionInput().setProducerInstructionIndex(0).setOutputNum(0)).setNumOutputs(1).setUserFn(spec)).setOutputs(Arrays.asList(new InstructionOutput().setOriginalName(DEFAULT_OUTPUT_ORIGINAL_NAME).setSystemName(DEFAULT_OUTPUT_SYSTEM_NAME).setName("output").setCodec(CloudObjects.asCloudObject(windowedGroupedCoder, /*sdkComponents=*/
null))));
List<ParallelInstruction> instructions = Arrays.asList(makeWindowingSourceInstruction(kvCoder), mergeWindowsInstruction, // Use multiple stages in the maptask to test caching with multiple stages.
makeDoFnInstruction(new PassthroughDoFn(), 1, groupedCoder), makeSinkInstruction(groupedCoder, 2));
FakeWindmillServer server = new FakeWindmillServer(errorCollector);
StreamingDataflowWorker worker = makeWorker(instructions, createTestingPipelineOptions(server), false);
Map<String, String> nameMap = new HashMap<>();
nameMap.put("MergeWindowsStep", "MergeWindows");
worker.addStateNameMappings(nameMap);
worker.start();
server.addWorkToOffer(buildInput("work {" + " computation_id: \"" + DEFAULT_COMPUTATION_ID + "\"" + " input_data_watermark: 0" + " work {" + " key: \"" + DEFAULT_KEY_STRING + "\"" + " sharding_key: " + DEFAULT_SHARDING_KEY + " cache_token: 1" + " work_token: 1" + " is_new_key: 1" + " message_bundles {" + " source_computation_id: \"" + DEFAULT_SOURCE_COMPUTATION_ID + "\"" + " messages {" + " timestamp: 0" + " data: \"" + dataStringForIndex(0) + "\"" + " }" + " }" + " }" + "}", intervalWindowBytes(WINDOW_AT_ZERO)));
Map<Long, Windmill.WorkItemCommitRequest> result = server.waitForAndGetCommits(1);
Iterable<CounterUpdate> counters = worker.buildCounters();
// These tags and data are opaque strings and this is a change detector test.
// The "/u" indicates the user's namespace, versus "/s" for system namespace
String window = "/gAAAAAAAA-joBw/";
String timerTagPrefix = "/s" + window + "+0";
ByteString bufferTag = ByteString.copyFromUtf8(window + "+ubuf");
ByteString paneInfoTag = ByteString.copyFromUtf8(window + "+upane");
String watermarkDataHoldTag = window + "+uhold";
String watermarkExtraHoldTag = window + "+uextra";
String stateFamily = "MergeWindows";
ByteString bufferData = ByteString.copyFromUtf8("data0");
// Encoded form for Iterable<String>: -1, true, 'data0', false
ByteString outputData = ByteString.copyFrom(new byte[] { (byte) 0xff, (byte) 0xff, (byte) 0xff, (byte) 0xff, 0x01, 0x05, 0x64, 0x61, 0x74, 0x61, 0x30, 0x00 });
// These values are not essential to the change detector test
long timerTimestamp = 999000L;
WorkItemCommitRequest actualOutput = result.get(1L);
// Set timer
verifyTimers(actualOutput, buildWatermarkTimer(timerTagPrefix, 999));
assertThat(actualOutput.getBagUpdatesList(), Matchers.contains(Matchers.equalTo(Windmill.TagBag.newBuilder().setTag(bufferTag).setStateFamily(stateFamily).addValues(bufferData).build())));
verifyHolds(actualOutput, buildHold(watermarkDataHoldTag, 0, false));
// No state reads
assertEquals(0L, splitIntToLong(getCounter(counters, "WindmillStateBytesRead").getInteger()));
// Timer + buffer + watermark hold
assertEquals(Windmill.WorkItemCommitRequest.newBuilder(actualOutput).clearCounterUpdates().clearOutputMessages().build().getSerializedSize(), splitIntToLong(getCounter(counters, "WindmillStateBytesWritten").getInteger()));
// Input messages
assertEquals(VarInt.getLength(0L) + dataStringForIndex(0).length() + addPaneTag(PaneInfo.NO_FIRING, intervalWindowBytes(WINDOW_AT_ZERO)).size() + // proto overhead
5L, splitIntToLong(getCounter(counters, "WindmillShuffleBytesRead").getInteger()));
Windmill.GetWorkResponse.Builder getWorkResponse = Windmill.GetWorkResponse.newBuilder();
getWorkResponse.addWorkBuilder().setComputationId(DEFAULT_COMPUTATION_ID).setInputDataWatermark(timerTimestamp + 1000).addWorkBuilder().setKey(ByteString.copyFromUtf8(DEFAULT_KEY_STRING)).setShardingKey(DEFAULT_SHARDING_KEY).setWorkToken(2).setCacheToken(1).getTimersBuilder().addTimers(buildWatermarkTimer(timerTagPrefix, timerTimestamp));
server.addWorkToOffer(getWorkResponse.build());
long expectedBytesRead = 0L;
Windmill.GetDataResponse.Builder dataResponse = Windmill.GetDataResponse.newBuilder();
Windmill.KeyedGetDataResponse.Builder dataBuilder = dataResponse.addDataBuilder().setComputationId(DEFAULT_COMPUTATION_ID).addDataBuilder().setKey(ByteString.copyFromUtf8(DEFAULT_KEY_STRING)).setShardingKey(DEFAULT_SHARDING_KEY);
// These reads are skipped due to being cached from accesses in the first work item processing.
// dataBuilder
// .addBagsBuilder()
// .setTag(bufferTag)
// .setStateFamily(stateFamily)
// .addValues(bufferData);
// dataBuilder
// .addWatermarkHoldsBuilder()
// .setTag(ByteString.copyFromUtf8(watermarkDataHoldTag))
// .setStateFamily(stateFamily)
// .addTimestamps(0);
dataBuilder.addWatermarkHoldsBuilder().setTag(ByteString.copyFromUtf8(watermarkExtraHoldTag)).setStateFamily(stateFamily).addTimestamps(0);
dataBuilder.addValuesBuilder().setTag(paneInfoTag).setStateFamily(stateFamily).getValueBuilder().setTimestamp(0).setData(ByteString.EMPTY);
server.addDataToOffer(dataResponse.build());
expectedBytesRead += dataBuilder.build().getSerializedSize();
result = server.waitForAndGetCommits(1);
counters = worker.buildCounters();
actualOutput = result.get(2L);
assertEquals(1, actualOutput.getOutputMessagesCount());
assertEquals(DEFAULT_DESTINATION_STREAM_ID, actualOutput.getOutputMessages(0).getDestinationStreamId());
assertEquals(DEFAULT_KEY_STRING, actualOutput.getOutputMessages(0).getBundles(0).getKey().toStringUtf8());
assertEquals(0, actualOutput.getOutputMessages(0).getBundles(0).getMessages(0).getTimestamp());
assertEquals(outputData, actualOutput.getOutputMessages(0).getBundles(0).getMessages(0).getData());
ByteString metadata = actualOutput.getOutputMessages(0).getBundles(0).getMessages(0).getMetadata();
InputStream inStream = metadata.newInput();
assertEquals(PaneInfo.createPane(true, true, Timing.ON_TIME), PaneInfoCoder.INSTANCE.decode(inStream));
assertEquals(Arrays.asList(WINDOW_AT_ZERO), DEFAULT_WINDOW_COLLECTION_CODER.decode(inStream, Coder.Context.OUTER));
// Data was deleted
assertThat("" + actualOutput.getValueUpdatesList(), actualOutput.getValueUpdatesList(), Matchers.contains(Matchers.equalTo(Windmill.TagValue.newBuilder().setTag(paneInfoTag).setStateFamily(stateFamily).setValue(Windmill.Value.newBuilder().setTimestamp(Long.MAX_VALUE).setData(ByteString.EMPTY)).build())));
assertThat("" + actualOutput.getBagUpdatesList(), actualOutput.getBagUpdatesList(), Matchers.contains(Matchers.equalTo(Windmill.TagBag.newBuilder().setTag(bufferTag).setStateFamily(stateFamily).setDeleteAll(true).build())));
verifyHolds(actualOutput, buildHold(watermarkDataHoldTag, -1, true), buildHold(watermarkExtraHoldTag, -1, true));
// State reads for windowing
assertEquals(expectedBytesRead, splitIntToLong(getCounter(counters, "WindmillStateBytesRead").getInteger()));
// State updates to clear state
assertEquals(Windmill.WorkItemCommitRequest.newBuilder(actualOutput).clearCounterUpdates().clearOutputMessages().build().getSerializedSize(), splitIntToLong(getCounter(counters, "WindmillStateBytesWritten").getInteger()));
// No input messages
assertEquals(0L, splitIntToLong(getCounter(counters, "WindmillShuffleBytesRead").getInteger()));
CacheStats stats = worker.stateCache.getCacheStats();
LOG.info("cache stats {}", stats);
assertEquals(1, stats.hitCount());
assertEquals(4, stats.missCount());
}
Aggregations