use of org.apache.beam.sdk.io.BoundedSource in project beam by apache.
the class CustomSources method serializeToCloudSource.
public static com.google.api.services.dataflow.model.Source serializeToCloudSource(Source<?> source, PipelineOptions options) throws Exception {
com.google.api.services.dataflow.model.Source cloudSource = new com.google.api.services.dataflow.model.Source();
// We ourselves act as the SourceFormat.
cloudSource.setSpec(CloudObject.forClass(CustomSources.class));
addString(cloudSource.getSpec(), SERIALIZED_SOURCE, encodeBase64String(serializeToByteArray(source)));
SourceMetadata metadata = new SourceMetadata();
if (source instanceof BoundedSource) {
BoundedSource<?> boundedSource = (BoundedSource<?>) source;
// Size estimation is best effort so we continue even if it fails here.
try {
metadata.setEstimatedSizeBytes(boundedSource.getEstimatedSizeBytes(options));
} catch (Exception e) {
LOG.warn("Size estimation of the source failed: " + source, e);
}
} else if (source instanceof UnboundedSource) {
UnboundedSource<?, ?> unboundedSource = (UnboundedSource<?, ?>) source;
metadata.setInfinite(true);
List<String> encodedSplits = new ArrayList<>();
int desiredNumSplits = getDesiredNumUnboundedSourceSplits(options.as(DataflowPipelineOptions.class));
for (UnboundedSource<?, ?> split : unboundedSource.split(desiredNumSplits, options)) {
encodedSplits.add(encodeBase64String(serializeToByteArray(split)));
}
checkArgument(!encodedSplits.isEmpty(), "UnboundedSources must have at least one split");
addStringList(cloudSource.getSpec(), SERIALIZED_SOURCE_SPLITS, encodedSplits);
} else {
throw new IllegalArgumentException("Unexpected source kind: " + source.getClass());
}
cloudSource.setMetadata(metadata);
return cloudSource;
}
use of org.apache.beam.sdk.io.BoundedSource in project beam by apache.
the class WorkerCustomSources method performSplitWithApiLimit.
/**
* A helper method like {@link #performSplit(SourceSplitRequest, PipelineOptions)} but that allows
* overriding the API size limit for testing.
*/
static SourceOperationResponse performSplitWithApiLimit(SourceSplitRequest request, PipelineOptions options, int numBundlesLimit, long apiByteLimit) throws Exception {
// Compute the desired bundle size given by the service, or default if none was provided.
long desiredBundleSizeBytes = DEFAULT_DESIRED_BUNDLE_SIZE_BYTES;
SourceSplitOptions splitOptions = request.getOptions();
if (splitOptions != null && splitOptions.getDesiredBundleSizeBytes() != null) {
desiredBundleSizeBytes = splitOptions.getDesiredBundleSizeBytes();
}
Source<?> anySource = deserializeFromCloudSource(request.getSource().getSpec());
checkArgument(anySource instanceof BoundedSource, "Cannot split a non-Bounded source: %s", anySource);
return performSplitTyped(options, (BoundedSource<?>) anySource, desiredBundleSizeBytes, numBundlesLimit, apiByteLimit);
}
use of org.apache.beam.sdk.io.BoundedSource in project beam by apache.
the class WorkerCustomSources method performSplitTyped.
private static <T> SourceOperationResponse performSplitTyped(PipelineOptions options, BoundedSource<T> source, long desiredBundleSizeBytes, int numBundlesLimit, long apiByteLimit) throws Exception {
// Try to split normally
List<BoundedSource<T>> bundles = splitAndValidate(source, desiredBundleSizeBytes, options);
// If serialized size is too big, try splitting with a proportionally larger desiredBundleSize
// to reduce the oversplitting.
long serializedSize = DataflowApiUtils.computeSerializedSizeBytes(wrapIntoSourceSplitResponse(bundles));
// If split response is too large, scale desired size for expected DATAFLOW_API_SIZE_BYTES/2.
if (serializedSize > apiByteLimit) {
double expansion = 2 * (double) serializedSize / apiByteLimit;
long expandedBundleSizeBytes = (long) (desiredBundleSizeBytes * expansion);
LOG.warn("Splitting source {} into bundles of estimated size {} bytes produced {} bundles, which" + " have total serialized size {} bytes. As this is too large for the Google Cloud" + " Dataflow API, retrying splitting once with increased desiredBundleSizeBytes {}" + " to reduce the number of splits.", source, desiredBundleSizeBytes, bundles.size(), serializedSize, expandedBundleSizeBytes);
desiredBundleSizeBytes = expandedBundleSizeBytes;
bundles = splitAndValidate(source, desiredBundleSizeBytes, options);
serializedSize = DataflowApiUtils.computeSerializedSizeBytes(wrapIntoSourceSplitResponse(bundles));
LOG.info("Splitting with desiredBundleSizeBytes {} produced {} bundles " + "with total serialized size {} bytes", desiredBundleSizeBytes, bundles.size(), serializedSize);
}
int numBundlesBeforeRebundling = bundles.size();
// the sources into numBundlesLimit compressed serialized bundles.
if (bundles.size() > numBundlesLimit) {
LOG.warn("Splitting source {} into bundles of estimated size {} bytes produced {} bundles. " + "Rebundling into {} bundles.", source, desiredBundleSizeBytes, bundles.size(), numBundlesLimit);
bundles = limitNumberOfBundles(bundles, numBundlesLimit);
}
SourceOperationResponse response = new SourceOperationResponse().setSplit(wrapIntoSourceSplitResponse(bundles));
long finalResponseSize = DataflowApiUtils.computeSerializedSizeBytes(response);
LOG.info("Splitting source {} produced {} bundles with total serialized response size {}", source, bundles.size(), finalResponseSize);
if (finalResponseSize > apiByteLimit) {
String message = String.format("Total size of the BoundedSource objects generated by split() operation is larger " + "than the allowable limit. When splitting %s into bundles of %d bytes " + "it generated %d BoundedSource objects with total serialized size of %d bytes " + "which is larger than the limit %d. " + "For more information, please check the corresponding FAQ entry at " + "https://cloud.google.com/dataflow/pipelines/troubleshooting-your-pipeline", source, desiredBundleSizeBytes, numBundlesBeforeRebundling, finalResponseSize, apiByteLimit);
throw new IllegalArgumentException(message);
}
return response;
}
use of org.apache.beam.sdk.io.BoundedSource in project beam by apache.
the class BoundedSourceWrapper method run.
@Override
public void run(SourceContext<WindowedValue<OutputT>> ctx) throws Exception {
// figure out which split sources we're responsible for
int subtaskIndex = getRuntimeContext().getIndexOfThisSubtask();
int numSubtasks = getRuntimeContext().getNumberOfParallelSubtasks();
List<BoundedSource<OutputT>> localSources = new ArrayList<>();
for (int i = 0; i < splitSources.size(); i++) {
if (i % numSubtasks == subtaskIndex) {
localSources.add(splitSources.get(i));
}
}
LOG.info("Bounded Flink Source {}/{} is reading from sources: {}", subtaskIndex, numSubtasks, localSources);
FlinkMetricContainer metricContainer = new FlinkMetricContainer(getRuntimeContext());
ReaderInvocationUtil<OutputT, BoundedSource.BoundedReader<OutputT>> readerInvoker = new ReaderInvocationUtil<>(stepName, serializedOptions.getPipelineOptions(), metricContainer);
readers = new ArrayList<>();
// initialize readers from scratch
for (BoundedSource<OutputT> source : localSources) {
readers.add(source.createReader(serializedOptions.getPipelineOptions()));
}
if (readers.size() == 1) {
// the easy case, we just read from one reader
BoundedSource.BoundedReader<OutputT> reader = readers.get(0);
boolean dataAvailable = readerInvoker.invokeStart(reader);
if (dataAvailable) {
emitElement(ctx, reader);
}
while (isRunning) {
dataAvailable = readerInvoker.invokeAdvance(reader);
if (dataAvailable) {
emitElement(ctx, reader);
} else {
break;
}
}
} else {
// a bit more complicated, we are responsible for several readers
// loop through them and sleep if none of them had any data
int currentReader = 0;
// start each reader and emit data if immediately available
for (BoundedSource.BoundedReader<OutputT> reader : readers) {
boolean dataAvailable = readerInvoker.invokeStart(reader);
if (dataAvailable) {
emitElement(ctx, reader);
}
}
// a flag telling us whether any of the readers had data
// if no reader had data, sleep for bit
boolean hadData = false;
while (isRunning && !readers.isEmpty()) {
BoundedSource.BoundedReader<OutputT> reader = readers.get(currentReader);
boolean dataAvailable = readerInvoker.invokeAdvance(reader);
if (dataAvailable) {
emitElement(ctx, reader);
hadData = true;
} else {
readers.remove(currentReader);
currentReader--;
if (readers.isEmpty()) {
break;
}
}
currentReader = (currentReader + 1) % readers.size();
if (currentReader == 0 && !hadData) {
Thread.sleep(50);
} else if (currentReader == 0) {
hadData = false;
}
}
}
// emit final Long.MAX_VALUE watermark, just to be sure
ctx.emitWatermark(new Watermark(Long.MAX_VALUE));
}
use of org.apache.beam.sdk.io.BoundedSource in project beam by apache.
the class BigQuerySourceBase method createSources.
private List<BoundedSource<TableRow>> createSources(List<ResourceId> files, TableSchema tableSchema) throws IOException, InterruptedException {
final String jsonSchema = BigQueryIO.JSON_FACTORY.toString(tableSchema);
SerializableFunction<GenericRecord, TableRow> function = new SerializableFunction<GenericRecord, TableRow>() {
@Override
public TableRow apply(GenericRecord input) {
return BigQueryAvroUtils.convertGenericRecordToTableRow(input, BigQueryHelpers.fromJsonString(jsonSchema, TableSchema.class));
}
};
List<BoundedSource<TableRow>> avroSources = Lists.newArrayList();
for (ResourceId file : files) {
avroSources.add(new TransformingSource<>(AvroSource.from(file.toString()), function, getDefaultOutputCoder()));
}
return ImmutableList.copyOf(avroSources);
}
Aggregations