Search in sources :

Example 6 with BoundedSource

use of org.apache.beam.sdk.io.BoundedSource in project beam by apache.

the class CustomSources method serializeToCloudSource.

public static com.google.api.services.dataflow.model.Source serializeToCloudSource(Source<?> source, PipelineOptions options) throws Exception {
    com.google.api.services.dataflow.model.Source cloudSource = new com.google.api.services.dataflow.model.Source();
    // We ourselves act as the SourceFormat.
    cloudSource.setSpec(CloudObject.forClass(CustomSources.class));
    addString(cloudSource.getSpec(), SERIALIZED_SOURCE, encodeBase64String(serializeToByteArray(source)));
    SourceMetadata metadata = new SourceMetadata();
    if (source instanceof BoundedSource) {
        BoundedSource<?> boundedSource = (BoundedSource<?>) source;
        // Size estimation is best effort so we continue even if it fails here.
        try {
            metadata.setEstimatedSizeBytes(boundedSource.getEstimatedSizeBytes(options));
        } catch (Exception e) {
            LOG.warn("Size estimation of the source failed: " + source, e);
        }
    } else if (source instanceof UnboundedSource) {
        UnboundedSource<?, ?> unboundedSource = (UnboundedSource<?, ?>) source;
        metadata.setInfinite(true);
        List<String> encodedSplits = new ArrayList<>();
        int desiredNumSplits = getDesiredNumUnboundedSourceSplits(options.as(DataflowPipelineOptions.class));
        for (UnboundedSource<?, ?> split : unboundedSource.split(desiredNumSplits, options)) {
            encodedSplits.add(encodeBase64String(serializeToByteArray(split)));
        }
        checkArgument(!encodedSplits.isEmpty(), "UnboundedSources must have at least one split");
        addStringList(cloudSource.getSpec(), SERIALIZED_SOURCE_SPLITS, encodedSplits);
    } else {
        throw new IllegalArgumentException("Unexpected source kind: " + source.getClass());
    }
    cloudSource.setMetadata(metadata);
    return cloudSource;
}
Also used : BoundedSource(org.apache.beam.sdk.io.BoundedSource) SourceMetadata(com.google.api.services.dataflow.model.SourceMetadata) UnboundedSource(org.apache.beam.sdk.io.UnboundedSource) Source(org.apache.beam.sdk.io.Source) BoundedSource(org.apache.beam.sdk.io.BoundedSource) Structs.addStringList(org.apache.beam.runners.dataflow.util.Structs.addStringList) ArrayList(java.util.ArrayList) List(java.util.List) UnboundedSource(org.apache.beam.sdk.io.UnboundedSource)

Example 7 with BoundedSource

use of org.apache.beam.sdk.io.BoundedSource in project beam by apache.

the class WorkerCustomSources method performSplitWithApiLimit.

/**
 * A helper method like {@link #performSplit(SourceSplitRequest, PipelineOptions)} but that allows
 * overriding the API size limit for testing.
 */
static SourceOperationResponse performSplitWithApiLimit(SourceSplitRequest request, PipelineOptions options, int numBundlesLimit, long apiByteLimit) throws Exception {
    // Compute the desired bundle size given by the service, or default if none was provided.
    long desiredBundleSizeBytes = DEFAULT_DESIRED_BUNDLE_SIZE_BYTES;
    SourceSplitOptions splitOptions = request.getOptions();
    if (splitOptions != null && splitOptions.getDesiredBundleSizeBytes() != null) {
        desiredBundleSizeBytes = splitOptions.getDesiredBundleSizeBytes();
    }
    Source<?> anySource = deserializeFromCloudSource(request.getSource().getSpec());
    checkArgument(anySource instanceof BoundedSource, "Cannot split a non-Bounded source: %s", anySource);
    return performSplitTyped(options, (BoundedSource<?>) anySource, desiredBundleSizeBytes, numBundlesLimit, apiByteLimit);
}
Also used : BoundedSource(org.apache.beam.sdk.io.BoundedSource) SourceSplitOptions(com.google.api.services.dataflow.model.SourceSplitOptions)

Example 8 with BoundedSource

use of org.apache.beam.sdk.io.BoundedSource in project beam by apache.

the class WorkerCustomSources method performSplitTyped.

private static <T> SourceOperationResponse performSplitTyped(PipelineOptions options, BoundedSource<T> source, long desiredBundleSizeBytes, int numBundlesLimit, long apiByteLimit) throws Exception {
    // Try to split normally
    List<BoundedSource<T>> bundles = splitAndValidate(source, desiredBundleSizeBytes, options);
    // If serialized size is too big, try splitting with a proportionally larger desiredBundleSize
    // to reduce the oversplitting.
    long serializedSize = DataflowApiUtils.computeSerializedSizeBytes(wrapIntoSourceSplitResponse(bundles));
    // If split response is too large, scale desired size for expected DATAFLOW_API_SIZE_BYTES/2.
    if (serializedSize > apiByteLimit) {
        double expansion = 2 * (double) serializedSize / apiByteLimit;
        long expandedBundleSizeBytes = (long) (desiredBundleSizeBytes * expansion);
        LOG.warn("Splitting source {} into bundles of estimated size {} bytes produced {} bundles, which" + " have total serialized size {} bytes. As this is too large for the Google Cloud" + " Dataflow API, retrying splitting once with increased desiredBundleSizeBytes {}" + " to reduce the number of splits.", source, desiredBundleSizeBytes, bundles.size(), serializedSize, expandedBundleSizeBytes);
        desiredBundleSizeBytes = expandedBundleSizeBytes;
        bundles = splitAndValidate(source, desiredBundleSizeBytes, options);
        serializedSize = DataflowApiUtils.computeSerializedSizeBytes(wrapIntoSourceSplitResponse(bundles));
        LOG.info("Splitting with desiredBundleSizeBytes {} produced {} bundles " + "with total serialized size {} bytes", desiredBundleSizeBytes, bundles.size(), serializedSize);
    }
    int numBundlesBeforeRebundling = bundles.size();
    // the sources into numBundlesLimit compressed serialized bundles.
    if (bundles.size() > numBundlesLimit) {
        LOG.warn("Splitting source {} into bundles of estimated size {} bytes produced {} bundles. " + "Rebundling into {} bundles.", source, desiredBundleSizeBytes, bundles.size(), numBundlesLimit);
        bundles = limitNumberOfBundles(bundles, numBundlesLimit);
    }
    SourceOperationResponse response = new SourceOperationResponse().setSplit(wrapIntoSourceSplitResponse(bundles));
    long finalResponseSize = DataflowApiUtils.computeSerializedSizeBytes(response);
    LOG.info("Splitting source {} produced {} bundles with total serialized response size {}", source, bundles.size(), finalResponseSize);
    if (finalResponseSize > apiByteLimit) {
        String message = String.format("Total size of the BoundedSource objects generated by split() operation is larger " + "than the allowable limit. When splitting %s into bundles of %d bytes " + "it generated %d BoundedSource objects with total serialized size of %d bytes " + "which is larger than the limit %d. " + "For more information, please check the corresponding FAQ entry at " + "https://cloud.google.com/dataflow/pipelines/troubleshooting-your-pipeline", source, desiredBundleSizeBytes, numBundlesBeforeRebundling, finalResponseSize, apiByteLimit);
        throw new IllegalArgumentException(message);
    }
    return response;
}
Also used : BoundedSource(org.apache.beam.sdk.io.BoundedSource) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) Structs.addString(org.apache.beam.runners.dataflow.util.Structs.addString) Base64.encodeBase64String(com.google.api.client.util.Base64.encodeBase64String) SourceOperationResponse(com.google.api.services.dataflow.model.SourceOperationResponse)

Example 9 with BoundedSource

use of org.apache.beam.sdk.io.BoundedSource in project beam by apache.

the class BoundedSourceWrapper method run.

@Override
public void run(SourceContext<WindowedValue<OutputT>> ctx) throws Exception {
    // figure out which split sources we're responsible for
    int subtaskIndex = getRuntimeContext().getIndexOfThisSubtask();
    int numSubtasks = getRuntimeContext().getNumberOfParallelSubtasks();
    List<BoundedSource<OutputT>> localSources = new ArrayList<>();
    for (int i = 0; i < splitSources.size(); i++) {
        if (i % numSubtasks == subtaskIndex) {
            localSources.add(splitSources.get(i));
        }
    }
    LOG.info("Bounded Flink Source {}/{} is reading from sources: {}", subtaskIndex, numSubtasks, localSources);
    FlinkMetricContainer metricContainer = new FlinkMetricContainer(getRuntimeContext());
    ReaderInvocationUtil<OutputT, BoundedSource.BoundedReader<OutputT>> readerInvoker = new ReaderInvocationUtil<>(stepName, serializedOptions.getPipelineOptions(), metricContainer);
    readers = new ArrayList<>();
    // initialize readers from scratch
    for (BoundedSource<OutputT> source : localSources) {
        readers.add(source.createReader(serializedOptions.getPipelineOptions()));
    }
    if (readers.size() == 1) {
        // the easy case, we just read from one reader
        BoundedSource.BoundedReader<OutputT> reader = readers.get(0);
        boolean dataAvailable = readerInvoker.invokeStart(reader);
        if (dataAvailable) {
            emitElement(ctx, reader);
        }
        while (isRunning) {
            dataAvailable = readerInvoker.invokeAdvance(reader);
            if (dataAvailable) {
                emitElement(ctx, reader);
            } else {
                break;
            }
        }
    } else {
        // a bit more complicated, we are responsible for several readers
        // loop through them and sleep if none of them had any data
        int currentReader = 0;
        // start each reader and emit data if immediately available
        for (BoundedSource.BoundedReader<OutputT> reader : readers) {
            boolean dataAvailable = readerInvoker.invokeStart(reader);
            if (dataAvailable) {
                emitElement(ctx, reader);
            }
        }
        // a flag telling us whether any of the readers had data
        // if no reader had data, sleep for bit
        boolean hadData = false;
        while (isRunning && !readers.isEmpty()) {
            BoundedSource.BoundedReader<OutputT> reader = readers.get(currentReader);
            boolean dataAvailable = readerInvoker.invokeAdvance(reader);
            if (dataAvailable) {
                emitElement(ctx, reader);
                hadData = true;
            } else {
                readers.remove(currentReader);
                currentReader--;
                if (readers.isEmpty()) {
                    break;
                }
            }
            currentReader = (currentReader + 1) % readers.size();
            if (currentReader == 0 && !hadData) {
                Thread.sleep(50);
            } else if (currentReader == 0) {
                hadData = false;
            }
        }
    }
    // emit final Long.MAX_VALUE watermark, just to be sure
    ctx.emitWatermark(new Watermark(Long.MAX_VALUE));
}
Also used : ReaderInvocationUtil(org.apache.beam.runners.flink.metrics.ReaderInvocationUtil) BoundedSource(org.apache.beam.sdk.io.BoundedSource) ArrayList(java.util.ArrayList) Watermark(org.apache.flink.streaming.api.watermark.Watermark) FlinkMetricContainer(org.apache.beam.runners.flink.metrics.FlinkMetricContainer)

Example 10 with BoundedSource

use of org.apache.beam.sdk.io.BoundedSource in project beam by apache.

the class BigQuerySourceBase method createSources.

private List<BoundedSource<TableRow>> createSources(List<ResourceId> files, TableSchema tableSchema) throws IOException, InterruptedException {
    final String jsonSchema = BigQueryIO.JSON_FACTORY.toString(tableSchema);
    SerializableFunction<GenericRecord, TableRow> function = new SerializableFunction<GenericRecord, TableRow>() {

        @Override
        public TableRow apply(GenericRecord input) {
            return BigQueryAvroUtils.convertGenericRecordToTableRow(input, BigQueryHelpers.fromJsonString(jsonSchema, TableSchema.class));
        }
    };
    List<BoundedSource<TableRow>> avroSources = Lists.newArrayList();
    for (ResourceId file : files) {
        avroSources.add(new TransformingSource<>(AvroSource.from(file.toString()), function, getDefaultOutputCoder()));
    }
    return ImmutableList.copyOf(avroSources);
}
Also used : BoundedSource(org.apache.beam.sdk.io.BoundedSource) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) TableSchema(com.google.api.services.bigquery.model.TableSchema) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) TableRow(com.google.api.services.bigquery.model.TableRow) GenericRecord(org.apache.avro.generic.GenericRecord)

Aggregations

BoundedSource (org.apache.beam.sdk.io.BoundedSource)16 ArrayList (java.util.ArrayList)6 Test (org.junit.Test)6 List (java.util.List)3 UnboundedSource (org.apache.beam.sdk.io.UnboundedSource)3 SourceMetadata (com.google.api.services.dataflow.model.SourceMetadata)2 ByteString (com.google.protobuf.ByteString)2 IOException (java.io.IOException)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 Source (org.apache.beam.sdk.io.Source)2 ResourceId (org.apache.beam.sdk.io.fs.ResourceId)2 SerializableFunction (org.apache.beam.sdk.transforms.SerializableFunction)2 WindowedValue (org.apache.beam.sdk.util.WindowedValue)2 KV (org.apache.beam.sdk.values.KV)2 Base64.encodeBase64String (com.google.api.client.util.Base64.encodeBase64String)1 TableRow (com.google.api.services.bigquery.model.TableRow)1 TableSchema (com.google.api.services.bigquery.model.TableSchema)1 DerivedSource (com.google.api.services.dataflow.model.DerivedSource)1 SourceOperationResponse (com.google.api.services.dataflow.model.SourceOperationResponse)1 SourceSplitOptions (com.google.api.services.dataflow.model.SourceSplitOptions)1