Search in sources :

Example 1 with Builder

use of com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions.Builder in project hadoop-connectors by GoogleCloudDataproc.

the class DirectBigQueryInputFormat method startSession.

private static ReadSession startSession(Configuration configuration, Table table, BigQueryStorageClient client) {
    // Extract relevant configuration settings.
    String jobProjectId = PROJECT_ID.get(configuration, configuration::get);
    String filter = SQL_FILTER.get(configuration, configuration::get);
    Collection<String> selectedFields = SELECTED_FIELDS.getStringCollection(configuration);
    Builder readOptions = TableReadOptions.newBuilder().setRowRestriction(filter);
    if (!selectedFields.isEmpty()) {
        readOptions.addAllSelectedFields(selectedFields);
    }
    CreateReadSessionRequest request = CreateReadSessionRequest.newBuilder().setTableReference(TableReferenceProto.TableReference.newBuilder().setProjectId(table.getTableReference().getProjectId()).setDatasetId(table.getTableReference().getDatasetId()).setTableId(table.getTableReference().getTableId())).setRequestedStreams(DIRECT_PARALLELISM.get(configuration, configuration::getInt)).setParent("projects/" + jobProjectId).setReadOptions(readOptions).setFormat(DataFormat.AVRO).build();
    return client.createReadSession(request);
}
Also used : Builder(com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions.Builder) CreateReadSessionRequest(com.google.cloud.bigquery.storage.v1beta1.Storage.CreateReadSessionRequest)

Example 2 with Builder

use of com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions.Builder in project DataflowTemplates by GoogleCloudPlatform.

the class BigQueryToParquet method run.

/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
private static PipelineResult run(BigQueryToParquetOptions options) {
    // Create the pipeline.
    Pipeline pipeline = Pipeline.create(options);
    TableReadOptions.Builder builder = TableReadOptions.newBuilder();
    /* Add fields to filter export on, if any. */
    if (options.getFields() != null) {
        builder.addAllSelectedFields(Arrays.asList(options.getFields().split(",\\s*")));
    }
    TableReadOptions tableReadOptions = builder.build();
    BigQueryStorageClient client = BigQueryStorageClientFactory.create();
    ReadSession session = ReadSessionFactory.create(client, options.getTableRef(), tableReadOptions);
    // Extract schema from ReadSession
    Schema schema = getTableSchema(session);
    client.close();
    TypedRead<GenericRecord> readFromBQ = BigQueryIO.read(SchemaAndRecord::getRecord).from(options.getTableRef()).withTemplateCompatibility().withMethod(Method.DIRECT_READ).withCoder(AvroCoder.of(schema));
    if (options.getFields() != null) {
        List<String> selectedFields = Splitter.on(",").splitToList(options.getFields());
        readFromBQ = selectedFields.isEmpty() ? readFromBQ : readFromBQ.withSelectedFields(selectedFields);
    }
    /*
     * Steps: 1) Read records from BigQuery via BigQueryIO.
     *        2) Write records to Google Cloud Storage in Parquet format.
     */
    pipeline.apply("ReadFromBigQuery", readFromBQ).apply("WriteToParquet", FileIO.<GenericRecord>write().via(ParquetIO.sink(schema)).to(options.getBucket()).withNumShards(options.getNumShards()).withSuffix(FILE_SUFFIX));
    // Execute the pipeline and return the result.
    return pipeline.run();
}
Also used : BigQueryStorageClient(com.google.cloud.bigquery.storage.v1beta1.BigQueryStorageClient) ReadSession(com.google.cloud.bigquery.storage.v1beta1.Storage.ReadSession) Schema(org.apache.avro.Schema) TableReadOptions(com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions) GenericRecord(org.apache.avro.generic.GenericRecord) SchemaAndRecord(org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord) Pipeline(org.apache.beam.sdk.Pipeline)

Example 3 with Builder

use of com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions.Builder in project DataflowTemplates by GoogleCloudPlatform.

the class BigQueryMetadataLoader method loadTableMetadata.

/**
 * Populates {@code table} builder with additional metadata like partition names and schema.
 *
 * @param filter optional filter to skip a subset of tables
 * @return {@code true} if the table matches all filters and should be included in the results,
 *     {@code false} if it should be skipped
 */
private boolean loadTableMetadata(BigQueryTable.Builder table, Filter filter) throws InterruptedException {
    TableReadOptions.Builder readOptions = TableReadOptions.newBuilder();
    if (table.getPartitioningColumn() == null) {
        if (filter != null && filter.shouldSkipUnpartitionedTable(table)) {
            return false;
        }
    } else {
        List<BigQueryTablePartition> partitions = loadTablePartitions(table, filter);
        if (filter != null && filter.shouldSkipPartitionedTable(table, partitions)) {
            return false;
        }
        table.setPartitions(partitions);
        LOG.info("Loaded {} partitions for table {}: {}", partitions.size(), table.getTableName(), partitions);
        // Creating a ReadSession without a WHERE clause for a partitioned table that has
        // "require partition filter" param set to true would fail with the error:
        // "Cannot query over table ... without a filter over column(s) ...
        // that can be used for partition elimination".
        // The following is a hack that adds an "is null and is not null" filter over the
        // partitioning column, which shouldn't select any data but should make the query
        // analyzer happy and should be enough to extract the table schema.
        // TODO(an2x): do this only when "require partition filter" = true
        // or load schema differently?
        readOptions.setRowRestriction(String.format("%s is null and %s is not null", table.getPartitioningColumn(), table.getPartitioningColumn()));
    }
    ReadSession session = BigQueryUtils.createReadSession(bqsClient, DatasetId.of(table.getProject(), table.getDataset()), table.getTableName(), readOptions.build());
    table.setSchema(new Schema.Parser().parse(session.getAvroSchema().getSchema()));
    LOG.info("Loaded schema for table {}: {}", table.getTableName(), table.getSchema());
    return true;
}
Also used : BigQueryTablePartition(com.google.cloud.teleport.v2.values.BigQueryTablePartition) ReadSession(com.google.cloud.bigquery.storage.v1beta1.Storage.ReadSession) TableReadOptions(com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions)

Example 4 with Builder

use of com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions.Builder in project java-bigquerystorage by googleapis.

the class EnhancedBigQueryStorageStubSettingsTest method testReadRowsSettings.

@Test
public void testReadRowsSettings() {
    ServerStreamingCallSettings.Builder<ReadRowsRequest, ReadRowsResponse> builder = EnhancedBigQueryStorageStubSettings.newBuilder().readRowsSettings();
    assertThat(builder.getRetryableCodes()).contains(Code.UNAVAILABLE);
    RetrySettings retrySettings = builder.getRetrySettings();
    assertThat(retrySettings.getInitialRetryDelay()).isEqualTo(Duration.ofMillis(100L));
    assertThat(retrySettings.getRetryDelayMultiplier()).isWithin(1e-6).of(1.3);
    assertThat(retrySettings.getMaxRetryDelay()).isEqualTo(Duration.ofMinutes(1L));
    assertThat(retrySettings.getInitialRpcTimeout()).isEqualTo(Duration.ofDays(1L));
    assertThat(retrySettings.getRpcTimeoutMultiplier()).isWithin(1e-6).of(1.0);
    assertThat(retrySettings.getMaxRpcTimeout()).isEqualTo(Duration.ofDays(1L));
    assertThat(retrySettings.getTotalTimeout()).isEqualTo(Duration.ofDays(1L));
    assertThat(builder.getIdleTimeout()).isEqualTo(Duration.ZERO);
}
Also used : ServerStreamingCallSettings(com.google.api.gax.rpc.ServerStreamingCallSettings) RetrySettings(com.google.api.gax.retrying.RetrySettings) ReadRowsResponse(com.google.cloud.bigquery.storage.v1beta1.Storage.ReadRowsResponse) ReadRowsRequest(com.google.cloud.bigquery.storage.v1beta1.Storage.ReadRowsRequest) Test(org.junit.Test)

Aggregations

TableReadOptions (com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions)2 ReadSession (com.google.cloud.bigquery.storage.v1beta1.Storage.ReadSession)2 RetrySettings (com.google.api.gax.retrying.RetrySettings)1 ServerStreamingCallSettings (com.google.api.gax.rpc.ServerStreamingCallSettings)1 BigQueryStorageClient (com.google.cloud.bigquery.storage.v1beta1.BigQueryStorageClient)1 Builder (com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions.Builder)1 CreateReadSessionRequest (com.google.cloud.bigquery.storage.v1beta1.Storage.CreateReadSessionRequest)1 ReadRowsRequest (com.google.cloud.bigquery.storage.v1beta1.Storage.ReadRowsRequest)1 ReadRowsResponse (com.google.cloud.bigquery.storage.v1beta1.Storage.ReadRowsResponse)1 BigQueryTablePartition (com.google.cloud.teleport.v2.values.BigQueryTablePartition)1 Schema (org.apache.avro.Schema)1 GenericRecord (org.apache.avro.generic.GenericRecord)1 Pipeline (org.apache.beam.sdk.Pipeline)1 SchemaAndRecord (org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord)1 Test (org.junit.Test)1