Search in sources :

Example 1 with TableReadOptions

use of com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions in project DataflowTemplates by GoogleCloudPlatform.

the class BigQueryToParquet method run.

/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
private static PipelineResult run(BigQueryToParquetOptions options) {
    // Create the pipeline.
    Pipeline pipeline = Pipeline.create(options);
    TableReadOptions.Builder builder = TableReadOptions.newBuilder();
    /* Add fields to filter export on, if any. */
    if (options.getFields() != null) {
        builder.addAllSelectedFields(Arrays.asList(options.getFields().split(",\\s*")));
    }
    TableReadOptions tableReadOptions = builder.build();
    BigQueryStorageClient client = BigQueryStorageClientFactory.create();
    ReadSession session = ReadSessionFactory.create(client, options.getTableRef(), tableReadOptions);
    // Extract schema from ReadSession
    Schema schema = getTableSchema(session);
    client.close();
    TypedRead<GenericRecord> readFromBQ = BigQueryIO.read(SchemaAndRecord::getRecord).from(options.getTableRef()).withTemplateCompatibility().withMethod(Method.DIRECT_READ).withCoder(AvroCoder.of(schema));
    if (options.getFields() != null) {
        List<String> selectedFields = Splitter.on(",").splitToList(options.getFields());
        readFromBQ = selectedFields.isEmpty() ? readFromBQ : readFromBQ.withSelectedFields(selectedFields);
    }
    /*
     * Steps: 1) Read records from BigQuery via BigQueryIO.
     *        2) Write records to Google Cloud Storage in Parquet format.
     */
    pipeline.apply("ReadFromBigQuery", readFromBQ).apply("WriteToParquet", FileIO.<GenericRecord>write().via(ParquetIO.sink(schema)).to(options.getBucket()).withNumShards(options.getNumShards()).withSuffix(FILE_SUFFIX));
    // Execute the pipeline and return the result.
    return pipeline.run();
}
Also used : BigQueryStorageClient(com.google.cloud.bigquery.storage.v1beta1.BigQueryStorageClient) ReadSession(com.google.cloud.bigquery.storage.v1beta1.Storage.ReadSession) Schema(org.apache.avro.Schema) TableReadOptions(com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions) GenericRecord(org.apache.avro.generic.GenericRecord) SchemaAndRecord(org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord) Pipeline(org.apache.beam.sdk.Pipeline)

Example 2 with TableReadOptions

use of com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions in project java-bigquerystorage by googleapis.

the class ITBigQueryStorageTest method testColumnSelection.

@Test
public void testColumnSelection() throws IOException {
    TableReference tableReference = TableReference.newBuilder().setProjectId("bigquery-public-data").setDatasetId("samples").setTableId("shakespeare").build();
    TableReadOptions options = TableReadOptions.newBuilder().addSelectedFields("word").addSelectedFields("word_count").setRowRestriction("word_count > 100").build();
    CreateReadSessionRequest request = CreateReadSessionRequest.newBuilder().setParent(parentProjectId).setRequestedStreams(1).setTableReference(tableReference).setReadOptions(options).setFormat(DataFormat.AVRO).build();
    ReadSession session = client.createReadSession(request);
    assertEquals(String.format("Did not receive expected number of streams for table reference '%s' CreateReadSession response:%n%s", TextFormat.shortDebugString(tableReference), session.toString()), 1, session.getStreamsCount());
    StreamPosition readPosition = StreamPosition.newBuilder().setStream(session.getStreams(0)).build();
    ReadRowsRequest readRowsRequest = ReadRowsRequest.newBuilder().setReadPosition(readPosition).build();
    Schema avroSchema = new Schema.Parser().parse(session.getAvroSchema().getSchema());
    String actualSchemaMessage = String.format("Unexpected schema. Actual schema:%n%s", avroSchema.toString(/* pretty = */
    true));
    assertEquals(actualSchemaMessage, Schema.Type.RECORD, avroSchema.getType());
    assertEquals(actualSchemaMessage, "__root__", avroSchema.getName());
    assertEquals(actualSchemaMessage, 2, avroSchema.getFields().size());
    assertEquals(actualSchemaMessage, Schema.Type.STRING, avroSchema.getField("word").schema().getType());
    assertEquals(actualSchemaMessage, Schema.Type.LONG, avroSchema.getField("word_count").schema().getType());
    SimpleRowReader reader = new SimpleRowReader(avroSchema);
    long rowCount = 0;
    ServerStream<ReadRowsResponse> stream = client.readRowsCallable().call(readRowsRequest);
    for (ReadRowsResponse response : stream) {
        rowCount += response.getRowCount();
        reader.processRows(response.getAvroRows(), new SimpleRowReader.AvroRowConsumer() {

            @Override
            public void accept(GenericData.Record record) {
                String rowAssertMessage = String.format("Row not matching expectations: %s", record.toString());
                Long wordCount = (Long) record.get("word_count");
                assertWithMessage(rowAssertMessage).that(wordCount).isGreaterThan(100L);
                Utf8 word = (Utf8) record.get("word");
                assertWithMessage(rowAssertMessage).that(word.length()).isGreaterThan(0);
            }
        });
    }
    assertEquals(1_333, rowCount);
}
Also used : AvroRowConsumer(com.google.cloud.bigquery.storage.v1beta1.it.SimpleRowReader.AvroRowConsumer) ReadSession(com.google.cloud.bigquery.storage.v1beta1.Storage.ReadSession) Schema(org.apache.avro.Schema) StreamPosition(com.google.cloud.bigquery.storage.v1beta1.Storage.StreamPosition) ReadRowsRequest(com.google.cloud.bigquery.storage.v1beta1.Storage.ReadRowsRequest) GenericData(org.apache.avro.generic.GenericData) TableReference(com.google.cloud.bigquery.storage.v1beta1.TableReferenceProto.TableReference) ReadRowsResponse(com.google.cloud.bigquery.storage.v1beta1.Storage.ReadRowsResponse) Utf8(org.apache.avro.util.Utf8) TableReadOptions(com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions) CreateReadSessionRequest(com.google.cloud.bigquery.storage.v1beta1.Storage.CreateReadSessionRequest) Test(org.junit.Test)

Example 3 with TableReadOptions

use of com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions in project DataflowTemplates by GoogleCloudPlatform.

the class BigQueryToParquetTest method testReadSessionFactoryBadTable.

/**
 * Test {@link ReadSessionFactory} throws exception when invalid table reference is provided.
 */
@Test(expected = IllegalArgumentException.class)
public void testReadSessionFactoryBadTable() {
    // Test input
    final String badTableRef = "fantasmic-999999;great_data.table";
    final TableReadOptions tableReadOptions = TableReadOptions.newBuilder().build();
    ReadSessionFactory trsf = new ReadSessionFactory();
    ReadSession trs = trsf.create(client, badTableRef, tableReadOptions);
}
Also used : ReadSessionFactory(com.google.cloud.teleport.v2.templates.BigQueryToParquet.ReadSessionFactory) ReadSession(com.google.cloud.bigquery.storage.v1beta1.Storage.ReadSession) TableReadOptions(com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions) Test(org.junit.Test)

Example 4 with TableReadOptions

use of com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions in project java-bigquerystorage by googleapis.

the class ITBigQueryStorageTest method testFilter.

@Test
public void testFilter() throws IOException {
    TableReference tableReference = TableReference.newBuilder().setProjectId("bigquery-public-data").setDatasetId("samples").setTableId("shakespeare").build();
    TableReadOptions options = TableReadOptions.newBuilder().setRowRestriction("word_count > 100").build();
    CreateReadSessionRequest request = CreateReadSessionRequest.newBuilder().setParent(parentProjectId).setRequestedStreams(1).setTableReference(tableReference).setReadOptions(options).setFormat(DataFormat.AVRO).build();
    ReadSession session = client.createReadSession(request);
    assertEquals(String.format("Did not receive expected number of streams for table reference '%s' CreateReadSession response:%n%s", TextFormat.shortDebugString(tableReference), session.toString()), 1, session.getStreamsCount());
    StreamPosition readPosition = StreamPosition.newBuilder().setStream(session.getStreams(0)).build();
    ReadRowsRequest readRowsRequest = ReadRowsRequest.newBuilder().setReadPosition(readPosition).build();
    SimpleRowReader reader = new SimpleRowReader(new Schema.Parser().parse(session.getAvroSchema().getSchema()));
    long rowCount = 0;
    ServerStream<ReadRowsResponse> stream = client.readRowsCallable().call(readRowsRequest);
    for (ReadRowsResponse response : stream) {
        rowCount += response.getRowCount();
        reader.processRows(response.getAvroRows(), new SimpleRowReader.AvroRowConsumer() {

            @Override
            public void accept(GenericData.Record record) {
                Long wordCount = (Long) record.get("word_count");
                assertWithMessage("Row not matching expectations: %s", record.toString()).that(wordCount).isGreaterThan(100L);
            }
        });
    }
    assertEquals(1_333, rowCount);
}
Also used : AvroRowConsumer(com.google.cloud.bigquery.storage.v1beta1.it.SimpleRowReader.AvroRowConsumer) ReadSession(com.google.cloud.bigquery.storage.v1beta1.Storage.ReadSession) StreamPosition(com.google.cloud.bigquery.storage.v1beta1.Storage.StreamPosition) ReadRowsRequest(com.google.cloud.bigquery.storage.v1beta1.Storage.ReadRowsRequest) GenericData(org.apache.avro.generic.GenericData) TableReference(com.google.cloud.bigquery.storage.v1beta1.TableReferenceProto.TableReference) ReadRowsResponse(com.google.cloud.bigquery.storage.v1beta1.Storage.ReadRowsResponse) TableReadOptions(com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions) CreateReadSessionRequest(com.google.cloud.bigquery.storage.v1beta1.Storage.CreateReadSessionRequest) Test(org.junit.Test)

Aggregations

TableReadOptions (com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions)4 ReadSession (com.google.cloud.bigquery.storage.v1beta1.Storage.ReadSession)4 Test (org.junit.Test)3 CreateReadSessionRequest (com.google.cloud.bigquery.storage.v1beta1.Storage.CreateReadSessionRequest)2 ReadRowsRequest (com.google.cloud.bigquery.storage.v1beta1.Storage.ReadRowsRequest)2 ReadRowsResponse (com.google.cloud.bigquery.storage.v1beta1.Storage.ReadRowsResponse)2 StreamPosition (com.google.cloud.bigquery.storage.v1beta1.Storage.StreamPosition)2 TableReference (com.google.cloud.bigquery.storage.v1beta1.TableReferenceProto.TableReference)2 AvroRowConsumer (com.google.cloud.bigquery.storage.v1beta1.it.SimpleRowReader.AvroRowConsumer)2 Schema (org.apache.avro.Schema)2 GenericData (org.apache.avro.generic.GenericData)2 BigQueryStorageClient (com.google.cloud.bigquery.storage.v1beta1.BigQueryStorageClient)1 ReadSessionFactory (com.google.cloud.teleport.v2.templates.BigQueryToParquet.ReadSessionFactory)1 GenericRecord (org.apache.avro.generic.GenericRecord)1 Utf8 (org.apache.avro.util.Utf8)1 Pipeline (org.apache.beam.sdk.Pipeline)1 SchemaAndRecord (org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord)1