Search in sources :

Example 1 with CreateReadSessionRequest

use of com.google.cloud.bigquery.storage.v1beta2.CreateReadSessionRequest in project spark-bigquery-connector by GoogleCloudDataproc.

the class ReadSessionCreator method create.

/**
 * Creates a new ReadSession for parallel reads.
 *
 * <p>Some attributes are governed by the {@link ReadSessionCreatorConfig} that this object was
 * constructed with.
 *
 * @param table The table to create the session for.
 * @param selectedFields
 * @param filter
 * @return
 */
public ReadSessionResponse create(TableId table, ImmutableList<String> selectedFields, Optional<String> filter) {
    TableInfo tableDetails = bigQueryClient.getTable(table);
    TableInfo actualTable = getActualTable(tableDetails, selectedFields, filter);
    StandardTableDefinition tableDefinition = actualTable.getDefinition();
    BigQueryReadClient bigQueryReadClient = bigQueryReadClientFactory.getBigQueryReadClient();
    String tablePath = toTablePath(actualTable.getTableId());
    CreateReadSessionRequest request = config.getRequestEncodedBase().map(value -> {
        try {
            return com.google.cloud.bigquery.storage.v1.CreateReadSessionRequest.parseFrom(java.util.Base64.getDecoder().decode(value));
        } catch (com.google.protobuf.InvalidProtocolBufferException e) {
            throw new RuntimeException("Couldn't decode:" + value, e);
        }
    }).orElse(CreateReadSessionRequest.newBuilder().build());
    ReadSession.Builder requestedSession = request.getReadSession().toBuilder();
    config.getTraceId().ifPresent(traceId -> requestedSession.setTraceId(traceId));
    TableReadOptions.Builder readOptions = requestedSession.getReadOptionsBuilder();
    if (!isInputTableAView(tableDetails)) {
        filter.ifPresent(readOptions::setRowRestriction);
    }
    readOptions.addAllSelectedFields(selectedFields);
    readOptions.setArrowSerializationOptions(ArrowSerializationOptions.newBuilder().setBufferCompression(config.getArrowCompressionCodec()).build());
    ReadSession readSession = bigQueryReadClient.createReadSession(request.newBuilder().setParent("projects/" + bigQueryClient.getProjectId()).setReadSession(requestedSession.setDataFormat(config.getReadDataFormat()).setReadOptions(readOptions).setTable(tablePath).build()).setMaxStreamCount(getMaxNumPartitionsRequested(config.getMaxParallelism(), tableDefinition)).build());
    return new ReadSessionResponse(readSession, actualTable);
}
Also used : TableDefinition(com.google.cloud.bigquery.TableDefinition) Logger(org.slf4j.Logger) StandardTableDefinition(com.google.cloud.bigquery.StandardTableDefinition) ReadSession(com.google.cloud.bigquery.storage.v1.ReadSession) TableId(com.google.cloud.bigquery.TableId) LoggerFactory(org.slf4j.LoggerFactory) BigQueryReadClient(com.google.cloud.bigquery.storage.v1.BigQueryReadClient) OptionalInt(java.util.OptionalInt) String.format(java.lang.String.format) UNSUPPORTED(com.google.cloud.bigquery.connector.common.BigQueryErrorCode.UNSUPPORTED) Stream(java.util.stream.Stream) ImmutableList(com.google.common.collect.ImmutableList) TableReadOptions(com.google.cloud.bigquery.storage.v1.ReadSession.TableReadOptions) ArrowSerializationOptions(com.google.cloud.bigquery.storage.v1.ArrowSerializationOptions) Optional(java.util.Optional) TableInfo(com.google.cloud.bigquery.TableInfo) CreateReadSessionRequest(com.google.cloud.bigquery.storage.v1.CreateReadSessionRequest) ReadSession(com.google.cloud.bigquery.storage.v1.ReadSession) BigQueryReadClient(com.google.cloud.bigquery.storage.v1.BigQueryReadClient) TableInfo(com.google.cloud.bigquery.TableInfo) TableReadOptions(com.google.cloud.bigquery.storage.v1.ReadSession.TableReadOptions) StandardTableDefinition(com.google.cloud.bigquery.StandardTableDefinition) CreateReadSessionRequest(com.google.cloud.bigquery.storage.v1.CreateReadSessionRequest)

Example 2 with CreateReadSessionRequest

use of com.google.cloud.bigquery.storage.v1beta2.CreateReadSessionRequest in project java-bigquerystorage by googleapis.

the class ITBigQueryStorageTest method testColumnSelection.

@Test
public void testColumnSelection() throws IOException {
    String table = BigQueryResource.FormatTableResource(/* projectId = */
    "bigquery-public-data", /* datasetId = */
    "samples", /* tableId = */
    "shakespeare");
    TableReadOptions options = TableReadOptions.newBuilder().addSelectedFields("word").addSelectedFields("word_count").setRowRestriction("word_count > 100").build();
    CreateReadSessionRequest request = CreateReadSessionRequest.newBuilder().setParent(parentProjectId).setMaxStreamCount(1).setReadSession(ReadSession.newBuilder().setTable(table).setReadOptions(options).setDataFormat(DataFormat.AVRO).build()).build();
    ReadSession session = client.createReadSession(request);
    assertEquals(String.format("Did not receive expected number of streams for table '%s' CreateReadSession response:%n%s", table, session.toString()), 1, session.getStreamsCount());
    ReadRowsRequest readRowsRequest = ReadRowsRequest.newBuilder().setReadStream(session.getStreams(0).getName()).build();
    Schema avroSchema = new Schema.Parser().parse(session.getAvroSchema().getSchema());
    String actualSchemaMessage = String.format("Unexpected schema. Actual schema:%n%s", avroSchema.toString(/* pretty = */
    true));
    assertEquals(actualSchemaMessage, Schema.Type.RECORD, avroSchema.getType());
    assertEquals(actualSchemaMessage, "__root__", avroSchema.getName());
    assertEquals(actualSchemaMessage, 2, avroSchema.getFields().size());
    assertEquals(actualSchemaMessage, Schema.Type.STRING, avroSchema.getField("word").schema().getType());
    assertEquals(actualSchemaMessage, Schema.Type.LONG, avroSchema.getField("word_count").schema().getType());
    SimpleRowReader reader = new SimpleRowReader(avroSchema);
    long rowCount = 0;
    ServerStream<ReadRowsResponse> stream = client.readRowsCallable().call(readRowsRequest);
    for (ReadRowsResponse response : stream) {
        rowCount += response.getRowCount();
        reader.processRows(response.getAvroRows(), new AvroRowConsumer() {

            @Override
            public void accept(GenericData.Record record) {
                String rowAssertMessage = String.format("Row not matching expectations: %s", record.toString());
                Long wordCount = (Long) record.get("word_count");
                assertWithMessage(rowAssertMessage).that(wordCount).isGreaterThan(100L);
                Utf8 word = (Utf8) record.get("word");
                assertWithMessage(rowAssertMessage).that(word.length()).isGreaterThan(0);
            }
        });
    }
    assertEquals(1_333, rowCount);
}
Also used : AvroRowConsumer(com.google.cloud.bigquery.storage.v1beta2.it.SimpleRowReader.AvroRowConsumer) ReadSession(com.google.cloud.bigquery.storage.v1beta2.ReadSession) Schema(org.apache.avro.Schema) ReadRowsRequest(com.google.cloud.bigquery.storage.v1beta2.ReadRowsRequest) GenericData(org.apache.avro.generic.GenericData) ReadRowsResponse(com.google.cloud.bigquery.storage.v1beta2.ReadRowsResponse) Utf8(org.apache.avro.util.Utf8) TableReadOptions(com.google.cloud.bigquery.storage.v1beta2.ReadSession.TableReadOptions) CreateReadSessionRequest(com.google.cloud.bigquery.storage.v1beta2.CreateReadSessionRequest) Test(org.junit.Test)

Example 3 with CreateReadSessionRequest

use of com.google.cloud.bigquery.storage.v1beta2.CreateReadSessionRequest in project java-bigquerystorage by googleapis.

the class ITBigQueryStorageTest method testFilter.

@Test
public void testFilter() throws IOException {
    String table = BigQueryResource.FormatTableResource(/* projectId = */
    "bigquery-public-data", /* datasetId = */
    "samples", /* tableId = */
    "shakespeare");
    TableReadOptions options = TableReadOptions.newBuilder().setRowRestriction("word_count > 100").build();
    CreateReadSessionRequest request = CreateReadSessionRequest.newBuilder().setParent(parentProjectId).setMaxStreamCount(1).setReadSession(ReadSession.newBuilder().setTable(table).setReadOptions(options).setDataFormat(DataFormat.AVRO).build()).build();
    ReadSession session = client.createReadSession(request);
    assertEquals(String.format("Did not receive expected number of streams for table '%s' CreateReadSession response:%n%s", table, session.toString()), 1, session.getStreamsCount());
    ReadRowsRequest readRowsRequest = ReadRowsRequest.newBuilder().setReadStream(session.getStreams(0).getName()).build();
    SimpleRowReader reader = new SimpleRowReader(new Schema.Parser().parse(session.getAvroSchema().getSchema()));
    long rowCount = 0;
    ServerStream<ReadRowsResponse> stream = client.readRowsCallable().call(readRowsRequest);
    for (ReadRowsResponse response : stream) {
        rowCount += response.getRowCount();
        reader.processRows(response.getAvroRows(), new AvroRowConsumer() {

            @Override
            public void accept(GenericData.Record record) {
                Long wordCount = (Long) record.get("word_count");
                assertWithMessage("Row not matching expectations: %s", record.toString()).that(wordCount).isGreaterThan(100L);
            }
        });
    }
    assertEquals(1_333, rowCount);
}
Also used : AvroRowConsumer(com.google.cloud.bigquery.storage.v1beta2.it.SimpleRowReader.AvroRowConsumer) ReadSession(com.google.cloud.bigquery.storage.v1beta2.ReadSession) ReadRowsRequest(com.google.cloud.bigquery.storage.v1beta2.ReadRowsRequest) GenericData(org.apache.avro.generic.GenericData) ReadRowsResponse(com.google.cloud.bigquery.storage.v1beta2.ReadRowsResponse) TableReadOptions(com.google.cloud.bigquery.storage.v1beta2.ReadSession.TableReadOptions) CreateReadSessionRequest(com.google.cloud.bigquery.storage.v1beta2.CreateReadSessionRequest) Test(org.junit.Test)

Example 4 with CreateReadSessionRequest

use of com.google.cloud.bigquery.storage.v1beta2.CreateReadSessionRequest in project java-bigquerystorage by googleapis.

the class ITBigQueryStorageTest method testColumnSelection.

@Test
public void testColumnSelection() throws IOException {
    String table = BigQueryResource.FormatTableResource(/* projectId = */
    "bigquery-public-data", /* datasetId = */
    "samples", /* tableId = */
    "shakespeare");
    TableReadOptions options = TableReadOptions.newBuilder().addSelectedFields("word").addSelectedFields("word_count").setRowRestriction("word_count > 100").build();
    CreateReadSessionRequest request = CreateReadSessionRequest.newBuilder().setParent(parentProjectId).setMaxStreamCount(1).setReadSession(ReadSession.newBuilder().setTable(table).setReadOptions(options).setDataFormat(DataFormat.AVRO).build()).build();
    ReadSession session = client.createReadSession(request);
    assertEquals(String.format("Did not receive expected number of streams for table '%s' CreateReadSession response:%n%s", table, session.toString()), 1, session.getStreamsCount());
    ReadRowsRequest readRowsRequest = ReadRowsRequest.newBuilder().setReadStream(session.getStreams(0).getName()).build();
    Schema avroSchema = new Schema.Parser().parse(session.getAvroSchema().getSchema());
    String actualSchemaMessage = String.format("Unexpected schema. Actual schema:%n%s", avroSchema.toString(/* pretty = */
    true));
    assertEquals(actualSchemaMessage, Schema.Type.RECORD, avroSchema.getType());
    assertEquals(actualSchemaMessage, "__root__", avroSchema.getName());
    assertEquals(actualSchemaMessage, 2, avroSchema.getFields().size());
    assertEquals(actualSchemaMessage, Schema.Type.STRING, avroSchema.getField("word").schema().getType());
    assertEquals(actualSchemaMessage, Schema.Type.LONG, avroSchema.getField("word_count").schema().getType());
    SimpleRowReader reader = new SimpleRowReader(avroSchema);
    long rowCount = 0;
    ServerStream<ReadRowsResponse> stream = client.readRowsCallable().call(readRowsRequest);
    for (ReadRowsResponse response : stream) {
        rowCount += response.getRowCount();
        reader.processRows(response.getAvroRows(), new AvroRowConsumer() {

            @Override
            public void accept(GenericData.Record record) {
                String rowAssertMessage = String.format("Row not matching expectations: %s", record.toString());
                Long wordCount = (Long) record.get("word_count");
                assertWithMessage(rowAssertMessage).that(wordCount).isGreaterThan(100L);
                Utf8 word = (Utf8) record.get("word");
                assertWithMessage(rowAssertMessage).that(word.length()).isGreaterThan(0);
            }
        });
    }
    assertEquals(1_333, rowCount);
}
Also used : AvroRowConsumer(com.google.cloud.bigquery.storage.v1.it.SimpleRowReader.AvroRowConsumer) ReadSession(com.google.cloud.bigquery.storage.v1.ReadSession) Schema(org.apache.avro.Schema) ReadRowsRequest(com.google.cloud.bigquery.storage.v1.ReadRowsRequest) GenericData(org.apache.avro.generic.GenericData) ReadRowsResponse(com.google.cloud.bigquery.storage.v1.ReadRowsResponse) Utf8(org.apache.avro.util.Utf8) TableReadOptions(com.google.cloud.bigquery.storage.v1.ReadSession.TableReadOptions) CreateReadSessionRequest(com.google.cloud.bigquery.storage.v1.CreateReadSessionRequest) Test(org.junit.Test)

Example 5 with CreateReadSessionRequest

use of com.google.cloud.bigquery.storage.v1beta2.CreateReadSessionRequest in project beam by apache.

the class BigQueryStorageSourceBase method split.

@Override
public List<BigQueryStorageStreamSource<T>> split(long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
    BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
    Table targetTable = getTargetTable(bqOptions);
    ReadSession.Builder readSessionBuilder = ReadSession.newBuilder();
    if (targetTable != null) {
        readSessionBuilder.setTable(BigQueryHelpers.toTableResourceName(targetTable.getTableReference()));
    } else {
        // If the table does not exist targetTable will be null.
        // Construct the table id if we can generate it. For error recording/logging.
        @Nullable String tableReferenceId = getTargetTableId(bqOptions);
        if (tableReferenceId != null) {
            readSessionBuilder.setTable(tableReferenceId);
        }
    }
    if (selectedFieldsProvider != null || rowRestrictionProvider != null) {
        ReadSession.TableReadOptions.Builder tableReadOptionsBuilder = ReadSession.TableReadOptions.newBuilder();
        if (selectedFieldsProvider != null) {
            tableReadOptionsBuilder.addAllSelectedFields(selectedFieldsProvider.get());
        }
        if (rowRestrictionProvider != null) {
            tableReadOptionsBuilder.setRowRestriction(rowRestrictionProvider.get());
        }
        readSessionBuilder.setReadOptions(tableReadOptionsBuilder);
    }
    if (format != null) {
        readSessionBuilder.setDataFormat(format);
    }
    int streamCount = 0;
    if (desiredBundleSizeBytes > 0) {
        long tableSizeBytes = (targetTable != null) ? targetTable.getNumBytes() : 0;
        streamCount = (int) Math.min(tableSizeBytes / desiredBundleSizeBytes, MAX_SPLIT_COUNT);
    }
    streamCount = Math.max(streamCount, MIN_SPLIT_COUNT);
    CreateReadSessionRequest createReadSessionRequest = CreateReadSessionRequest.newBuilder().setParent(BigQueryHelpers.toProjectResourceName(bqOptions.getBigQueryProject() == null ? bqOptions.getProject() : bqOptions.getBigQueryProject())).setReadSession(readSessionBuilder).setMaxStreamCount(streamCount).build();
    ReadSession readSession;
    try (StorageClient client = bqServices.getStorageClient(bqOptions)) {
        readSession = client.createReadSession(createReadSessionRequest);
        LOG.info("Sent BigQuery Storage API CreateReadSession request '{}'; received response '{}'.", createReadSessionRequest, readSession);
    }
    if (readSession.getStreamsList().isEmpty()) {
        // The underlying table is empty or all rows have been pruned.
        return ImmutableList.of();
    }
    Schema sessionSchema;
    if (readSession.getDataFormat() == DataFormat.ARROW) {
        org.apache.arrow.vector.types.pojo.Schema schema = ArrowConversion.arrowSchemaFromInput(readSession.getArrowSchema().getSerializedSchema().newInput());
        org.apache.beam.sdk.schemas.Schema beamSchema = ArrowConversion.ArrowSchemaTranslator.toBeamSchema(schema);
        sessionSchema = AvroUtils.toAvroSchema(beamSchema);
    } else if (readSession.getDataFormat() == DataFormat.AVRO) {
        sessionSchema = new Schema.Parser().parse(readSession.getAvroSchema().getSchema());
    } else {
        throw new IllegalArgumentException("data is not in a supported dataFormat: " + readSession.getDataFormat());
    }
    TableSchema trimmedSchema = BigQueryAvroUtils.trimBigQueryTableSchema(targetTable.getSchema(), sessionSchema);
    List<BigQueryStorageStreamSource<T>> sources = Lists.newArrayList();
    for (ReadStream readStream : readSession.getStreamsList()) {
        sources.add(BigQueryStorageStreamSource.create(readSession, readStream, trimmedSchema, parseFn, outputCoder, bqServices));
    }
    return ImmutableList.copyOf(sources);
}
Also used : TableSchema(com.google.api.services.bigquery.model.TableSchema) Schema(org.apache.avro.Schema) TableSchema(com.google.api.services.bigquery.model.TableSchema) ReadStream(com.google.cloud.bigquery.storage.v1.ReadStream) Table(com.google.api.services.bigquery.model.Table) ReadSession(com.google.cloud.bigquery.storage.v1.ReadSession) StorageClient(org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.StorageClient) Nullable(org.checkerframework.checker.nullness.qual.Nullable) CreateReadSessionRequest(com.google.cloud.bigquery.storage.v1.CreateReadSessionRequest)

Aggregations

CreateReadSessionRequest (com.google.cloud.bigquery.storage.v1.CreateReadSessionRequest)18 ReadSession (com.google.cloud.bigquery.storage.v1.ReadSession)16 Test (org.junit.Test)16 Table (com.google.api.services.bigquery.model.Table)12 StorageClient (org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.StorageClient)12 TableReference (com.google.api.services.bigquery.model.TableReference)11 FakeBigQueryServices (org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices)11 TableRow (com.google.api.services.bigquery.model.TableRow)9 TableRowParser (org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.TableRowParser)8 ReadRowsRequest (com.google.cloud.bigquery.storage.v1.ReadRowsRequest)5 ReadRowsResponse (com.google.cloud.bigquery.storage.v1.ReadRowsResponse)5 ByteString (com.google.protobuf.ByteString)5 Schema (org.apache.avro.Schema)5 JobStatistics (com.google.api.services.bigquery.model.JobStatistics)4 JobStatistics2 (com.google.api.services.bigquery.model.JobStatistics2)4 TableSchema (com.google.api.services.bigquery.model.TableSchema)4 TableReadOptions (com.google.cloud.bigquery.storage.v1.ReadSession.TableReadOptions)4 GenericData (org.apache.avro.generic.GenericData)4 BigQueryResourceNaming.createTempTableReference (org.apache.beam.sdk.io.gcp.bigquery.BigQueryResourceNaming.createTempTableReference)4 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)2