use of com.google.cloud.bigquery.storage.v1beta2.CreateReadSessionRequest in project spark-bigquery-connector by GoogleCloudDataproc.
the class ReadSessionCreator method create.
/**
* Creates a new ReadSession for parallel reads.
*
* <p>Some attributes are governed by the {@link ReadSessionCreatorConfig} that this object was
* constructed with.
*
* @param table The table to create the session for.
* @param selectedFields
* @param filter
* @return
*/
public ReadSessionResponse create(TableId table, ImmutableList<String> selectedFields, Optional<String> filter) {
TableInfo tableDetails = bigQueryClient.getTable(table);
TableInfo actualTable = getActualTable(tableDetails, selectedFields, filter);
StandardTableDefinition tableDefinition = actualTable.getDefinition();
BigQueryReadClient bigQueryReadClient = bigQueryReadClientFactory.getBigQueryReadClient();
String tablePath = toTablePath(actualTable.getTableId());
CreateReadSessionRequest request = config.getRequestEncodedBase().map(value -> {
try {
return com.google.cloud.bigquery.storage.v1.CreateReadSessionRequest.parseFrom(java.util.Base64.getDecoder().decode(value));
} catch (com.google.protobuf.InvalidProtocolBufferException e) {
throw new RuntimeException("Couldn't decode:" + value, e);
}
}).orElse(CreateReadSessionRequest.newBuilder().build());
ReadSession.Builder requestedSession = request.getReadSession().toBuilder();
config.getTraceId().ifPresent(traceId -> requestedSession.setTraceId(traceId));
TableReadOptions.Builder readOptions = requestedSession.getReadOptionsBuilder();
if (!isInputTableAView(tableDetails)) {
filter.ifPresent(readOptions::setRowRestriction);
}
readOptions.addAllSelectedFields(selectedFields);
readOptions.setArrowSerializationOptions(ArrowSerializationOptions.newBuilder().setBufferCompression(config.getArrowCompressionCodec()).build());
ReadSession readSession = bigQueryReadClient.createReadSession(request.newBuilder().setParent("projects/" + bigQueryClient.getProjectId()).setReadSession(requestedSession.setDataFormat(config.getReadDataFormat()).setReadOptions(readOptions).setTable(tablePath).build()).setMaxStreamCount(getMaxNumPartitionsRequested(config.getMaxParallelism(), tableDefinition)).build());
return new ReadSessionResponse(readSession, actualTable);
}
use of com.google.cloud.bigquery.storage.v1beta2.CreateReadSessionRequest in project java-bigquerystorage by googleapis.
the class ITBigQueryStorageTest method testColumnSelection.
@Test
public void testColumnSelection() throws IOException {
String table = BigQueryResource.FormatTableResource(/* projectId = */
"bigquery-public-data", /* datasetId = */
"samples", /* tableId = */
"shakespeare");
TableReadOptions options = TableReadOptions.newBuilder().addSelectedFields("word").addSelectedFields("word_count").setRowRestriction("word_count > 100").build();
CreateReadSessionRequest request = CreateReadSessionRequest.newBuilder().setParent(parentProjectId).setMaxStreamCount(1).setReadSession(ReadSession.newBuilder().setTable(table).setReadOptions(options).setDataFormat(DataFormat.AVRO).build()).build();
ReadSession session = client.createReadSession(request);
assertEquals(String.format("Did not receive expected number of streams for table '%s' CreateReadSession response:%n%s", table, session.toString()), 1, session.getStreamsCount());
ReadRowsRequest readRowsRequest = ReadRowsRequest.newBuilder().setReadStream(session.getStreams(0).getName()).build();
Schema avroSchema = new Schema.Parser().parse(session.getAvroSchema().getSchema());
String actualSchemaMessage = String.format("Unexpected schema. Actual schema:%n%s", avroSchema.toString(/* pretty = */
true));
assertEquals(actualSchemaMessage, Schema.Type.RECORD, avroSchema.getType());
assertEquals(actualSchemaMessage, "__root__", avroSchema.getName());
assertEquals(actualSchemaMessage, 2, avroSchema.getFields().size());
assertEquals(actualSchemaMessage, Schema.Type.STRING, avroSchema.getField("word").schema().getType());
assertEquals(actualSchemaMessage, Schema.Type.LONG, avroSchema.getField("word_count").schema().getType());
SimpleRowReader reader = new SimpleRowReader(avroSchema);
long rowCount = 0;
ServerStream<ReadRowsResponse> stream = client.readRowsCallable().call(readRowsRequest);
for (ReadRowsResponse response : stream) {
rowCount += response.getRowCount();
reader.processRows(response.getAvroRows(), new AvroRowConsumer() {
@Override
public void accept(GenericData.Record record) {
String rowAssertMessage = String.format("Row not matching expectations: %s", record.toString());
Long wordCount = (Long) record.get("word_count");
assertWithMessage(rowAssertMessage).that(wordCount).isGreaterThan(100L);
Utf8 word = (Utf8) record.get("word");
assertWithMessage(rowAssertMessage).that(word.length()).isGreaterThan(0);
}
});
}
assertEquals(1_333, rowCount);
}
use of com.google.cloud.bigquery.storage.v1beta2.CreateReadSessionRequest in project java-bigquerystorage by googleapis.
the class ITBigQueryStorageTest method testFilter.
@Test
public void testFilter() throws IOException {
String table = BigQueryResource.FormatTableResource(/* projectId = */
"bigquery-public-data", /* datasetId = */
"samples", /* tableId = */
"shakespeare");
TableReadOptions options = TableReadOptions.newBuilder().setRowRestriction("word_count > 100").build();
CreateReadSessionRequest request = CreateReadSessionRequest.newBuilder().setParent(parentProjectId).setMaxStreamCount(1).setReadSession(ReadSession.newBuilder().setTable(table).setReadOptions(options).setDataFormat(DataFormat.AVRO).build()).build();
ReadSession session = client.createReadSession(request);
assertEquals(String.format("Did not receive expected number of streams for table '%s' CreateReadSession response:%n%s", table, session.toString()), 1, session.getStreamsCount());
ReadRowsRequest readRowsRequest = ReadRowsRequest.newBuilder().setReadStream(session.getStreams(0).getName()).build();
SimpleRowReader reader = new SimpleRowReader(new Schema.Parser().parse(session.getAvroSchema().getSchema()));
long rowCount = 0;
ServerStream<ReadRowsResponse> stream = client.readRowsCallable().call(readRowsRequest);
for (ReadRowsResponse response : stream) {
rowCount += response.getRowCount();
reader.processRows(response.getAvroRows(), new AvroRowConsumer() {
@Override
public void accept(GenericData.Record record) {
Long wordCount = (Long) record.get("word_count");
assertWithMessage("Row not matching expectations: %s", record.toString()).that(wordCount).isGreaterThan(100L);
}
});
}
assertEquals(1_333, rowCount);
}
use of com.google.cloud.bigquery.storage.v1beta2.CreateReadSessionRequest in project java-bigquerystorage by googleapis.
the class ITBigQueryStorageTest method testColumnSelection.
@Test
public void testColumnSelection() throws IOException {
String table = BigQueryResource.FormatTableResource(/* projectId = */
"bigquery-public-data", /* datasetId = */
"samples", /* tableId = */
"shakespeare");
TableReadOptions options = TableReadOptions.newBuilder().addSelectedFields("word").addSelectedFields("word_count").setRowRestriction("word_count > 100").build();
CreateReadSessionRequest request = CreateReadSessionRequest.newBuilder().setParent(parentProjectId).setMaxStreamCount(1).setReadSession(ReadSession.newBuilder().setTable(table).setReadOptions(options).setDataFormat(DataFormat.AVRO).build()).build();
ReadSession session = client.createReadSession(request);
assertEquals(String.format("Did not receive expected number of streams for table '%s' CreateReadSession response:%n%s", table, session.toString()), 1, session.getStreamsCount());
ReadRowsRequest readRowsRequest = ReadRowsRequest.newBuilder().setReadStream(session.getStreams(0).getName()).build();
Schema avroSchema = new Schema.Parser().parse(session.getAvroSchema().getSchema());
String actualSchemaMessage = String.format("Unexpected schema. Actual schema:%n%s", avroSchema.toString(/* pretty = */
true));
assertEquals(actualSchemaMessage, Schema.Type.RECORD, avroSchema.getType());
assertEquals(actualSchemaMessage, "__root__", avroSchema.getName());
assertEquals(actualSchemaMessage, 2, avroSchema.getFields().size());
assertEquals(actualSchemaMessage, Schema.Type.STRING, avroSchema.getField("word").schema().getType());
assertEquals(actualSchemaMessage, Schema.Type.LONG, avroSchema.getField("word_count").schema().getType());
SimpleRowReader reader = new SimpleRowReader(avroSchema);
long rowCount = 0;
ServerStream<ReadRowsResponse> stream = client.readRowsCallable().call(readRowsRequest);
for (ReadRowsResponse response : stream) {
rowCount += response.getRowCount();
reader.processRows(response.getAvroRows(), new AvroRowConsumer() {
@Override
public void accept(GenericData.Record record) {
String rowAssertMessage = String.format("Row not matching expectations: %s", record.toString());
Long wordCount = (Long) record.get("word_count");
assertWithMessage(rowAssertMessage).that(wordCount).isGreaterThan(100L);
Utf8 word = (Utf8) record.get("word");
assertWithMessage(rowAssertMessage).that(word.length()).isGreaterThan(0);
}
});
}
assertEquals(1_333, rowCount);
}
use of com.google.cloud.bigquery.storage.v1beta2.CreateReadSessionRequest in project beam by apache.
the class BigQueryStorageSourceBase method split.
@Override
public List<BigQueryStorageStreamSource<T>> split(long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
Table targetTable = getTargetTable(bqOptions);
ReadSession.Builder readSessionBuilder = ReadSession.newBuilder();
if (targetTable != null) {
readSessionBuilder.setTable(BigQueryHelpers.toTableResourceName(targetTable.getTableReference()));
} else {
// If the table does not exist targetTable will be null.
// Construct the table id if we can generate it. For error recording/logging.
@Nullable String tableReferenceId = getTargetTableId(bqOptions);
if (tableReferenceId != null) {
readSessionBuilder.setTable(tableReferenceId);
}
}
if (selectedFieldsProvider != null || rowRestrictionProvider != null) {
ReadSession.TableReadOptions.Builder tableReadOptionsBuilder = ReadSession.TableReadOptions.newBuilder();
if (selectedFieldsProvider != null) {
tableReadOptionsBuilder.addAllSelectedFields(selectedFieldsProvider.get());
}
if (rowRestrictionProvider != null) {
tableReadOptionsBuilder.setRowRestriction(rowRestrictionProvider.get());
}
readSessionBuilder.setReadOptions(tableReadOptionsBuilder);
}
if (format != null) {
readSessionBuilder.setDataFormat(format);
}
int streamCount = 0;
if (desiredBundleSizeBytes > 0) {
long tableSizeBytes = (targetTable != null) ? targetTable.getNumBytes() : 0;
streamCount = (int) Math.min(tableSizeBytes / desiredBundleSizeBytes, MAX_SPLIT_COUNT);
}
streamCount = Math.max(streamCount, MIN_SPLIT_COUNT);
CreateReadSessionRequest createReadSessionRequest = CreateReadSessionRequest.newBuilder().setParent(BigQueryHelpers.toProjectResourceName(bqOptions.getBigQueryProject() == null ? bqOptions.getProject() : bqOptions.getBigQueryProject())).setReadSession(readSessionBuilder).setMaxStreamCount(streamCount).build();
ReadSession readSession;
try (StorageClient client = bqServices.getStorageClient(bqOptions)) {
readSession = client.createReadSession(createReadSessionRequest);
LOG.info("Sent BigQuery Storage API CreateReadSession request '{}'; received response '{}'.", createReadSessionRequest, readSession);
}
if (readSession.getStreamsList().isEmpty()) {
// The underlying table is empty or all rows have been pruned.
return ImmutableList.of();
}
Schema sessionSchema;
if (readSession.getDataFormat() == DataFormat.ARROW) {
org.apache.arrow.vector.types.pojo.Schema schema = ArrowConversion.arrowSchemaFromInput(readSession.getArrowSchema().getSerializedSchema().newInput());
org.apache.beam.sdk.schemas.Schema beamSchema = ArrowConversion.ArrowSchemaTranslator.toBeamSchema(schema);
sessionSchema = AvroUtils.toAvroSchema(beamSchema);
} else if (readSession.getDataFormat() == DataFormat.AVRO) {
sessionSchema = new Schema.Parser().parse(readSession.getAvroSchema().getSchema());
} else {
throw new IllegalArgumentException("data is not in a supported dataFormat: " + readSession.getDataFormat());
}
TableSchema trimmedSchema = BigQueryAvroUtils.trimBigQueryTableSchema(targetTable.getSchema(), sessionSchema);
List<BigQueryStorageStreamSource<T>> sources = Lists.newArrayList();
for (ReadStream readStream : readSession.getStreamsList()) {
sources.add(BigQueryStorageStreamSource.create(readSession, readStream, trimmedSchema, parseFn, outputCoder, bqServices));
}
return ImmutableList.copyOf(sources);
}
Aggregations