use of com.google.cloud.bigquery.storage.v1beta2.ReadSession in project spark-bigquery-connector by GoogleCloudDataproc.
the class BigQueryDataSourceReaderContext method planInputPartitionContexts.
public Stream<InputPartitionContext<InternalRow>> planInputPartitionContexts() {
if (isEmptySchema()) {
// create empty projection
return createEmptyProjectionPartitions();
}
ImmutableList<String> selectedFields = schema.map(requiredSchema -> ImmutableList.copyOf(requiredSchema.fieldNames())).orElse(ImmutableList.copyOf(fields.keySet()));
Optional<String> filter = getCombinedFilter();
ReadSessionResponse readSessionResponse = readSessionCreator.create(tableId, selectedFields, filter);
ReadSession readSession = readSessionResponse.getReadSession();
logger.info("Created read session for {}: {} for application id: {}", tableId.toString(), readSession.getName(), applicationId);
return readSession.getStreamsList().stream().map(stream -> new BigQueryInputPartitionContext(bigQueryReadClientFactory, stream.getName(), readSessionCreatorConfig.toReadRowsHelperOptions(), createConverter(selectedFields, readSessionResponse, userProvidedSchema)));
}
use of com.google.cloud.bigquery.storage.v1beta2.ReadSession in project trino by trinodb.
the class BigQuerySplitManager method readFromBigQuery.
private List<BigQuerySplit> readFromBigQuery(ConnectorSession session, TableId remoteTableId, Optional<List<ColumnHandle>> projectedColumns, int actualParallelism, Optional<String> filter) {
log.debug("readFromBigQuery(tableId=%s, projectedColumns=%s, actualParallelism=%s, filter=[%s])", remoteTableId, projectedColumns, actualParallelism, filter);
List<ColumnHandle> columns = projectedColumns.orElse(ImmutableList.of());
List<String> projectedColumnsNames = columns.stream().map(column -> ((BigQueryColumnHandle) column).getName()).collect(toImmutableList());
ReadSession readSession = new ReadSessionCreator(bigQueryClientFactory, bigQueryReadClientFactory, viewEnabled, viewExpiration).create(session, remoteTableId, projectedColumnsNames, filter, actualParallelism);
return readSession.getStreamsList().stream().map(stream -> BigQuerySplit.forStream(stream.getName(), readSession.getAvroSchema().getSchema(), columns)).collect(toImmutableList());
}
use of com.google.cloud.bigquery.storage.v1beta2.ReadSession in project trino by trinodb.
the class ReadSessionCreator method create.
public ReadSession create(ConnectorSession session, TableId remoteTable, List<String> selectedFields, Optional<String> filter, int parallelism) {
BigQueryClient client = bigQueryClientFactory.create(session);
TableInfo tableDetails = client.getTable(remoteTable).orElseThrow(() -> new TableNotFoundException(new SchemaTableName(remoteTable.getDataset(), remoteTable.getTable())));
TableInfo actualTable = getActualTable(client, tableDetails, selectedFields);
List<String> filteredSelectedFields = selectedFields.stream().filter(BigQueryUtil::validColumnName).collect(toList());
try (BigQueryReadClient bigQueryReadClient = bigQueryReadClientFactory.create(session)) {
ReadSession.TableReadOptions.Builder readOptions = ReadSession.TableReadOptions.newBuilder().addAllSelectedFields(filteredSelectedFields);
filter.ifPresent(readOptions::setRowRestriction);
ReadSession readSession = bigQueryReadClient.createReadSession(CreateReadSessionRequest.newBuilder().setParent("projects/" + client.getProjectId()).setReadSession(ReadSession.newBuilder().setDataFormat(DataFormat.AVRO).setTable(toTableResourceName(actualTable.getTableId())).setReadOptions(readOptions)).setMaxStreamCount(parallelism).build());
return readSession;
}
}
use of com.google.cloud.bigquery.storage.v1beta2.ReadSession in project java-bigquerystorage by googleapis.
the class ITBigQueryStorageLongRunningTest method testLongRunningReadSession.
@Test
public void testLongRunningReadSession() throws InterruptedException, ExecutionException {
// This test reads a larger table with the goal of doing a simple validation of timeout settings
// for a longer running session.
String table = BigQueryResource.FormatTableResource(/* projectId = */
"bigquery-public-data", /* datasetId = */
"samples", /* tableId = */
"wikipedia");
ReadSession session = client.createReadSession(/* parent = */
parentProjectId, /* readSession = */
ReadSession.newBuilder().setTable(table).setDataFormat(DataFormat.AVRO).build(), /* maxStreamCount = */
5);
assertEquals(String.format("Did not receive expected number of streams for table '%s' CreateReadSession response:%n%s", table, session.toString()), 5, session.getStreamsCount());
List<Callable<Long>> tasks = new ArrayList<>(session.getStreamsCount());
for (final ReadStream stream : session.getStreamsList()) {
tasks.add(new Callable<Long>() {
@Override
public Long call() throws Exception {
return readAllRowsFromStream(stream);
}
});
}
ExecutorService executor = Executors.newFixedThreadPool(tasks.size());
List<Future<Long>> results = executor.invokeAll(tasks);
long rowCount = 0;
for (Future<Long> result : results) {
rowCount += result.get();
}
assertEquals(313_797_035, rowCount);
}
use of com.google.cloud.bigquery.storage.v1beta2.ReadSession in project java-bigquerystorage by googleapis.
the class ITBigQueryStorageTest method testSimpleReadAndResume.
@Test
public void testSimpleReadAndResume() {
String table = BigQueryResource.FormatTableResource(/* projectId = */
"bigquery-public-data", /* datasetId = */
"samples", /* tableId = */
"shakespeare");
ReadSession session = client.createReadSession(/* parent = */
parentProjectId, /* readSession = */
ReadSession.newBuilder().setTable(table).setDataFormat(DataFormat.AVRO).build(), /* maxStreamCount = */
1);
assertEquals(String.format("Did not receive expected number of streams for table '%s' CreateReadSession response:%n%s", table, session.toString()), 1, session.getStreamsCount());
// We have to read some number of rows in order to be able to resume. More details:
long rowCount = ReadStreamToOffset(session.getStreams(0), /* rowOffset = */
34_846);
ReadRowsRequest readRowsRequest = ReadRowsRequest.newBuilder().setReadStream(session.getStreams(0).getName()).setOffset(rowCount).build();
ServerStream<ReadRowsResponse> stream = client.readRowsCallable().call(readRowsRequest);
for (ReadRowsResponse response : stream) {
rowCount += response.getRowCount();
}
// Verifies that the number of rows skipped and read equals to the total number of rows in the
// table.
assertEquals(164_656, rowCount);
}
Aggregations