use of com.google.cloud.bigquery.storage.v1.TableSchema in project beam by apache.
the class BigQueryIOStorageQueryTest method testQuerySourceInitialSplit_NoReferencedTables.
/**
* This test simulates the scenario where the SQL text which is executed by the query job doesn't
* by itself refer to any tables (e.g. "SELECT 17 AS value"), and thus there are no referenced
* tables when the dry run of the query is performed.
*/
@Test
public void testQuerySourceInitialSplit_NoReferencedTables() throws Exception {
Table queryResultTable = new Table().setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))).setNumBytes(1024L * 1024L);
String encodedQuery = FakeBigQueryServices.encodeQueryResult(queryResultTable);
fakeJobService.expectDryRunQuery(options.getProject(), encodedQuery, new JobStatistics().setQuery(new JobStatistics2().setTotalBytesProcessed(1024L * 1024L).setReferencedTables(ImmutableList.of())));
String stepUuid = "testStepUuid";
TableReference tempTableReference = createTempTableReference(options.getProject(), BigQueryResourceNaming.createJobIdPrefix(options.getJobName(), stepUuid, JobType.QUERY), Optional.empty());
CreateReadSessionRequest expectedRequest = CreateReadSessionRequest.newBuilder().setParent("projects/" + options.getProject()).setReadSession(ReadSession.newBuilder().setTable(BigQueryHelpers.toTableResourceName(tempTableReference))).setMaxStreamCount(1024).build();
Schema sessionSchema = SchemaBuilder.record("__root__").fields().name("name").type().nullable().stringType().noDefault().name("number").type().nullable().longType().noDefault().endRecord();
ReadSession.Builder builder = ReadSession.newBuilder().setAvroSchema(AvroSchema.newBuilder().setSchema(sessionSchema.toString())).setDataFormat(DataFormat.AVRO);
for (int i = 0; i < 1024; i++) {
builder.addStreams(ReadStream.newBuilder().setName("stream-" + i));
}
StorageClient fakeStorageClient = mock(StorageClient.class);
when(fakeStorageClient.createReadSession(expectedRequest)).thenReturn(builder.build());
BigQueryStorageQuerySource<TableRow> querySource = BigQueryStorageQuerySource.create(stepUuid, ValueProvider.StaticValueProvider.of(encodedQuery), /* flattenResults = */
true, /* useLegacySql = */
true, /* priority = */
QueryPriority.BATCH, /* location = */
null, /* queryTempDataset = */
null, /* kmsKey = */
null, null, new TableRowParser(), TableRowJsonCoder.of(), new FakeBigQueryServices().withDatasetService(fakeDatasetService).withJobService(fakeJobService).withStorageClient(fakeStorageClient));
List<? extends BoundedSource<TableRow>> sources = querySource.split(1024, options);
assertEquals(1024, sources.size());
}
use of com.google.cloud.bigquery.storage.v1.TableSchema in project spark-bigquery-connector by GoogleCloudDataproc.
the class BigQueryDataSourceReaderContext method planBatchInputPartitionContexts.
public Stream<InputPartitionContext<ColumnarBatch>> planBatchInputPartitionContexts() {
if (!enableBatchRead()) {
throw new IllegalStateException("Batch reads should not be enabled");
}
ImmutableList<String> selectedFields = schema.map(requiredSchema -> ImmutableList.copyOf(requiredSchema.fieldNames())).orElse(ImmutableList.copyOf(fields.keySet()));
Optional<String> filter = getCombinedFilter();
ReadSessionResponse readSessionResponse = readSessionCreator.create(tableId, selectedFields, filter);
ReadSession readSession = readSessionResponse.getReadSession();
logger.info("Created read session for {}: {} for application id: {}", tableId.toString(), readSession.getName(), applicationId);
if (selectedFields.isEmpty()) {
// means select *
Schema tableSchema = SchemaConverters.getSchemaWithPseudoColumns(readSessionResponse.getReadTableInfo());
selectedFields = tableSchema.getFields().stream().map(Field::getName).collect(ImmutableList.toImmutableList());
}
ImmutableList<String> partitionSelectedFields = selectedFields;
return Streams.stream(Iterables.partition(readSession.getStreamsList(), readSessionCreatorConfig.streamsPerPartition())).map(streams -> new ArrowInputPartitionContext(bigQueryReadClientFactory, bigQueryTracerFactory, streams.stream().map(ReadStream::getName).collect(Collectors.toCollection(ArrayList::new)), readSessionCreatorConfig.toReadRowsHelperOptions(), partitionSelectedFields, readSessionResponse, userProvidedSchema));
}
use of com.google.cloud.bigquery.storage.v1.TableSchema in project spark-bigquery-connector by GoogleCloudDataproc.
the class BigQueryDirectDataSourceWriterContext method getOrCreateTable.
/**
* This function determines whether the destination table exists: if it doesn't, we will create a
* table and Spark will directly write to it.
*
* @param saveMode the SaveMode supplied by the user.
* @param destinationTableId the TableId, as was supplied by the user
* @param bigQuerySchema the bigQuery schema
* @return The TableId to which Spark will do the writing: whether that is the destinationTableID
* or the temporaryTableId.
*/
private BigQueryTable getOrCreateTable(SaveMode saveMode, TableId destinationTableId, Schema bigQuerySchema) throws IllegalArgumentException {
if (bigQueryClient.tableExists(destinationTableId)) {
TableInfo destinationTable = bigQueryClient.getTable(destinationTableId);
Schema tableSchema = destinationTable.getDefinition().getSchema();
Preconditions.checkArgument(BigQueryUtil.schemaEquals(tableSchema, bigQuerySchema, /* regardFieldOrder */
false), new BigQueryConnectorException.InvalidSchemaException("Destination table's schema is not compatible with dataframe's schema"));
switch(saveMode) {
case Append:
break;
case Overwrite:
writingMode = WritingMode.OVERWRITE;
return new BigQueryTable(bigQueryClient.createTempTable(destinationTableId, bigQuerySchema).getTableId(), true);
case Ignore:
writingMode = WritingMode.IGNORE_INPUTS;
break;
case ErrorIfExists:
throw new IllegalArgumentException("Table already exists in BigQuery");
}
return new BigQueryTable(destinationTable.getTableId(), false);
} else {
return new BigQueryTable(bigQueryClient.createTable(destinationTableId, bigQuerySchema).getTableId(), true);
}
}
use of com.google.cloud.bigquery.storage.v1.TableSchema in project java-bigquerystorage by googleapis.
the class AppendCompleteCallback method writeToDefaultStream.
// writeToDefaultStream: Writes records from the source file to the destination table.
public static void writeToDefaultStream(String projectId, String datasetName, String tableName, String dataFile) throws DescriptorValidationException, InterruptedException, IOException {
BigQuery bigquery = BigQueryOptions.getDefaultInstance().getService();
// Get the schema of the destination table and convert to the equivalent BigQueryStorage type.
Table table = bigquery.getTable(datasetName, tableName);
Schema schema = table.getDefinition().getSchema();
TableSchema tableSchema = BqToBqStorageSchemaConverter.convertTableSchema(schema);
// Use the JSON stream writer to send records in JSON format.
TableName parentTable = TableName.of(projectId, datasetName, tableName);
try (JsonStreamWriter writer = JsonStreamWriter.newBuilder(parentTable.toString(), tableSchema).build()) {
// Read JSON data from the source file and send it to the Write API.
BufferedReader reader = new BufferedReader(new FileReader(dataFile));
String line = reader.readLine();
while (line != null) {
// As a best practice, send batches of records, instead of single records at a time.
JSONArray jsonArr = new JSONArray();
for (int i = 0; i < 100; i++) {
JSONObject record = new JSONObject(line);
jsonArr.put(record);
line = reader.readLine();
if (line == null) {
break;
}
}
// batch
ApiFuture<AppendRowsResponse> future = writer.append(jsonArr);
// The append method is asynchronous. Rather than waiting for the method to complete,
// which can hurt performance, register a completion callback and continue streaming.
ApiFutures.addCallback(future, new AppendCompleteCallback(), MoreExecutors.directExecutor());
}
}
}
use of com.google.cloud.bigquery.storage.v1.TableSchema in project java-bigquerystorage by googleapis.
the class ITBigQueryStorageTest method testReadAtSnapshot.
@Test
public void testReadAtSnapshot() throws InterruptedException, IOException {
Field intFieldSchema = Field.newBuilder("col", LegacySQLTypeName.INTEGER).setMode(Mode.REQUIRED).setDescription("IntegerDescription").build();
com.google.cloud.bigquery.Schema tableSchema = com.google.cloud.bigquery.Schema.of(intFieldSchema);
TableId testTableId = TableId.of(/* dataset = */
DATASET, /* table = */
"test_read_snapshot");
bigquery.create(TableInfo.of(testTableId, StandardTableDefinition.of(tableSchema)));
testTableId.toString();
Job firstJob = RunQueryAppendJobAndExpectSuccess(/* destinationTableId = */
testTableId, /* query = */
"SELECT 1 AS col");
Job secondJob = RunQueryAppendJobAndExpectSuccess(/* destinationTableId = */
testTableId, /* query = */
"SELECT 2 AS col");
String table = BigQueryResource.FormatTableResource(/* projectId = */
ServiceOptions.getDefaultProjectId(), /* datasetId = */
DATASET, /* tableId = */
testTableId.getTable());
final List<Long> rowsAfterFirstSnapshot = new ArrayList<>();
ProcessRowsAtSnapshot(/* table = */
table, /* snapshotInMillis = */
firstJob.getStatistics().getEndTime(), /* filter = */
null, /* consumer = */
new AvroRowConsumer() {
@Override
public void accept(GenericData.Record record) {
rowsAfterFirstSnapshot.add((Long) record.get("col"));
}
});
assertEquals(Arrays.asList(1L), rowsAfterFirstSnapshot);
final List<Long> rowsAfterSecondSnapshot = new ArrayList<>();
ProcessRowsAtSnapshot(/* table = */
table, /* snapshotInMillis = */
secondJob.getStatistics().getEndTime(), /* filter = */
null, /* consumer = */
new AvroRowConsumer() {
@Override
public void accept(GenericData.Record record) {
rowsAfterSecondSnapshot.add((Long) record.get("col"));
}
});
Collections.sort(rowsAfterSecondSnapshot);
assertEquals(Arrays.asList(1L, 2L), rowsAfterSecondSnapshot);
}
Aggregations