Search in sources :

Example 11 with TableSchema

use of com.google.cloud.bigquery.storage.v1.TableSchema in project beam by apache.

the class BigQueryIOStorageQueryTest method testQuerySourceInitialSplit_NoReferencedTables.

/**
 * This test simulates the scenario where the SQL text which is executed by the query job doesn't
 * by itself refer to any tables (e.g. "SELECT 17 AS value"), and thus there are no referenced
 * tables when the dry run of the query is performed.
 */
@Test
public void testQuerySourceInitialSplit_NoReferencedTables() throws Exception {
    Table queryResultTable = new Table().setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))).setNumBytes(1024L * 1024L);
    String encodedQuery = FakeBigQueryServices.encodeQueryResult(queryResultTable);
    fakeJobService.expectDryRunQuery(options.getProject(), encodedQuery, new JobStatistics().setQuery(new JobStatistics2().setTotalBytesProcessed(1024L * 1024L).setReferencedTables(ImmutableList.of())));
    String stepUuid = "testStepUuid";
    TableReference tempTableReference = createTempTableReference(options.getProject(), BigQueryResourceNaming.createJobIdPrefix(options.getJobName(), stepUuid, JobType.QUERY), Optional.empty());
    CreateReadSessionRequest expectedRequest = CreateReadSessionRequest.newBuilder().setParent("projects/" + options.getProject()).setReadSession(ReadSession.newBuilder().setTable(BigQueryHelpers.toTableResourceName(tempTableReference))).setMaxStreamCount(1024).build();
    Schema sessionSchema = SchemaBuilder.record("__root__").fields().name("name").type().nullable().stringType().noDefault().name("number").type().nullable().longType().noDefault().endRecord();
    ReadSession.Builder builder = ReadSession.newBuilder().setAvroSchema(AvroSchema.newBuilder().setSchema(sessionSchema.toString())).setDataFormat(DataFormat.AVRO);
    for (int i = 0; i < 1024; i++) {
        builder.addStreams(ReadStream.newBuilder().setName("stream-" + i));
    }
    StorageClient fakeStorageClient = mock(StorageClient.class);
    when(fakeStorageClient.createReadSession(expectedRequest)).thenReturn(builder.build());
    BigQueryStorageQuerySource<TableRow> querySource = BigQueryStorageQuerySource.create(stepUuid, ValueProvider.StaticValueProvider.of(encodedQuery), /* flattenResults = */
    true, /* useLegacySql = */
    true, /* priority = */
    QueryPriority.BATCH, /* location = */
    null, /* queryTempDataset = */
    null, /* kmsKey = */
    null, null, new TableRowParser(), TableRowJsonCoder.of(), new FakeBigQueryServices().withDatasetService(fakeDatasetService).withJobService(fakeJobService).withStorageClient(fakeStorageClient));
    List<? extends BoundedSource<TableRow>> sources = querySource.split(1024, options);
    assertEquals(1024, sources.size());
}
Also used : JobStatistics(com.google.api.services.bigquery.model.JobStatistics) JobStatistics2(com.google.api.services.bigquery.model.JobStatistics2) Table(com.google.api.services.bigquery.model.Table) TableSchema(com.google.api.services.bigquery.model.TableSchema) AvroSchema(com.google.cloud.bigquery.storage.v1.AvroSchema) TableSchema(com.google.api.services.bigquery.model.TableSchema) Schema(org.apache.avro.Schema) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) ReadSession(com.google.cloud.bigquery.storage.v1.ReadSession) StorageClient(org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.StorageClient) ByteString(com.google.protobuf.ByteString) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) TableRowParser(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.TableRowParser) TableReference(com.google.api.services.bigquery.model.TableReference) BigQueryResourceNaming.createTempTableReference(org.apache.beam.sdk.io.gcp.bigquery.BigQueryResourceNaming.createTempTableReference) TableRow(com.google.api.services.bigquery.model.TableRow) FakeBigQueryServices(org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices) CreateReadSessionRequest(com.google.cloud.bigquery.storage.v1.CreateReadSessionRequest) Test(org.junit.Test)

Example 12 with TableSchema

use of com.google.cloud.bigquery.storage.v1.TableSchema in project spark-bigquery-connector by GoogleCloudDataproc.

the class BigQueryDataSourceReaderContext method planBatchInputPartitionContexts.

public Stream<InputPartitionContext<ColumnarBatch>> planBatchInputPartitionContexts() {
    if (!enableBatchRead()) {
        throw new IllegalStateException("Batch reads should not be enabled");
    }
    ImmutableList<String> selectedFields = schema.map(requiredSchema -> ImmutableList.copyOf(requiredSchema.fieldNames())).orElse(ImmutableList.copyOf(fields.keySet()));
    Optional<String> filter = getCombinedFilter();
    ReadSessionResponse readSessionResponse = readSessionCreator.create(tableId, selectedFields, filter);
    ReadSession readSession = readSessionResponse.getReadSession();
    logger.info("Created read session for {}: {} for application id: {}", tableId.toString(), readSession.getName(), applicationId);
    if (selectedFields.isEmpty()) {
        // means select *
        Schema tableSchema = SchemaConverters.getSchemaWithPseudoColumns(readSessionResponse.getReadTableInfo());
        selectedFields = tableSchema.getFields().stream().map(Field::getName).collect(ImmutableList.toImmutableList());
    }
    ImmutableList<String> partitionSelectedFields = selectedFields;
    return Streams.stream(Iterables.partition(readSession.getStreamsList(), readSessionCreatorConfig.streamsPerPartition())).map(streams -> new ArrowInputPartitionContext(bigQueryReadClientFactory, bigQueryTracerFactory, streams.stream().map(ReadStream::getName).collect(Collectors.toCollection(ArrayList::new)), readSessionCreatorConfig.toReadRowsHelperOptions(), partitionSelectedFields, readSessionResponse, userProvidedSchema));
}
Also used : IntStream(java.util.stream.IntStream) Iterables(com.google.common.collect.Iterables) InternalRow(org.apache.spark.sql.catalyst.InternalRow) TableId(com.google.cloud.bigquery.TableId) LoggerFactory(org.slf4j.LoggerFactory) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) OptionalLong(java.util.OptionalLong) ImmutableList(com.google.common.collect.ImmutableList) Schema(com.google.cloud.bigquery.Schema) Map(java.util.Map) ReadSessionResponse(com.google.cloud.bigquery.connector.common.ReadSessionResponse) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) Field(com.google.cloud.bigquery.Field) TableDefinition(com.google.cloud.bigquery.TableDefinition) ReadSessionCreator(com.google.cloud.bigquery.connector.common.ReadSessionCreator) JavaConversions(scala.collection.JavaConversions) ReadStream(com.google.cloud.bigquery.storage.v1.ReadStream) ImmutableSet(com.google.common.collect.ImmutableSet) Logger(org.slf4j.Logger) ReadSessionCreatorConfig(com.google.cloud.bigquery.connector.common.ReadSessionCreatorConfig) ReadSession(com.google.cloud.bigquery.storage.v1.ReadSession) BigQueryClient(com.google.cloud.bigquery.connector.common.BigQueryClient) Set(java.util.Set) SchemaConverters(com.google.cloud.spark.bigquery.SchemaConverters) Streams(com.google.common.collect.Streams) Collectors(java.util.stream.Collectors) DataFormat(com.google.cloud.bigquery.storage.v1.DataFormat) List(java.util.List) Stream(java.util.stream.Stream) ColumnarBatch(org.apache.spark.sql.vectorized.ColumnarBatch) ReadRowsResponseToInternalRowIteratorConverter(com.google.cloud.spark.bigquery.ReadRowsResponseToInternalRowIteratorConverter) BigQueryClientFactory(com.google.cloud.bigquery.connector.common.BigQueryClientFactory) SparkFilterUtils(com.google.cloud.spark.bigquery.SparkFilterUtils) Optional(java.util.Optional) Filter(org.apache.spark.sql.sources.Filter) TableInfo(com.google.cloud.bigquery.TableInfo) BigQueryUtil(com.google.cloud.bigquery.connector.common.BigQueryUtil) BigQueryTracerFactory(com.google.cloud.bigquery.connector.common.BigQueryTracerFactory) StructField(org.apache.spark.sql.types.StructField) Field(com.google.cloud.bigquery.Field) ReadSessionResponse(com.google.cloud.bigquery.connector.common.ReadSessionResponse) ReadSession(com.google.cloud.bigquery.storage.v1.ReadSession) Schema(com.google.cloud.bigquery.Schema)

Example 13 with TableSchema

use of com.google.cloud.bigquery.storage.v1.TableSchema in project spark-bigquery-connector by GoogleCloudDataproc.

the class BigQueryDirectDataSourceWriterContext method getOrCreateTable.

/**
 * This function determines whether the destination table exists: if it doesn't, we will create a
 * table and Spark will directly write to it.
 *
 * @param saveMode the SaveMode supplied by the user.
 * @param destinationTableId the TableId, as was supplied by the user
 * @param bigQuerySchema the bigQuery schema
 * @return The TableId to which Spark will do the writing: whether that is the destinationTableID
 *     or the temporaryTableId.
 */
private BigQueryTable getOrCreateTable(SaveMode saveMode, TableId destinationTableId, Schema bigQuerySchema) throws IllegalArgumentException {
    if (bigQueryClient.tableExists(destinationTableId)) {
        TableInfo destinationTable = bigQueryClient.getTable(destinationTableId);
        Schema tableSchema = destinationTable.getDefinition().getSchema();
        Preconditions.checkArgument(BigQueryUtil.schemaEquals(tableSchema, bigQuerySchema, /* regardFieldOrder */
        false), new BigQueryConnectorException.InvalidSchemaException("Destination table's schema is not compatible with dataframe's schema"));
        switch(saveMode) {
            case Append:
                break;
            case Overwrite:
                writingMode = WritingMode.OVERWRITE;
                return new BigQueryTable(bigQueryClient.createTempTable(destinationTableId, bigQuerySchema).getTableId(), true);
            case Ignore:
                writingMode = WritingMode.IGNORE_INPUTS;
                break;
            case ErrorIfExists:
                throw new IllegalArgumentException("Table already exists in BigQuery");
        }
        return new BigQueryTable(destinationTable.getTableId(), false);
    } else {
        return new BigQueryTable(bigQueryClient.createTable(destinationTableId, bigQuerySchema).getTableId(), true);
    }
}
Also used : BigQueryConnectorException(com.google.cloud.bigquery.connector.common.BigQueryConnectorException) ProtobufUtils.toProtoSchema(com.google.cloud.spark.bigquery.ProtobufUtils.toProtoSchema) SchemaConverters.toBigQuerySchema(com.google.cloud.spark.bigquery.SchemaConverters.toBigQuerySchema) Schema(com.google.cloud.bigquery.Schema) ProtoSchema(com.google.cloud.bigquery.storage.v1.ProtoSchema) TableInfo(com.google.cloud.bigquery.TableInfo)

Example 14 with TableSchema

use of com.google.cloud.bigquery.storage.v1.TableSchema in project java-bigquerystorage by googleapis.

the class AppendCompleteCallback method writeToDefaultStream.

// writeToDefaultStream: Writes records from the source file to the destination table.
public static void writeToDefaultStream(String projectId, String datasetName, String tableName, String dataFile) throws DescriptorValidationException, InterruptedException, IOException {
    BigQuery bigquery = BigQueryOptions.getDefaultInstance().getService();
    // Get the schema of the destination table and convert to the equivalent BigQueryStorage type.
    Table table = bigquery.getTable(datasetName, tableName);
    Schema schema = table.getDefinition().getSchema();
    TableSchema tableSchema = BqToBqStorageSchemaConverter.convertTableSchema(schema);
    // Use the JSON stream writer to send records in JSON format.
    TableName parentTable = TableName.of(projectId, datasetName, tableName);
    try (JsonStreamWriter writer = JsonStreamWriter.newBuilder(parentTable.toString(), tableSchema).build()) {
        // Read JSON data from the source file and send it to the Write API.
        BufferedReader reader = new BufferedReader(new FileReader(dataFile));
        String line = reader.readLine();
        while (line != null) {
            // As a best practice, send batches of records, instead of single records at a time.
            JSONArray jsonArr = new JSONArray();
            for (int i = 0; i < 100; i++) {
                JSONObject record = new JSONObject(line);
                jsonArr.put(record);
                line = reader.readLine();
                if (line == null) {
                    break;
                }
            }
            // batch
            ApiFuture<AppendRowsResponse> future = writer.append(jsonArr);
            // The append method is asynchronous. Rather than waiting for the method to complete,
            // which can hurt performance, register a completion callback and continue streaming.
            ApiFutures.addCallback(future, new AppendCompleteCallback(), MoreExecutors.directExecutor());
        }
    }
}
Also used : BigQuery(com.google.cloud.bigquery.BigQuery) Table(com.google.cloud.bigquery.Table) TableSchema(com.google.cloud.bigquery.storage.v1.TableSchema) TableSchema(com.google.cloud.bigquery.storage.v1.TableSchema) Schema(com.google.cloud.bigquery.Schema) JSONArray(org.json.JSONArray) AppendRowsResponse(com.google.cloud.bigquery.storage.v1.AppendRowsResponse) TableName(com.google.cloud.bigquery.storage.v1.TableName) JSONObject(org.json.JSONObject) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) JsonStreamWriter(com.google.cloud.bigquery.storage.v1.JsonStreamWriter)

Example 15 with TableSchema

use of com.google.cloud.bigquery.storage.v1.TableSchema in project java-bigquerystorage by googleapis.

the class ITBigQueryStorageTest method testReadAtSnapshot.

@Test
public void testReadAtSnapshot() throws InterruptedException, IOException {
    Field intFieldSchema = Field.newBuilder("col", LegacySQLTypeName.INTEGER).setMode(Mode.REQUIRED).setDescription("IntegerDescription").build();
    com.google.cloud.bigquery.Schema tableSchema = com.google.cloud.bigquery.Schema.of(intFieldSchema);
    TableId testTableId = TableId.of(/* dataset = */
    DATASET, /* table = */
    "test_read_snapshot");
    bigquery.create(TableInfo.of(testTableId, StandardTableDefinition.of(tableSchema)));
    testTableId.toString();
    Job firstJob = RunQueryAppendJobAndExpectSuccess(/* destinationTableId = */
    testTableId, /* query = */
    "SELECT 1 AS col");
    Job secondJob = RunQueryAppendJobAndExpectSuccess(/* destinationTableId = */
    testTableId, /* query = */
    "SELECT 2 AS col");
    String table = BigQueryResource.FormatTableResource(/* projectId = */
    ServiceOptions.getDefaultProjectId(), /* datasetId = */
    DATASET, /* tableId = */
    testTableId.getTable());
    final List<Long> rowsAfterFirstSnapshot = new ArrayList<>();
    ProcessRowsAtSnapshot(/* table = */
    table, /* snapshotInMillis = */
    firstJob.getStatistics().getEndTime(), /* filter = */
    null, /* consumer = */
    new AvroRowConsumer() {

        @Override
        public void accept(GenericData.Record record) {
            rowsAfterFirstSnapshot.add((Long) record.get("col"));
        }
    });
    assertEquals(Arrays.asList(1L), rowsAfterFirstSnapshot);
    final List<Long> rowsAfterSecondSnapshot = new ArrayList<>();
    ProcessRowsAtSnapshot(/* table = */
    table, /* snapshotInMillis = */
    secondJob.getStatistics().getEndTime(), /* filter = */
    null, /* consumer = */
    new AvroRowConsumer() {

        @Override
        public void accept(GenericData.Record record) {
            rowsAfterSecondSnapshot.add((Long) record.get("col"));
        }
    });
    Collections.sort(rowsAfterSecondSnapshot);
    assertEquals(Arrays.asList(1L, 2L), rowsAfterSecondSnapshot);
}
Also used : TableId(com.google.cloud.bigquery.TableId) AvroRowConsumer(com.google.cloud.bigquery.storage.v1.it.SimpleRowReader.AvroRowConsumer) ArrayList(java.util.ArrayList) GenericData(org.apache.avro.generic.GenericData) Field(com.google.cloud.bigquery.Field) Job(com.google.cloud.bigquery.Job) Test(org.junit.Test)

Aggregations

Test (org.junit.Test)9 Table (com.google.api.services.bigquery.model.Table)6 TableSchema (com.google.api.services.bigquery.model.TableSchema)6 TableReference (com.google.api.services.bigquery.model.TableReference)5 TableRow (com.google.api.services.bigquery.model.TableRow)5 ReadSession (com.google.cloud.bigquery.storage.v1.ReadSession)5 JSONArray (org.json.JSONArray)5 JSONObject (org.json.JSONObject)5 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)4 FieldValueList (com.google.cloud.bigquery.FieldValueList)4 Schema (com.google.cloud.bigquery.Schema)4 TableResult (com.google.cloud.bigquery.TableResult)4 CreateReadSessionRequest (com.google.cloud.bigquery.storage.v1.CreateReadSessionRequest)4 AppendRowsResponse (com.google.cloud.bigquery.storage.v1beta2.AppendRowsResponse)4 JsonStreamWriter (com.google.cloud.bigquery.storage.v1beta2.JsonStreamWriter)4 TableFieldSchema (com.google.cloud.bigquery.storage.v1beta2.TableFieldSchema)4 TableName (com.google.cloud.bigquery.storage.v1beta2.TableName)4 TableSchema (com.google.cloud.bigquery.storage.v1beta2.TableSchema)4 DataFormat (com.google.cloud.bigquery.storage.v1.DataFormat)3 StorageClient (org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.StorageClient)3