Search in sources :

Example 26 with TableSchema

use of com.google.api.services.bigquery.model.TableSchema in project beam by apache.

the class WriteToBigQuery method getSchema.

/** Build the output table schema. */
protected TableSchema getSchema() {
    List<TableFieldSchema> fields = new ArrayList<>();
    for (Map.Entry<String, FieldInfo<InputT>> entry : fieldInfo.entrySet()) {
        String key = entry.getKey();
        FieldInfo<InputT> fcnInfo = entry.getValue();
        String bqType = fcnInfo.getFieldType();
        fields.add(new TableFieldSchema().setName(key).setType(bqType));
    }
    return new TableSchema().setFields(fields);
}
Also used : TableSchema(com.google.api.services.bigquery.model.TableSchema) ArrayList(java.util.ArrayList) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) Map(java.util.Map)

Example 27 with TableSchema

use of com.google.api.services.bigquery.model.TableSchema in project beam by apache.

the class MaxPerKeyExamples method main.

public static void main(String[] args) throws Exception {
    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    Pipeline p = Pipeline.create(options);
    // Build the table schema for the output table.
    List<TableFieldSchema> fields = new ArrayList<>();
    fields.add(new TableFieldSchema().setName("month").setType("INTEGER"));
    fields.add(new TableFieldSchema().setName("max_mean_temp").setType("FLOAT"));
    TableSchema schema = new TableSchema().setFields(fields);
    p.apply(BigQueryIO.read().from(options.getInput())).apply(new MaxMeanTemp()).apply(BigQueryIO.writeTableRows().to(options.getOutput()).withSchema(schema).withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
    p.run().waitUntilFinish();
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) TableSchema(com.google.api.services.bigquery.model.TableSchema) ArrayList(java.util.ArrayList) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) Pipeline(org.apache.beam.sdk.Pipeline)

Example 28 with TableSchema

use of com.google.api.services.bigquery.model.TableSchema in project beam by apache.

the class BigQuerySourceBase method split.

@Override
public List<BoundedSource<TableRow>> split(long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
    // another BigQuery extract job for the repeated split() calls.
    if (cachedSplitResult == null) {
        BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
        TableReference tableToExtract = getTableToExtract(bqOptions);
        JobService jobService = bqServices.getJobService(bqOptions);
        final String extractDestinationDir = resolveTempLocation(bqOptions.getTempLocation(), "BigQueryExtractTemp", stepUuid);
        String extractJobId = getExtractJobId(createJobIdToken(options.getJobName(), stepUuid));
        List<ResourceId> tempFiles = executeExtract(extractJobId, tableToExtract, jobService, bqOptions.getProject(), extractDestinationDir);
        TableSchema tableSchema = bqServices.getDatasetService(bqOptions).getTable(tableToExtract).getSchema();
        cleanupTempResource(bqOptions);
        cachedSplitResult = checkNotNull(createSources(tempFiles, tableSchema));
    }
    return cachedSplitResult;
}
Also used : TableReference(com.google.api.services.bigquery.model.TableReference) TableSchema(com.google.api.services.bigquery.model.TableSchema) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) JobService(org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.JobService)

Example 29 with TableSchema

use of com.google.api.services.bigquery.model.TableSchema in project beam by apache.

the class BigQuerySourceBase method createSources.

private List<BoundedSource<TableRow>> createSources(List<ResourceId> files, TableSchema tableSchema) throws IOException, InterruptedException {
    final String jsonSchema = BigQueryIO.JSON_FACTORY.toString(tableSchema);
    SerializableFunction<GenericRecord, TableRow> function = new SerializableFunction<GenericRecord, TableRow>() {

        @Override
        public TableRow apply(GenericRecord input) {
            return BigQueryAvroUtils.convertGenericRecordToTableRow(input, BigQueryHelpers.fromJsonString(jsonSchema, TableSchema.class));
        }
    };
    List<BoundedSource<TableRow>> avroSources = Lists.newArrayList();
    for (ResourceId file : files) {
        avroSources.add(new TransformingSource<>(AvroSource.from(file.toString()), function, getDefaultOutputCoder()));
    }
    return ImmutableList.copyOf(avroSources);
}
Also used : BoundedSource(org.apache.beam.sdk.io.BoundedSource) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) TableSchema(com.google.api.services.bigquery.model.TableSchema) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) TableRow(com.google.api.services.bigquery.model.TableRow) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 30 with TableSchema

use of com.google.api.services.bigquery.model.TableSchema in project beam by apache.

the class WriteTables method processElement.

@ProcessElement
public void processElement(ProcessContext c) throws Exception {
    dynamicDestinations.setSideInputAccessorFromProcessContext(c);
    DestinationT destination = c.element().getKey().getKey();
    TableSchema tableSchema = BigQueryHelpers.fromJsonString(c.sideInput(schemasView).get(destination), TableSchema.class);
    TableDestination tableDestination = dynamicDestinations.getTable(destination);
    TableReference tableReference = tableDestination.getTableReference();
    if (Strings.isNullOrEmpty(tableReference.getProjectId())) {
        tableReference.setProjectId(c.getPipelineOptions().as(BigQueryOptions.class).getProject());
        tableDestination = new TableDestination(tableReference, tableDestination.getTableDescription());
    }
    Integer partition = c.element().getKey().getShardNumber();
    List<String> partitionFiles = Lists.newArrayList(c.element().getValue());
    String jobIdPrefix = BigQueryHelpers.createJobId(c.sideInput(jobIdToken), tableDestination, partition);
    if (!singlePartition) {
        tableReference.setTableId(jobIdPrefix);
    }
    load(bqServices.getJobService(c.getPipelineOptions().as(BigQueryOptions.class)), bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class)), jobIdPrefix, tableReference, tableSchema, partitionFiles, writeDisposition, createDisposition, tableDestination.getTableDescription());
    c.output(KV.of(tableDestination, BigQueryHelpers.toJsonString(tableReference)));
    removeTemporaryFiles(partitionFiles);
}
Also used : TableReference(com.google.api.services.bigquery.model.TableReference) TableSchema(com.google.api.services.bigquery.model.TableSchema)

Aggregations

TableSchema (com.google.api.services.bigquery.model.TableSchema)31 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)20 TableRow (com.google.api.services.bigquery.model.TableRow)18 JsonSchemaToTableSchema (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema)13 Test (org.junit.Test)13 TableReference (com.google.api.services.bigquery.model.TableReference)12 Pipeline (org.apache.beam.sdk.Pipeline)12 ArrayList (java.util.ArrayList)10 BigQueryHelpers.toJsonString (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString)9 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)8 Table (com.google.api.services.bigquery.model.Table)7 BigQueryHelpers.createTempTableReference (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference)7 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)7 HashBasedTable (com.google.common.collect.HashBasedTable)6 JobStatus (com.google.api.services.bigquery.model.JobStatus)5 JobStatistics (com.google.api.services.bigquery.model.JobStatistics)4 JobStatistics4 (com.google.api.services.bigquery.model.JobStatistics4)4 Path (java.nio.file.Path)4 Map (java.util.Map)4 Job (com.google.api.services.bigquery.model.Job)3