Search in sources :

Example 61 with TableFieldSchema

use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.

the class BigQueryIOWriteTest method testExtendedErrorRetrieval.

@Test
public void testExtendedErrorRetrieval() throws Exception {
    if (useStorageApi) {
        return;
    }
    TableRow row1 = new TableRow().set("name", "a").set("number", "1");
    TableRow row2 = new TableRow().set("name", "b").set("number", "2");
    TableRow row3 = new TableRow().set("name", "c").set("number", "3");
    String tableSpec = "project-id:dataset-id.table-id";
    TableDataInsertAllResponse.InsertErrors ephemeralError = new TableDataInsertAllResponse.InsertErrors().setErrors(ImmutableList.of(new ErrorProto().setReason("timeout")));
    TableDataInsertAllResponse.InsertErrors persistentError = new TableDataInsertAllResponse.InsertErrors().setErrors(Lists.newArrayList(new ErrorProto().setReason("invalidQuery")));
    fakeDatasetService.failOnInsert(ImmutableMap.of(row1, ImmutableList.of(ephemeralError, ephemeralError), row2, ImmutableList.of(ephemeralError, ephemeralError, persistentError)));
    PCollection<BigQueryInsertError> failedRows = p.apply(Create.of(row1, row2, row3)).apply(BigQueryIO.writeTableRows().to(tableSpec).withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS).withSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()).withTestServices(fakeBqServices).withoutValidation().withExtendedErrorInfo()).getFailedInsertsWithErr();
    // row2 finally fails with a non-retryable error, so we expect to see it in the collection of
    // failed rows.
    PAssert.that(failedRows).containsInAnyOrder(new BigQueryInsertError(row2, persistentError, BigQueryHelpers.parseTableSpec(tableSpec)));
    p.run();
    // Only row1 and row3 were successfully inserted.
    assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", "table-id"), containsInAnyOrder(row1, row3));
}
Also used : ErrorProto(com.google.api.services.bigquery.model.ErrorProto) TableSchema(com.google.api.services.bigquery.model.TableSchema) TableRow(com.google.api.services.bigquery.model.TableRow) TableDataInsertAllResponse(com.google.api.services.bigquery.model.TableDataInsertAllResponse) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) Test(org.junit.Test)

Example 62 with TableFieldSchema

use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.

the class BigQueryIOWriteTest method testClusteringTableFunction.

@Test
public void testClusteringTableFunction() throws Exception {
    TableRow row1 = new TableRow().set("date", "2018-01-01").set("number", "1");
    TableRow row2 = new TableRow().set("date", "2018-01-02").set("number", "2");
    TimePartitioning timePartitioning = new TimePartitioning().setType("DAY").setField("date");
    Clustering clustering = new Clustering().setFields(ImmutableList.of("date"));
    TableSchema schema = new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("date").setType("DATE"), new TableFieldSchema().setName("number").setType("INTEGER")));
    // withMethod overrides the pipeline option, so we need to explicitly request
    // STORAGE_API_WRITES.
    BigQueryIO.Write.Method method = useStorageApi ? (useStorageApiApproximate ? Method.STORAGE_API_AT_LEAST_ONCE : Method.STORAGE_WRITE_API) : Method.FILE_LOADS;
    p.apply(Create.of(row1, row2)).apply(BigQueryIO.writeTableRows().to((ValueInSingleWindow<TableRow> vsw) -> {
        String tableSpec = "project-id:dataset-id.table-" + vsw.getValue().get("number");
        return new TableDestination(tableSpec, null, new TimePartitioning().setType("DAY").setField("date"), new Clustering().setFields(ImmutableList.of("date")));
    }).withTestServices(fakeBqServices).withMethod(method).withSchema(schema).withClustering().withoutValidation());
    p.run();
    Table table = fakeDatasetService.getTable(BigQueryHelpers.parseTableSpec("project-id:dataset-id.table-1"));
    assertEquals(schema, table.getSchema());
    assertEquals(timePartitioning, table.getTimePartitioning());
    assertEquals(clustering, table.getClustering());
}
Also used : Write(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write) Table(com.google.api.services.bigquery.model.Table) TableSchema(com.google.api.services.bigquery.model.TableSchema) TableRow(com.google.api.services.bigquery.model.TableRow) ValueInSingleWindow(org.apache.beam.sdk.values.ValueInSingleWindow) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) Clustering(com.google.api.services.bigquery.model.Clustering) Method(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method) TimePartitioning(com.google.api.services.bigquery.model.TimePartitioning) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) Test(org.junit.Test)

Example 63 with TableFieldSchema

use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.

the class BigQueryIOWriteTest method testTriggeredFileLoadsWithAutoSharding.

@Test
public void testTriggeredFileLoadsWithAutoSharding() throws Exception {
    if (useStorageApi || !useStreaming) {
        // This test does not make sense for the storage API.
        return;
    }
    List<TableRow> elements = Lists.newArrayList();
    for (int i = 0; i < 30; ++i) {
        elements.add(new TableRow().set("number", i));
    }
    Instant startInstant = new Instant(0L);
    TestStream<TableRow> testStream = TestStream.create(TableRowJsonCoder.of()).advanceWatermarkTo(startInstant).addElements(elements.get(0), Iterables.toArray(elements.subList(1, 10), TableRow.class)).advanceProcessingTime(Duration.standardMinutes(1)).advanceWatermarkTo(startInstant.plus(Duration.standardSeconds(10))).addElements(elements.get(10), Iterables.toArray(elements.subList(11, 20), TableRow.class)).advanceProcessingTime(Duration.standardMinutes(1)).advanceWatermarkTo(startInstant.plus(Duration.standardSeconds(30))).addElements(elements.get(20), Iterables.toArray(elements.subList(21, 30), TableRow.class)).advanceProcessingTime(Duration.standardMinutes(2)).advanceWatermarkToInfinity();
    int numTables = 3;
    p.apply(testStream).apply(BigQueryIO.writeTableRows().to((ValueInSingleWindow<TableRow> vsw) -> {
        String tableSpec = "project-id:dataset-id.table-" + ((int) vsw.getValue().get("number") % numTables);
        return new TableDestination(tableSpec, null);
    }).withSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("number").setType("INTEGER")))).withTestServices(fakeBqServices).withTriggeringFrequency(Duration.standardSeconds(100)).withAutoSharding().withMaxBytesPerPartition(1000).withMaxFilesPerPartition(10).withMethod(BigQueryIO.Write.Method.FILE_LOADS).withoutValidation());
    p.run();
    Map<Integer, List<TableRow>> elementsByTableIdx = new HashMap<>();
    for (int i = 0; i < elements.size(); i++) {
        elementsByTableIdx.computeIfAbsent(i % numTables, k -> new ArrayList<>()).add(elements.get(i));
    }
    for (Map.Entry<Integer, List<TableRow>> entry : elementsByTableIdx.entrySet()) {
        assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", "table-" + entry.getKey()), containsInAnyOrder(Iterables.toArray(entry.getValue(), TableRow.class)));
    }
    // For each table destination, it's expected to create two load jobs based on the triggering
    // frequency and processing time intervals.
    assertEquals(2 * numTables, fakeDatasetService.getInsertCount());
}
Also used : ExpectedLogs(org.apache.beam.sdk.testing.ExpectedLogs) SerializableCoder(org.apache.beam.sdk.coders.SerializableCoder) ValueInSingleWindow(org.apache.beam.sdk.values.ValueInSingleWindow) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) Encoder(org.apache.avro.io.Encoder) ResultCoder(org.apache.beam.sdk.io.gcp.bigquery.WritePartition.ResultCoder) Matcher(java.util.regex.Matcher) DoFnTester(org.apache.beam.sdk.transforms.DoFnTester) Create(org.apache.beam.sdk.transforms.Create) Map(java.util.Map) Window(org.apache.beam.sdk.transforms.windowing.Window) GlobalWindow(org.apache.beam.sdk.transforms.windowing.GlobalWindow) FakeBigQueryServices(org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices) EnumSet(java.util.EnumSet) ValueProvider(org.apache.beam.sdk.options.ValueProvider) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) KvCoder(org.apache.beam.sdk.coders.KvCoder) Matchers.allOf(org.hamcrest.Matchers.allOf) Set(java.util.Set) WindowFn(org.apache.beam.sdk.transforms.windowing.WindowFn) FieldType(org.apache.beam.sdk.schemas.Schema.FieldType) Serializable(java.io.Serializable) IncompatibleWindowException(org.apache.beam.sdk.transforms.windowing.IncompatibleWindowException) Assert.assertFalse(org.junit.Assert.assertFalse) AutoValue(com.google.auto.value.AutoValue) TestStream(org.apache.beam.sdk.testing.TestStream) Matchers.is(org.hamcrest.Matchers.is) DisplayDataMatchers.hasDisplayItem(org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem) Write(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write) Method(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method) Preconditions.checkNotNull(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkNotNull) KV(org.apache.beam.sdk.values.KV) FakeDatasetService(org.apache.beam.sdk.io.gcp.testing.FakeDatasetService) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) View(org.apache.beam.sdk.transforms.View) ArrayList(java.util.ArrayList) GenericData(org.apache.avro.generic.GenericData) Distinct(org.apache.beam.sdk.transforms.Distinct) Multimap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Multimap) TupleTag(org.apache.beam.sdk.values.TupleTag) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) Maps(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Maps) StreamSupport(java.util.stream.StreamSupport) JavaFieldSchema(org.apache.beam.sdk.schemas.JavaFieldSchema) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) Row(org.apache.beam.sdk.values.Row) Result(org.apache.beam.sdk.io.gcp.bigquery.WriteTables.Result) Before(org.junit.Before) TableReference(com.google.api.services.bigquery.model.TableReference) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) Files(java.nio.file.Files) PAssert(org.apache.beam.sdk.testing.PAssert) NonMergingWindowFn(org.apache.beam.sdk.transforms.windowing.NonMergingWindowFn) Parameter(org.junit.runners.Parameterized.Parameter) Assert.assertTrue(org.junit.Assert.assertTrue) IOException(java.io.IOException) ShardedKeyCoder(org.apache.beam.sdk.coders.ShardedKeyCoder) Test(org.junit.Test) Schema(org.apache.beam.sdk.schemas.Schema) File(java.io.File) Assert.assertNull(org.junit.Assert.assertNull) Paths(java.nio.file.Paths) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) AtomicCoder(org.apache.beam.sdk.coders.AtomicCoder) DefaultSchema(org.apache.beam.sdk.schemas.annotations.DefaultSchema) FakeJobService(org.apache.beam.sdk.io.gcp.testing.FakeJobService) Assert.assertEquals(org.junit.Assert.assertEquals) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) TimePartitioning(com.google.api.services.bigquery.model.TimePartitioning) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) After(org.junit.After) TableRow(com.google.api.services.bigquery.model.TableRow) Assert.fail(org.junit.Assert.fail) TableSchema(com.google.api.services.bigquery.model.TableSchema) ArrayListMultimap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ArrayListMultimap) ShardedKey(org.apache.beam.sdk.values.ShardedKey) Parameterized(org.junit.runners.Parameterized) MapElements(org.apache.beam.sdk.transforms.MapElements) DatumWriter(org.apache.avro.io.DatumWriter) Collection(java.util.Collection) GenerateSequence(org.apache.beam.sdk.io.GenerateSequence) CreateDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition) Description(org.junit.runner.Description) Collectors(java.util.stream.Collectors) List(java.util.List) Clustering(com.google.api.services.bigquery.model.Clustering) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) TableDataInsertAllResponse(com.google.api.services.bigquery.model.TableDataInsertAllResponse) Matchers.equalTo(org.hamcrest.Matchers.equalTo) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Pattern(java.util.regex.Pattern) ErrorProto(com.google.api.services.bigquery.model.ErrorProto) Statement(org.junit.runners.model.Statement) TestRule(org.junit.rules.TestRule) Parameters(org.junit.runners.Parameterized.Parameters) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) SerializableFunctions(org.apache.beam.sdk.transforms.SerializableFunctions) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) SchemaUpdateOption(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption) WindowMappingFn(org.apache.beam.sdk.transforms.windowing.WindowMappingFn) SchemaCreate(org.apache.beam.sdk.schemas.annotations.SchemaCreate) Job(com.google.api.services.bigquery.model.Job) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ExpectedException(org.junit.rules.ExpectedException) Nullable(org.checkerframework.checker.nullness.qual.Nullable) Matchers.hasEntry(org.hamcrest.Matchers.hasEntry) OutputStream(java.io.OutputStream) DisplayData(org.apache.beam.sdk.transforms.display.DisplayData) GenericRecord(org.apache.avro.generic.GenericRecord) Lists(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists) Matchers(org.hamcrest.Matchers) PCollection(org.apache.beam.sdk.values.PCollection) Table(com.google.api.services.bigquery.model.Table) Rule(org.junit.Rule) Instant(org.joda.time.Instant) Collections(java.util.Collections) JobConfigurationLoad(com.google.api.services.bigquery.model.JobConfigurationLoad) TemporaryFolder(org.junit.rules.TemporaryFolder) InputStream(java.io.InputStream) TableSchema(com.google.api.services.bigquery.model.TableSchema) HashMap(java.util.HashMap) Instant(org.joda.time.Instant) ArrayList(java.util.ArrayList) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) TableRow(com.google.api.services.bigquery.model.TableRow) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) HashMap(java.util.HashMap) Test(org.junit.Test)

Example 64 with TableFieldSchema

use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.

the class BigQueryIOWriteTest method streamingWrite.

private void streamingWrite(boolean autoSharding) throws Exception {
    if (!useStreaming) {
        return;
    }
    BigQueryIO.Write<TableRow> write = BigQueryIO.writeTableRows().to("project-id:dataset-id.table-id").withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))).withTestServices(fakeBqServices).withoutValidation();
    if (autoSharding) {
        write = write.withAutoSharding();
    }
    p.apply(Create.of(new TableRow().set("name", "a").set("number", "1"), new TableRow().set("name", "b").set("number", "2"), new TableRow().set("name", "c").set("number", "3"), new TableRow().set("name", "d").set("number", "4")).withCoder(TableRowJsonCoder.of())).setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED).apply("WriteToBQ", write);
    p.run();
    assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", "table-id"), containsInAnyOrder(new TableRow().set("name", "a").set("number", "1"), new TableRow().set("name", "b").set("number", "2"), new TableRow().set("name", "c").set("number", "3"), new TableRow().set("name", "d").set("number", "4")));
}
Also used : TableSchema(com.google.api.services.bigquery.model.TableSchema) TableRow(com.google.api.services.bigquery.model.TableRow) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema)

Example 65 with TableFieldSchema

use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.

the class BigQueryIOWriteTest method testTriggeredFileLoads.

@Test
public void testTriggeredFileLoads() throws Exception {
    if (useStorageApi || !useStreaming) {
        return;
    }
    List<TableRow> elements = Lists.newArrayList();
    for (int i = 0; i < 30; ++i) {
        elements.add(new TableRow().set("number", i));
    }
    TestStream<TableRow> testStream = TestStream.create(TableRowJsonCoder.of()).addElements(elements.get(0), Iterables.toArray(elements.subList(1, 10), TableRow.class)).advanceProcessingTime(Duration.standardMinutes(1)).addElements(elements.get(10), Iterables.toArray(elements.subList(11, 20), TableRow.class)).advanceProcessingTime(Duration.standardMinutes(1)).addElements(elements.get(20), Iterables.toArray(elements.subList(21, 30), TableRow.class)).advanceWatermarkToInfinity();
    BigQueryIO.Write.Method method = Method.FILE_LOADS;
    p.apply(testStream).apply(BigQueryIO.writeTableRows().to("project-id:dataset-id.table-id").withSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("number").setType("INTEGER")))).withTestServices(fakeBqServices).withTriggeringFrequency(Duration.standardSeconds(30)).withNumFileShards(2).withMethod(method).withoutValidation());
    p.run();
    assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", "table-id"), containsInAnyOrder(Iterables.toArray(elements, TableRow.class)));
}
Also used : Write(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write) TableSchema(com.google.api.services.bigquery.model.TableSchema) TableRow(com.google.api.services.bigquery.model.TableRow) Method(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) Test(org.junit.Test)

Aggregations

TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)80 TableSchema (com.google.api.services.bigquery.model.TableSchema)71 TableRow (com.google.api.services.bigquery.model.TableRow)56 Test (org.junit.Test)45 Table (com.google.api.services.bigquery.model.Table)25 TableReference (com.google.api.services.bigquery.model.TableReference)23 ArrayList (java.util.ArrayList)17 BigQueryHelpers.toJsonString (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString)16 List (java.util.List)15 Map (java.util.Map)15 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)14 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)13 Pipeline (org.apache.beam.sdk.Pipeline)12 ByteString (com.google.protobuf.ByteString)10 JsonSchemaToTableSchema (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema)10 Write (org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write)10 Method (org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method)10 BigQueryResourceNaming.createTempTableReference (org.apache.beam.sdk.io.gcp.bigquery.BigQueryResourceNaming.createTempTableReference)9 FakeBigQueryServices (org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices)9 ErrorProto (com.google.api.services.bigquery.model.ErrorProto)8