Search in sources :

Example 26 with TableRow

use of com.google.api.services.bigquery.model.TableRow in project beam by apache.

the class BigQueryIOTest method testWriteTables.

@Test
public void testWriteTables() throws Exception {
    p.enableAbandonedNodeEnforcement(false);
    FakeDatasetService datasetService = new FakeDatasetService();
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(datasetService);
    datasetService.createDataset("project-id", "dataset-id", "", "");
    long numTables = 3;
    long numPartitions = 3;
    long numFilesPerPartition = 10;
    String jobIdToken = "jobIdToken";
    String stepUuid = "stepUuid";
    Map<TableDestination, List<String>> expectedTempTables = Maps.newHashMap();
    Path baseDir = Files.createTempDirectory(tempFolder, "testWriteTables");
    List<KV<ShardedKey<String>, List<String>>> partitions = Lists.newArrayList();
    for (int i = 0; i < numTables; ++i) {
        String tableName = String.format("project-id:dataset-id.table%05d", i);
        TableDestination tableDestination = new TableDestination(tableName, tableName);
        for (int j = 0; j < numPartitions; ++j) {
            String tempTableId = BigQueryHelpers.createJobId(jobIdToken, tableDestination, j);
            List<String> filesPerPartition = Lists.newArrayList();
            for (int k = 0; k < numFilesPerPartition; ++k) {
                String filename = Paths.get(baseDir.toString(), String.format("files0x%08x_%05d", tempTableId.hashCode(), k)).toString();
                ResourceId fileResource = FileSystems.matchNewResource(filename, false);
                try (WritableByteChannel channel = FileSystems.create(fileResource, MimeTypes.TEXT)) {
                    try (OutputStream output = Channels.newOutputStream(channel)) {
                        TableRow tableRow = new TableRow().set("name", tableName);
                        TableRowJsonCoder.of().encode(tableRow, output, Context.OUTER);
                        output.write("\n".getBytes(StandardCharsets.UTF_8));
                    }
                }
                filesPerPartition.add(filename);
            }
            partitions.add(KV.of(ShardedKey.of(tableDestination.getTableSpec(), j), filesPerPartition));
            List<String> expectedTables = expectedTempTables.get(tableDestination);
            if (expectedTables == null) {
                expectedTables = Lists.newArrayList();
                expectedTempTables.put(tableDestination, expectedTables);
            }
            String json = String.format("{\"datasetId\":\"dataset-id\",\"projectId\":\"project-id\",\"tableId\":\"%s\"}", tempTableId);
            expectedTables.add(json);
        }
    }
    PCollectionView<String> jobIdTokenView = p.apply("CreateJobId", Create.of("jobId")).apply(View.<String>asSingleton());
    PCollectionView<Map<String, String>> schemaMapView = p.apply("CreateEmptySchema", Create.empty(new TypeDescriptor<KV<String, String>>() {
    })).apply(View.<String, String>asMap());
    WriteTables<String> writeTables = new WriteTables<>(false, fakeBqServices, jobIdTokenView, schemaMapView, WriteDisposition.WRITE_EMPTY, CreateDisposition.CREATE_IF_NEEDED, new IdentityDynamicTables());
    DoFnTester<KV<ShardedKey<String>, List<String>>, KV<TableDestination, String>> tester = DoFnTester.of(writeTables);
    tester.setSideInput(jobIdTokenView, GlobalWindow.INSTANCE, jobIdToken);
    tester.setSideInput(schemaMapView, GlobalWindow.INSTANCE, ImmutableMap.<String, String>of());
    tester.getPipelineOptions().setTempLocation("tempLocation");
    for (KV<ShardedKey<String>, List<String>> partition : partitions) {
        tester.processElement(partition);
    }
    Map<TableDestination, List<String>> tempTablesResult = Maps.newHashMap();
    for (KV<TableDestination, String> element : tester.takeOutputElements()) {
        List<String> tables = tempTablesResult.get(element.getKey());
        if (tables == null) {
            tables = Lists.newArrayList();
            tempTablesResult.put(element.getKey(), tables);
        }
        tables.add(element.getValue());
    }
    assertEquals(expectedTempTables, tempTablesResult);
}
Also used : OutputStream(java.io.OutputStream) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Path(java.nio.file.Path) WritableByteChannel(java.nio.channels.WritableByteChannel) KV(org.apache.beam.sdk.values.KV) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) TableRow(com.google.api.services.bigquery.model.TableRow) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Test(org.junit.Test)

Example 27 with TableRow

use of com.google.api.services.bigquery.model.TableRow in project beam by apache.

the class BigQueryIOTest method testWriteFailedJobs.

@Test
public void testWriteFailedJobs() throws Exception {
    BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
    bqOptions.setProject("defaultproject");
    bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
    FakeDatasetService datasetService = new FakeDatasetService();
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(datasetService);
    Pipeline p = TestPipeline.create(bqOptions);
    p.apply(Create.of(new TableRow().set("name", "a").set("number", 1), new TableRow().set("name", "b").set("number", 2), new TableRow().set("name", "c").set("number", 3)).withCoder(TableRowJsonCoder.of())).apply(BigQueryIO.writeTableRows().to("dataset-id.table-id").withCreateDisposition(CreateDisposition.CREATE_NEVER).withTestServices(fakeBqServices).withoutValidation());
    thrown.expect(RuntimeException.class);
    thrown.expectMessage("Failed to create load job with id prefix");
    thrown.expectMessage("reached max retries");
    thrown.expectMessage("last failed load job");
    try {
        p.run();
    } finally {
        File tempDir = new File(bqOptions.getTempLocation());
        testNumFiles(tempDir, 0);
    }
}
Also used : TableRow(com.google.api.services.bigquery.model.TableRow) File(java.io.File) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 28 with TableRow

use of com.google.api.services.bigquery.model.TableRow in project beam by apache.

the class BigQueryIOTest method convertBigDecimaslToLong.

List<TableRow> convertBigDecimaslToLong(List<TableRow> toConvert) {
    // The numbers come back as BigDecimal objects after JSON serialization. Change them back to
    // longs so that we can assert the output.
    List<TableRow> converted = Lists.newArrayList();
    for (TableRow entry : toConvert) {
        TableRow convertedEntry = entry.clone();
        Object num = convertedEntry.get("number");
        if (num instanceof BigDecimal) {
            convertedEntry.set("number", ((BigDecimal) num).longValue());
        }
        converted.add(convertedEntry);
    }
    return converted;
}
Also used : TableRow(com.google.api.services.bigquery.model.TableRow) BigDecimal(java.math.BigDecimal)

Example 29 with TableRow

use of com.google.api.services.bigquery.model.TableRow in project beam by apache.

the class BigQueryIOTest method testRetryPolicy.

@Test
public void testRetryPolicy() throws Exception {
    BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
    bqOptions.setProject("project-id");
    bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
    FakeDatasetService datasetService = new FakeDatasetService();
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(datasetService);
    datasetService.createDataset("project-id", "dataset-id", "", "");
    TableRow row1 = new TableRow().set("name", "a").set("number", "1");
    TableRow row2 = new TableRow().set("name", "b").set("number", "2");
    TableRow row3 = new TableRow().set("name", "c").set("number", "3");
    TableDataInsertAllResponse.InsertErrors ephemeralError = new TableDataInsertAllResponse.InsertErrors().setErrors(ImmutableList.of(new ErrorProto().setReason("timeout")));
    TableDataInsertAllResponse.InsertErrors persistentError = new TableDataInsertAllResponse.InsertErrors().setErrors(ImmutableList.of(new ErrorProto().setReason("invalidQuery")));
    datasetService.failOnInsert(ImmutableMap.<TableRow, List<TableDataInsertAllResponse.InsertErrors>>of(row1, ImmutableList.of(ephemeralError, ephemeralError), row2, ImmutableList.of(ephemeralError, ephemeralError, persistentError)));
    Pipeline p = TestPipeline.create(bqOptions);
    PCollection<TableRow> failedRows = p.apply(Create.of(row1, row2, row3)).setIsBoundedInternal(IsBounded.UNBOUNDED).apply(BigQueryIO.writeTableRows().to("project-id:dataset-id.table-id").withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED).withSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()).withTestServices(fakeBqServices).withoutValidation()).getFailedInserts();
    // row2 finally fails with a non-retryable error, so we expect to see it in the collection of
    // failed rows.
    PAssert.that(failedRows).containsInAnyOrder(row2);
    p.run();
    // Only row1 and row3 were successfully inserted.
    assertThat(datasetService.getAllRows("project-id", "dataset-id", "table-id"), containsInAnyOrder(row1, row3));
}
Also used : ErrorProto(com.google.api.services.bigquery.model.ErrorProto) TableSchema(com.google.api.services.bigquery.model.TableSchema) JsonSchemaToTableSchema(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema) TableDataInsertAllResponse(com.google.api.services.bigquery.model.TableDataInsertAllResponse) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) TableRow(com.google.api.services.bigquery.model.TableRow) Test(org.junit.Test)

Example 30 with TableRow

use of com.google.api.services.bigquery.model.TableRow in project beam by apache.

the class BigQueryIOTest method testBigQueryNoTableQuerySourceInitSplit.

@Test
public void testBigQueryNoTableQuerySourceInitSplit() throws Exception {
    TableReference dryRunTable = new TableReference();
    Job queryJob = new Job();
    JobStatistics queryJobStats = new JobStatistics();
    JobStatistics2 queryStats = new JobStatistics2();
    queryStats.setReferencedTables(ImmutableList.of(dryRunTable));
    queryJobStats.setQuery(queryStats);
    queryJob.setStatus(new JobStatus()).setStatistics(queryJobStats);
    Job extractJob = new Job();
    JobStatistics extractJobStats = new JobStatistics();
    JobStatistics4 extractStats = new JobStatistics4();
    extractStats.setDestinationUriFileCounts(ImmutableList.of(1L));
    extractJobStats.setExtract(extractStats);
    extractJob.setStatus(new JobStatus()).setStatistics(extractJobStats);
    FakeDatasetService datasetService = new FakeDatasetService();
    FakeJobService jobService = new FakeJobService();
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(jobService).withDatasetService(datasetService);
    PipelineOptions options = PipelineOptionsFactory.create();
    BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
    bqOptions.setProject("project");
    String stepUuid = "testStepUuid";
    TableReference tempTableReference = createTempTableReference(bqOptions.getProject(), createJobIdToken(bqOptions.getJobName(), stepUuid));
    List<TableRow> expected = ImmutableList.of(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L), new TableRow().set("name", "d").set("number", 4L), new TableRow().set("name", "e").set("number", 5L), new TableRow().set("name", "f").set("number", 6L));
    datasetService.createDataset(tempTableReference.getProjectId(), tempTableReference.getDatasetId(), "", "");
    Table table = new Table().setTableReference(tempTableReference).setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER"))));
    datasetService.createTable(table);
    String query = FakeBigQueryServices.encodeQuery(expected);
    jobService.expectDryRunQuery("project", query, new JobStatistics().setQuery(new JobStatistics2().setTotalBytesProcessed(100L).setReferencedTables(ImmutableList.of(table.getTableReference()))));
    Path baseDir = Files.createTempDirectory(tempFolder, "testBigQueryNoTableQuerySourceInitSplit");
    BoundedSource<TableRow> bqSource = BigQueryQuerySource.create(stepUuid, StaticValueProvider.of(query), true, /* flattenResults */
    true, /* useLegacySql */
    fakeBqServices);
    options.setTempLocation(baseDir.toString());
    List<TableRow> read = convertBigDecimaslToLong(SourceTestUtils.readFromSource(bqSource, options));
    assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));
    SourceTestUtils.assertSplitAtFractionBehavior(bqSource, 2, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
    List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
    assertEquals(2, sources.size());
    BoundedSource<TableRow> actual = sources.get(0);
    assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
}
Also used : Path(java.nio.file.Path) JobStatistics(com.google.api.services.bigquery.model.JobStatistics) JobStatistics2(com.google.api.services.bigquery.model.JobStatistics2) HashBasedTable(com.google.common.collect.HashBasedTable) Table(com.google.api.services.bigquery.model.Table) JobStatistics4(com.google.api.services.bigquery.model.JobStatistics4) TableSchema(com.google.api.services.bigquery.model.TableSchema) JsonSchemaToTableSchema(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) JobStatus(com.google.api.services.bigquery.model.JobStatus) BigQueryHelpers.createTempTableReference(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference) TableReference(com.google.api.services.bigquery.model.TableReference) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) TableRow(com.google.api.services.bigquery.model.TableRow) Job(com.google.api.services.bigquery.model.Job) Test(org.junit.Test)

Aggregations

TableRow (com.google.api.services.bigquery.model.TableRow)73 Test (org.junit.Test)43 TableReference (com.google.api.services.bigquery.model.TableReference)24 TableSchema (com.google.api.services.bigquery.model.TableSchema)18 Pipeline (org.apache.beam.sdk.Pipeline)16 KV (org.apache.beam.sdk.values.KV)15 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)14 JsonSchemaToTableSchema (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema)14 BigQueryHelpers.toJsonString (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString)13 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)12 BigQueryHelpers.createTempTableReference (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference)11 Table (com.google.api.services.bigquery.model.Table)10 HashBasedTable (com.google.common.collect.HashBasedTable)10 JobStatus (com.google.api.services.bigquery.model.JobStatus)9 TableDataInsertAllResponse (com.google.api.services.bigquery.model.TableDataInsertAllResponse)8 ArrayList (java.util.ArrayList)8 List (java.util.List)8 Map (java.util.Map)8 ValueInSingleWindow (org.apache.beam.sdk.values.ValueInSingleWindow)7 JobStatistics (com.google.api.services.bigquery.model.JobStatistics)6