Search in sources :

Example 6 with ResourceId

use of org.apache.beam.sdk.io.fs.ResourceId in project beam by apache.

the class FileBasedSinkTest method testCopyToOutputFiles.

/** Output files are copied to the destination location with the correct names and contents. */
@Test
public void testCopyToOutputFiles() throws Exception {
    SimpleSink.SimpleWriteOperation writeOp = buildWriteOperation();
    ResourceId outputDirectory = writeOp.getSink().getBaseOutputDirectoryProvider().get();
    List<String> inputFilenames = Arrays.asList("input-1", "input-2", "input-3");
    List<String> inputContents = Arrays.asList("1", "2", "3");
    List<String> expectedOutputFilenames = Arrays.asList("file-00-of-03.test", "file-01-of-03.test", "file-02-of-03.test");
    Map<ResourceId, ResourceId> inputFilePaths = new HashMap<>();
    List<ResourceId> expectedOutputPaths = new ArrayList<>();
    for (int i = 0; i < inputFilenames.size(); i++) {
        // Generate output paths.
        expectedOutputPaths.add(getBaseOutputDirectory().resolve(expectedOutputFilenames.get(i), StandardResolveOptions.RESOLVE_FILE));
        // Generate and write to input paths.
        File inputTmpFile = tmpFolder.newFile(inputFilenames.get(i));
        List<String> lines = Collections.singletonList(inputContents.get(i));
        writeFile(lines, inputTmpFile);
        inputFilePaths.put(LocalResources.fromFile(inputTmpFile, false), writeOp.getSink().getFilenamePolicy().unwindowedFilename(outputDirectory, new Context(i, inputFilenames.size()), ""));
    }
    // Copy input files to output files.
    writeOp.copyToOutputFiles(inputFilePaths);
    // Assert that the contents were copied.
    for (int i = 0; i < expectedOutputPaths.size(); i++) {
        assertFileContains(Collections.singletonList(inputContents.get(i)), expectedOutputPaths.get(i));
    }
}
Also used : Context(org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy.Context) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) File(java.io.File) Test(org.junit.Test)

Example 7 with ResourceId

use of org.apache.beam.sdk.io.fs.ResourceId in project beam by apache.

the class LocalResourceIdTest method testResolveInvalidNotDirectory.

@Test
public void testResolveInvalidNotDirectory() throws Exception {
    ResourceId tmp = toResourceIdentifier("/root/").resolve("tmp", StandardResolveOptions.RESOLVE_FILE);
    thrown.expect(IllegalStateException.class);
    thrown.expectMessage("Expected the path is a directory, but had [/root/tmp].");
    tmp.resolve("aa", StandardResolveOptions.RESOLVE_FILE);
}
Also used : ResourceId(org.apache.beam.sdk.io.fs.ResourceId) Test(org.junit.Test)

Example 8 with ResourceId

use of org.apache.beam.sdk.io.fs.ResourceId in project beam by apache.

the class GcsResourceIdTest method testResolveInvalidNotDirectory.

@Test
public void testResolveInvalidNotDirectory() throws Exception {
    ResourceId tmpDir = toResourceIdentifier("gs://my_bucket/").resolve("tmp dir", StandardResolveOptions.RESOLVE_FILE);
    thrown.expect(IllegalStateException.class);
    thrown.expectMessage("Expected the gcsPath is a directory, but had [gs://my_bucket/tmp dir].");
    tmpDir.resolve("aa", StandardResolveOptions.RESOLVE_FILE);
}
Also used : ResourceId(org.apache.beam.sdk.io.fs.ResourceId) Test(org.junit.Test)

Example 9 with ResourceId

use of org.apache.beam.sdk.io.fs.ResourceId in project beam by apache.

the class FakeJobService method runLoadJob.

private JobStatus runLoadJob(JobReference jobRef, JobConfigurationLoad load) throws InterruptedException, IOException {
    TableReference destination = load.getDestinationTable();
    TableSchema schema = load.getSchema();
    List<ResourceId> sourceFiles = filesForLoadJobs.get(jobRef.getProjectId(), jobRef.getJobId());
    WriteDisposition writeDisposition = WriteDisposition.valueOf(load.getWriteDisposition());
    CreateDisposition createDisposition = CreateDisposition.valueOf(load.getCreateDisposition());
    checkArgument(load.getSourceFormat().equals("NEWLINE_DELIMITED_JSON"));
    Table existingTable = datasetService.getTable(destination);
    if (!validateDispositions(existingTable, createDisposition, writeDisposition)) {
        return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
    }
    datasetService.createTable(new Table().setTableReference(destination).setSchema(schema));
    List<TableRow> rows = Lists.newArrayList();
    for (ResourceId filename : sourceFiles) {
        rows.addAll(readRows(filename.toString()));
    }
    datasetService.insertAll(destination, rows, null);
    return new JobStatus().setState("DONE");
}
Also used : JobStatus(com.google.api.services.bigquery.model.JobStatus) TableReference(com.google.api.services.bigquery.model.TableReference) CreateDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition) HashBasedTable(com.google.common.collect.HashBasedTable) Table(com.google.api.services.bigquery.model.Table) ErrorProto(com.google.api.services.bigquery.model.ErrorProto) TableSchema(com.google.api.services.bigquery.model.TableSchema) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) TableRow(com.google.api.services.bigquery.model.TableRow) WriteDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition)

Example 10 with ResourceId

use of org.apache.beam.sdk.io.fs.ResourceId in project beam by apache.

the class BigQueryIOTest method testWriteTables.

@Test
public void testWriteTables() throws Exception {
    p.enableAbandonedNodeEnforcement(false);
    FakeDatasetService datasetService = new FakeDatasetService();
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(datasetService);
    datasetService.createDataset("project-id", "dataset-id", "", "");
    long numTables = 3;
    long numPartitions = 3;
    long numFilesPerPartition = 10;
    String jobIdToken = "jobIdToken";
    String stepUuid = "stepUuid";
    Map<TableDestination, List<String>> expectedTempTables = Maps.newHashMap();
    Path baseDir = Files.createTempDirectory(tempFolder, "testWriteTables");
    List<KV<ShardedKey<String>, List<String>>> partitions = Lists.newArrayList();
    for (int i = 0; i < numTables; ++i) {
        String tableName = String.format("project-id:dataset-id.table%05d", i);
        TableDestination tableDestination = new TableDestination(tableName, tableName);
        for (int j = 0; j < numPartitions; ++j) {
            String tempTableId = BigQueryHelpers.createJobId(jobIdToken, tableDestination, j);
            List<String> filesPerPartition = Lists.newArrayList();
            for (int k = 0; k < numFilesPerPartition; ++k) {
                String filename = Paths.get(baseDir.toString(), String.format("files0x%08x_%05d", tempTableId.hashCode(), k)).toString();
                ResourceId fileResource = FileSystems.matchNewResource(filename, false);
                try (WritableByteChannel channel = FileSystems.create(fileResource, MimeTypes.TEXT)) {
                    try (OutputStream output = Channels.newOutputStream(channel)) {
                        TableRow tableRow = new TableRow().set("name", tableName);
                        TableRowJsonCoder.of().encode(tableRow, output, Context.OUTER);
                        output.write("\n".getBytes(StandardCharsets.UTF_8));
                    }
                }
                filesPerPartition.add(filename);
            }
            partitions.add(KV.of(ShardedKey.of(tableDestination.getTableSpec(), j), filesPerPartition));
            List<String> expectedTables = expectedTempTables.get(tableDestination);
            if (expectedTables == null) {
                expectedTables = Lists.newArrayList();
                expectedTempTables.put(tableDestination, expectedTables);
            }
            String json = String.format("{\"datasetId\":\"dataset-id\",\"projectId\":\"project-id\",\"tableId\":\"%s\"}", tempTableId);
            expectedTables.add(json);
        }
    }
    PCollectionView<String> jobIdTokenView = p.apply("CreateJobId", Create.of("jobId")).apply(View.<String>asSingleton());
    PCollectionView<Map<String, String>> schemaMapView = p.apply("CreateEmptySchema", Create.empty(new TypeDescriptor<KV<String, String>>() {
    })).apply(View.<String, String>asMap());
    WriteTables<String> writeTables = new WriteTables<>(false, fakeBqServices, jobIdTokenView, schemaMapView, WriteDisposition.WRITE_EMPTY, CreateDisposition.CREATE_IF_NEEDED, new IdentityDynamicTables());
    DoFnTester<KV<ShardedKey<String>, List<String>>, KV<TableDestination, String>> tester = DoFnTester.of(writeTables);
    tester.setSideInput(jobIdTokenView, GlobalWindow.INSTANCE, jobIdToken);
    tester.setSideInput(schemaMapView, GlobalWindow.INSTANCE, ImmutableMap.<String, String>of());
    tester.getPipelineOptions().setTempLocation("tempLocation");
    for (KV<ShardedKey<String>, List<String>> partition : partitions) {
        tester.processElement(partition);
    }
    Map<TableDestination, List<String>> tempTablesResult = Maps.newHashMap();
    for (KV<TableDestination, String> element : tester.takeOutputElements()) {
        List<String> tables = tempTablesResult.get(element.getKey());
        if (tables == null) {
            tables = Lists.newArrayList();
            tempTablesResult.put(element.getKey(), tables);
        }
        tables.add(element.getValue());
    }
    assertEquals(expectedTempTables, tempTablesResult);
}
Also used : OutputStream(java.io.OutputStream) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Path(java.nio.file.Path) WritableByteChannel(java.nio.channels.WritableByteChannel) KV(org.apache.beam.sdk.values.KV) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) TableRow(com.google.api.services.bigquery.model.TableRow) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Test(org.junit.Test)

Aggregations

ResourceId (org.apache.beam.sdk.io.fs.ResourceId)23 Test (org.junit.Test)12 ArrayList (java.util.ArrayList)7 File (java.io.File)4 FileResult (org.apache.beam.sdk.io.FileBasedSink.FileResult)4 FilenamePolicy (org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy)4 TableRow (com.google.api.services.bigquery.model.TableRow)3 TableSchema (com.google.api.services.bigquery.model.TableSchema)3 ImmutableList (com.google.common.collect.ImmutableList)3 JobStatus (com.google.api.services.bigquery.model.JobStatus)2 TableReference (com.google.api.services.bigquery.model.TableReference)2 DefaultFilenamePolicy (org.apache.beam.sdk.io.DefaultFilenamePolicy)2 Context (org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy.Context)2 TextIO (org.apache.beam.sdk.io.TextIO)2 GoogleJsonResponseException (com.google.api.client.googleapis.json.GoogleJsonResponseException)1 ErrorProto (com.google.api.services.bigquery.model.ErrorProto)1 Job (com.google.api.services.bigquery.model.Job)1 JobConfiguration (com.google.api.services.bigquery.model.JobConfiguration)1 JobStatistics (com.google.api.services.bigquery.model.JobStatistics)1 Table (com.google.api.services.bigquery.model.Table)1