Search in sources :

Example 6 with GcsPath

use of org.apache.beam.sdk.util.gcsfs.GcsPath in project beam by apache.

the class GcsUtilTest method testNonExistentObjectReturnsEmptyResult.

// GCSUtil.expand() should fail when matching a single object when that object does not exist.
// We should return the empty result since GCS get object is strongly consistent.
@Test
public void testNonExistentObjectReturnsEmptyResult() throws IOException {
    GcsOptions pipelineOptions = gcsOptionsWithTestCredential();
    GcsUtil gcsUtil = pipelineOptions.getGcsUtil();
    Storage mockStorage = Mockito.mock(Storage.class);
    gcsUtil.setStorageClient(mockStorage);
    Storage.Objects mockStorageObjects = Mockito.mock(Storage.Objects.class);
    Storage.Objects.Get mockStorageGet = Mockito.mock(Storage.Objects.Get.class);
    GcsPath pattern = GcsPath.fromUri("gs://testbucket/testdirectory/nonexistentfile");
    GoogleJsonResponseException expectedException = googleJsonResponseException(HttpStatusCodes.STATUS_CODE_NOT_FOUND, "It don't exist", "Nothing here to see");
    when(mockStorage.objects()).thenReturn(mockStorageObjects);
    when(mockStorageObjects.get(pattern.getBucket(), pattern.getObject())).thenReturn(mockStorageGet);
    when(mockStorageGet.execute()).thenThrow(expectedException);
    assertEquals(Collections.EMPTY_LIST, gcsUtil.expand(pattern));
}
Also used : GoogleJsonResponseException(com.google.api.client.googleapis.json.GoogleJsonResponseException) Storage(com.google.api.services.storage.Storage) Objects(com.google.api.services.storage.model.Objects) GcsPath(org.apache.beam.sdk.util.gcsfs.GcsPath) GcsOptions(org.apache.beam.sdk.extensions.gcp.options.GcsOptions) Test(org.junit.Test)

Example 7 with GcsPath

use of org.apache.beam.sdk.util.gcsfs.GcsPath in project beam by apache.

the class GcsUtil method makeGetBatches.

/**
   * Makes get {@link BatchRequest BatchRequests}.
   *
   * @param paths {@link GcsPath GcsPaths}.
   * @param results mutable {@link List} for return values.
   * @return {@link BatchRequest BatchRequests} to execute.
   * @throws IOException
   */
@VisibleForTesting
List<BatchRequest> makeGetBatches(Collection<GcsPath> paths, List<StorageObjectOrIOException[]> results) throws IOException {
    List<BatchRequest> batches = new LinkedList<>();
    for (List<GcsPath> filesToGet : Lists.partition(Lists.newArrayList(paths), MAX_REQUESTS_PER_BATCH)) {
        BatchRequest batch = createBatchRequest();
        for (GcsPath path : filesToGet) {
            results.add(enqueueGetFileSize(path, batch));
        }
        batches.add(batch);
    }
    return batches;
}
Also used : BatchRequest(com.google.api.client.googleapis.batch.BatchRequest) GcsPath(org.apache.beam.sdk.util.gcsfs.GcsPath) LinkedList(java.util.LinkedList) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 8 with GcsPath

use of org.apache.beam.sdk.util.gcsfs.GcsPath in project beam by apache.

the class GcsFileSystemTest method testGlobExpansion.

@Test
public void testGlobExpansion() throws IOException {
    Objects modelObjects = new Objects();
    List<StorageObject> items = new ArrayList<>();
    // A directory
    items.add(new StorageObject().setBucket("testbucket").setName("testdirectory/"));
    // Files within the directory
    items.add(createStorageObject("gs://testbucket/testdirectory/file1name", 1L));
    items.add(createStorageObject("gs://testbucket/testdirectory/file2name", 2L));
    items.add(createStorageObject("gs://testbucket/testdirectory/file3name", 3L));
    items.add(createStorageObject("gs://testbucket/testdirectory/otherfile", 4L));
    items.add(createStorageObject("gs://testbucket/testdirectory/anotherfile", 5L));
    items.add(createStorageObject("gs://testbucket/testotherdirectory/file4name", 6L));
    modelObjects.setItems(items);
    when(mockGcsUtil.listObjects(eq("testbucket"), anyString(), isNull(String.class))).thenReturn(modelObjects);
    // Test patterns.
    {
        GcsPath pattern = GcsPath.fromUri("gs://testbucket/testdirectory/file*");
        List<String> expectedFiles = ImmutableList.of("gs://testbucket/testdirectory/file1name", "gs://testbucket/testdirectory/file2name", "gs://testbucket/testdirectory/file3name");
        assertThat(expectedFiles, contains(toFilenames(gcsFileSystem.expand(pattern)).toArray()));
    }
    {
        GcsPath pattern = GcsPath.fromUri("gs://testbucket/testdirectory/file*");
        List<String> expectedFiles = ImmutableList.of("gs://testbucket/testdirectory/file1name", "gs://testbucket/testdirectory/file2name", "gs://testbucket/testdirectory/file3name");
        assertThat(expectedFiles, contains(toFilenames(gcsFileSystem.expand(pattern)).toArray()));
    }
    {
        GcsPath pattern = GcsPath.fromUri("gs://testbucket/testdirectory/file[1-3]*");
        List<String> expectedFiles = ImmutableList.of("gs://testbucket/testdirectory/file1name", "gs://testbucket/testdirectory/file2name", "gs://testbucket/testdirectory/file3name");
        assertThat(expectedFiles, contains(toFilenames(gcsFileSystem.expand(pattern)).toArray()));
    }
    {
        GcsPath pattern = GcsPath.fromUri("gs://testbucket/testdirectory/file?name");
        List<String> expectedFiles = ImmutableList.of("gs://testbucket/testdirectory/file1name", "gs://testbucket/testdirectory/file2name", "gs://testbucket/testdirectory/file3name");
        assertThat(expectedFiles, contains(toFilenames(gcsFileSystem.expand(pattern)).toArray()));
    }
    {
        GcsPath pattern = GcsPath.fromUri("gs://testbucket/test*ectory/fi*name");
        List<String> expectedFiles = ImmutableList.of("gs://testbucket/testdirectory/file1name", "gs://testbucket/testdirectory/file2name", "gs://testbucket/testdirectory/file3name", "gs://testbucket/testotherdirectory/file4name");
        assertThat(expectedFiles, contains(toFilenames(gcsFileSystem.expand(pattern)).toArray()));
    }
}
Also used : StorageObject(com.google.api.services.storage.model.StorageObject) Objects(com.google.api.services.storage.model.Objects) ArrayList(java.util.ArrayList) GcsPath(org.apache.beam.sdk.util.gcsfs.GcsPath) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) Matchers.anyString(org.mockito.Matchers.anyString) Test(org.junit.Test)

Example 9 with GcsPath

use of org.apache.beam.sdk.util.gcsfs.GcsPath in project beam by apache.

the class TfIdf method listInputDocuments.

/**
   * Lists documents contained beneath the {@code options.input} prefix/directory.
   */
public static Set<URI> listInputDocuments(Options options) throws URISyntaxException, IOException {
    URI baseUri = new URI(options.getInput());
    // List all documents in the directory or GCS prefix.
    URI absoluteUri;
    if (baseUri.getScheme() != null) {
        absoluteUri = baseUri;
    } else {
        absoluteUri = new URI("file", baseUri.getAuthority(), baseUri.getPath(), baseUri.getQuery(), baseUri.getFragment());
    }
    Set<URI> uris = new HashSet<>();
    if (absoluteUri.getScheme().equals("file")) {
        File directory = new File(absoluteUri);
        for (String entry : directory.list()) {
            File path = new File(directory, entry);
            uris.add(path.toURI());
        }
    } else if (absoluteUri.getScheme().equals("gs")) {
        GcsUtil gcsUtil = options.as(GcsOptions.class).getGcsUtil();
        URI gcsUriGlob = new URI(absoluteUri.getScheme(), absoluteUri.getAuthority(), absoluteUri.getPath() + "*", absoluteUri.getQuery(), absoluteUri.getFragment());
        for (GcsPath entry : gcsUtil.expand(GcsPath.fromUri(gcsUriGlob))) {
            uris.add(entry.toUri());
        }
    }
    return uris;
}
Also used : GcsPath(org.apache.beam.sdk.util.gcsfs.GcsPath) URI(java.net.URI) File(java.io.File) GcsUtil(org.apache.beam.sdk.util.GcsUtil) HashSet(java.util.HashSet)

Example 10 with GcsPath

use of org.apache.beam.sdk.util.gcsfs.GcsPath in project DataflowJavaSDK-examples by GoogleCloudPlatform.

the class MinimalWordCountJava8Test method buildMockGcsUtil.

private GcsUtil buildMockGcsUtil() throws IOException {
    GcsUtil mockGcsUtil = Mockito.mock(GcsUtil.class);
    // Any request to open gets a new bogus channel
    Mockito.when(mockGcsUtil.open(Mockito.any(GcsPath.class))).then(new Answer<SeekableByteChannel>() {

        @Override
        public SeekableByteChannel answer(InvocationOnMock invocation) throws Throwable {
            return FileChannel.open(Files.createTempFile("channel-", ".tmp"), StandardOpenOption.CREATE, StandardOpenOption.DELETE_ON_CLOSE);
        }
    });
    // Any request for expansion returns a list containing the original GcsPath
    // This is required to pass validation that occurs in TextIO during apply()
    Mockito.when(mockGcsUtil.expand(Mockito.any(GcsPath.class))).then(new Answer<List<GcsPath>>() {

        @Override
        public List<GcsPath> answer(InvocationOnMock invocation) throws Throwable {
            return ImmutableList.of((GcsPath) invocation.getArguments()[0]);
        }
    });
    return mockGcsUtil;
}
Also used : SeekableByteChannel(java.nio.channels.SeekableByteChannel) InvocationOnMock(org.mockito.invocation.InvocationOnMock) GcsPath(org.apache.beam.sdk.util.gcsfs.GcsPath) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) GcsUtil(org.apache.beam.sdk.util.GcsUtil)

Aggregations

GcsPath (org.apache.beam.sdk.util.gcsfs.GcsPath)19 ImmutableList (com.google.common.collect.ImmutableList)9 List (java.util.List)8 Objects (com.google.api.services.storage.model.Objects)7 StorageObject (com.google.api.services.storage.model.StorageObject)6 ArrayList (java.util.ArrayList)6 LinkedList (java.util.LinkedList)6 GcsUtil (org.apache.beam.sdk.util.GcsUtil)6 Test (org.junit.Test)6 InvocationOnMock (org.mockito.invocation.InvocationOnMock)5 Storage (com.google.api.services.storage.Storage)4 GcsOptions (org.apache.beam.sdk.extensions.gcp.options.GcsOptions)4 SeekableByteChannel (java.nio.channels.SeekableByteChannel)3 BatchRequest (com.google.api.client.googleapis.batch.BatchRequest)2 GoogleJsonResponseException (com.google.api.client.googleapis.json.GoogleJsonResponseException)2 FileNotFoundException (java.io.FileNotFoundException)2 DataflowPipelineOptions (org.apache.beam.runners.dataflow.options.DataflowPipelineOptions)2 TestCredential (org.apache.beam.sdk.extensions.gcp.auth.TestCredential)2 MatchResult (org.apache.beam.sdk.io.fs.MatchResult)2 TupleTagList (org.apache.beam.sdk.values.TupleTagList)2