Search in sources :

Example 6 with GcsPath

use of org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath in project beam by apache.

the class GcsFileSystem method match.

@Override
protected List<MatchResult> match(List<String> specs) throws IOException {
    List<GcsPath> gcsPaths = toGcsPaths(specs);
    List<GcsPath> globs = Lists.newArrayList();
    List<GcsPath> nonGlobs = Lists.newArrayList();
    List<Boolean> isGlobBooleans = Lists.newArrayList();
    for (GcsPath path : gcsPaths) {
        if (GcsUtil.isWildcard(path)) {
            globs.add(path);
            isGlobBooleans.add(true);
        } else {
            nonGlobs.add(path);
            isGlobBooleans.add(false);
        }
    }
    Iterator<MatchResult> globsMatchResults = matchGlobs(globs).iterator();
    Iterator<MatchResult> nonGlobsMatchResults = matchNonGlobs(nonGlobs).iterator();
    ImmutableList.Builder<MatchResult> ret = ImmutableList.builder();
    for (Boolean isGlob : isGlobBooleans) {
        if (isGlob) {
            checkState(globsMatchResults.hasNext(), "Expect globsMatchResults has next: %s", globs);
            ret.add(globsMatchResults.next());
        } else {
            checkState(nonGlobsMatchResults.hasNext(), "Expect nonGlobsMatchResults has next: %s", nonGlobs);
            ret.add(nonGlobsMatchResults.next());
        }
    }
    checkState(!globsMatchResults.hasNext(), "Internal error encountered in GcsFilesystem: expected no more elements in globsMatchResults.");
    checkState(!nonGlobsMatchResults.hasNext(), "Internal error encountered in GcsFilesystem: expected no more elements in globsMatchResults.");
    return ret.build();
}
Also used : ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath) MatchResult(org.apache.beam.sdk.io.fs.MatchResult)

Example 7 with GcsPath

use of org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath in project beam by apache.

the class GcsResourceId method getCurrentDirectory.

@Override
public GcsResourceId getCurrentDirectory() {
    if (isDirectory()) {
        return this;
    } else {
        GcsPath parent = gcsPath.getParent();
        checkState(parent != null, String.format("Failed to get the current directory for path: [%s].", gcsPath));
        return fromGcsPath(parent);
    }
}
Also used : GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath)

Example 8 with GcsPath

use of org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath in project beam by apache.

the class TfIdf method listInputDocuments.

/**
 * Lists documents contained beneath the {@code options.input} prefix/directory.
 */
public static Set<URI> listInputDocuments(Options options) throws URISyntaxException, IOException {
    URI baseUri = new URI(options.getInput());
    // List all documents in the directory or GCS prefix.
    URI absoluteUri;
    if (baseUri.getScheme() != null) {
        absoluteUri = baseUri;
    } else {
        absoluteUri = new URI("file", baseUri.getAuthority(), baseUri.getPath(), baseUri.getQuery(), baseUri.getFragment());
    }
    Set<URI> uris = new HashSet<>();
    if ("file".equals(absoluteUri.getScheme())) {
        File directory = new File(absoluteUri);
        for (String entry : Optional.fromNullable(directory.list()).or(new String[] {})) {
            File path = new File(directory, entry);
            uris.add(path.toURI());
        }
    } else if ("gs".equals(absoluteUri.getScheme())) {
        GcsUtil gcsUtil = options.as(GcsOptions.class).getGcsUtil();
        URI gcsUriGlob = new URI(absoluteUri.getScheme(), absoluteUri.getAuthority(), absoluteUri.getPath() + "*", absoluteUri.getQuery(), absoluteUri.getFragment());
        for (GcsPath entry : gcsUtil.expand(GcsPath.fromUri(gcsUriGlob))) {
            uris.add(entry.toUri());
        }
    }
    return uris;
}
Also used : GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath) URI(java.net.URI) File(java.io.File) GcsUtil(org.apache.beam.sdk.extensions.gcp.util.GcsUtil) HashSet(java.util.HashSet)

Example 9 with GcsPath

use of org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath in project beam by apache.

the class DataflowRunnerTest method buildMockGcsUtil.

private static GcsUtil buildMockGcsUtil() throws IOException {
    GcsUtil mockGcsUtil = mock(GcsUtil.class);
    when(mockGcsUtil.create(any(GcsPath.class), any(GcsUtil.CreateOptions.class))).then(invocation -> FileChannel.open(Files.createTempFile("channel-", ".tmp"), StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.DELETE_ON_CLOSE));
    when(mockGcsUtil.create(any(GcsPath.class), any(GcsUtil.CreateOptions.class))).then(invocation -> FileChannel.open(Files.createTempFile("channel-", ".tmp"), StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.DELETE_ON_CLOSE));
    when(mockGcsUtil.expand(any(GcsPath.class))).then(invocation -> ImmutableList.of((GcsPath) invocation.getArguments()[0]));
    when(mockGcsUtil.bucketAccessible(GcsPath.fromUri(VALID_STAGING_BUCKET))).thenReturn(true);
    when(mockGcsUtil.bucketAccessible(GcsPath.fromUri(VALID_TEMP_BUCKET))).thenReturn(true);
    when(mockGcsUtil.bucketAccessible(GcsPath.fromUri(VALID_TEMP_BUCKET + "/staging/"))).thenReturn(true);
    when(mockGcsUtil.bucketAccessible(GcsPath.fromUri(VALID_PROFILE_BUCKET))).thenReturn(true);
    when(mockGcsUtil.bucketAccessible(GcsPath.fromUri(NON_EXISTENT_BUCKET))).thenReturn(false);
    // Let every valid path be matched
    when(mockGcsUtil.getObjects(anyListOf(GcsPath.class))).thenAnswer(invocationOnMock -> {
        List<GcsPath> gcsPaths = (List<GcsPath>) invocationOnMock.getArguments()[0];
        List<GcsUtil.StorageObjectOrIOException> results = new ArrayList<>();
        for (GcsPath gcsPath : gcsPaths) {
            if (gcsPath.getBucket().equals(VALID_BUCKET)) {
                StorageObject resultObject = new StorageObject();
                resultObject.setBucket(gcsPath.getBucket());
                resultObject.setName(gcsPath.getObject());
                results.add(GcsUtil.StorageObjectOrIOException.create(resultObject));
            }
        }
        return results;
    });
    // The dataflow pipeline attempts to output to this location.
    when(mockGcsUtil.bucketAccessible(GcsPath.fromUri("gs://bucket/object"))).thenReturn(true);
    return mockGcsUtil;
}
Also used : StorageObject(com.google.api.services.storage.model.StorageObject) GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) GcsUtil(org.apache.beam.sdk.extensions.gcp.util.GcsUtil)

Example 10 with GcsPath

use of org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath in project beam by apache.

the class GcsUtilTest method testGlobExpansion.

@Test
public void testGlobExpansion() throws IOException {
    GcsOptions pipelineOptions = gcsOptionsWithTestCredential();
    GcsUtil gcsUtil = pipelineOptions.getGcsUtil();
    Storage mockStorage = Mockito.mock(Storage.class);
    gcsUtil.setStorageClient(mockStorage);
    Storage.Objects mockStorageObjects = Mockito.mock(Storage.Objects.class);
    Storage.Objects.Get mockStorageGet = Mockito.mock(Storage.Objects.Get.class);
    Storage.Objects.List mockStorageList = Mockito.mock(Storage.Objects.List.class);
    Objects modelObjects = new Objects();
    List<StorageObject> items = new ArrayList<>();
    // A directory
    items.add(new StorageObject().setBucket("testbucket").setName("testdirectory/"));
    // Files within the directory
    items.add(new StorageObject().setBucket("testbucket").setName("testdirectory/file1name"));
    items.add(new StorageObject().setBucket("testbucket").setName("testdirectory/file2name"));
    items.add(new StorageObject().setBucket("testbucket").setName("testdirectory/file3name"));
    items.add(new StorageObject().setBucket("testbucket").setName("testdirectory/otherfile"));
    items.add(new StorageObject().setBucket("testbucket").setName("testdirectory/anotherfile"));
    modelObjects.setItems(items);
    when(mockStorage.objects()).thenReturn(mockStorageObjects);
    when(mockStorageObjects.get("testbucket", "testdirectory/otherfile")).thenReturn(mockStorageGet);
    when(mockStorageObjects.list("testbucket")).thenReturn(mockStorageList);
    when(mockStorageGet.execute()).thenReturn(new StorageObject().setBucket("testbucket").setName("testdirectory/otherfile"));
    when(mockStorageList.execute()).thenReturn(modelObjects);
    // Test a single file.
    {
        GcsPath pattern = GcsPath.fromUri("gs://testbucket/testdirectory/otherfile");
        List<GcsPath> expectedFiles = ImmutableList.of(GcsPath.fromUri("gs://testbucket/testdirectory/otherfile"));
        assertThat(expectedFiles, contains(gcsUtil.expand(pattern).toArray()));
    }
    // Test patterns.
    {
        GcsPath pattern = GcsPath.fromUri("gs://testbucket/testdirectory/file*");
        List<GcsPath> expectedFiles = ImmutableList.of(GcsPath.fromUri("gs://testbucket/testdirectory/file1name"), GcsPath.fromUri("gs://testbucket/testdirectory/file2name"), GcsPath.fromUri("gs://testbucket/testdirectory/file3name"));
        assertThat(expectedFiles, contains(gcsUtil.expand(pattern).toArray()));
    }
    {
        GcsPath pattern = GcsPath.fromUri("gs://testbucket/testdirectory/file[1-3]*");
        List<GcsPath> expectedFiles = ImmutableList.of(GcsPath.fromUri("gs://testbucket/testdirectory/file1name"), GcsPath.fromUri("gs://testbucket/testdirectory/file2name"), GcsPath.fromUri("gs://testbucket/testdirectory/file3name"));
        assertThat(expectedFiles, contains(gcsUtil.expand(pattern).toArray()));
    }
    {
        GcsPath pattern = GcsPath.fromUri("gs://testbucket/testdirectory/file?name");
        List<GcsPath> expectedFiles = ImmutableList.of(GcsPath.fromUri("gs://testbucket/testdirectory/file1name"), GcsPath.fromUri("gs://testbucket/testdirectory/file2name"), GcsPath.fromUri("gs://testbucket/testdirectory/file3name"));
        assertThat(expectedFiles, contains(gcsUtil.expand(pattern).toArray()));
    }
    {
        GcsPath pattern = GcsPath.fromUri("gs://testbucket/test*ectory/fi*name");
        List<GcsPath> expectedFiles = ImmutableList.of(GcsPath.fromUri("gs://testbucket/testdirectory/file1name"), GcsPath.fromUri("gs://testbucket/testdirectory/file2name"), GcsPath.fromUri("gs://testbucket/testdirectory/file3name"));
        assertThat(expectedFiles, contains(gcsUtil.expand(pattern).toArray()));
    }
}
Also used : Storage(com.google.api.services.storage.Storage) GoogleCloudStorage(com.google.cloud.hadoop.gcsio.GoogleCloudStorage) StorageObject(com.google.api.services.storage.model.StorageObject) Objects(com.google.api.services.storage.model.Objects) ArrayList(java.util.ArrayList) GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath) List(java.util.List) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) GcsOptions(org.apache.beam.sdk.extensions.gcp.options.GcsOptions) Test(org.junit.Test)

Aggregations

GcsPath (org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath)20 ArrayList (java.util.ArrayList)8 Objects (com.google.api.services.storage.model.Objects)7 StorageObject (com.google.api.services.storage.model.StorageObject)7 Test (org.junit.Test)7 GcsUtil (org.apache.beam.sdk.extensions.gcp.util.GcsUtil)6 ImmutableList (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList)6 GoogleCloudStorage (com.google.cloud.hadoop.gcsio.GoogleCloudStorage)5 List (java.util.List)5 GcsOptions (org.apache.beam.sdk.extensions.gcp.options.GcsOptions)5 Storage (com.google.api.services.storage.Storage)4 GoogleJsonResponseException (com.google.api.client.googleapis.json.GoogleJsonResponseException)2 FileNotFoundException (java.io.FileNotFoundException)2 LinkedList (java.util.LinkedList)2 DataflowPipelineOptions (org.apache.beam.runners.dataflow.options.DataflowPipelineOptions)2 TestCredential (org.apache.beam.sdk.extensions.gcp.auth.TestCredential)2 MatchResult (org.apache.beam.sdk.io.fs.MatchResult)2 ArgumentMatchers.anyString (org.mockito.ArgumentMatchers.anyString)2 StorageResourceId (com.google.cloud.hadoop.gcsio.StorageResourceId)1 File (java.io.File)1