Search in sources :

Example 16 with GcsPath

use of org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath in project beam by apache.

the class BatchStatefulParDoOverridesTest method buildPipelineOptions.

private static DataflowPipelineOptions buildPipelineOptions(String... args) throws IOException {
    GcsUtil mockGcsUtil = mock(GcsUtil.class);
    when(mockGcsUtil.expand(any(GcsPath.class))).then(invocation -> ImmutableList.of((GcsPath) invocation.getArguments()[0]));
    when(mockGcsUtil.bucketAccessible(any(GcsPath.class))).thenReturn(true);
    DataflowPipelineOptions options = PipelineOptionsFactory.fromArgs(args).as(DataflowPipelineOptions.class);
    options.setRunner(DataflowRunner.class);
    options.setGcpCredential(new TestCredential());
    options.setJobName("some-job-name");
    options.setProject("some-project");
    options.setRegion("some-region");
    options.setTempLocation(GcsPath.fromComponents("somebucket", "some/path").toString());
    options.setFilesToStage(new ArrayList<>());
    options.setGcsUtil(mockGcsUtil);
    // Enable the FileSystems API to know about gs:// URIs in this test.
    FileSystems.setDefaultPipelineOptions(options);
    return options;
}
Also used : TestCredential(org.apache.beam.sdk.extensions.gcp.auth.TestCredential) DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath) GcsUtil(org.apache.beam.sdk.extensions.gcp.util.GcsUtil)

Example 17 with GcsPath

use of org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath in project beam by apache.

the class GcsUtil method makeRewriteOps.

LinkedList<RewriteOp> makeRewriteOps(Iterable<String> srcFilenames, Iterable<String> destFilenames, boolean deleteSource, boolean ignoreMissingSource, boolean ignoreExistingDest) throws IOException {
    List<String> srcList = Lists.newArrayList(srcFilenames);
    List<String> destList = Lists.newArrayList(destFilenames);
    checkArgument(srcList.size() == destList.size(), "Number of source files %s must equal number of destination files %s", srcList.size(), destList.size());
    LinkedList<RewriteOp> rewrites = Lists.newLinkedList();
    for (int i = 0; i < srcList.size(); i++) {
        final GcsPath sourcePath = GcsPath.fromUri(srcList.get(i));
        final GcsPath destPath = GcsPath.fromUri(destList.get(i));
        if (ignoreExistingDest && !sourcePath.getBucket().equals(destPath.getBucket())) {
            throw new UnsupportedOperationException("Skipping dest existence is only supported within a bucket.");
        }
        rewrites.addLast(new RewriteOp(sourcePath, destPath, deleteSource, ignoreMissingSource));
    }
    return rewrites;
}
Also used : GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath)

Example 18 with GcsPath

use of org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath in project beam by apache.

the class GcsUtil method expand.

/**
 * Expands a pattern into matched paths. The pattern path may contain globs, which are expanded in
 * the result. For patterns that only match a single object, we ensure that the object exists.
 */
public List<GcsPath> expand(GcsPath gcsPattern) throws IOException {
    Pattern p = null;
    String prefix = null;
    if (isWildcard(gcsPattern)) {
        // Part before the first wildcard character.
        prefix = getNonWildcardPrefix(gcsPattern.getObject());
        p = Pattern.compile(wildcardToRegexp(gcsPattern.getObject()));
    } else {
        // Not a wildcard.
        try {
            // Use a get request to fetch the metadata of the object, and ignore the return value.
            // The request has strong global consistency.
            getObject(gcsPattern);
            return ImmutableList.of(gcsPattern);
        } catch (FileNotFoundException e) {
            // If the path was not found, return an empty list.
            return ImmutableList.of();
        }
    }
    LOG.debug("matching files in bucket {}, prefix {} against pattern {}", gcsPattern.getBucket(), prefix, p.toString());
    String pageToken = null;
    List<GcsPath> results = new ArrayList<>();
    do {
        Objects objects = listObjects(gcsPattern.getBucket(), prefix, pageToken);
        if (objects.getItems() == null) {
            break;
        }
        // Filter objects based on the regex.
        for (StorageObject o : objects.getItems()) {
            String name = o.getName();
            // Skip directories, which end with a slash.
            if (p.matcher(name).matches() && !name.endsWith("/")) {
                LOG.debug("Matched object: {}", name);
                results.add(GcsPath.fromObject(o));
            }
        }
        pageToken = objects.getNextPageToken();
    } while (pageToken != null);
    return results;
}
Also used : Pattern(java.util.regex.Pattern) StorageObject(com.google.api.services.storage.model.StorageObject) FileNotFoundException(java.io.FileNotFoundException) GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath) ArrayList(java.util.ArrayList) Objects(com.google.api.services.storage.model.Objects)

Example 19 with GcsPath

use of org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath in project beam by apache.

the class GcsFileSystemTest method createStorageObject.

private StorageObject createStorageObject(String gcsFilename, long fileSize) {
    GcsPath gcsPath = GcsPath.fromUri(gcsFilename);
    // Google APIs will use null for empty files.
    @Nullable BigInteger size = (fileSize == 0) ? null : BigInteger.valueOf(fileSize);
    return new StorageObject().setBucket(gcsPath.getBucket()).setName(gcsPath.getObject()).setSize(size);
}
Also used : StorageObject(com.google.api.services.storage.model.StorageObject) GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath) BigInteger(java.math.BigInteger) Nullable(org.checkerframework.checker.nullness.qual.Nullable)

Example 20 with GcsPath

use of org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath in project beam by apache.

the class GcsFileSystemTest method testMatch.

@Test
public void testMatch() throws Exception {
    Objects modelObjects = new Objects();
    List<StorageObject> items = new ArrayList<>();
    // A directory
    items.add(new StorageObject().setBucket("testbucket").setName("testdirectory/"));
    // Files within the directory
    items.add(createStorageObject("gs://testbucket/testdirectory/file1name", 1L));
    items.add(createStorageObject("gs://testbucket/testdirectory/file2name", 2L));
    items.add(createStorageObject("gs://testbucket/testdirectory/file3name", 3L));
    items.add(createStorageObject("gs://testbucket/testdirectory/file4name", 4L));
    items.add(createStorageObject("gs://testbucket/testdirectory/otherfile", 5L));
    items.add(createStorageObject("gs://testbucket/testdirectory/anotherfile", 6L));
    modelObjects.setItems(items);
    when(mockGcsUtil.listObjects(eq("testbucket"), anyString(), isNull(String.class))).thenReturn(modelObjects);
    List<GcsPath> gcsPaths = ImmutableList.of(GcsPath.fromUri("gs://testbucket/testdirectory/non-exist-file"), GcsPath.fromUri("gs://testbucket/testdirectory/otherfile"));
    when(mockGcsUtil.getObjects(eq(gcsPaths))).thenReturn(ImmutableList.of(StorageObjectOrIOException.create(new FileNotFoundException()), StorageObjectOrIOException.create(createStorageObject("gs://testbucket/testdirectory/otherfile", 4L))));
    List<String> specs = ImmutableList.of("gs://testbucket/testdirectory/file[1-3]*", "gs://testbucket/testdirectory/non-exist-file", "gs://testbucket/testdirectory/otherfile");
    List<MatchResult> matchResults = gcsFileSystem.match(specs);
    assertEquals(3, matchResults.size());
    assertEquals(Status.OK, matchResults.get(0).status());
    assertThat(ImmutableList.of("gs://testbucket/testdirectory/file1name", "gs://testbucket/testdirectory/file2name", "gs://testbucket/testdirectory/file3name"), contains(toFilenames(matchResults.get(0)).toArray()));
    assertEquals(Status.NOT_FOUND, matchResults.get(1).status());
    assertEquals(Status.OK, matchResults.get(2).status());
    assertThat(ImmutableList.of("gs://testbucket/testdirectory/otherfile"), contains(toFilenames(matchResults.get(2)).toArray()));
}
Also used : StorageObject(com.google.api.services.storage.model.StorageObject) Objects(com.google.api.services.storage.model.Objects) ArrayList(java.util.ArrayList) GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath) FileNotFoundException(java.io.FileNotFoundException) ArgumentMatchers.anyString(org.mockito.ArgumentMatchers.anyString) MatchResult(org.apache.beam.sdk.io.fs.MatchResult) Test(org.junit.Test)

Aggregations

GcsPath (org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath)20 ArrayList (java.util.ArrayList)8 Objects (com.google.api.services.storage.model.Objects)7 StorageObject (com.google.api.services.storage.model.StorageObject)7 Test (org.junit.Test)7 GcsUtil (org.apache.beam.sdk.extensions.gcp.util.GcsUtil)6 ImmutableList (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList)6 GoogleCloudStorage (com.google.cloud.hadoop.gcsio.GoogleCloudStorage)5 List (java.util.List)5 GcsOptions (org.apache.beam.sdk.extensions.gcp.options.GcsOptions)5 Storage (com.google.api.services.storage.Storage)4 GoogleJsonResponseException (com.google.api.client.googleapis.json.GoogleJsonResponseException)2 FileNotFoundException (java.io.FileNotFoundException)2 LinkedList (java.util.LinkedList)2 DataflowPipelineOptions (org.apache.beam.runners.dataflow.options.DataflowPipelineOptions)2 TestCredential (org.apache.beam.sdk.extensions.gcp.auth.TestCredential)2 MatchResult (org.apache.beam.sdk.io.fs.MatchResult)2 ArgumentMatchers.anyString (org.mockito.ArgumentMatchers.anyString)2 StorageResourceId (com.google.cloud.hadoop.gcsio.StorageResourceId)1 File (java.io.File)1