use of org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath in project beam by apache.
the class GcsFileSystem method match.
@Override
protected List<MatchResult> match(List<String> specs) throws IOException {
List<GcsPath> gcsPaths = toGcsPaths(specs);
List<GcsPath> globs = Lists.newArrayList();
List<GcsPath> nonGlobs = Lists.newArrayList();
List<Boolean> isGlobBooleans = Lists.newArrayList();
for (GcsPath path : gcsPaths) {
if (GcsUtil.isWildcard(path)) {
globs.add(path);
isGlobBooleans.add(true);
} else {
nonGlobs.add(path);
isGlobBooleans.add(false);
}
}
Iterator<MatchResult> globsMatchResults = matchGlobs(globs).iterator();
Iterator<MatchResult> nonGlobsMatchResults = matchNonGlobs(nonGlobs).iterator();
ImmutableList.Builder<MatchResult> ret = ImmutableList.builder();
for (Boolean isGlob : isGlobBooleans) {
if (isGlob) {
checkState(globsMatchResults.hasNext(), "Expect globsMatchResults has next: %s", globs);
ret.add(globsMatchResults.next());
} else {
checkState(nonGlobsMatchResults.hasNext(), "Expect nonGlobsMatchResults has next: %s", nonGlobs);
ret.add(nonGlobsMatchResults.next());
}
}
checkState(!globsMatchResults.hasNext(), "Internal error encountered in GcsFilesystem: expected no more elements in globsMatchResults.");
checkState(!nonGlobsMatchResults.hasNext(), "Internal error encountered in GcsFilesystem: expected no more elements in globsMatchResults.");
return ret.build();
}
use of org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath in project beam by apache.
the class GcsResourceId method getCurrentDirectory.
@Override
public GcsResourceId getCurrentDirectory() {
if (isDirectory()) {
return this;
} else {
GcsPath parent = gcsPath.getParent();
checkState(parent != null, String.format("Failed to get the current directory for path: [%s].", gcsPath));
return fromGcsPath(parent);
}
}
use of org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath in project beam by apache.
the class TfIdf method listInputDocuments.
/**
* Lists documents contained beneath the {@code options.input} prefix/directory.
*/
public static Set<URI> listInputDocuments(Options options) throws URISyntaxException, IOException {
URI baseUri = new URI(options.getInput());
// List all documents in the directory or GCS prefix.
URI absoluteUri;
if (baseUri.getScheme() != null) {
absoluteUri = baseUri;
} else {
absoluteUri = new URI("file", baseUri.getAuthority(), baseUri.getPath(), baseUri.getQuery(), baseUri.getFragment());
}
Set<URI> uris = new HashSet<>();
if ("file".equals(absoluteUri.getScheme())) {
File directory = new File(absoluteUri);
for (String entry : Optional.fromNullable(directory.list()).or(new String[] {})) {
File path = new File(directory, entry);
uris.add(path.toURI());
}
} else if ("gs".equals(absoluteUri.getScheme())) {
GcsUtil gcsUtil = options.as(GcsOptions.class).getGcsUtil();
URI gcsUriGlob = new URI(absoluteUri.getScheme(), absoluteUri.getAuthority(), absoluteUri.getPath() + "*", absoluteUri.getQuery(), absoluteUri.getFragment());
for (GcsPath entry : gcsUtil.expand(GcsPath.fromUri(gcsUriGlob))) {
uris.add(entry.toUri());
}
}
return uris;
}
use of org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath in project beam by apache.
the class DataflowRunnerTest method buildMockGcsUtil.
private static GcsUtil buildMockGcsUtil() throws IOException {
GcsUtil mockGcsUtil = mock(GcsUtil.class);
when(mockGcsUtil.create(any(GcsPath.class), any(GcsUtil.CreateOptions.class))).then(invocation -> FileChannel.open(Files.createTempFile("channel-", ".tmp"), StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.DELETE_ON_CLOSE));
when(mockGcsUtil.create(any(GcsPath.class), any(GcsUtil.CreateOptions.class))).then(invocation -> FileChannel.open(Files.createTempFile("channel-", ".tmp"), StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.DELETE_ON_CLOSE));
when(mockGcsUtil.expand(any(GcsPath.class))).then(invocation -> ImmutableList.of((GcsPath) invocation.getArguments()[0]));
when(mockGcsUtil.bucketAccessible(GcsPath.fromUri(VALID_STAGING_BUCKET))).thenReturn(true);
when(mockGcsUtil.bucketAccessible(GcsPath.fromUri(VALID_TEMP_BUCKET))).thenReturn(true);
when(mockGcsUtil.bucketAccessible(GcsPath.fromUri(VALID_TEMP_BUCKET + "/staging/"))).thenReturn(true);
when(mockGcsUtil.bucketAccessible(GcsPath.fromUri(VALID_PROFILE_BUCKET))).thenReturn(true);
when(mockGcsUtil.bucketAccessible(GcsPath.fromUri(NON_EXISTENT_BUCKET))).thenReturn(false);
// Let every valid path be matched
when(mockGcsUtil.getObjects(anyListOf(GcsPath.class))).thenAnswer(invocationOnMock -> {
List<GcsPath> gcsPaths = (List<GcsPath>) invocationOnMock.getArguments()[0];
List<GcsUtil.StorageObjectOrIOException> results = new ArrayList<>();
for (GcsPath gcsPath : gcsPaths) {
if (gcsPath.getBucket().equals(VALID_BUCKET)) {
StorageObject resultObject = new StorageObject();
resultObject.setBucket(gcsPath.getBucket());
resultObject.setName(gcsPath.getObject());
results.add(GcsUtil.StorageObjectOrIOException.create(resultObject));
}
}
return results;
});
// The dataflow pipeline attempts to output to this location.
when(mockGcsUtil.bucketAccessible(GcsPath.fromUri("gs://bucket/object"))).thenReturn(true);
return mockGcsUtil;
}
use of org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath in project beam by apache.
the class GcsUtilTest method testGlobExpansion.
@Test
public void testGlobExpansion() throws IOException {
GcsOptions pipelineOptions = gcsOptionsWithTestCredential();
GcsUtil gcsUtil = pipelineOptions.getGcsUtil();
Storage mockStorage = Mockito.mock(Storage.class);
gcsUtil.setStorageClient(mockStorage);
Storage.Objects mockStorageObjects = Mockito.mock(Storage.Objects.class);
Storage.Objects.Get mockStorageGet = Mockito.mock(Storage.Objects.Get.class);
Storage.Objects.List mockStorageList = Mockito.mock(Storage.Objects.List.class);
Objects modelObjects = new Objects();
List<StorageObject> items = new ArrayList<>();
// A directory
items.add(new StorageObject().setBucket("testbucket").setName("testdirectory/"));
// Files within the directory
items.add(new StorageObject().setBucket("testbucket").setName("testdirectory/file1name"));
items.add(new StorageObject().setBucket("testbucket").setName("testdirectory/file2name"));
items.add(new StorageObject().setBucket("testbucket").setName("testdirectory/file3name"));
items.add(new StorageObject().setBucket("testbucket").setName("testdirectory/otherfile"));
items.add(new StorageObject().setBucket("testbucket").setName("testdirectory/anotherfile"));
modelObjects.setItems(items);
when(mockStorage.objects()).thenReturn(mockStorageObjects);
when(mockStorageObjects.get("testbucket", "testdirectory/otherfile")).thenReturn(mockStorageGet);
when(mockStorageObjects.list("testbucket")).thenReturn(mockStorageList);
when(mockStorageGet.execute()).thenReturn(new StorageObject().setBucket("testbucket").setName("testdirectory/otherfile"));
when(mockStorageList.execute()).thenReturn(modelObjects);
// Test a single file.
{
GcsPath pattern = GcsPath.fromUri("gs://testbucket/testdirectory/otherfile");
List<GcsPath> expectedFiles = ImmutableList.of(GcsPath.fromUri("gs://testbucket/testdirectory/otherfile"));
assertThat(expectedFiles, contains(gcsUtil.expand(pattern).toArray()));
}
// Test patterns.
{
GcsPath pattern = GcsPath.fromUri("gs://testbucket/testdirectory/file*");
List<GcsPath> expectedFiles = ImmutableList.of(GcsPath.fromUri("gs://testbucket/testdirectory/file1name"), GcsPath.fromUri("gs://testbucket/testdirectory/file2name"), GcsPath.fromUri("gs://testbucket/testdirectory/file3name"));
assertThat(expectedFiles, contains(gcsUtil.expand(pattern).toArray()));
}
{
GcsPath pattern = GcsPath.fromUri("gs://testbucket/testdirectory/file[1-3]*");
List<GcsPath> expectedFiles = ImmutableList.of(GcsPath.fromUri("gs://testbucket/testdirectory/file1name"), GcsPath.fromUri("gs://testbucket/testdirectory/file2name"), GcsPath.fromUri("gs://testbucket/testdirectory/file3name"));
assertThat(expectedFiles, contains(gcsUtil.expand(pattern).toArray()));
}
{
GcsPath pattern = GcsPath.fromUri("gs://testbucket/testdirectory/file?name");
List<GcsPath> expectedFiles = ImmutableList.of(GcsPath.fromUri("gs://testbucket/testdirectory/file1name"), GcsPath.fromUri("gs://testbucket/testdirectory/file2name"), GcsPath.fromUri("gs://testbucket/testdirectory/file3name"));
assertThat(expectedFiles, contains(gcsUtil.expand(pattern).toArray()));
}
{
GcsPath pattern = GcsPath.fromUri("gs://testbucket/test*ectory/fi*name");
List<GcsPath> expectedFiles = ImmutableList.of(GcsPath.fromUri("gs://testbucket/testdirectory/file1name"), GcsPath.fromUri("gs://testbucket/testdirectory/file2name"), GcsPath.fromUri("gs://testbucket/testdirectory/file3name"));
assertThat(expectedFiles, contains(gcsUtil.expand(pattern).toArray()));
}
}
Aggregations