use of org.apache.beam.sdk.util.gcsfs.GcsPath in project beam by apache.
the class GcsUtilTest method testNonExistentObjectReturnsEmptyResult.
// GCSUtil.expand() should fail when matching a single object when that object does not exist.
// We should return the empty result since GCS get object is strongly consistent.
@Test
public void testNonExistentObjectReturnsEmptyResult() throws IOException {
GcsOptions pipelineOptions = gcsOptionsWithTestCredential();
GcsUtil gcsUtil = pipelineOptions.getGcsUtil();
Storage mockStorage = Mockito.mock(Storage.class);
gcsUtil.setStorageClient(mockStorage);
Storage.Objects mockStorageObjects = Mockito.mock(Storage.Objects.class);
Storage.Objects.Get mockStorageGet = Mockito.mock(Storage.Objects.Get.class);
GcsPath pattern = GcsPath.fromUri("gs://testbucket/testdirectory/nonexistentfile");
GoogleJsonResponseException expectedException = googleJsonResponseException(HttpStatusCodes.STATUS_CODE_NOT_FOUND, "It don't exist", "Nothing here to see");
when(mockStorage.objects()).thenReturn(mockStorageObjects);
when(mockStorageObjects.get(pattern.getBucket(), pattern.getObject())).thenReturn(mockStorageGet);
when(mockStorageGet.execute()).thenThrow(expectedException);
assertEquals(Collections.EMPTY_LIST, gcsUtil.expand(pattern));
}
use of org.apache.beam.sdk.util.gcsfs.GcsPath in project beam by apache.
the class GcsUtil method makeGetBatches.
/**
* Makes get {@link BatchRequest BatchRequests}.
*
* @param paths {@link GcsPath GcsPaths}.
* @param results mutable {@link List} for return values.
* @return {@link BatchRequest BatchRequests} to execute.
* @throws IOException
*/
@VisibleForTesting
List<BatchRequest> makeGetBatches(Collection<GcsPath> paths, List<StorageObjectOrIOException[]> results) throws IOException {
List<BatchRequest> batches = new LinkedList<>();
for (List<GcsPath> filesToGet : Lists.partition(Lists.newArrayList(paths), MAX_REQUESTS_PER_BATCH)) {
BatchRequest batch = createBatchRequest();
for (GcsPath path : filesToGet) {
results.add(enqueueGetFileSize(path, batch));
}
batches.add(batch);
}
return batches;
}
use of org.apache.beam.sdk.util.gcsfs.GcsPath in project beam by apache.
the class GcsFileSystemTest method testGlobExpansion.
@Test
public void testGlobExpansion() throws IOException {
Objects modelObjects = new Objects();
List<StorageObject> items = new ArrayList<>();
// A directory
items.add(new StorageObject().setBucket("testbucket").setName("testdirectory/"));
// Files within the directory
items.add(createStorageObject("gs://testbucket/testdirectory/file1name", 1L));
items.add(createStorageObject("gs://testbucket/testdirectory/file2name", 2L));
items.add(createStorageObject("gs://testbucket/testdirectory/file3name", 3L));
items.add(createStorageObject("gs://testbucket/testdirectory/otherfile", 4L));
items.add(createStorageObject("gs://testbucket/testdirectory/anotherfile", 5L));
items.add(createStorageObject("gs://testbucket/testotherdirectory/file4name", 6L));
modelObjects.setItems(items);
when(mockGcsUtil.listObjects(eq("testbucket"), anyString(), isNull(String.class))).thenReturn(modelObjects);
// Test patterns.
{
GcsPath pattern = GcsPath.fromUri("gs://testbucket/testdirectory/file*");
List<String> expectedFiles = ImmutableList.of("gs://testbucket/testdirectory/file1name", "gs://testbucket/testdirectory/file2name", "gs://testbucket/testdirectory/file3name");
assertThat(expectedFiles, contains(toFilenames(gcsFileSystem.expand(pattern)).toArray()));
}
{
GcsPath pattern = GcsPath.fromUri("gs://testbucket/testdirectory/file*");
List<String> expectedFiles = ImmutableList.of("gs://testbucket/testdirectory/file1name", "gs://testbucket/testdirectory/file2name", "gs://testbucket/testdirectory/file3name");
assertThat(expectedFiles, contains(toFilenames(gcsFileSystem.expand(pattern)).toArray()));
}
{
GcsPath pattern = GcsPath.fromUri("gs://testbucket/testdirectory/file[1-3]*");
List<String> expectedFiles = ImmutableList.of("gs://testbucket/testdirectory/file1name", "gs://testbucket/testdirectory/file2name", "gs://testbucket/testdirectory/file3name");
assertThat(expectedFiles, contains(toFilenames(gcsFileSystem.expand(pattern)).toArray()));
}
{
GcsPath pattern = GcsPath.fromUri("gs://testbucket/testdirectory/file?name");
List<String> expectedFiles = ImmutableList.of("gs://testbucket/testdirectory/file1name", "gs://testbucket/testdirectory/file2name", "gs://testbucket/testdirectory/file3name");
assertThat(expectedFiles, contains(toFilenames(gcsFileSystem.expand(pattern)).toArray()));
}
{
GcsPath pattern = GcsPath.fromUri("gs://testbucket/test*ectory/fi*name");
List<String> expectedFiles = ImmutableList.of("gs://testbucket/testdirectory/file1name", "gs://testbucket/testdirectory/file2name", "gs://testbucket/testdirectory/file3name", "gs://testbucket/testotherdirectory/file4name");
assertThat(expectedFiles, contains(toFilenames(gcsFileSystem.expand(pattern)).toArray()));
}
}
use of org.apache.beam.sdk.util.gcsfs.GcsPath in project beam by apache.
the class TfIdf method listInputDocuments.
/**
* Lists documents contained beneath the {@code options.input} prefix/directory.
*/
public static Set<URI> listInputDocuments(Options options) throws URISyntaxException, IOException {
URI baseUri = new URI(options.getInput());
// List all documents in the directory or GCS prefix.
URI absoluteUri;
if (baseUri.getScheme() != null) {
absoluteUri = baseUri;
} else {
absoluteUri = new URI("file", baseUri.getAuthority(), baseUri.getPath(), baseUri.getQuery(), baseUri.getFragment());
}
Set<URI> uris = new HashSet<>();
if (absoluteUri.getScheme().equals("file")) {
File directory = new File(absoluteUri);
for (String entry : directory.list()) {
File path = new File(directory, entry);
uris.add(path.toURI());
}
} else if (absoluteUri.getScheme().equals("gs")) {
GcsUtil gcsUtil = options.as(GcsOptions.class).getGcsUtil();
URI gcsUriGlob = new URI(absoluteUri.getScheme(), absoluteUri.getAuthority(), absoluteUri.getPath() + "*", absoluteUri.getQuery(), absoluteUri.getFragment());
for (GcsPath entry : gcsUtil.expand(GcsPath.fromUri(gcsUriGlob))) {
uris.add(entry.toUri());
}
}
return uris;
}
use of org.apache.beam.sdk.util.gcsfs.GcsPath in project DataflowJavaSDK-examples by GoogleCloudPlatform.
the class MinimalWordCountJava8Test method buildMockGcsUtil.
private GcsUtil buildMockGcsUtil() throws IOException {
GcsUtil mockGcsUtil = Mockito.mock(GcsUtil.class);
// Any request to open gets a new bogus channel
Mockito.when(mockGcsUtil.open(Mockito.any(GcsPath.class))).then(new Answer<SeekableByteChannel>() {
@Override
public SeekableByteChannel answer(InvocationOnMock invocation) throws Throwable {
return FileChannel.open(Files.createTempFile("channel-", ".tmp"), StandardOpenOption.CREATE, StandardOpenOption.DELETE_ON_CLOSE);
}
});
// Any request for expansion returns a list containing the original GcsPath
// This is required to pass validation that occurs in TextIO during apply()
Mockito.when(mockGcsUtil.expand(Mockito.any(GcsPath.class))).then(new Answer<List<GcsPath>>() {
@Override
public List<GcsPath> answer(InvocationOnMock invocation) throws Throwable {
return ImmutableList.of((GcsPath) invocation.getArguments()[0]);
}
});
return mockGcsUtil;
}
Aggregations