Search in sources :

Example 1 with GcsUtil

use of org.apache.beam.sdk.extensions.gcp.util.GcsUtil in project beam by apache.

the class GcsKmsKeyIT method testGcsWriteWithKmsKey.

/**
 * Tests writing to tempLocation with --dataflowKmsKey set on the command line. Verifies that
 * resulting output uses specified key and is readable. Does not verify any temporary files.
 *
 * <p>This test verifies that GCS file copies work with CMEK-enabled files.
 */
@Test
public void testGcsWriteWithKmsKey() {
    TestPipelineOptions options = TestPipeline.testingPipelineOptions().as(TestPipelineOptions.class);
    assertNotNull(options.getTempRoot());
    options.setTempLocation(options.getTempRoot() + "/testGcsWriteWithKmsKey");
    GcsOptions gcsOptions = options.as(GcsOptions.class);
    ResourceId filenamePrefix = FileSystems.matchNewResource(gcsOptions.getGcpTempLocation(), true).resolve(String.format("GcsKmsKeyIT-%tF-%<tH-%<tM-%<tS-%<tL.output", new Date()), StandardResolveOptions.RESOLVE_FILE);
    Pipeline p = Pipeline.create(options);
    p.apply("ReadLines", TextIO.read().from(INPUT_FILE)).apply("WriteLines", TextIO.write().to(filenamePrefix));
    PipelineResult result = p.run();
    State state = result.waitUntilFinish();
    assertThat(state, equalTo(State.DONE));
    String filePattern = filenamePrefix + "*-of-*";
    assertThat(new NumberedShardedFile(filePattern), fileContentsHaveChecksum(EXPECTED_CHECKSUM));
    // Verify objects have KMS key set.
    try {
        MatchResult matchResult = Iterables.getOnlyElement(FileSystems.match(Collections.singletonList(filePattern)));
        GcsUtil gcsUtil = gcsOptions.getGcsUtil();
        for (Metadata metadata : matchResult.metadata()) {
            String kmsKey = gcsUtil.getObject(GcsPath.fromUri(metadata.resourceId().toString())).getKmsKeyName();
            assertNotNull(kmsKey);
        }
    } catch (IOException e) {
        throw new AssertionError(e);
    }
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) PipelineResult(org.apache.beam.sdk.PipelineResult) IOException(java.io.IOException) TestPipelineOptions(org.apache.beam.sdk.testing.TestPipelineOptions) MatchResult(org.apache.beam.sdk.io.fs.MatchResult) Date(java.util.Date) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) State(org.apache.beam.sdk.PipelineResult.State) NumberedShardedFile(org.apache.beam.sdk.util.NumberedShardedFile) GcsOptions(org.apache.beam.sdk.extensions.gcp.options.GcsOptions) GcsUtil(org.apache.beam.sdk.extensions.gcp.util.GcsUtil) Test(org.junit.Test)

Example 2 with GcsUtil

use of org.apache.beam.sdk.extensions.gcp.util.GcsUtil in project beam by apache.

the class TfIdf method listInputDocuments.

/**
 * Lists documents contained beneath the {@code options.input} prefix/directory.
 */
public static Set<URI> listInputDocuments(Options options) throws URISyntaxException, IOException {
    URI baseUri = new URI(options.getInput());
    // List all documents in the directory or GCS prefix.
    URI absoluteUri;
    if (baseUri.getScheme() != null) {
        absoluteUri = baseUri;
    } else {
        absoluteUri = new URI("file", baseUri.getAuthority(), baseUri.getPath(), baseUri.getQuery(), baseUri.getFragment());
    }
    Set<URI> uris = new HashSet<>();
    if ("file".equals(absoluteUri.getScheme())) {
        File directory = new File(absoluteUri);
        for (String entry : Optional.fromNullable(directory.list()).or(new String[] {})) {
            File path = new File(directory, entry);
            uris.add(path.toURI());
        }
    } else if ("gs".equals(absoluteUri.getScheme())) {
        GcsUtil gcsUtil = options.as(GcsOptions.class).getGcsUtil();
        URI gcsUriGlob = new URI(absoluteUri.getScheme(), absoluteUri.getAuthority(), absoluteUri.getPath() + "*", absoluteUri.getQuery(), absoluteUri.getFragment());
        for (GcsPath entry : gcsUtil.expand(GcsPath.fromUri(gcsUriGlob))) {
            uris.add(entry.toUri());
        }
    }
    return uris;
}
Also used : GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath) URI(java.net.URI) File(java.io.File) GcsUtil(org.apache.beam.sdk.extensions.gcp.util.GcsUtil) HashSet(java.util.HashSet)

Example 3 with GcsUtil

use of org.apache.beam.sdk.extensions.gcp.util.GcsUtil in project beam by apache.

the class DataflowRunnerTest method buildMockGcsUtil.

private static GcsUtil buildMockGcsUtil() throws IOException {
    GcsUtil mockGcsUtil = mock(GcsUtil.class);
    when(mockGcsUtil.create(any(GcsPath.class), any(GcsUtil.CreateOptions.class))).then(invocation -> FileChannel.open(Files.createTempFile("channel-", ".tmp"), StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.DELETE_ON_CLOSE));
    when(mockGcsUtil.create(any(GcsPath.class), any(GcsUtil.CreateOptions.class))).then(invocation -> FileChannel.open(Files.createTempFile("channel-", ".tmp"), StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.DELETE_ON_CLOSE));
    when(mockGcsUtil.expand(any(GcsPath.class))).then(invocation -> ImmutableList.of((GcsPath) invocation.getArguments()[0]));
    when(mockGcsUtil.bucketAccessible(GcsPath.fromUri(VALID_STAGING_BUCKET))).thenReturn(true);
    when(mockGcsUtil.bucketAccessible(GcsPath.fromUri(VALID_TEMP_BUCKET))).thenReturn(true);
    when(mockGcsUtil.bucketAccessible(GcsPath.fromUri(VALID_TEMP_BUCKET + "/staging/"))).thenReturn(true);
    when(mockGcsUtil.bucketAccessible(GcsPath.fromUri(VALID_PROFILE_BUCKET))).thenReturn(true);
    when(mockGcsUtil.bucketAccessible(GcsPath.fromUri(NON_EXISTENT_BUCKET))).thenReturn(false);
    // Let every valid path be matched
    when(mockGcsUtil.getObjects(anyListOf(GcsPath.class))).thenAnswer(invocationOnMock -> {
        List<GcsPath> gcsPaths = (List<GcsPath>) invocationOnMock.getArguments()[0];
        List<GcsUtil.StorageObjectOrIOException> results = new ArrayList<>();
        for (GcsPath gcsPath : gcsPaths) {
            if (gcsPath.getBucket().equals(VALID_BUCKET)) {
                StorageObject resultObject = new StorageObject();
                resultObject.setBucket(gcsPath.getBucket());
                resultObject.setName(gcsPath.getObject());
                results.add(GcsUtil.StorageObjectOrIOException.create(resultObject));
            }
        }
        return results;
    });
    // The dataflow pipeline attempts to output to this location.
    when(mockGcsUtil.bucketAccessible(GcsPath.fromUri("gs://bucket/object"))).thenReturn(true);
    return mockGcsUtil;
}
Also used : StorageObject(com.google.api.services.storage.model.StorageObject) GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) GcsUtil(org.apache.beam.sdk.extensions.gcp.util.GcsUtil)

Example 4 with GcsUtil

use of org.apache.beam.sdk.extensions.gcp.util.GcsUtil in project beam by apache.

the class MinimalWordCountTest method buildMockGcsUtil.

private GcsUtil buildMockGcsUtil() throws IOException {
    GcsUtil mockGcsUtil = Mockito.mock(GcsUtil.class);
    // Any request to open gets a new bogus channel
    Mockito.when(mockGcsUtil.open(Mockito.any(GcsPath.class))).then(invocation -> FileChannel.open(Files.createTempFile("channel-", ".tmp"), StandardOpenOption.CREATE, StandardOpenOption.DELETE_ON_CLOSE));
    // Any request for expansion returns a list containing the original GcsPath
    // This is required to pass validation that occurs in TextIO during apply()
    Mockito.when(mockGcsUtil.expand(Mockito.any(GcsPath.class))).then(invocation -> ImmutableList.of((GcsPath) invocation.getArguments()[0]));
    return mockGcsUtil;
}
Also used : GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath) GcsUtil(org.apache.beam.sdk.extensions.gcp.util.GcsUtil)

Example 5 with GcsUtil

use of org.apache.beam.sdk.extensions.gcp.util.GcsUtil in project beam by apache.

the class ExampleEchoPipelineTest method buildMockGcsUtil.

private GcsUtil buildMockGcsUtil() throws IOException {
    GcsUtil mockGcsUtil = Mockito.mock(GcsUtil.class);
    // Any request to open gets a new bogus channel
    Mockito.when(mockGcsUtil.open(Mockito.any(GcsPath.class))).then(new Answer<SeekableByteChannel>() {

        @Override
        public SeekableByteChannel answer(InvocationOnMock invocation) throws Throwable {
            return FileChannel.open(Files.createTempFile("channel-", ".tmp"), StandardOpenOption.CREATE, StandardOpenOption.DELETE_ON_CLOSE);
        }
    });
    // Any request for expansion returns a list containing the original GcsPath
    // This is required to pass validation that occurs in TextIO during apply()
    Mockito.when(mockGcsUtil.expand(Mockito.any(GcsPath.class))).then(new Answer<List<GcsPath>>() {

        @Override
        public List<GcsPath> answer(InvocationOnMock invocation) throws Throwable {
            return ImmutableList.of((GcsPath) invocation.getArguments()[0]);
        }
    });
    return mockGcsUtil;
}
Also used : SeekableByteChannel(java.nio.channels.SeekableByteChannel) InvocationOnMock(org.mockito.invocation.InvocationOnMock) GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) GcsUtil(org.apache.beam.sdk.extensions.gcp.util.GcsUtil)

Aggregations

GcsUtil (org.apache.beam.sdk.extensions.gcp.util.GcsUtil)7 GcsPath (org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath)6 ArrayList (java.util.ArrayList)2 List (java.util.List)2 DataflowPipelineOptions (org.apache.beam.runners.dataflow.options.DataflowPipelineOptions)2 TestCredential (org.apache.beam.sdk.extensions.gcp.auth.TestCredential)2 ImmutableList (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList)2 StorageObject (com.google.api.services.storage.model.StorageObject)1 File (java.io.File)1 IOException (java.io.IOException)1 URI (java.net.URI)1 SeekableByteChannel (java.nio.channels.SeekableByteChannel)1 Date (java.util.Date)1 HashSet (java.util.HashSet)1 Pipeline (org.apache.beam.sdk.Pipeline)1 PipelineResult (org.apache.beam.sdk.PipelineResult)1 State (org.apache.beam.sdk.PipelineResult.State)1 GcsOptions (org.apache.beam.sdk.extensions.gcp.options.GcsOptions)1 MatchResult (org.apache.beam.sdk.io.fs.MatchResult)1 Metadata (org.apache.beam.sdk.io.fs.MatchResult.Metadata)1