Search in sources :

Example 41 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class GcsFileSystem method toMetadata.

private Metadata toMetadata(StorageObject storageObject) {
    // TODO: Address https://issues.apache.org/jira/browse/BEAM-1494
    // It is incorrect to set IsReadSeekEfficient true for files with content encoding set to gzip.
    Metadata.Builder ret = Metadata.builder().setIsReadSeekEfficient(true).setResourceId(GcsResourceId.fromGcsPath(GcsPath.fromObject(storageObject)));
    if (storageObject.getMd5Hash() != null) {
        ret.setChecksum(storageObject.getMd5Hash());
    }
    BigInteger size = firstNonNull(storageObject.getSize(), BigInteger.ZERO);
    ret.setSizeBytes(size.longValue());
    DateTime lastModified = firstNonNull(storageObject.getUpdated(), new DateTime(0L));
    ret.setLastModifiedMillis(lastModified.getValue());
    return ret.build();
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) BigInteger(java.math.BigInteger) DateTime(com.google.api.client.util.DateTime)

Example 42 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class GcsFileSystem method expand.

/**
 * Expands a pattern into {@link MatchResult}.
 *
 * @throws IllegalArgumentException if {@code gcsPattern} does not contain globs.
 */
@VisibleForTesting
MatchResult expand(GcsPath gcsPattern) throws IOException {
    String prefix = GcsUtil.getNonWildcardPrefix(gcsPattern.getObject());
    Pattern p = Pattern.compile(wildcardToRegexp(gcsPattern.getObject()));
    LOG.debug("matching files in bucket {}, prefix {} against pattern {}", gcsPattern.getBucket(), prefix, p.toString());
    String pageToken = null;
    List<Metadata> results = new ArrayList<>();
    do {
        Objects objects = options.getGcsUtil().listObjects(gcsPattern.getBucket(), prefix, pageToken);
        if (objects.getItems() == null) {
            break;
        }
        // Filter objects based on the regex.
        for (StorageObject o : objects.getItems()) {
            String name = o.getName();
            // Skip directories, which end with a slash.
            if (p.matcher(name).matches() && !name.endsWith("/")) {
                LOG.debug("Matched object: {}", name);
                results.add(toMetadata(o));
            }
        }
        pageToken = objects.getNextPageToken();
    } while (pageToken != null);
    return MatchResult.create(Status.OK, results);
}
Also used : Pattern(java.util.regex.Pattern) StorageObject(com.google.api.services.storage.model.StorageObject) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) ArrayList(java.util.ArrayList) Objects(com.google.api.services.storage.model.Objects) VisibleForTesting(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting)

Aggregations

Metadata (org.apache.beam.sdk.io.fs.MatchResult.Metadata)42 Test (org.junit.Test)22 File (java.io.File)16 ArrayList (java.util.ArrayList)14 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)9 MatchResult (org.apache.beam.sdk.io.fs.MatchResult)6 VisibleForTesting (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting)6 IOException (java.io.IOException)5 Path (java.nio.file.Path)5 AvroMetadata (org.apache.beam.sdk.io.AvroSource.AvroMetadata)5 Reader (java.io.Reader)4 List (java.util.List)4 ImmutableList (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList)4 BufferedReader (java.io.BufferedReader)3 Matcher (java.util.regex.Matcher)3 Pattern (java.util.regex.Pattern)3 GenericRecord (org.apache.avro.generic.GenericRecord)3 FileReader (java.io.FileReader)2 HashSet (java.util.HashSet)2 FileStatus (org.apache.hadoop.fs.FileStatus)2