Search in sources :

Example 16 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class LocalFileSystem method matchOne.

private MatchResult matchOne(String spec) throws IOException {
    File file = Paths.get(spec).toFile();
    if (file.exists()) {
        return MatchResult.create(Status.OK, ImmutableList.of(toMetadata(file)));
    }
    File parent = file.getAbsoluteFile().getParentFile();
    if (!parent.exists()) {
        return MatchResult.create(Status.NOT_FOUND, Collections.<Metadata>emptyList());
    }
    // Method getAbsolutePath() on Windows platform may return something like
    // "c:\temp\file.txt". FileSystem.getPathMatcher() call below will treat
    // '\' (backslash) as an escape character, instead of a directory
    // separator. Replacing backslash with double-backslash solves the problem.
    // We perform the replacement on all platforms, even those that allow
    // backslash as a part of the filename, because Globs.toRegexPattern will
    // eat one backslash.
    String pathToMatch = file.getAbsolutePath().replaceAll(Matcher.quoteReplacement("\\"), Matcher.quoteReplacement("\\\\"));
    final PathMatcher matcher = java.nio.file.FileSystems.getDefault().getPathMatcher("glob:" + pathToMatch);
    // TODO: Avoid iterating all files: https://issues.apache.org/jira/browse/BEAM-1309
    Iterable<File> files = com.google.common.io.Files.fileTreeTraverser().preOrderTraversal(parent);
    Iterable<File> matchedFiles = Iterables.filter(files, Predicates.and(com.google.common.io.Files.isFile(), new Predicate<File>() {

        @Override
        public boolean apply(File input) {
            return matcher.matches(input.toPath());
        }
    }));
    List<Metadata> result = Lists.newLinkedList();
    for (File match : matchedFiles) {
        result.add(toMetadata(match));
    }
    if (result.isEmpty()) {
        // TODO: consider to return Status.OK for globs.
        return MatchResult.create(Status.NOT_FOUND, new FileNotFoundException(String.format("No files found for spec: %s.", spec)));
    } else {
        return MatchResult.create(Status.OK, result);
    }
}
Also used : PathMatcher(java.nio.file.PathMatcher) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) FileNotFoundException(java.io.FileNotFoundException) File(java.io.File) Predicate(com.google.common.base.Predicate)

Example 17 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class FileBasedSource method getEstimatedSizeBytes.

@Override
public final long getEstimatedSizeBytes(PipelineOptions options) throws IOException {
    // This implementation of method getEstimatedSizeBytes is provided to simplify subclasses. Here
    // we perform the size estimation of files and file patterns using the interface provided by
    // FileSystem.
    checkState(fileOrPatternSpec.isAccessible(), "Cannot estimate size of a FileBasedSource with inaccessible file pattern: {}.", fileOrPatternSpec);
    String fileOrPattern = fileOrPatternSpec.get();
    if (mode == Mode.FILEPATTERN) {
        long totalSize = 0;
        List<MatchResult> inputs = FileSystems.match(Collections.singletonList(fileOrPattern));
        MatchResult result = Iterables.getOnlyElement(inputs);
        checkArgument(result.status() == Status.OK, "Error matching the pattern or glob %s: status %s", fileOrPattern, result.status());
        List<Metadata> allMatches = result.metadata();
        for (Metadata metadata : allMatches) {
            totalSize += metadata.sizeBytes();
        }
        LOG.info("Filepattern {} matched {} files with total size {}", fileOrPattern, allMatches.size(), totalSize);
        return totalSize;
    } else {
        long start = getStartOffset();
        long end = Math.min(getEndOffset(), getMaxEndOffset(options));
        return end - start;
    }
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) MatchResult(org.apache.beam.sdk.io.fs.MatchResult)

Example 18 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class FileBasedSource method getMaxEndOffset.

@Override
public final long getMaxEndOffset(PipelineOptions options) throws IOException {
    checkArgument(mode != Mode.FILEPATTERN, "Cannot determine the exact end offset of a file pattern");
    Metadata metadata = getSingleFileMetadata();
    return metadata.sizeBytes();
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata)

Example 19 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class FileBasedSource method createReader.

@Override
public final BoundedReader<T> createReader(PipelineOptions options) throws IOException {
    // Validate the current source prior to creating a reader for it.
    this.validate();
    checkState(fileOrPatternSpec.isAccessible(), "Cannot create a file reader without access to the file or pattern specification: {}.", fileOrPatternSpec);
    String fileOrPattern = fileOrPatternSpec.get();
    if (mode == Mode.FILEPATTERN) {
        long startTime = System.currentTimeMillis();
        List<Metadata> fileMetadata = FileBasedSource.expandFilePattern(fileOrPattern);
        List<FileBasedReader<T>> fileReaders = new ArrayList<>();
        for (Metadata metadata : fileMetadata) {
            long endOffset = metadata.sizeBytes();
            fileReaders.add(createForSubrangeOfFile(metadata, 0, endOffset).createSingleFileReader(options));
        }
        LOG.debug("Creating a reader for file pattern {} took {} ms", fileOrPattern, System.currentTimeMillis() - startTime);
        if (fileReaders.size() == 1) {
            return fileReaders.get(0);
        }
        return new FilePatternReader(this, fileReaders);
    } else {
        return createSingleFileReader(options);
    }
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) ArrayList(java.util.ArrayList)

Example 20 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class GcsFileSystem method toMetadata.

private Metadata toMetadata(StorageObject storageObject) {
    // TODO: Address https://issues.apache.org/jira/browse/BEAM-1494
    // It is incorrect to set IsReadSeekEfficient true for files with content encoding set to gzip.
    Metadata.Builder ret = Metadata.builder().setIsReadSeekEfficient(true).setResourceId(GcsResourceId.fromGcsPath(GcsPath.fromObject(storageObject)));
    BigInteger size = firstNonNull(storageObject.getSize(), BigInteger.ZERO);
    ret.setSizeBytes(size.longValue());
    return ret.build();
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) BigInteger(java.math.BigInteger)

Aggregations

Metadata (org.apache.beam.sdk.io.fs.MatchResult.Metadata)29 Test (org.junit.Test)15 File (java.io.File)14 ArrayList (java.util.ArrayList)12 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)9 VisibleForTesting (com.google.common.annotations.VisibleForTesting)4 AvroMetadata (org.apache.beam.sdk.io.AvroSource.AvroMetadata)4 ImmutableList (com.google.common.collect.ImmutableList)3 FileReader (java.io.FileReader)3 Reader (java.io.Reader)3 MatchResult (org.apache.beam.sdk.io.fs.MatchResult)3 BufferedReader (java.io.BufferedReader)2 IOException (java.io.IOException)2 List (java.util.List)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 Objects (com.google.api.services.storage.model.Objects)1 StorageObject (com.google.api.services.storage.model.StorageObject)1 Predicate (com.google.common.base.Predicate)1 FileNotFoundException (java.io.FileNotFoundException)1 BigInteger (java.math.BigInteger)1