Search in sources :

Example 1 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class WriteFilesTest method checkFileContents.

static void checkFileContents(String baseName, List<String> inputs, Optional<Integer> numExpectedShards) throws IOException {
    List<File> outputFiles = Lists.newArrayList();
    final String pattern = baseName + "*";
    List<Metadata> metadata = FileSystems.match(Collections.singletonList(pattern)).get(0).metadata();
    for (Metadata meta : metadata) {
        outputFiles.add(new File(meta.resourceId().toString()));
    }
    if (numExpectedShards.isPresent()) {
        assertEquals(numExpectedShards.get().intValue(), outputFiles.size());
    }
    List<String> actual = Lists.newArrayList();
    for (File outputFile : outputFiles) {
        try (BufferedReader reader = new BufferedReader(new FileReader(outputFile))) {
            for (; ; ) {
                String line = reader.readLine();
                if (line == null) {
                    break;
                }
                if (!line.equals(SimpleWriter.HEADER) && !line.equals(SimpleWriter.FOOTER)) {
                    actual.add(line);
                }
            }
        }
    }
    assertThat(actual, containsInAnyOrder(inputs.toArray()));
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) File(java.io.File)

Example 2 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class LocalFileSystem method matchOne.

private MatchResult matchOne(String baseDir, String spec) {
    if (spec.toLowerCase().startsWith("file:")) {
        spec = spec.substring("file:".length());
    }
    if (SystemUtils.IS_OS_WINDOWS) {
        List<String> prefixes = Arrays.asList("///", "/");
        for (String prefix : prefixes) {
            if (spec.toLowerCase().startsWith(prefix)) {
                spec = spec.substring(prefix.length());
            }
        }
    }
    // BEAM-6213: Windows breaks on Paths.get(spec).toFile() with a glob because
    // it considers it an invalid file system pattern. We should use
    // new File(spec) to avoid such validation.
    // See https://bugs.openjdk.java.net/browse/JDK-8197918
    // However, new File(parent, child) resolves absolute `child` in a system-dependent
    // way that is generally incorrect, for example new File($PWD, "/tmp/foo") resolves
    // to $PWD/tmp/foo on many systems, unlike Paths.get($PWD).resolve("/tmp/foo") which
    // correctly resolves to "/tmp/foo". We add just this one piece of logic here, without
    // switching to Paths which could require a rewrite of this module to support
    // both Windows and correct file resolution.
    // The root cause is that globs are not files but we are using file manipulation libraries
    // to work with them.
    final File specAsFile = new File(spec);
    final File absoluteFile = specAsFile.isAbsolute() ? specAsFile : new File(baseDir, spec);
    if (absoluteFile.exists()) {
        return MatchResult.create(Status.OK, ImmutableList.of(toMetadata(absoluteFile)));
    }
    File parent = getSpecNonGlobPrefixParentFile(absoluteFile.getAbsolutePath());
    if (!parent.exists()) {
        return MatchResult.create(Status.NOT_FOUND, Collections.emptyList());
    }
    // Method getAbsolutePath() on Windows platform may return something like
    // "c:\temp\file.txt". FileSystem.getPathMatcher() call below will treat
    // '\' (backslash) as an escape character, instead of a directory
    // separator. Replacing backslash with double-backslash solves the problem.
    // We perform the replacement on all platforms, even those that allow
    // backslash as a part of the filename, because Globs.toRegexPattern will
    // eat one backslash.
    String pathToMatch = absoluteFile.getAbsolutePath().replaceAll(Matcher.quoteReplacement("\\"), Matcher.quoteReplacement("\\\\"));
    final PathMatcher matcher = java.nio.file.FileSystems.getDefault().getPathMatcher("glob:" + pathToMatch);
    // TODO: Avoid iterating all files: https://issues.apache.org/jira/browse/BEAM-1309
    Iterable<File> files = fileTraverser().depthFirstPreOrder(parent);
    Iterable<File> matchedFiles = StreamSupport.stream(files.spliterator(), false).filter(Predicates.and(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.io.Files.isFile(), input -> matcher.matches(input.toPath()))::apply).collect(Collectors.toList());
    List<Metadata> result = Lists.newLinkedList();
    for (File match : matchedFiles) {
        result.add(toMetadata(match));
    }
    if (result.isEmpty()) {
        // TODO: consider to return Status.OK for globs.
        return MatchResult.create(Status.NOT_FOUND, new FileNotFoundException(String.format("No files found for spec: %s in working directory %s", spec, baseDir)));
    } else {
        return MatchResult.create(Status.OK, result);
    }
}
Also used : NoSuchFileException(java.nio.file.NoSuchFileException) Arrays(java.util.Arrays) MatchResult(org.apache.beam.sdk.io.fs.MatchResult) LoggerFactory(org.slf4j.LoggerFactory) Status(org.apache.beam.sdk.io.fs.MatchResult.Status) BufferedOutputStream(java.io.BufferedOutputStream) StandardCopyOption(java.nio.file.StandardCopyOption) Matcher(java.util.regex.Matcher) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) PathMatcher(java.nio.file.PathMatcher) StreamSupport(java.util.stream.StreamSupport) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) Predicates(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Predicates) Path(java.nio.file.Path) ReadableByteChannel(java.nio.channels.ReadableByteChannel) Logger(org.slf4j.Logger) Files(java.nio.file.Files) CreateOptions(org.apache.beam.sdk.io.fs.CreateOptions) SystemUtils(org.apache.commons.lang3.SystemUtils) Channels(java.nio.channels.Channels) Collection(java.util.Collection) FileOutputStream(java.io.FileOutputStream) Lists(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) Files.fileTraverser(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.io.Files.fileTraverser) Collectors(java.util.stream.Collectors) File(java.io.File) FileNotFoundException(java.io.FileNotFoundException) List(java.util.List) Paths(java.nio.file.Paths) MoveOptions(org.apache.beam.sdk.io.fs.MoveOptions) VisibleForTesting(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting) WritableByteChannel(java.nio.channels.WritableByteChannel) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Pattern(java.util.regex.Pattern) Collections(java.util.Collections) PathMatcher(java.nio.file.PathMatcher) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) FileNotFoundException(java.io.FileNotFoundException) File(java.io.File)

Example 3 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class FileBasedSource method split.

@Override
public final List<? extends FileBasedSource<T>> split(long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
    // This implementation of method split is provided to simplify subclasses. Here we
    // split a FileBasedSource based on a file pattern to FileBasedSources based on full single
    // files. For files that can be efficiently seeked, we further split FileBasedSources based on
    // those files to FileBasedSources based on sub ranges of single files.
    String fileOrPattern = fileOrPatternSpec.get();
    if (mode == Mode.FILEPATTERN) {
        long startTime = System.currentTimeMillis();
        List<Metadata> expandedFiles = FileSystems.match(fileOrPattern, emptyMatchTreatment).metadata();
        List<FileBasedSource<T>> splitResults = new ArrayList<>(expandedFiles.size());
        for (Metadata metadata : expandedFiles) {
            FileBasedSource<T> split = createForSubrangeOfFile(metadata, 0, metadata.sizeBytes());
            verify(split.getMode() == Mode.SINGLE_FILE_OR_SUBRANGE, "%s.createForSubrangeOfFile must return a source in mode %s", split, Mode.SINGLE_FILE_OR_SUBRANGE);
            // The split is NOT in FILEPATTERN mode, so we can call its split without fear
            // of recursion. This will break a single file into multiple splits when the file is
            // splittable and larger than the desired bundle size.
            splitResults.addAll(split.split(desiredBundleSizeBytes, options));
        }
        LOG.info("Splitting filepattern {} into bundles of size {} took {} ms " + "and produced {} files and {} bundles", fileOrPattern, desiredBundleSizeBytes, System.currentTimeMillis() - startTime, expandedFiles.size(), splitResults.size());
        return splitResults;
    } else {
        if (isSplittable()) {
            @SuppressWarnings("unchecked") List<FileBasedSource<T>> splits = (List<FileBasedSource<T>>) super.split(desiredBundleSizeBytes, options);
            return splits;
        } else {
            LOG.debug("The source for file {} is not split into sub-range based sources since " + "the file is not seekable", fileOrPattern);
            return ImmutableList.of(this);
        }
    }
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList)

Example 4 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class ReadableFileCoder method decode.

@Override
public FileIO.ReadableFile decode(@UnknownKeyFor @NonNull @Initialized InputStream inStream) throws CoderException, IOException {
    MatchResult.Metadata metadata = getMetadataCoder().decode(inStream);
    Compression compression = Compression.values()[VarIntCoder.of().decode(inStream)];
    return new FileIO.ReadableFile(metadata, compression);
}
Also used : MatchResult(org.apache.beam.sdk.io.fs.MatchResult) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata)

Example 5 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class WriteFilesTest method checkFileContents.

static void checkFileContents(String baseName, List<String> inputs, Optional<Integer> numExpectedShards, boolean expectRemovedTempDirectory, BiFunction<Integer, List<String>, Void> shardContentChecker) throws IOException {
    List<File> outputFiles = Lists.newArrayList();
    final String pattern = baseName + "*";
    List<Metadata> metadata = FileSystems.match(Collections.singletonList(pattern)).get(0).metadata();
    for (Metadata meta : metadata) {
        outputFiles.add(new File(meta.resourceId().toString()));
    }
    assertFalse("Should have produced at least 1 output file", outputFiles.isEmpty());
    Pattern shardPattern = Pattern.compile("(\\d{4})-of-\\d{4}");
    if (numExpectedShards.isPresent()) {
        assertEquals(numExpectedShards.get().intValue(), outputFiles.size());
        Set<String> expectedShards = Sets.newHashSet();
        DecimalFormat df = new DecimalFormat("0000");
        for (int i = 0; i < numExpectedShards.get(); i++) {
            expectedShards.add(String.format("%s-of-%s", df.format(i), df.format(numExpectedShards.get())));
        }
        Set<String> outputShards = Sets.newHashSet();
        for (File file : outputFiles) {
            Matcher matcher = shardPattern.matcher(file.getName());
            assertTrue(matcher.find());
            assertTrue(outputShards.add(matcher.group()));
        }
        assertEquals(expectedShards, outputShards);
    }
    List<String> actual = Lists.newArrayList();
    for (File outputFile : outputFiles) {
        List<String> actualShard = Lists.newArrayList();
        try (BufferedReader reader = Files.newBufferedReader(outputFile.toPath(), Charsets.UTF_8)) {
            for (; ; ) {
                String line = reader.readLine();
                if (line == null) {
                    break;
                }
                if (!line.equals(SimpleWriter.HEADER) && !line.equals(SimpleWriter.FOOTER)) {
                    actualShard.add(line);
                }
            }
        }
        if (shardContentChecker != null) {
            Matcher matcher = shardPattern.matcher(outputFile.getName());
            matcher.find();
            int shardNumber = Integer.parseInt(matcher.group(1));
            shardContentChecker.apply(shardNumber, actualShard);
        }
        actual.addAll(actualShard);
    }
    assertThat(actual, containsInAnyOrder(inputs.toArray()));
    if (expectRemovedTempDirectory) {
        assertThat(Lists.newArrayList(new File(baseName).getParentFile().list()), Matchers.everyItem(not(containsString(FileBasedSink.TEMP_DIRECTORY_PREFIX))));
    }
}
Also used : Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) DecimalFormat(java.text.DecimalFormat) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) BufferedReader(java.io.BufferedReader) Matchers.containsString(org.hamcrest.Matchers.containsString) File(java.io.File)

Aggregations

Metadata (org.apache.beam.sdk.io.fs.MatchResult.Metadata)42 Test (org.junit.Test)22 File (java.io.File)16 ArrayList (java.util.ArrayList)14 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)9 MatchResult (org.apache.beam.sdk.io.fs.MatchResult)6 VisibleForTesting (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting)6 IOException (java.io.IOException)5 Path (java.nio.file.Path)5 AvroMetadata (org.apache.beam.sdk.io.AvroSource.AvroMetadata)5 Reader (java.io.Reader)4 List (java.util.List)4 ImmutableList (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList)4 BufferedReader (java.io.BufferedReader)3 Matcher (java.util.regex.Matcher)3 Pattern (java.util.regex.Pattern)3 GenericRecord (org.apache.avro.generic.GenericRecord)3 FileReader (java.io.FileReader)2 HashSet (java.util.HashSet)2 FileStatus (org.apache.hadoop.fs.FileStatus)2