Search in sources :

Example 6 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class WriteFilesTest method checkFileContents.

static void checkFileContents(String baseName, List<String> inputs, Optional<Integer> numExpectedShards) throws IOException {
    List<File> outputFiles = Lists.newArrayList();
    final String pattern = baseName + "*";
    List<Metadata> metadata = FileSystems.match(Collections.singletonList(pattern)).get(0).metadata();
    for (Metadata meta : metadata) {
        outputFiles.add(new File(meta.resourceId().toString()));
    }
    if (numExpectedShards.isPresent()) {
        assertEquals(numExpectedShards.get().intValue(), outputFiles.size());
    }
    List<String> actual = Lists.newArrayList();
    for (File outputFile : outputFiles) {
        try (BufferedReader reader = new BufferedReader(new FileReader(outputFile))) {
            for (; ; ) {
                String line = reader.readLine();
                if (line == null) {
                    break;
                }
                if (!line.equals(SimpleWriter.HEADER) && !line.equals(SimpleWriter.FOOTER)) {
                    actual.add(line);
                }
            }
        }
    }
    assertThat(actual, containsInAnyOrder(inputs.toArray()));
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) File(java.io.File)

Example 7 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class ExplicitShardedFile method readLines.

/**
   * Reads all the lines of all the files.
   *
   * <p>Not suitable for use except in testing of small data, since the data size may be far more
   * than can be reasonably processed serially, in-memory, by a single thread.
   */
@VisibleForTesting
List<String> readLines(Collection<Metadata> files) throws IOException {
    List<String> allLines = Lists.newArrayList();
    int i = 1;
    for (Metadata file : files) {
        try (Reader reader = Channels.newReader(FileSystems.open(file.resourceId()), StandardCharsets.UTF_8.name())) {
            List<String> lines = CharStreams.readLines(reader);
            allLines.addAll(lines);
            LOG.debug("[{} of {}] Read {} lines from file: {}", i, files.size(), lines.size(), file);
        }
        i++;
    }
    return allLines;
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) Reader(java.io.Reader) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 8 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class NumberedShardedFile method readLines.

/**
   * Reads all the lines of all the files.
   *
   * <p>Not suitable for use except in testing of small data, since the data size may be far more
   * than can be reasonably processed serially, in-memory, by a single thread.
   */
@VisibleForTesting
List<String> readLines(Collection<Metadata> files) throws IOException {
    List<String> allLines = Lists.newArrayList();
    int i = 1;
    for (Metadata file : files) {
        try (Reader reader = Channels.newReader(FileSystems.open(file.resourceId()), StandardCharsets.UTF_8.name())) {
            List<String> lines = CharStreams.readLines(reader);
            allLines.addAll(lines);
            LOG.debug("[{} of {}] Read {} lines from file: {}", i, files.size(), lines.size(), file);
        }
        i++;
    }
    return allLines;
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) Reader(java.io.Reader) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 9 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class NumberedShardedFile method checkTotalNumOfFiles.

/**
   * Check if total number of files is correct by comparing with the number that
   * is parsed from shard name using a name template. If no template is specified,
   * "SSSS-of-NNNN" will be used as default, and "NNNN" will be the expected total
   * number of files.
   *
   * @return {@code true} if at least one shard name matches template and total number
   * of given files equals the number that is parsed from shard name.
   */
@VisibleForTesting
boolean checkTotalNumOfFiles(Collection<Metadata> files) {
    for (Metadata fileMedadata : files) {
        String fileName = fileMedadata.resourceId().getFilename();
        if (fileName == null) {
            // this path has zero elements
            continue;
        }
        Matcher matcher = shardTemplate.matcher(fileName);
        if (!matcher.matches()) {
            // shard name doesn't match the pattern, check with the next shard
            continue;
        }
        // once match, extract total number of shards and compare to file list
        return files.size() == Integer.parseInt(matcher.group("numshards"));
    }
    return false;
}
Also used : Matcher(java.util.regex.Matcher) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 10 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class HadoopFileSystem method match.

@Override
protected List<MatchResult> match(List<String> specs) {
    ImmutableList.Builder<MatchResult> resultsBuilder = ImmutableList.builder();
    for (String spec : specs) {
        try {
            FileStatus[] fileStatuses = fileSystem.globStatus(new Path(spec));
            if (fileStatuses == null) {
                resultsBuilder.add(MatchResult.create(Status.NOT_FOUND, Collections.<Metadata>emptyList()));
                continue;
            }
            List<Metadata> metadata = new ArrayList<>();
            for (FileStatus fileStatus : fileStatuses) {
                if (fileStatus.isFile()) {
                    URI uri = dropEmptyAuthority(fileStatus.getPath().toUri().toString());
                    metadata.add(Metadata.builder().setResourceId(new HadoopResourceId(uri)).setIsReadSeekEfficient(true).setSizeBytes(fileStatus.getLen()).build());
                }
            }
            resultsBuilder.add(MatchResult.create(Status.OK, metadata));
        } catch (IOException e) {
            resultsBuilder.add(MatchResult.create(Status.ERROR, e));
        }
    }
    return resultsBuilder.build();
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) ImmutableList(com.google.common.collect.ImmutableList) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) ArrayList(java.util.ArrayList) IOException(java.io.IOException) MatchResult(org.apache.beam.sdk.io.fs.MatchResult) URI(java.net.URI)

Aggregations

Metadata (org.apache.beam.sdk.io.fs.MatchResult.Metadata)29 Test (org.junit.Test)15 File (java.io.File)14 ArrayList (java.util.ArrayList)12 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)9 VisibleForTesting (com.google.common.annotations.VisibleForTesting)4 AvroMetadata (org.apache.beam.sdk.io.AvroSource.AvroMetadata)4 ImmutableList (com.google.common.collect.ImmutableList)3 FileReader (java.io.FileReader)3 Reader (java.io.Reader)3 MatchResult (org.apache.beam.sdk.io.fs.MatchResult)3 BufferedReader (java.io.BufferedReader)2 IOException (java.io.IOException)2 List (java.util.List)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 Objects (com.google.api.services.storage.model.Objects)1 StorageObject (com.google.api.services.storage.model.StorageObject)1 Predicate (com.google.common.base.Predicate)1 FileNotFoundException (java.io.FileNotFoundException)1 BigInteger (java.math.BigInteger)1