Search in sources :

Example 31 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class FileBasedSource method getEstimatedSizeBytes.

@Override
public final long getEstimatedSizeBytes(PipelineOptions options) throws IOException {
    // This implementation of method getEstimatedSizeBytes is provided to simplify subclasses. Here
    // we perform the size estimation of files and file patterns using the interface provided by
    // FileSystem.
    String fileOrPattern = fileOrPatternSpec.get();
    if (mode == Mode.FILEPATTERN) {
        long totalSize = 0;
        List<Metadata> allMatches = FileSystems.match(fileOrPattern, emptyMatchTreatment).metadata();
        for (Metadata metadata : allMatches) {
            totalSize += metadata.sizeBytes();
        }
        LOG.info("Filepattern {} matched {} files with total size {}", fileOrPattern, allMatches.size(), totalSize);
        return totalSize;
    } else {
        long start = getStartOffset();
        long end = Math.min(getEndOffset(), getMaxEndOffset(options));
        return end - start;
    }
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata)

Example 32 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class FileBasedSource method getMaxEndOffset.

@Override
public final long getMaxEndOffset(PipelineOptions options) throws IOException {
    checkArgument(mode != Mode.FILEPATTERN, "Cannot determine the exact end offset of a file pattern");
    Metadata metadata = getSingleFileMetadata();
    return metadata.sizeBytes();
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata)

Example 33 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class FilePatternMatchingShardedFile method readFilesWithRetries.

/**
 * Discovers all shards of this file using the provided {@link Sleeper} and {@link BackOff}.
 */
@Override
public List<String> readFilesWithRetries(Sleeper sleeper, BackOff backOff) throws IOException, InterruptedException {
    IOException lastException = null;
    do {
        try {
            Collection<Metadata> files = FileSystems.match(filePattern).metadata();
            LOG.debug("Found file(s) {} by matching the path: {}", files.stream().map(Metadata::resourceId).map(ResourceId::getFilename).collect(Collectors.joining(",")), filePattern);
            if (files.isEmpty()) {
                continue;
            }
            // Read data from file paths
            return readLines(files);
        } catch (IOException e) {
            // Ignore and retry
            lastException = e;
            LOG.warn("Error in file reading. Ignore and retry.");
        }
    } while (BackOffUtils.next(sleeper, backOff));
    // Failed after max retries
    throw new IOException(String.format("Unable to read file(s) after retrying %d times", MAX_READ_RETRIES), lastException);
}
Also used : ResourceId(org.apache.beam.sdk.io.fs.ResourceId) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) IOException(java.io.IOException)

Example 34 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class NumberedShardedFile method readFilesWithRetries.

/**
 * Discovers all shards of this file using the provided {@link Sleeper} and {@link BackOff}.
 *
 * <p>Because of eventual consistency, reads may discover no files or fewer files than the shard
 * template implies. In this case, the read is considered to have failed.
 */
@Override
public List<String> readFilesWithRetries(Sleeper sleeper, BackOff backOff) throws IOException, InterruptedException {
    IOException lastException = null;
    do {
        try {
            // Match inputPath which may contains glob
            Collection<Metadata> files = Iterables.getOnlyElement(FileSystems.match(Collections.singletonList(filePattern))).metadata();
            LOG.debug("Found {} file(s) by matching the path: {}", files.size(), filePattern);
            if (files.isEmpty() || !checkTotalNumOfFiles(files)) {
                continue;
            }
            // Read data from file paths
            return readLines(files);
        } catch (IOException e) {
            // Ignore and retry
            lastException = e;
            LOG.warn("Error in file reading. Ignore and retry.");
        }
    } while (BackOffUtils.next(sleeper, backOff));
    // Failed after max retries
    throw new IOException(String.format("Unable to read file(s) after retrying %d times", MAX_READ_RETRIES), lastException);
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) IOException(java.io.IOException)

Example 35 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class FileBasedSourceTest method testReadRangeFromFileWithSplitsFromMiddleOfHeader.

@Test
public void testReadRangeFromFileWithSplitsFromMiddleOfHeader() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    String header = "<h>";
    List<String> data = new ArrayList<>();
    for (int i = 0; i < 10; i++) {
        data.add(header);
        data.addAll(createStringDataset(3, 9));
    }
    String fileName = "file";
    File file = createFileWithData(fileName, data);
    List<String> expectedResults = new ArrayList<>();
    expectedResults.addAll(data.subList(10, data.size()));
    // Remove all occurrences of header from expected results.
    expectedResults.removeAll(Collections.singletonList(header));
    Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
    // Split starts after "<" of the header
    TestFileBasedSource source = new TestFileBasedSource(metadata, 64, 1, Long.MAX_VALUE, header);
    assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
    // Split starts after "<h" of the header
    source = new TestFileBasedSource(metadata, 64, 2, Long.MAX_VALUE, header);
    assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
    // Split starts after "<h>" of the header
    source = new TestFileBasedSource(metadata, 64, 3, Long.MAX_VALUE, header);
    assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ArrayList(java.util.ArrayList) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) File(java.io.File) Test(org.junit.Test)

Aggregations

Metadata (org.apache.beam.sdk.io.fs.MatchResult.Metadata)42 Test (org.junit.Test)22 File (java.io.File)16 ArrayList (java.util.ArrayList)14 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)9 MatchResult (org.apache.beam.sdk.io.fs.MatchResult)6 VisibleForTesting (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting)6 IOException (java.io.IOException)5 Path (java.nio.file.Path)5 AvroMetadata (org.apache.beam.sdk.io.AvroSource.AvroMetadata)5 Reader (java.io.Reader)4 List (java.util.List)4 ImmutableList (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList)4 BufferedReader (java.io.BufferedReader)3 Matcher (java.util.regex.Matcher)3 Pattern (java.util.regex.Pattern)3 GenericRecord (org.apache.avro.generic.GenericRecord)3 FileReader (java.io.FileReader)2 HashSet (java.util.HashSet)2 FileStatus (org.apache.hadoop.fs.FileStatus)2