use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class FileBasedSource method getEstimatedSizeBytes.
@Override
public final long getEstimatedSizeBytes(PipelineOptions options) throws IOException {
// This implementation of method getEstimatedSizeBytes is provided to simplify subclasses. Here
// we perform the size estimation of files and file patterns using the interface provided by
// FileSystem.
String fileOrPattern = fileOrPatternSpec.get();
if (mode == Mode.FILEPATTERN) {
long totalSize = 0;
List<Metadata> allMatches = FileSystems.match(fileOrPattern, emptyMatchTreatment).metadata();
for (Metadata metadata : allMatches) {
totalSize += metadata.sizeBytes();
}
LOG.info("Filepattern {} matched {} files with total size {}", fileOrPattern, allMatches.size(), totalSize);
return totalSize;
} else {
long start = getStartOffset();
long end = Math.min(getEndOffset(), getMaxEndOffset(options));
return end - start;
}
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class FileBasedSource method getMaxEndOffset.
@Override
public final long getMaxEndOffset(PipelineOptions options) throws IOException {
checkArgument(mode != Mode.FILEPATTERN, "Cannot determine the exact end offset of a file pattern");
Metadata metadata = getSingleFileMetadata();
return metadata.sizeBytes();
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class FilePatternMatchingShardedFile method readFilesWithRetries.
/**
* Discovers all shards of this file using the provided {@link Sleeper} and {@link BackOff}.
*/
@Override
public List<String> readFilesWithRetries(Sleeper sleeper, BackOff backOff) throws IOException, InterruptedException {
IOException lastException = null;
do {
try {
Collection<Metadata> files = FileSystems.match(filePattern).metadata();
LOG.debug("Found file(s) {} by matching the path: {}", files.stream().map(Metadata::resourceId).map(ResourceId::getFilename).collect(Collectors.joining(",")), filePattern);
if (files.isEmpty()) {
continue;
}
// Read data from file paths
return readLines(files);
} catch (IOException e) {
// Ignore and retry
lastException = e;
LOG.warn("Error in file reading. Ignore and retry.");
}
} while (BackOffUtils.next(sleeper, backOff));
// Failed after max retries
throw new IOException(String.format("Unable to read file(s) after retrying %d times", MAX_READ_RETRIES), lastException);
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class NumberedShardedFile method readFilesWithRetries.
/**
* Discovers all shards of this file using the provided {@link Sleeper} and {@link BackOff}.
*
* <p>Because of eventual consistency, reads may discover no files or fewer files than the shard
* template implies. In this case, the read is considered to have failed.
*/
@Override
public List<String> readFilesWithRetries(Sleeper sleeper, BackOff backOff) throws IOException, InterruptedException {
IOException lastException = null;
do {
try {
// Match inputPath which may contains glob
Collection<Metadata> files = Iterables.getOnlyElement(FileSystems.match(Collections.singletonList(filePattern))).metadata();
LOG.debug("Found {} file(s) by matching the path: {}", files.size(), filePattern);
if (files.isEmpty() || !checkTotalNumOfFiles(files)) {
continue;
}
// Read data from file paths
return readLines(files);
} catch (IOException e) {
// Ignore and retry
lastException = e;
LOG.warn("Error in file reading. Ignore and retry.");
}
} while (BackOffUtils.next(sleeper, backOff));
// Failed after max retries
throw new IOException(String.format("Unable to read file(s) after retrying %d times", MAX_READ_RETRIES), lastException);
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class FileBasedSourceTest method testReadRangeFromFileWithSplitsFromMiddleOfHeader.
@Test
public void testReadRangeFromFileWithSplitsFromMiddleOfHeader() throws IOException {
PipelineOptions options = PipelineOptionsFactory.create();
String header = "<h>";
List<String> data = new ArrayList<>();
for (int i = 0; i < 10; i++) {
data.add(header);
data.addAll(createStringDataset(3, 9));
}
String fileName = "file";
File file = createFileWithData(fileName, data);
List<String> expectedResults = new ArrayList<>();
expectedResults.addAll(data.subList(10, data.size()));
// Remove all occurrences of header from expected results.
expectedResults.removeAll(Collections.singletonList(header));
Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
// Split starts after "<" of the header
TestFileBasedSource source = new TestFileBasedSource(metadata, 64, 1, Long.MAX_VALUE, header);
assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
// Split starts after "<h" of the header
source = new TestFileBasedSource(metadata, 64, 2, Long.MAX_VALUE, header);
assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
// Split starts after "<h>" of the header
source = new TestFileBasedSource(metadata, 64, 3, Long.MAX_VALUE, header);
assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
}
Aggregations