use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class LocalFileSystem method matchOne.
private MatchResult matchOne(String spec) throws IOException {
File file = Paths.get(spec).toFile();
if (file.exists()) {
return MatchResult.create(Status.OK, ImmutableList.of(toMetadata(file)));
}
File parent = file.getAbsoluteFile().getParentFile();
if (!parent.exists()) {
return MatchResult.create(Status.NOT_FOUND, Collections.<Metadata>emptyList());
}
// Method getAbsolutePath() on Windows platform may return something like
// "c:\temp\file.txt". FileSystem.getPathMatcher() call below will treat
// '\' (backslash) as an escape character, instead of a directory
// separator. Replacing backslash with double-backslash solves the problem.
// We perform the replacement on all platforms, even those that allow
// backslash as a part of the filename, because Globs.toRegexPattern will
// eat one backslash.
String pathToMatch = file.getAbsolutePath().replaceAll(Matcher.quoteReplacement("\\"), Matcher.quoteReplacement("\\\\"));
final PathMatcher matcher = java.nio.file.FileSystems.getDefault().getPathMatcher("glob:" + pathToMatch);
// TODO: Avoid iterating all files: https://issues.apache.org/jira/browse/BEAM-1309
Iterable<File> files = com.google.common.io.Files.fileTreeTraverser().preOrderTraversal(parent);
Iterable<File> matchedFiles = Iterables.filter(files, Predicates.and(com.google.common.io.Files.isFile(), new Predicate<File>() {
@Override
public boolean apply(File input) {
return matcher.matches(input.toPath());
}
}));
List<Metadata> result = Lists.newLinkedList();
for (File match : matchedFiles) {
result.add(toMetadata(match));
}
if (result.isEmpty()) {
// TODO: consider to return Status.OK for globs.
return MatchResult.create(Status.NOT_FOUND, new FileNotFoundException(String.format("No files found for spec: %s.", spec)));
} else {
return MatchResult.create(Status.OK, result);
}
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class FileBasedSource method getEstimatedSizeBytes.
@Override
public final long getEstimatedSizeBytes(PipelineOptions options) throws IOException {
// This implementation of method getEstimatedSizeBytes is provided to simplify subclasses. Here
// we perform the size estimation of files and file patterns using the interface provided by
// FileSystem.
checkState(fileOrPatternSpec.isAccessible(), "Cannot estimate size of a FileBasedSource with inaccessible file pattern: {}.", fileOrPatternSpec);
String fileOrPattern = fileOrPatternSpec.get();
if (mode == Mode.FILEPATTERN) {
long totalSize = 0;
List<MatchResult> inputs = FileSystems.match(Collections.singletonList(fileOrPattern));
MatchResult result = Iterables.getOnlyElement(inputs);
checkArgument(result.status() == Status.OK, "Error matching the pattern or glob %s: status %s", fileOrPattern, result.status());
List<Metadata> allMatches = result.metadata();
for (Metadata metadata : allMatches) {
totalSize += metadata.sizeBytes();
}
LOG.info("Filepattern {} matched {} files with total size {}", fileOrPattern, allMatches.size(), totalSize);
return totalSize;
} else {
long start = getStartOffset();
long end = Math.min(getEndOffset(), getMaxEndOffset(options));
return end - start;
}
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class FileBasedSource method getMaxEndOffset.
@Override
public final long getMaxEndOffset(PipelineOptions options) throws IOException {
checkArgument(mode != Mode.FILEPATTERN, "Cannot determine the exact end offset of a file pattern");
Metadata metadata = getSingleFileMetadata();
return metadata.sizeBytes();
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class FileBasedSource method createReader.
@Override
public final BoundedReader<T> createReader(PipelineOptions options) throws IOException {
// Validate the current source prior to creating a reader for it.
this.validate();
checkState(fileOrPatternSpec.isAccessible(), "Cannot create a file reader without access to the file or pattern specification: {}.", fileOrPatternSpec);
String fileOrPattern = fileOrPatternSpec.get();
if (mode == Mode.FILEPATTERN) {
long startTime = System.currentTimeMillis();
List<Metadata> fileMetadata = FileBasedSource.expandFilePattern(fileOrPattern);
List<FileBasedReader<T>> fileReaders = new ArrayList<>();
for (Metadata metadata : fileMetadata) {
long endOffset = metadata.sizeBytes();
fileReaders.add(createForSubrangeOfFile(metadata, 0, endOffset).createSingleFileReader(options));
}
LOG.debug("Creating a reader for file pattern {} took {} ms", fileOrPattern, System.currentTimeMillis() - startTime);
if (fileReaders.size() == 1) {
return fileReaders.get(0);
}
return new FilePatternReader(this, fileReaders);
} else {
return createSingleFileReader(options);
}
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class GcsFileSystem method toMetadata.
private Metadata toMetadata(StorageObject storageObject) {
// TODO: Address https://issues.apache.org/jira/browse/BEAM-1494
// It is incorrect to set IsReadSeekEfficient true for files with content encoding set to gzip.
Metadata.Builder ret = Metadata.builder().setIsReadSeekEfficient(true).setResourceId(GcsResourceId.fromGcsPath(GcsPath.fromObject(storageObject)));
BigInteger size = firstNonNull(storageObject.getSize(), BigInteger.ZERO);
ret.setSizeBytes(size.longValue());
return ret.build();
}
Aggregations