use of org.apache.beam.sdk.io.fs.MatchResult in project beam by apache.
the class HadoopFileSystem method match.
@Override
protected List<MatchResult> match(List<String> specs) {
ImmutableList.Builder<MatchResult> resultsBuilder = ImmutableList.builder();
for (String spec : specs) {
try {
FileStatus[] fileStatuses = fileSystem.globStatus(new Path(spec));
if (fileStatuses == null) {
resultsBuilder.add(MatchResult.create(Status.NOT_FOUND, Collections.<Metadata>emptyList()));
continue;
}
List<Metadata> metadata = new ArrayList<>();
for (FileStatus fileStatus : fileStatuses) {
if (fileStatus.isFile()) {
URI uri = dropEmptyAuthority(fileStatus.getPath().toUri().toString());
metadata.add(Metadata.builder().setResourceId(new HadoopResourceId(uri)).setIsReadSeekEfficient(true).setSizeBytes(fileStatus.getLen()).build());
}
}
resultsBuilder.add(MatchResult.create(Status.OK, metadata));
} catch (IOException e) {
resultsBuilder.add(MatchResult.create(Status.ERROR, e));
}
}
return resultsBuilder.build();
}
use of org.apache.beam.sdk.io.fs.MatchResult in project beam by apache.
the class GcsFileSystem method match.
@Override
protected List<MatchResult> match(List<String> specs) throws IOException {
List<GcsPath> gcsPaths = toGcsPaths(specs);
List<GcsPath> globs = Lists.newArrayList();
List<GcsPath> nonGlobs = Lists.newArrayList();
List<Boolean> isGlobBooleans = Lists.newArrayList();
for (GcsPath path : gcsPaths) {
if (GcsUtil.isWildcard(path)) {
globs.add(path);
isGlobBooleans.add(true);
} else {
nonGlobs.add(path);
isGlobBooleans.add(false);
}
}
Iterator<MatchResult> globsMatchResults = matchGlobs(globs).iterator();
Iterator<MatchResult> nonGlobsMatchResults = matchNonGlobs(nonGlobs).iterator();
ImmutableList.Builder<MatchResult> ret = ImmutableList.builder();
for (Boolean isGlob : isGlobBooleans) {
if (isGlob) {
checkState(globsMatchResults.hasNext(), "Expect globsMatchResults has next.");
ret.add(globsMatchResults.next());
} else {
checkState(nonGlobsMatchResults.hasNext(), "Expect nonGlobsMatchResults has next.");
ret.add(nonGlobsMatchResults.next());
}
}
checkState(!globsMatchResults.hasNext(), "Expect no more elements in globsMatchResults.");
checkState(!nonGlobsMatchResults.hasNext(), "Expect no more elements in nonGlobsMatchResults.");
return ret.build();
}
use of org.apache.beam.sdk.io.fs.MatchResult in project beam by apache.
the class FileSystems method matchSingleFileSpec.
/**
* Returns the {@link Metadata} for a single file resource. Expects a resource specification
* {@code spec} that matches a single result.
*
* @param spec a resource specification that matches exactly one result.
* @return the {@link Metadata} for the specified resource.
* @throws FileNotFoundException if the file resource is not found.
* @throws IOException in the event of an error in the inner call to {@link #match},
* or if the given spec does not match exactly 1 result.
*/
public static Metadata matchSingleFileSpec(String spec) throws IOException {
List<MatchResult> matches = FileSystems.match(Collections.singletonList(spec));
MatchResult matchResult = Iterables.getOnlyElement(matches);
if (matchResult.status() == Status.NOT_FOUND) {
throw new FileNotFoundException(String.format("File spec %s not found", spec));
} else if (matchResult.status() != Status.OK) {
throw new IOException(String.format("Error matching file spec %s: status %s", spec, matchResult.status()));
} else {
List<Metadata> metadata = matchResult.metadata();
if (metadata.size() != 1) {
throw new IOException(String.format("Expecting spec %s to match exactly one file, but matched %s: %s", spec, metadata.size(), metadata));
}
return metadata.get(0);
}
}
use of org.apache.beam.sdk.io.fs.MatchResult in project beam by apache.
the class FileBasedSource method getEstimatedSizeBytes.
@Override
public final long getEstimatedSizeBytes(PipelineOptions options) throws IOException {
// This implementation of method getEstimatedSizeBytes is provided to simplify subclasses. Here
// we perform the size estimation of files and file patterns using the interface provided by
// FileSystem.
checkState(fileOrPatternSpec.isAccessible(), "Cannot estimate size of a FileBasedSource with inaccessible file pattern: {}.", fileOrPatternSpec);
String fileOrPattern = fileOrPatternSpec.get();
if (mode == Mode.FILEPATTERN) {
long totalSize = 0;
List<MatchResult> inputs = FileSystems.match(Collections.singletonList(fileOrPattern));
MatchResult result = Iterables.getOnlyElement(inputs);
checkArgument(result.status() == Status.OK, "Error matching the pattern or glob %s: status %s", fileOrPattern, result.status());
List<Metadata> allMatches = result.metadata();
for (Metadata metadata : allMatches) {
totalSize += metadata.sizeBytes();
}
LOG.info("Filepattern {} matched {} files with total size {}", fileOrPattern, allMatches.size(), totalSize);
return totalSize;
} else {
long start = getStartOffset();
long end = Math.min(getEndOffset(), getMaxEndOffset(options));
return end - start;
}
}
use of org.apache.beam.sdk.io.fs.MatchResult in project beam by apache.
the class FileBasedSource method expandFilePattern.
private static List<Metadata> expandFilePattern(String fileOrPatternSpec) throws IOException {
MatchResult matches = Iterables.getOnlyElement(FileSystems.match(Collections.singletonList(fileOrPatternSpec)));
LOG.info("Matched {} files for pattern {}", matches.metadata().size(), fileOrPatternSpec);
return ImmutableList.copyOf(matches.metadata());
}
Aggregations