use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class WriteFilesTest method checkFileContents.
static void checkFileContents(String baseName, List<String> inputs, Optional<Integer> numExpectedShards) throws IOException {
List<File> outputFiles = Lists.newArrayList();
final String pattern = baseName + "*";
List<Metadata> metadata = FileSystems.match(Collections.singletonList(pattern)).get(0).metadata();
for (Metadata meta : metadata) {
outputFiles.add(new File(meta.resourceId().toString()));
}
if (numExpectedShards.isPresent()) {
assertEquals(numExpectedShards.get().intValue(), outputFiles.size());
}
List<String> actual = Lists.newArrayList();
for (File outputFile : outputFiles) {
try (BufferedReader reader = new BufferedReader(new FileReader(outputFile))) {
for (; ; ) {
String line = reader.readLine();
if (line == null) {
break;
}
if (!line.equals(SimpleWriter.HEADER) && !line.equals(SimpleWriter.FOOTER)) {
actual.add(line);
}
}
}
}
assertThat(actual, containsInAnyOrder(inputs.toArray()));
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class LocalFileSystem method matchOne.
private MatchResult matchOne(String baseDir, String spec) {
if (spec.toLowerCase().startsWith("file:")) {
spec = spec.substring("file:".length());
}
if (SystemUtils.IS_OS_WINDOWS) {
List<String> prefixes = Arrays.asList("///", "/");
for (String prefix : prefixes) {
if (spec.toLowerCase().startsWith(prefix)) {
spec = spec.substring(prefix.length());
}
}
}
// BEAM-6213: Windows breaks on Paths.get(spec).toFile() with a glob because
// it considers it an invalid file system pattern. We should use
// new File(spec) to avoid such validation.
// See https://bugs.openjdk.java.net/browse/JDK-8197918
// However, new File(parent, child) resolves absolute `child` in a system-dependent
// way that is generally incorrect, for example new File($PWD, "/tmp/foo") resolves
// to $PWD/tmp/foo on many systems, unlike Paths.get($PWD).resolve("/tmp/foo") which
// correctly resolves to "/tmp/foo". We add just this one piece of logic here, without
// switching to Paths which could require a rewrite of this module to support
// both Windows and correct file resolution.
// The root cause is that globs are not files but we are using file manipulation libraries
// to work with them.
final File specAsFile = new File(spec);
final File absoluteFile = specAsFile.isAbsolute() ? specAsFile : new File(baseDir, spec);
if (absoluteFile.exists()) {
return MatchResult.create(Status.OK, ImmutableList.of(toMetadata(absoluteFile)));
}
File parent = getSpecNonGlobPrefixParentFile(absoluteFile.getAbsolutePath());
if (!parent.exists()) {
return MatchResult.create(Status.NOT_FOUND, Collections.emptyList());
}
// Method getAbsolutePath() on Windows platform may return something like
// "c:\temp\file.txt". FileSystem.getPathMatcher() call below will treat
// '\' (backslash) as an escape character, instead of a directory
// separator. Replacing backslash with double-backslash solves the problem.
// We perform the replacement on all platforms, even those that allow
// backslash as a part of the filename, because Globs.toRegexPattern will
// eat one backslash.
String pathToMatch = absoluteFile.getAbsolutePath().replaceAll(Matcher.quoteReplacement("\\"), Matcher.quoteReplacement("\\\\"));
final PathMatcher matcher = java.nio.file.FileSystems.getDefault().getPathMatcher("glob:" + pathToMatch);
// TODO: Avoid iterating all files: https://issues.apache.org/jira/browse/BEAM-1309
Iterable<File> files = fileTraverser().depthFirstPreOrder(parent);
Iterable<File> matchedFiles = StreamSupport.stream(files.spliterator(), false).filter(Predicates.and(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.io.Files.isFile(), input -> matcher.matches(input.toPath()))::apply).collect(Collectors.toList());
List<Metadata> result = Lists.newLinkedList();
for (File match : matchedFiles) {
result.add(toMetadata(match));
}
if (result.isEmpty()) {
// TODO: consider to return Status.OK for globs.
return MatchResult.create(Status.NOT_FOUND, new FileNotFoundException(String.format("No files found for spec: %s in working directory %s", spec, baseDir)));
} else {
return MatchResult.create(Status.OK, result);
}
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class FileBasedSource method split.
@Override
public final List<? extends FileBasedSource<T>> split(long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
// This implementation of method split is provided to simplify subclasses. Here we
// split a FileBasedSource based on a file pattern to FileBasedSources based on full single
// files. For files that can be efficiently seeked, we further split FileBasedSources based on
// those files to FileBasedSources based on sub ranges of single files.
String fileOrPattern = fileOrPatternSpec.get();
if (mode == Mode.FILEPATTERN) {
long startTime = System.currentTimeMillis();
List<Metadata> expandedFiles = FileSystems.match(fileOrPattern, emptyMatchTreatment).metadata();
List<FileBasedSource<T>> splitResults = new ArrayList<>(expandedFiles.size());
for (Metadata metadata : expandedFiles) {
FileBasedSource<T> split = createForSubrangeOfFile(metadata, 0, metadata.sizeBytes());
verify(split.getMode() == Mode.SINGLE_FILE_OR_SUBRANGE, "%s.createForSubrangeOfFile must return a source in mode %s", split, Mode.SINGLE_FILE_OR_SUBRANGE);
// The split is NOT in FILEPATTERN mode, so we can call its split without fear
// of recursion. This will break a single file into multiple splits when the file is
// splittable and larger than the desired bundle size.
splitResults.addAll(split.split(desiredBundleSizeBytes, options));
}
LOG.info("Splitting filepattern {} into bundles of size {} took {} ms " + "and produced {} files and {} bundles", fileOrPattern, desiredBundleSizeBytes, System.currentTimeMillis() - startTime, expandedFiles.size(), splitResults.size());
return splitResults;
} else {
if (isSplittable()) {
@SuppressWarnings("unchecked") List<FileBasedSource<T>> splits = (List<FileBasedSource<T>>) super.split(desiredBundleSizeBytes, options);
return splits;
} else {
LOG.debug("The source for file {} is not split into sub-range based sources since " + "the file is not seekable", fileOrPattern);
return ImmutableList.of(this);
}
}
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class ReadableFileCoder method decode.
@Override
public FileIO.ReadableFile decode(@UnknownKeyFor @NonNull @Initialized InputStream inStream) throws CoderException, IOException {
MatchResult.Metadata metadata = getMetadataCoder().decode(inStream);
Compression compression = Compression.values()[VarIntCoder.of().decode(inStream)];
return new FileIO.ReadableFile(metadata, compression);
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class WriteFilesTest method checkFileContents.
static void checkFileContents(String baseName, List<String> inputs, Optional<Integer> numExpectedShards, boolean expectRemovedTempDirectory, BiFunction<Integer, List<String>, Void> shardContentChecker) throws IOException {
List<File> outputFiles = Lists.newArrayList();
final String pattern = baseName + "*";
List<Metadata> metadata = FileSystems.match(Collections.singletonList(pattern)).get(0).metadata();
for (Metadata meta : metadata) {
outputFiles.add(new File(meta.resourceId().toString()));
}
assertFalse("Should have produced at least 1 output file", outputFiles.isEmpty());
Pattern shardPattern = Pattern.compile("(\\d{4})-of-\\d{4}");
if (numExpectedShards.isPresent()) {
assertEquals(numExpectedShards.get().intValue(), outputFiles.size());
Set<String> expectedShards = Sets.newHashSet();
DecimalFormat df = new DecimalFormat("0000");
for (int i = 0; i < numExpectedShards.get(); i++) {
expectedShards.add(String.format("%s-of-%s", df.format(i), df.format(numExpectedShards.get())));
}
Set<String> outputShards = Sets.newHashSet();
for (File file : outputFiles) {
Matcher matcher = shardPattern.matcher(file.getName());
assertTrue(matcher.find());
assertTrue(outputShards.add(matcher.group()));
}
assertEquals(expectedShards, outputShards);
}
List<String> actual = Lists.newArrayList();
for (File outputFile : outputFiles) {
List<String> actualShard = Lists.newArrayList();
try (BufferedReader reader = Files.newBufferedReader(outputFile.toPath(), Charsets.UTF_8)) {
for (; ; ) {
String line = reader.readLine();
if (line == null) {
break;
}
if (!line.equals(SimpleWriter.HEADER) && !line.equals(SimpleWriter.FOOTER)) {
actualShard.add(line);
}
}
}
if (shardContentChecker != null) {
Matcher matcher = shardPattern.matcher(outputFile.getName());
matcher.find();
int shardNumber = Integer.parseInt(matcher.group(1));
shardContentChecker.apply(shardNumber, actualShard);
}
actual.addAll(actualShard);
}
assertThat(actual, containsInAnyOrder(inputs.toArray()));
if (expectRemovedTempDirectory) {
assertThat(Lists.newArrayList(new File(baseName).getParentFile().list()), Matchers.everyItem(not(containsString(FileBasedSink.TEMP_DIRECTORY_PREFIX))));
}
}
Aggregations