use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class WriteFilesTest method checkFileContents.
static void checkFileContents(String baseName, List<String> inputs, Optional<Integer> numExpectedShards) throws IOException {
List<File> outputFiles = Lists.newArrayList();
final String pattern = baseName + "*";
List<Metadata> metadata = FileSystems.match(Collections.singletonList(pattern)).get(0).metadata();
for (Metadata meta : metadata) {
outputFiles.add(new File(meta.resourceId().toString()));
}
if (numExpectedShards.isPresent()) {
assertEquals(numExpectedShards.get().intValue(), outputFiles.size());
}
List<String> actual = Lists.newArrayList();
for (File outputFile : outputFiles) {
try (BufferedReader reader = new BufferedReader(new FileReader(outputFile))) {
for (; ; ) {
String line = reader.readLine();
if (line == null) {
break;
}
if (!line.equals(SimpleWriter.HEADER) && !line.equals(SimpleWriter.FOOTER)) {
actual.add(line);
}
}
}
}
assertThat(actual, containsInAnyOrder(inputs.toArray()));
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class ExplicitShardedFile method readLines.
/**
* Reads all the lines of all the files.
*
* <p>Not suitable for use except in testing of small data, since the data size may be far more
* than can be reasonably processed serially, in-memory, by a single thread.
*/
@VisibleForTesting
List<String> readLines(Collection<Metadata> files) throws IOException {
List<String> allLines = Lists.newArrayList();
int i = 1;
for (Metadata file : files) {
try (Reader reader = Channels.newReader(FileSystems.open(file.resourceId()), StandardCharsets.UTF_8.name())) {
List<String> lines = CharStreams.readLines(reader);
allLines.addAll(lines);
LOG.debug("[{} of {}] Read {} lines from file: {}", i, files.size(), lines.size(), file);
}
i++;
}
return allLines;
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class NumberedShardedFile method readLines.
/**
* Reads all the lines of all the files.
*
* <p>Not suitable for use except in testing of small data, since the data size may be far more
* than can be reasonably processed serially, in-memory, by a single thread.
*/
@VisibleForTesting
List<String> readLines(Collection<Metadata> files) throws IOException {
List<String> allLines = Lists.newArrayList();
int i = 1;
for (Metadata file : files) {
try (Reader reader = Channels.newReader(FileSystems.open(file.resourceId()), StandardCharsets.UTF_8.name())) {
List<String> lines = CharStreams.readLines(reader);
allLines.addAll(lines);
LOG.debug("[{} of {}] Read {} lines from file: {}", i, files.size(), lines.size(), file);
}
i++;
}
return allLines;
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class NumberedShardedFile method checkTotalNumOfFiles.
/**
* Check if total number of files is correct by comparing with the number that
* is parsed from shard name using a name template. If no template is specified,
* "SSSS-of-NNNN" will be used as default, and "NNNN" will be the expected total
* number of files.
*
* @return {@code true} if at least one shard name matches template and total number
* of given files equals the number that is parsed from shard name.
*/
@VisibleForTesting
boolean checkTotalNumOfFiles(Collection<Metadata> files) {
for (Metadata fileMedadata : files) {
String fileName = fileMedadata.resourceId().getFilename();
if (fileName == null) {
// this path has zero elements
continue;
}
Matcher matcher = shardTemplate.matcher(fileName);
if (!matcher.matches()) {
// shard name doesn't match the pattern, check with the next shard
continue;
}
// once match, extract total number of shards and compare to file list
return files.size() == Integer.parseInt(matcher.group("numshards"));
}
return false;
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class HadoopFileSystem method match.
@Override
protected List<MatchResult> match(List<String> specs) {
ImmutableList.Builder<MatchResult> resultsBuilder = ImmutableList.builder();
for (String spec : specs) {
try {
FileStatus[] fileStatuses = fileSystem.globStatus(new Path(spec));
if (fileStatuses == null) {
resultsBuilder.add(MatchResult.create(Status.NOT_FOUND, Collections.<Metadata>emptyList()));
continue;
}
List<Metadata> metadata = new ArrayList<>();
for (FileStatus fileStatus : fileStatuses) {
if (fileStatus.isFile()) {
URI uri = dropEmptyAuthority(fileStatus.getPath().toUri().toString());
metadata.add(Metadata.builder().setResourceId(new HadoopResourceId(uri)).setIsReadSeekEfficient(true).setSizeBytes(fileStatus.getLen()).build());
}
}
resultsBuilder.add(MatchResult.create(Status.OK, metadata));
} catch (IOException e) {
resultsBuilder.add(MatchResult.create(Status.ERROR, e));
}
}
return resultsBuilder.build();
}
Aggregations