use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class GcsFileSystem method expand.
/**
* Expands a pattern into {@link MatchResult}.
*
* @throws IllegalArgumentException if {@code gcsPattern} does not contain globs.
*/
@VisibleForTesting
MatchResult expand(GcsPath gcsPattern) throws IOException {
String prefix = GcsUtil.getNonWildcardPrefix(gcsPattern.getObject());
Pattern p = Pattern.compile(GcsUtil.wildcardToRegexp(gcsPattern.getObject()));
LOG.debug("matching files in bucket {}, prefix {} against pattern {}", gcsPattern.getBucket(), prefix, p.toString());
String pageToken = null;
List<Metadata> results = new LinkedList<>();
do {
Objects objects = options.getGcsUtil().listObjects(gcsPattern.getBucket(), prefix, pageToken);
if (objects.getItems() == null) {
break;
}
// Filter objects based on the regex.
for (StorageObject o : objects.getItems()) {
String name = o.getName();
// Skip directories, which end with a slash.
if (p.matcher(name).matches() && !name.endsWith("/")) {
LOG.debug("Matched object: {}", name);
results.add(toMetadata(o));
}
}
pageToken = objects.getNextPageToken();
} while (pageToken != null);
return MatchResult.create(Status.OK, results);
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class AvroSourceTest method testSchemaIsInterned.
@Test
public void testSchemaIsInterned() throws Exception {
List<Bird> birds = createRandomRecords(100);
String filename = generateTestFile("tmp.avro", birds, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC);
Metadata fileMetadata = FileSystems.matchSingleFileSpec(filename);
String schemaA = AvroSource.readMetadataFromFile(fileMetadata.resourceId()).getSchemaString();
String schemaB = AvroSource.readMetadataFromFile(fileMetadata.resourceId()).getSchemaString();
assertNotSame(schemaA, schemaB);
AvroSource<GenericRecord> sourceA = (AvroSource<GenericRecord>) AvroSource.from(filename).withSchema(schemaA).createForSubrangeOfFile(fileMetadata, 0L, 0L);
AvroSource<GenericRecord> sourceB = (AvroSource<GenericRecord>) AvroSource.from(filename).withSchema(schemaB).createForSubrangeOfFile(fileMetadata, 0L, 0L);
assertSame(sourceA.getReadSchema(), sourceA.getFileSchema());
assertSame(sourceA.getReadSchema(), sourceB.getReadSchema());
assertSame(sourceA.getReadSchema(), sourceB.getFileSchema());
// Schemas are transient and not serialized thus we don't need to worry about interning
// after deserialization.
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class WriteWithShardingFactoryTest method dynamicallyReshardedWrite.
@Test
public void dynamicallyReshardedWrite() throws Exception {
List<String> strs = new ArrayList<>(INPUT_SIZE);
for (int i = 0; i < INPUT_SIZE; i++) {
strs.add(UUID.randomUUID().toString());
}
Collections.shuffle(strs);
String fileName = "resharded_write";
String targetLocation = tmp.getRoot().toPath().resolve(fileName).toString();
String targetLocationGlob = targetLocation + '*';
// TextIO is implemented in terms of the WriteFiles PTransform. When sharding is not specified,
// resharding should be automatically applied
p.apply(Create.of(strs)).apply(TextIO.write().to(targetLocation));
p.run();
List<Metadata> matches = FileSystems.match(targetLocationGlob).metadata();
List<String> actuals = new ArrayList<>(strs.size());
List<String> files = new ArrayList<>(strs.size());
for (Metadata match : matches) {
String filename = match.resourceId().toString();
files.add(filename);
CharBuffer buf = CharBuffer.allocate((int) new File(filename).length());
try (Reader reader = new FileReader(filename)) {
reader.read(buf);
buf.flip();
}
String[] readStrs = buf.toString().split("\n");
for (String read : readStrs) {
if (read.length() > 0) {
actuals.add(read);
}
}
}
assertThat(actuals, containsInAnyOrder(strs.toArray()));
assertThat(files, hasSize(allOf(greaterThan(1), lessThan((int) (Math.log10(INPUT_SIZE) + WriteWithShardingFactory.MAX_RANDOM_EXTRA_SHARDS)))));
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class TextIOTest method assertOutputFiles.
public static void assertOutputFiles(String[] elems, final String header, final String footer, int numShards, Path rootLocation, String outputName, String shardNameTemplate) throws Exception {
List<File> expectedFiles = new ArrayList<>();
if (numShards == 0) {
String pattern = rootLocation.toAbsolutePath().resolve(outputName + "*").toString();
List<MatchResult> matches = FileSystems.match(Collections.singletonList(pattern));
for (Metadata expectedFile : Iterables.getOnlyElement(matches).metadata()) {
expectedFiles.add(new File(expectedFile.resourceId().toString()));
}
} else {
for (int i = 0; i < numShards; i++) {
expectedFiles.add(new File(rootLocation.toString(), DefaultFilenamePolicy.constructName(outputName, shardNameTemplate, "", i, numShards)));
}
}
List<List<String>> actual = new ArrayList<>();
for (File tmpFile : expectedFiles) {
try (BufferedReader reader = new BufferedReader(new FileReader(tmpFile))) {
List<String> currentFile = new ArrayList<>();
for (; ; ) {
String line = reader.readLine();
if (line == null) {
break;
}
currentFile.add(line);
}
actual.add(currentFile);
}
}
List<String> expectedElements = new ArrayList<>(elems.length);
for (String elem : elems) {
byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem);
String line = new String(encodedElem);
expectedElements.add(line);
}
List<String> actualElements = Lists.newArrayList(Iterables.concat(FluentIterable.from(actual).transform(removeHeaderAndFooter(header, footer)).toList()));
assertThat(actualElements, containsInAnyOrder(expectedElements.toArray()));
assertTrue(Iterables.all(actual, haveProperHeaderAndFooter(header, footer)));
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class FileBasedSourceTest method testReadRangeFromFileWithSplitsFromMiddleOfHeader.
@Test
public void testReadRangeFromFileWithSplitsFromMiddleOfHeader() throws IOException {
PipelineOptions options = PipelineOptionsFactory.create();
String header = "<h>";
List<String> data = new ArrayList<>();
for (int i = 0; i < 10; i++) {
data.add(header);
data.addAll(createStringDataset(3, 9));
}
String fileName = "file";
File file = createFileWithData(fileName, data);
List<String> expectedResults = new ArrayList<String>();
expectedResults.addAll(data.subList(10, data.size()));
// Remove all occurrences of header from expected results.
expectedResults.removeAll(Collections.singletonList(header));
Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
// Split starts after "<" of the header
TestFileBasedSource source = new TestFileBasedSource(metadata, 64, 1, Long.MAX_VALUE, header);
assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
// Split starts after "<h" of the header
source = new TestFileBasedSource(metadata, 64, 2, Long.MAX_VALUE, header);
assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
// Split starts after "<h>" of the header
source = new TestFileBasedSource(metadata, 64, 3, Long.MAX_VALUE, header);
assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
}
Aggregations