Search in sources :

Example 21 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class GcsFileSystem method expand.

/**
   * Expands a pattern into {@link MatchResult}.
   *
   * @throws IllegalArgumentException if {@code gcsPattern} does not contain globs.
   */
@VisibleForTesting
MatchResult expand(GcsPath gcsPattern) throws IOException {
    String prefix = GcsUtil.getNonWildcardPrefix(gcsPattern.getObject());
    Pattern p = Pattern.compile(GcsUtil.wildcardToRegexp(gcsPattern.getObject()));
    LOG.debug("matching files in bucket {}, prefix {} against pattern {}", gcsPattern.getBucket(), prefix, p.toString());
    String pageToken = null;
    List<Metadata> results = new LinkedList<>();
    do {
        Objects objects = options.getGcsUtil().listObjects(gcsPattern.getBucket(), prefix, pageToken);
        if (objects.getItems() == null) {
            break;
        }
        // Filter objects based on the regex.
        for (StorageObject o : objects.getItems()) {
            String name = o.getName();
            // Skip directories, which end with a slash.
            if (p.matcher(name).matches() && !name.endsWith("/")) {
                LOG.debug("Matched object: {}", name);
                results.add(toMetadata(o));
            }
        }
        pageToken = objects.getNextPageToken();
    } while (pageToken != null);
    return MatchResult.create(Status.OK, results);
}
Also used : Pattern(java.util.regex.Pattern) StorageObject(com.google.api.services.storage.model.StorageObject) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) Objects(com.google.api.services.storage.model.Objects) LinkedList(java.util.LinkedList) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 22 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class AvroSourceTest method testSchemaIsInterned.

@Test
public void testSchemaIsInterned() throws Exception {
    List<Bird> birds = createRandomRecords(100);
    String filename = generateTestFile("tmp.avro", birds, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC);
    Metadata fileMetadata = FileSystems.matchSingleFileSpec(filename);
    String schemaA = AvroSource.readMetadataFromFile(fileMetadata.resourceId()).getSchemaString();
    String schemaB = AvroSource.readMetadataFromFile(fileMetadata.resourceId()).getSchemaString();
    assertNotSame(schemaA, schemaB);
    AvroSource<GenericRecord> sourceA = (AvroSource<GenericRecord>) AvroSource.from(filename).withSchema(schemaA).createForSubrangeOfFile(fileMetadata, 0L, 0L);
    AvroSource<GenericRecord> sourceB = (AvroSource<GenericRecord>) AvroSource.from(filename).withSchema(schemaB).createForSubrangeOfFile(fileMetadata, 0L, 0L);
    assertSame(sourceA.getReadSchema(), sourceA.getFileSchema());
    assertSame(sourceA.getReadSchema(), sourceB.getReadSchema());
    assertSame(sourceA.getReadSchema(), sourceB.getFileSchema());
// Schemas are transient and not serialized thus we don't need to worry about interning
// after deserialization.
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) AvroMetadata(org.apache.beam.sdk.io.AvroSource.AvroMetadata) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 23 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class WriteWithShardingFactoryTest method dynamicallyReshardedWrite.

@Test
public void dynamicallyReshardedWrite() throws Exception {
    List<String> strs = new ArrayList<>(INPUT_SIZE);
    for (int i = 0; i < INPUT_SIZE; i++) {
        strs.add(UUID.randomUUID().toString());
    }
    Collections.shuffle(strs);
    String fileName = "resharded_write";
    String targetLocation = tmp.getRoot().toPath().resolve(fileName).toString();
    String targetLocationGlob = targetLocation + '*';
    // TextIO is implemented in terms of the WriteFiles PTransform. When sharding is not specified,
    // resharding should be automatically applied
    p.apply(Create.of(strs)).apply(TextIO.write().to(targetLocation));
    p.run();
    List<Metadata> matches = FileSystems.match(targetLocationGlob).metadata();
    List<String> actuals = new ArrayList<>(strs.size());
    List<String> files = new ArrayList<>(strs.size());
    for (Metadata match : matches) {
        String filename = match.resourceId().toString();
        files.add(filename);
        CharBuffer buf = CharBuffer.allocate((int) new File(filename).length());
        try (Reader reader = new FileReader(filename)) {
            reader.read(buf);
            buf.flip();
        }
        String[] readStrs = buf.toString().split("\n");
        for (String read : readStrs) {
            if (read.length() > 0) {
                actuals.add(read);
            }
        }
    }
    assertThat(actuals, containsInAnyOrder(strs.toArray()));
    assertThat(files, hasSize(allOf(greaterThan(1), lessThan((int) (Math.log10(INPUT_SIZE) + WriteWithShardingFactory.MAX_RANDOM_EXTRA_SHARDS)))));
}
Also used : ArrayList(java.util.ArrayList) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) CharBuffer(java.nio.CharBuffer) Reader(java.io.Reader) FileReader(java.io.FileReader) FileReader(java.io.FileReader) File(java.io.File) Test(org.junit.Test)

Example 24 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class TextIOTest method assertOutputFiles.

public static void assertOutputFiles(String[] elems, final String header, final String footer, int numShards, Path rootLocation, String outputName, String shardNameTemplate) throws Exception {
    List<File> expectedFiles = new ArrayList<>();
    if (numShards == 0) {
        String pattern = rootLocation.toAbsolutePath().resolve(outputName + "*").toString();
        List<MatchResult> matches = FileSystems.match(Collections.singletonList(pattern));
        for (Metadata expectedFile : Iterables.getOnlyElement(matches).metadata()) {
            expectedFiles.add(new File(expectedFile.resourceId().toString()));
        }
    } else {
        for (int i = 0; i < numShards; i++) {
            expectedFiles.add(new File(rootLocation.toString(), DefaultFilenamePolicy.constructName(outputName, shardNameTemplate, "", i, numShards)));
        }
    }
    List<List<String>> actual = new ArrayList<>();
    for (File tmpFile : expectedFiles) {
        try (BufferedReader reader = new BufferedReader(new FileReader(tmpFile))) {
            List<String> currentFile = new ArrayList<>();
            for (; ; ) {
                String line = reader.readLine();
                if (line == null) {
                    break;
                }
                currentFile.add(line);
            }
            actual.add(currentFile);
        }
    }
    List<String> expectedElements = new ArrayList<>(elems.length);
    for (String elem : elems) {
        byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem);
        String line = new String(encodedElem);
        expectedElements.add(line);
    }
    List<String> actualElements = Lists.newArrayList(Iterables.concat(FluentIterable.from(actual).transform(removeHeaderAndFooter(header, footer)).toList()));
    assertThat(actualElements, containsInAnyOrder(expectedElements.toArray()));
    assertTrue(Iterables.all(actual, haveProperHeaderAndFooter(header, footer)));
}
Also used : ArrayList(java.util.ArrayList) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) MatchResult(org.apache.beam.sdk.io.fs.MatchResult) BufferedReader(java.io.BufferedReader) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) FileReader(java.io.FileReader) File(java.io.File)

Example 25 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class FileBasedSourceTest method testReadRangeFromFileWithSplitsFromMiddleOfHeader.

@Test
public void testReadRangeFromFileWithSplitsFromMiddleOfHeader() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    String header = "<h>";
    List<String> data = new ArrayList<>();
    for (int i = 0; i < 10; i++) {
        data.add(header);
        data.addAll(createStringDataset(3, 9));
    }
    String fileName = "file";
    File file = createFileWithData(fileName, data);
    List<String> expectedResults = new ArrayList<String>();
    expectedResults.addAll(data.subList(10, data.size()));
    // Remove all occurrences of header from expected results.
    expectedResults.removeAll(Collections.singletonList(header));
    Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
    // Split starts after "<" of the header
    TestFileBasedSource source = new TestFileBasedSource(metadata, 64, 1, Long.MAX_VALUE, header);
    assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
    // Split starts after "<h" of the header
    source = new TestFileBasedSource(metadata, 64, 2, Long.MAX_VALUE, header);
    assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
    // Split starts after "<h>" of the header
    source = new TestFileBasedSource(metadata, 64, 3, Long.MAX_VALUE, header);
    assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ArrayList(java.util.ArrayList) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) File(java.io.File) Test(org.junit.Test)

Aggregations

Metadata (org.apache.beam.sdk.io.fs.MatchResult.Metadata)29 Test (org.junit.Test)15 File (java.io.File)14 ArrayList (java.util.ArrayList)12 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)9 VisibleForTesting (com.google.common.annotations.VisibleForTesting)4 AvroMetadata (org.apache.beam.sdk.io.AvroSource.AvroMetadata)4 ImmutableList (com.google.common.collect.ImmutableList)3 FileReader (java.io.FileReader)3 Reader (java.io.Reader)3 MatchResult (org.apache.beam.sdk.io.fs.MatchResult)3 BufferedReader (java.io.BufferedReader)2 IOException (java.io.IOException)2 List (java.util.List)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 Objects (com.google.api.services.storage.model.Objects)1 StorageObject (com.google.api.services.storage.model.StorageObject)1 Predicate (com.google.common.base.Predicate)1 FileNotFoundException (java.io.FileNotFoundException)1 BigInteger (java.math.BigInteger)1