Search in sources :

Example 11 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class FileBasedSourceTest method testReadRangeFromFileWithSplitsFromStart.

@Test
public void testReadRangeFromFileWithSplitsFromStart() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    String header = "<h>";
    List<String> data = new ArrayList<>();
    for (int i = 0; i < 10; i++) {
        data.add(header);
        data.addAll(createStringDataset(3, 9));
    }
    String fileName = "file";
    File file = createFileWithData(fileName, data);
    Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
    TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 60, header);
    TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 60, Long.MAX_VALUE, header);
    List<String> expectedResults = new ArrayList<String>();
    expectedResults.addAll(data);
    // Remove all occurrences of header from expected results.
    expectedResults.removeAll(Arrays.asList(header));
    List<String> results = new ArrayList<>();
    results.addAll(readFromSource(source1, options));
    results.addAll(readFromSource(source2, options));
    assertThat(expectedResults, containsInAnyOrder(results.toArray()));
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ArrayList(java.util.ArrayList) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) File(java.io.File) Test(org.junit.Test)

Example 12 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class AvroSourceTest method testReadSchemaString.

@Test
public void testReadSchemaString() throws Exception {
    List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);
    String codec = DataFileConstants.NULL_CODEC;
    String filename = generateTestFile(codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);
    Metadata fileMeta = FileSystems.matchSingleFileSpec(filename);
    AvroMetadata metadata = AvroSource.readMetadataFromFile(fileMeta.resourceId());
    // By default, parse validates the schema, which is what we want.
    Schema schema = new Schema.Parser().parse(metadata.getSchemaString());
    assertEquals(4, schema.getFields().size());
}
Also used : Schema(org.apache.avro.Schema) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) AvroMetadata(org.apache.beam.sdk.io.AvroSource.AvroMetadata) AvroMetadata(org.apache.beam.sdk.io.AvroSource.AvroMetadata) Test(org.junit.Test)

Example 13 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class AvroSourceTest method testReadMetadataWithCodecs.

@Test
public void testReadMetadataWithCodecs() throws Exception {
    // Test reading files generated using all codecs.
    String[] codecs = { DataFileConstants.NULL_CODEC, DataFileConstants.BZIP2_CODEC, DataFileConstants.DEFLATE_CODEC, DataFileConstants.SNAPPY_CODEC, DataFileConstants.XZ_CODEC };
    List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);
    for (String codec : codecs) {
        String filename = generateTestFile(codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);
        Metadata fileMeta = FileSystems.matchSingleFileSpec(filename);
        AvroMetadata metadata = AvroSource.readMetadataFromFile(fileMeta.resourceId());
        assertEquals(codec, metadata.getCodec());
    }
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) AvroMetadata(org.apache.beam.sdk.io.AvroSource.AvroMetadata) AvroMetadata(org.apache.beam.sdk.io.AvroSource.AvroMetadata) Test(org.junit.Test)

Example 14 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class AvroSourceTest method testSchemaStringIsInterned.

@Test
public void testSchemaStringIsInterned() throws Exception {
    List<Bird> birds = createRandomRecords(100);
    String filename = generateTestFile("tmp.avro", birds, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC);
    Metadata fileMetadata = FileSystems.matchSingleFileSpec(filename);
    String schemaA = AvroSource.readMetadataFromFile(fileMetadata.resourceId()).getSchemaString();
    String schemaB = AvroSource.readMetadataFromFile(fileMetadata.resourceId()).getSchemaString();
    assertNotSame(schemaA, schemaB);
    AvroSource<GenericRecord> sourceA = AvroSource.from(filename).withSchema(schemaA);
    AvroSource<GenericRecord> sourceB = AvroSource.from(filename).withSchema(schemaB);
    assertSame(sourceA.getSchema(), sourceB.getSchema());
    // Ensure that deserialization still goes through interning
    AvroSource<GenericRecord> sourceC = SerializableUtils.clone(sourceB);
    assertSame(sourceA.getSchema(), sourceC.getSchema());
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) AvroMetadata(org.apache.beam.sdk.io.AvroSource.AvroMetadata) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 15 with Metadata

use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.

the class FileBasedSource method split.

@Override
public final List<? extends FileBasedSource<T>> split(long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
    // This implementation of method split is provided to simplify subclasses. Here we
    // split a FileBasedSource based on a file pattern to FileBasedSources based on full single
    // files. For files that can be efficiently seeked, we further split FileBasedSources based on
    // those files to FileBasedSources based on sub ranges of single files.
    checkState(fileOrPatternSpec.isAccessible(), "Cannot split a FileBasedSource without access to the file or pattern specification: {}.", fileOrPatternSpec);
    String fileOrPattern = fileOrPatternSpec.get();
    if (mode == Mode.FILEPATTERN) {
        long startTime = System.currentTimeMillis();
        List<Metadata> expandedFiles = FileBasedSource.expandFilePattern(fileOrPattern);
        checkArgument(!expandedFiles.isEmpty(), "Unable to find any files matching %s", fileOrPattern);
        List<FileBasedSource<T>> splitResults = new ArrayList<>(expandedFiles.size());
        for (Metadata metadata : expandedFiles) {
            FileBasedSource<T> split = createForSubrangeOfFile(metadata, 0, metadata.sizeBytes());
            verify(split.getMode() == Mode.SINGLE_FILE_OR_SUBRANGE, "%s.createForSubrangeOfFile must return a source in mode %s", split, Mode.SINGLE_FILE_OR_SUBRANGE);
            // The split is NOT in FILEPATTERN mode, so we can call its split without fear
            // of recursion. This will break a single file into multiple splits when the file is
            // splittable and larger than the desired bundle size.
            splitResults.addAll(split.split(desiredBundleSizeBytes, options));
        }
        LOG.info("Splitting filepattern {} into bundles of size {} took {} ms " + "and produced {} files and {} bundles", fileOrPattern, desiredBundleSizeBytes, System.currentTimeMillis() - startTime, expandedFiles.size(), splitResults.size());
        return splitResults;
    } else {
        if (isSplittable()) {
            @SuppressWarnings("unchecked") List<FileBasedSource<T>> splits = (List<FileBasedSource<T>>) super.split(desiredBundleSizeBytes, options);
            return splits;
        } else {
            LOG.debug("The source for file {} is not split into sub-range based sources since " + "the file is not seekable", fileOrPattern);
            return ImmutableList.of(this);
        }
    }
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List)

Aggregations

Metadata (org.apache.beam.sdk.io.fs.MatchResult.Metadata)29 Test (org.junit.Test)15 File (java.io.File)14 ArrayList (java.util.ArrayList)12 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)9 VisibleForTesting (com.google.common.annotations.VisibleForTesting)4 AvroMetadata (org.apache.beam.sdk.io.AvroSource.AvroMetadata)4 ImmutableList (com.google.common.collect.ImmutableList)3 FileReader (java.io.FileReader)3 Reader (java.io.Reader)3 MatchResult (org.apache.beam.sdk.io.fs.MatchResult)3 BufferedReader (java.io.BufferedReader)2 IOException (java.io.IOException)2 List (java.util.List)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 Objects (com.google.api.services.storage.model.Objects)1 StorageObject (com.google.api.services.storage.model.StorageObject)1 Predicate (com.google.common.base.Predicate)1 FileNotFoundException (java.io.FileNotFoundException)1 BigInteger (java.math.BigInteger)1