use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class FileBasedSourceTest method testReadRangeFromFileWithSplitsFromStart.
@Test
public void testReadRangeFromFileWithSplitsFromStart() throws IOException {
PipelineOptions options = PipelineOptionsFactory.create();
String header = "<h>";
List<String> data = new ArrayList<>();
for (int i = 0; i < 10; i++) {
data.add(header);
data.addAll(createStringDataset(3, 9));
}
String fileName = "file";
File file = createFileWithData(fileName, data);
Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 60, header);
TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 60, Long.MAX_VALUE, header);
List<String> expectedResults = new ArrayList<String>();
expectedResults.addAll(data);
// Remove all occurrences of header from expected results.
expectedResults.removeAll(Arrays.asList(header));
List<String> results = new ArrayList<>();
results.addAll(readFromSource(source1, options));
results.addAll(readFromSource(source2, options));
assertThat(expectedResults, containsInAnyOrder(results.toArray()));
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class AvroSourceTest method testReadSchemaString.
@Test
public void testReadSchemaString() throws Exception {
List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);
String codec = DataFileConstants.NULL_CODEC;
String filename = generateTestFile(codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);
Metadata fileMeta = FileSystems.matchSingleFileSpec(filename);
AvroMetadata metadata = AvroSource.readMetadataFromFile(fileMeta.resourceId());
// By default, parse validates the schema, which is what we want.
Schema schema = new Schema.Parser().parse(metadata.getSchemaString());
assertEquals(4, schema.getFields().size());
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class AvroSourceTest method testReadMetadataWithCodecs.
@Test
public void testReadMetadataWithCodecs() throws Exception {
// Test reading files generated using all codecs.
String[] codecs = { DataFileConstants.NULL_CODEC, DataFileConstants.BZIP2_CODEC, DataFileConstants.DEFLATE_CODEC, DataFileConstants.SNAPPY_CODEC, DataFileConstants.XZ_CODEC };
List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);
for (String codec : codecs) {
String filename = generateTestFile(codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);
Metadata fileMeta = FileSystems.matchSingleFileSpec(filename);
AvroMetadata metadata = AvroSource.readMetadataFromFile(fileMeta.resourceId());
assertEquals(codec, metadata.getCodec());
}
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class AvroSourceTest method testSchemaStringIsInterned.
@Test
public void testSchemaStringIsInterned() throws Exception {
List<Bird> birds = createRandomRecords(100);
String filename = generateTestFile("tmp.avro", birds, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC);
Metadata fileMetadata = FileSystems.matchSingleFileSpec(filename);
String schemaA = AvroSource.readMetadataFromFile(fileMetadata.resourceId()).getSchemaString();
String schemaB = AvroSource.readMetadataFromFile(fileMetadata.resourceId()).getSchemaString();
assertNotSame(schemaA, schemaB);
AvroSource<GenericRecord> sourceA = AvroSource.from(filename).withSchema(schemaA);
AvroSource<GenericRecord> sourceB = AvroSource.from(filename).withSchema(schemaB);
assertSame(sourceA.getSchema(), sourceB.getSchema());
// Ensure that deserialization still goes through interning
AvroSource<GenericRecord> sourceC = SerializableUtils.clone(sourceB);
assertSame(sourceA.getSchema(), sourceC.getSchema());
}
use of org.apache.beam.sdk.io.fs.MatchResult.Metadata in project beam by apache.
the class FileBasedSource method split.
@Override
public final List<? extends FileBasedSource<T>> split(long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
// This implementation of method split is provided to simplify subclasses. Here we
// split a FileBasedSource based on a file pattern to FileBasedSources based on full single
// files. For files that can be efficiently seeked, we further split FileBasedSources based on
// those files to FileBasedSources based on sub ranges of single files.
checkState(fileOrPatternSpec.isAccessible(), "Cannot split a FileBasedSource without access to the file or pattern specification: {}.", fileOrPatternSpec);
String fileOrPattern = fileOrPatternSpec.get();
if (mode == Mode.FILEPATTERN) {
long startTime = System.currentTimeMillis();
List<Metadata> expandedFiles = FileBasedSource.expandFilePattern(fileOrPattern);
checkArgument(!expandedFiles.isEmpty(), "Unable to find any files matching %s", fileOrPattern);
List<FileBasedSource<T>> splitResults = new ArrayList<>(expandedFiles.size());
for (Metadata metadata : expandedFiles) {
FileBasedSource<T> split = createForSubrangeOfFile(metadata, 0, metadata.sizeBytes());
verify(split.getMode() == Mode.SINGLE_FILE_OR_SUBRANGE, "%s.createForSubrangeOfFile must return a source in mode %s", split, Mode.SINGLE_FILE_OR_SUBRANGE);
// The split is NOT in FILEPATTERN mode, so we can call its split without fear
// of recursion. This will break a single file into multiple splits when the file is
// splittable and larger than the desired bundle size.
splitResults.addAll(split.split(desiredBundleSizeBytes, options));
}
LOG.info("Splitting filepattern {} into bundles of size {} took {} ms " + "and produced {} files and {} bundles", fileOrPattern, desiredBundleSizeBytes, System.currentTimeMillis() - startTime, expandedFiles.size(), splitResults.size());
return splitResults;
} else {
if (isSplittable()) {
@SuppressWarnings("unchecked") List<FileBasedSource<T>> splits = (List<FileBasedSource<T>>) super.split(desiredBundleSizeBytes, options);
return splits;
} else {
LOG.debug("The source for file {} is not split into sub-range based sources since " + "the file is not seekable", fileOrPattern);
return ImmutableList.of(this);
}
}
}
Aggregations