use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.
the class TestWriteMarkersFactory method testWriteMarkersFactory.
private void testWriteMarkersFactory(MarkerType markerTypeConfig, String basePath, boolean isTimelineServerEnabled, Class<?> expectedWriteMarkersClass) {
String instantTime = "001";
Mockito.when(table.getConfig()).thenReturn(writeConfig);
Mockito.when(writeConfig.isEmbeddedTimelineServerEnabled()).thenReturn(isTimelineServerEnabled);
Mockito.when(table.getMetaClient()).thenReturn(metaClient);
Mockito.when(metaClient.getFs()).thenReturn(fileSystem);
Mockito.when(metaClient.getBasePath()).thenReturn(basePath);
Mockito.when(metaClient.getMarkerFolderPath(any())).thenReturn(basePath + ".hoodie/.temp");
Mockito.when(table.getContext()).thenReturn(context);
Mockito.when(context.getHadoopConf()).thenReturn(new SerializableConfiguration(new Configuration()));
Mockito.when(writeConfig.getViewStorageConfig()).thenReturn(FileSystemViewStorageConfig.newBuilder().build());
assertEquals(expectedWriteMarkersClass, WriteMarkersFactory.get(markerTypeConfig, table, instantTime).getClass());
}
use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.
the class TestFileSystemBackedTableMetadata method testOneLevelPartitionedTable.
@Test
public void testOneLevelPartitionedTable() throws Exception {
String instant = "100";
hoodieTestTable = hoodieTestTable.addCommit(instant);
// Generate 10 files under each partition
ONE_LEVEL_PARTITIONS.stream().forEach(p -> {
try {
hoodieTestTable = hoodieTestTable.withPartitionMetaFiles(p).withBaseFilesInPartition(p, IntStream.range(0, 10).toArray());
} catch (Exception e) {
throw new RuntimeException(e);
}
});
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
FileSystemBackedTableMetadata fileSystemBackedTableMetadata = new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
Assertions.assertEquals(3, fileSystemBackedTableMetadata.getAllPartitionPaths().size());
Assertions.assertEquals(10, fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + ONE_LEVEL_PARTITIONS.get(0))).length);
List<String> fullPartitionPaths = ONE_LEVEL_PARTITIONS.stream().map(p -> basePath + "/" + p).collect(Collectors.toList());
Map<String, FileStatus[]> partitionToFilesMap = fileSystemBackedTableMetadata.getAllFilesInPartitions(fullPartitionPaths);
for (String p : fullPartitionPaths) {
Assertions.assertEquals(10, partitionToFilesMap.get(p).length);
}
}
use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.
the class TestFileSystemBackedTableMetadata method testMultiLevelPartitionedTable.
@Test
public void testMultiLevelPartitionedTable() throws Exception {
String instant = "100";
hoodieTestTable = hoodieTestTable.addCommit(instant);
// Generate 10 files under each partition
MULTI_LEVEL_PARTITIONS.stream().forEach(p -> {
try {
hoodieTestTable = hoodieTestTable.withPartitionMetaFiles(p).withBaseFilesInPartition(p, IntStream.range(0, 10).toArray());
} catch (Exception e) {
throw new RuntimeException(e);
}
});
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
FileSystemBackedTableMetadata fileSystemBackedTableMetadata = new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
Assertions.assertEquals(3, fileSystemBackedTableMetadata.getAllPartitionPaths().size());
Assertions.assertEquals(10, fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + MULTI_LEVEL_PARTITIONS.get(0))).length);
List<String> fullPartitionPaths = MULTI_LEVEL_PARTITIONS.stream().map(p -> basePath + "/" + p).collect(Collectors.toList());
Map<String, FileStatus[]> partitionToFilesMap = fileSystemBackedTableMetadata.getAllFilesInPartitions(fullPartitionPaths);
for (String p : fullPartitionPaths) {
Assertions.assertEquals(10, partitionToFilesMap.get(p).length);
}
}
use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.
the class HoodieBackedTableMetadataWriter method listAllPartitions.
/**
* Function to find hoodie partitions and list files in them in parallel.
*
* @param datasetMetaClient data set meta client instance.
* @return Map of partition names to a list of FileStatus for all the files in the partition
*/
private List<DirectoryInfo> listAllPartitions(HoodieTableMetaClient datasetMetaClient) {
List<Path> pathsToList = new LinkedList<>();
pathsToList.add(new Path(dataWriteConfig.getBasePath()));
List<DirectoryInfo> partitionsToBootstrap = new LinkedList<>();
final int fileListingParallelism = metadataWriteConfig.getFileListingParallelism();
SerializableConfiguration conf = new SerializableConfiguration(datasetMetaClient.getHadoopConf());
final String dirFilterRegex = dataWriteConfig.getMetadataConfig().getDirectoryFilterRegex();
final String datasetBasePath = datasetMetaClient.getBasePath();
while (!pathsToList.isEmpty()) {
// In each round we will list a section of directories
int numDirsToList = Math.min(fileListingParallelism, pathsToList.size());
// List all directories in parallel
List<DirectoryInfo> processedDirectories = engineContext.map(pathsToList.subList(0, numDirsToList), path -> {
FileSystem fs = path.getFileSystem(conf.get());
String relativeDirPath = FSUtils.getRelativePartitionPath(new Path(datasetBasePath), path);
return new DirectoryInfo(relativeDirPath, fs.listStatus(path));
}, numDirsToList);
pathsToList = new LinkedList<>(pathsToList.subList(numDirsToList, pathsToList.size()));
// the results.
for (DirectoryInfo dirInfo : processedDirectories) {
if (!dirFilterRegex.isEmpty()) {
final String relativePath = dirInfo.getRelativePath();
if (!relativePath.isEmpty()) {
Path partitionPath = new Path(datasetBasePath, relativePath);
if (partitionPath.getName().matches(dirFilterRegex)) {
LOG.info("Ignoring directory " + partitionPath + " which matches the filter regex " + dirFilterRegex);
continue;
}
}
}
if (dirInfo.isHoodiePartition()) {
// Add to result
partitionsToBootstrap.add(dirInfo);
} else {
// Add sub-dirs to the queue
pathsToList.addAll(dirInfo.getSubDirectories());
}
}
}
return partitionsToBootstrap;
}
use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.
the class TestDFSHoodieTestSuiteWriterAdapter method testDFSWorkloadSinkWithMultipleFilesFunctional.
@Test
public void testDFSWorkloadSinkWithMultipleFilesFunctional() throws IOException {
DeltaConfig dfsSinkConfig = new DFSDeltaConfig(DeltaOutputMode.DFS, DeltaInputType.AVRO, new SerializableConfiguration(jsc.hadoopConfiguration()), dfsBasePath, dfsBasePath, schemaProvider.getSourceSchema().toString(), 10240L, jsc.defaultParallelism(), false);
DeltaWriterAdapter<GenericRecord> dfsDeltaWriterAdapter = DeltaWriterFactory.getDeltaWriterAdapter(dfsSinkConfig, 1);
FlexibleSchemaRecordGenerationIterator itr = new FlexibleSchemaRecordGenerationIterator(1000, schemaProvider.getSourceSchema().toString());
dfsDeltaWriterAdapter.write(itr);
FileSystem fs = FSUtils.getFs(dfsBasePath, jsc.hadoopConfiguration());
FileStatus[] fileStatuses = fs.listStatus(new Path(dfsBasePath));
// Since maxFileSize was 10240L and we produced 1K records each close to 1K size, we should produce more than
// 1 file
assertTrue(fileStatuses.length > 0);
}
Aggregations