Search in sources :

Example 16 with SerializableConfiguration

use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.

the class TestWriteMarkersFactory method testWriteMarkersFactory.

private void testWriteMarkersFactory(MarkerType markerTypeConfig, String basePath, boolean isTimelineServerEnabled, Class<?> expectedWriteMarkersClass) {
    String instantTime = "001";
    Mockito.when(table.getConfig()).thenReturn(writeConfig);
    Mockito.when(writeConfig.isEmbeddedTimelineServerEnabled()).thenReturn(isTimelineServerEnabled);
    Mockito.when(table.getMetaClient()).thenReturn(metaClient);
    Mockito.when(metaClient.getFs()).thenReturn(fileSystem);
    Mockito.when(metaClient.getBasePath()).thenReturn(basePath);
    Mockito.when(metaClient.getMarkerFolderPath(any())).thenReturn(basePath + ".hoodie/.temp");
    Mockito.when(table.getContext()).thenReturn(context);
    Mockito.when(context.getHadoopConf()).thenReturn(new SerializableConfiguration(new Configuration()));
    Mockito.when(writeConfig.getViewStorageConfig()).thenReturn(FileSystemViewStorageConfig.newBuilder().build());
    assertEquals(expectedWriteMarkersClass, WriteMarkersFactory.get(markerTypeConfig, table, instantTime).getClass());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration)

Example 17 with SerializableConfiguration

use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.

the class TestFileSystemBackedTableMetadata method testOneLevelPartitionedTable.

@Test
public void testOneLevelPartitionedTable() throws Exception {
    String instant = "100";
    hoodieTestTable = hoodieTestTable.addCommit(instant);
    // Generate 10 files under each partition
    ONE_LEVEL_PARTITIONS.stream().forEach(p -> {
        try {
            hoodieTestTable = hoodieTestTable.withPartitionMetaFiles(p).withBaseFilesInPartition(p, IntStream.range(0, 10).toArray());
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    });
    HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
    FileSystemBackedTableMetadata fileSystemBackedTableMetadata = new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
    Assertions.assertEquals(3, fileSystemBackedTableMetadata.getAllPartitionPaths().size());
    Assertions.assertEquals(10, fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + ONE_LEVEL_PARTITIONS.get(0))).length);
    List<String> fullPartitionPaths = ONE_LEVEL_PARTITIONS.stream().map(p -> basePath + "/" + p).collect(Collectors.toList());
    Map<String, FileStatus[]> partitionToFilesMap = fileSystemBackedTableMetadata.getAllFilesInPartitions(fullPartitionPaths);
    for (String p : fullPartitionPaths) {
        Assertions.assertEquals(10, partitionToFilesMap.get(p).length);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) IntStream(java.util.stream.IntStream) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) IOException(java.io.IOException) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) AfterEach(org.junit.jupiter.api.AfterEach) List(java.util.List) Map(java.util.Map) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) Assertions(org.junit.jupiter.api.Assertions) Path(org.apache.hadoop.fs.Path) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) Collections(java.util.Collections) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) IOException(java.io.IOException) Test(org.junit.jupiter.api.Test)

Example 18 with SerializableConfiguration

use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.

the class TestFileSystemBackedTableMetadata method testMultiLevelPartitionedTable.

@Test
public void testMultiLevelPartitionedTable() throws Exception {
    String instant = "100";
    hoodieTestTable = hoodieTestTable.addCommit(instant);
    // Generate 10 files under each partition
    MULTI_LEVEL_PARTITIONS.stream().forEach(p -> {
        try {
            hoodieTestTable = hoodieTestTable.withPartitionMetaFiles(p).withBaseFilesInPartition(p, IntStream.range(0, 10).toArray());
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    });
    HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
    FileSystemBackedTableMetadata fileSystemBackedTableMetadata = new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
    Assertions.assertEquals(3, fileSystemBackedTableMetadata.getAllPartitionPaths().size());
    Assertions.assertEquals(10, fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + MULTI_LEVEL_PARTITIONS.get(0))).length);
    List<String> fullPartitionPaths = MULTI_LEVEL_PARTITIONS.stream().map(p -> basePath + "/" + p).collect(Collectors.toList());
    Map<String, FileStatus[]> partitionToFilesMap = fileSystemBackedTableMetadata.getAllFilesInPartitions(fullPartitionPaths);
    for (String p : fullPartitionPaths) {
        Assertions.assertEquals(10, partitionToFilesMap.get(p).length);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) IntStream(java.util.stream.IntStream) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) IOException(java.io.IOException) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) AfterEach(org.junit.jupiter.api.AfterEach) List(java.util.List) Map(java.util.Map) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) Assertions(org.junit.jupiter.api.Assertions) Path(org.apache.hadoop.fs.Path) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) Collections(java.util.Collections) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) IOException(java.io.IOException) Test(org.junit.jupiter.api.Test)

Example 19 with SerializableConfiguration

use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.

the class HoodieBackedTableMetadataWriter method listAllPartitions.

/**
 * Function to find hoodie partitions and list files in them in parallel.
 *
 * @param datasetMetaClient data set meta client instance.
 * @return Map of partition names to a list of FileStatus for all the files in the partition
 */
private List<DirectoryInfo> listAllPartitions(HoodieTableMetaClient datasetMetaClient) {
    List<Path> pathsToList = new LinkedList<>();
    pathsToList.add(new Path(dataWriteConfig.getBasePath()));
    List<DirectoryInfo> partitionsToBootstrap = new LinkedList<>();
    final int fileListingParallelism = metadataWriteConfig.getFileListingParallelism();
    SerializableConfiguration conf = new SerializableConfiguration(datasetMetaClient.getHadoopConf());
    final String dirFilterRegex = dataWriteConfig.getMetadataConfig().getDirectoryFilterRegex();
    final String datasetBasePath = datasetMetaClient.getBasePath();
    while (!pathsToList.isEmpty()) {
        // In each round we will list a section of directories
        int numDirsToList = Math.min(fileListingParallelism, pathsToList.size());
        // List all directories in parallel
        List<DirectoryInfo> processedDirectories = engineContext.map(pathsToList.subList(0, numDirsToList), path -> {
            FileSystem fs = path.getFileSystem(conf.get());
            String relativeDirPath = FSUtils.getRelativePartitionPath(new Path(datasetBasePath), path);
            return new DirectoryInfo(relativeDirPath, fs.listStatus(path));
        }, numDirsToList);
        pathsToList = new LinkedList<>(pathsToList.subList(numDirsToList, pathsToList.size()));
        // the results.
        for (DirectoryInfo dirInfo : processedDirectories) {
            if (!dirFilterRegex.isEmpty()) {
                final String relativePath = dirInfo.getRelativePath();
                if (!relativePath.isEmpty()) {
                    Path partitionPath = new Path(datasetBasePath, relativePath);
                    if (partitionPath.getName().matches(dirFilterRegex)) {
                        LOG.info("Ignoring directory " + partitionPath + " which matches the filter regex " + dirFilterRegex);
                        continue;
                    }
                }
            }
            if (dirInfo.isHoodiePartition()) {
                // Add to result
                partitionsToBootstrap.add(dirInfo);
            } else {
                // Add sub-dirs to the queue
                pathsToList.addAll(dirInfo.getSubDirectories());
            }
        }
    }
    return partitionsToBootstrap;
}
Also used : Path(org.apache.hadoop.fs.Path) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) FileSystem(org.apache.hadoop.fs.FileSystem) LinkedList(java.util.LinkedList)

Example 20 with SerializableConfiguration

use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.

the class TestDFSHoodieTestSuiteWriterAdapter method testDFSWorkloadSinkWithMultipleFilesFunctional.

@Test
public void testDFSWorkloadSinkWithMultipleFilesFunctional() throws IOException {
    DeltaConfig dfsSinkConfig = new DFSDeltaConfig(DeltaOutputMode.DFS, DeltaInputType.AVRO, new SerializableConfiguration(jsc.hadoopConfiguration()), dfsBasePath, dfsBasePath, schemaProvider.getSourceSchema().toString(), 10240L, jsc.defaultParallelism(), false);
    DeltaWriterAdapter<GenericRecord> dfsDeltaWriterAdapter = DeltaWriterFactory.getDeltaWriterAdapter(dfsSinkConfig, 1);
    FlexibleSchemaRecordGenerationIterator itr = new FlexibleSchemaRecordGenerationIterator(1000, schemaProvider.getSourceSchema().toString());
    dfsDeltaWriterAdapter.write(itr);
    FileSystem fs = FSUtils.getFs(dfsBasePath, jsc.hadoopConfiguration());
    FileStatus[] fileStatuses = fs.listStatus(new Path(dfsBasePath));
    // Since maxFileSize was 10240L and we produced 1K records each close to 1K size, we should produce more than
    // 1 file
    assertTrue(fileStatuses.length > 0);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) FileSystem(org.apache.hadoop.fs.FileSystem) FlexibleSchemaRecordGenerationIterator(org.apache.hudi.integ.testsuite.generator.FlexibleSchemaRecordGenerationIterator) DeltaConfig(org.apache.hudi.integ.testsuite.configuration.DeltaConfig) DFSDeltaConfig(org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig) GenericRecord(org.apache.avro.generic.GenericRecord) DFSDeltaConfig(org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig) Test(org.junit.jupiter.api.Test)

Aggregations

SerializableConfiguration (org.apache.hudi.common.config.SerializableConfiguration)32 Path (org.apache.hadoop.fs.Path)20 FileSystem (org.apache.hadoop.fs.FileSystem)16 FileStatus (org.apache.hadoop.fs.FileStatus)15 List (java.util.List)14 IOException (java.io.IOException)13 Collectors (java.util.stream.Collectors)13 Map (java.util.Map)12 Test (org.junit.jupiter.api.Test)12 ArrayList (java.util.ArrayList)11 LogManager (org.apache.log4j.LogManager)10 Logger (org.apache.log4j.Logger)10 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)9 Option (org.apache.hudi.common.util.Option)9 Arrays (java.util.Arrays)8 HoodieSparkEngineContext (org.apache.hudi.client.common.HoodieSparkEngineContext)8 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)8 HoodieIOException (org.apache.hudi.exception.HoodieIOException)7 Collections (java.util.Collections)6 Configuration (org.apache.hadoop.conf.Configuration)6