use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.
the class DatePartitionPathSelector method getNextFilePathsAndMaxModificationTime.
@Override
public Pair<Option<String>, String> getNextFilePathsAndMaxModificationTime(JavaSparkContext sparkContext, Option<String> lastCheckpointStr, long sourceLimit) {
// If not specified the current date is assumed by default.
LocalDate currentDate = LocalDate.parse(props.getString(Config.CURRENT_DATE, LocalDate.now().toString()));
// obtain all eligible files under root folder.
LOG.info("Root path => " + props.getString(ROOT_INPUT_PATH_PROP) + " source limit => " + sourceLimit + " depth of day partition => " + datePartitionDepth + " num prev days to list => " + numPrevDaysToList + " from current date => " + currentDate);
long lastCheckpointTime = lastCheckpointStr.map(Long::parseLong).orElse(Long.MIN_VALUE);
HoodieSparkEngineContext context = new HoodieSparkEngineContext(sparkContext);
SerializableConfiguration serializedConf = new SerializableConfiguration(fs.getConf());
List<String> prunedPartitionPaths = pruneDatePartitionPaths(context, fs, props.getString(ROOT_INPUT_PATH_PROP), currentDate);
List<FileStatus> eligibleFiles = context.flatMap(prunedPartitionPaths, path -> {
FileSystem fs = new Path(path).getFileSystem(serializedConf.get());
return listEligibleFiles(fs, new Path(path), lastCheckpointTime).stream();
}, partitionsListParallelism);
// sort them by modification time ascending.
List<FileStatus> sortedEligibleFiles = eligibleFiles.stream().sorted(Comparator.comparingLong(FileStatus::getModificationTime)).collect(Collectors.toList());
// Filter based on checkpoint & input size, if needed
long currentBytes = 0;
long newCheckpointTime = lastCheckpointTime;
List<FileStatus> filteredFiles = new ArrayList<>();
for (FileStatus f : sortedEligibleFiles) {
if (currentBytes + f.getLen() >= sourceLimit && f.getModificationTime() > newCheckpointTime) {
// so that some files with the same modification time won't be skipped in next read
break;
}
newCheckpointTime = f.getModificationTime();
currentBytes += f.getLen();
filteredFiles.add(f);
}
// no data to read
if (filteredFiles.isEmpty()) {
return new ImmutablePair<>(Option.empty(), String.valueOf(newCheckpointTime));
}
// read the files out.
String pathStr = filteredFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(","));
return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(newCheckpointTime));
}
use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.
the class FileSystemViewManager method createViewManager.
/**
* Main Factory method for building file-system views.
*/
public static FileSystemViewManager createViewManager(final HoodieEngineContext context, final HoodieMetadataConfig metadataConfig, final FileSystemViewStorageConfig config, final HoodieCommonConfig commonConfig, final SerializableSupplier<HoodieTableMetadata> metadataSupplier) {
LOG.info("Creating View Manager with storage type :" + config.getStorageType());
final SerializableConfiguration conf = context.getHadoopConf();
switch(config.getStorageType()) {
case EMBEDDED_KV_STORE:
LOG.info("Creating embedded rocks-db based Table View");
return new FileSystemViewManager(context, config, (metaClient, viewConf) -> createRocksDBBasedFileSystemView(conf, viewConf, metaClient));
case SPILLABLE_DISK:
LOG.info("Creating Spillable Disk based Table View");
return new FileSystemViewManager(context, config, (metaClient, viewConf) -> createSpillableMapBasedFileSystemView(conf, viewConf, metaClient, commonConfig));
case MEMORY:
LOG.info("Creating in-memory based Table View");
return new FileSystemViewManager(context, config, (metaClient, viewConfig) -> createInMemoryFileSystemView(metadataConfig, viewConfig, metaClient, metadataSupplier));
case REMOTE_ONLY:
LOG.info("Creating remote only table view");
return new FileSystemViewManager(context, config, (metaClient, viewConfig) -> createRemoteFileSystemView(conf, viewConfig, metaClient));
case REMOTE_FIRST:
LOG.info("Creating remote first table view");
return new FileSystemViewManager(context, config, (metaClient, viewConfig) -> {
RemoteHoodieTableFileSystemView remoteFileSystemView = createRemoteFileSystemView(conf, viewConfig, metaClient);
SyncableFileSystemView secondaryView;
switch(viewConfig.getSecondaryStorageType()) {
case MEMORY:
secondaryView = createInMemoryFileSystemView(metadataConfig, viewConfig, metaClient, metadataSupplier);
break;
case EMBEDDED_KV_STORE:
secondaryView = createRocksDBBasedFileSystemView(conf, viewConfig, metaClient);
break;
case SPILLABLE_DISK:
secondaryView = createSpillableMapBasedFileSystemView(conf, viewConfig, metaClient, commonConfig);
break;
default:
throw new IllegalArgumentException("Secondary Storage type can only be in-memory or spillable. Was :" + viewConfig.getSecondaryStorageType());
}
return new PriorityBasedFileSystemView(remoteFileSystemView, secondaryView);
});
default:
throw new IllegalArgumentException("Unknown file system view type :" + config.getStorageType());
}
}
use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.
the class HoodieRepairTool method deleteFiles.
/**
* Deletes files from table base path.
*
* @param context {@link HoodieEngineContext} instance.
* @param basePath Base path of the table.
* @param relativeFilePaths A {@link List} of relative file paths for deleting.
*/
static boolean deleteFiles(HoodieEngineContext context, String basePath, List<String> relativeFilePaths) {
SerializableConfiguration conf = context.getHadoopConf();
return context.parallelize(relativeFilePaths).mapPartitions(iterator -> {
FileSystem fs = FSUtils.getFs(basePath, conf.get());
List<Boolean> results = new ArrayList<>();
iterator.forEachRemaining(relativeFilePath -> {
boolean success = false;
try {
success = fs.delete(new Path(basePath, relativeFilePath), false);
} catch (IOException e) {
LOG.warn("Failed to delete file " + relativeFilePath);
} finally {
results.add(success);
}
});
return results.iterator();
}, true).collectAsList().stream().reduce((a, b) -> a && b).orElse(true);
}
use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.
the class HoodieSnapshotCopier method snapshot.
public void snapshot(JavaSparkContext jsc, String baseDir, final String outputDir, final boolean shouldAssumeDatePartitioning, final boolean useFileListingFromMetadata) throws IOException {
FileSystem fs = FSUtils.getFs(baseDir, jsc.hadoopConfiguration());
final SerializableConfiguration serConf = new SerializableConfiguration(jsc.hadoopConfiguration());
final HoodieTableMetaClient tableMetadata = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(baseDir).build();
final BaseFileOnlyView fsView = new HoodieTableFileSystemView(tableMetadata, tableMetadata.getActiveTimeline().getWriteTimeline().filterCompletedInstants());
HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
// Get the latest commit
Option<HoodieInstant> latestCommit = tableMetadata.getActiveTimeline().getWriteTimeline().filterCompletedInstants().lastInstant();
if (!latestCommit.isPresent()) {
LOG.warn("No commits present. Nothing to snapshot");
return;
}
final String latestCommitTimestamp = latestCommit.get().getTimestamp();
LOG.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.", latestCommitTimestamp));
List<String> partitions = FSUtils.getAllPartitionPaths(context, baseDir, useFileListingFromMetadata, shouldAssumeDatePartitioning);
if (partitions.size() > 0) {
LOG.info(String.format("The job needs to copy %d partitions.", partitions.size()));
// Make sure the output directory is empty
Path outputPath = new Path(outputDir);
if (fs.exists(outputPath)) {
LOG.warn(String.format("The output path %s targetBasePath already exists, deleting", outputPath));
fs.delete(new Path(outputDir), true);
}
context.setJobStatus(this.getClass().getSimpleName(), "Creating a snapshot");
List<Tuple2<String, String>> filesToCopy = context.flatMap(partitions, partition -> {
// Only take latest version files <= latestCommit.
FileSystem fs1 = FSUtils.getFs(baseDir, serConf.newCopy());
List<Tuple2<String, String>> filePaths = new ArrayList<>();
Stream<HoodieBaseFile> dataFiles = fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp);
dataFiles.forEach(hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath())));
// also need to copy over partition metadata
Path partitionMetaFile = new Path(FSUtils.getPartitionPath(baseDir, partition), HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE);
if (fs1.exists(partitionMetaFile)) {
filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString()));
}
return filePaths.stream();
}, partitions.size());
context.foreach(filesToCopy, tuple -> {
String partition = tuple._1();
Path sourceFilePath = new Path(tuple._2());
Path toPartitionPath = FSUtils.getPartitionPath(outputDir, partition);
FileSystem ifs = FSUtils.getFs(baseDir, serConf.newCopy());
if (!ifs.exists(toPartitionPath)) {
ifs.mkdirs(toPartitionPath);
}
FileUtil.copy(ifs, sourceFilePath, ifs, new Path(toPartitionPath, sourceFilePath.getName()), false, ifs.getConf());
}, filesToCopy.size());
// Also copy the .commit files
LOG.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp));
FileStatus[] commitFilesToCopy = fs.listStatus(new Path(baseDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> {
if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) {
return true;
} else {
String instantTime = FSUtils.getCommitFromCommitFile(commitFilePath.getName());
return HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.LESSER_THAN_OR_EQUALS, latestCommitTimestamp);
}
});
for (FileStatus commitStatus : commitFilesToCopy) {
Path targetFilePath = new Path(outputDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus.getPath().getName());
if (!fs.exists(targetFilePath.getParent())) {
fs.mkdirs(targetFilePath.getParent());
}
if (fs.exists(targetFilePath)) {
LOG.error(String.format("The target output commit file (%s targetBasePath) already exists.", targetFilePath));
}
FileUtil.copy(fs, commitStatus.getPath(), fs, targetFilePath, false, fs.getConf());
}
} else {
LOG.info("The job has 0 partition to copy.");
}
// Create the _SUCCESS tag
Path successTagPath = new Path(outputDir + "/_SUCCESS");
if (!fs.exists(successTagPath)) {
LOG.info(String.format("Creating _SUCCESS under targetBasePath: %s", outputDir));
fs.createNewFile(successTagPath);
}
}
use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.
the class TestHoodieBackedMetadata method validateMetadata.
private void validateMetadata(SparkRDDWriteClient testClient) throws IOException {
HoodieWriteConfig config = testClient.getConfig();
SparkRDDWriteClient client;
if (config.isEmbeddedTimelineServerEnabled()) {
testClient.close();
client = new SparkRDDWriteClient(testClient.getEngineContext(), testClient.getConfig());
} else {
client = testClient;
}
HoodieTableMetadata tableMetadata = metadata(client);
assertNotNull(tableMetadata, "MetadataReader should have been initialized");
if (!config.isMetadataTableEnabled()) {
return;
}
HoodieTimer timer = new HoodieTimer().startTimer();
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
// Partitions should match
FileSystemBackedTableMetadata fsBackedTableMetadata = new FileSystemBackedTableMetadata(engineContext, new SerializableConfiguration(hadoopConf), config.getBasePath(), config.shouldAssumeDatePartitioning());
List<String> fsPartitions = fsBackedTableMetadata.getAllPartitionPaths();
List<String> metadataPartitions = tableMetadata.getAllPartitionPaths();
Collections.sort(fsPartitions);
Collections.sort(metadataPartitions);
assertEquals(fsPartitions.size(), metadataPartitions.size(), "Partitions should match");
assertTrue(fsPartitions.equals(metadataPartitions), "Partitions should match");
// Files within each partition should match
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable table = HoodieSparkTable.create(config, engineContext);
TableFileSystemView tableView = table.getHoodieView();
List<String> fullPartitionPaths = fsPartitions.stream().map(partition -> basePath + "/" + partition).collect(Collectors.toList());
Map<String, FileStatus[]> partitionToFilesMap = tableMetadata.getAllFilesInPartitions(fullPartitionPaths);
assertEquals(fsPartitions.size(), partitionToFilesMap.size());
fsPartitions.forEach(partition -> {
try {
Path partitionPath;
if (partition.equals("")) {
// Should be the non-partitioned case
partitionPath = new Path(basePath);
} else {
partitionPath = new Path(basePath, partition);
}
FileStatus[] fsStatuses = FSUtils.getAllDataFilesInPartition(fs, partitionPath);
FileStatus[] metaStatuses = tableMetadata.getAllFilesInPartition(partitionPath);
List<String> fsFileNames = Arrays.stream(fsStatuses).map(s -> s.getPath().getName()).collect(Collectors.toList());
List<String> metadataFilenames = Arrays.stream(metaStatuses).map(s -> s.getPath().getName()).collect(Collectors.toList());
Collections.sort(fsFileNames);
Collections.sort(metadataFilenames);
assertEquals(fsStatuses.length, partitionToFilesMap.get(partitionPath.toString()).length);
// File sizes should be valid
Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getLen() > 0));
if ((fsFileNames.size() != metadataFilenames.size()) || (!fsFileNames.equals(metadataFilenames))) {
LOG.info("*** File system listing = " + Arrays.toString(fsFileNames.toArray()));
LOG.info("*** Metadata listing = " + Arrays.toString(metadataFilenames.toArray()));
for (String fileName : fsFileNames) {
if (!metadataFilenames.contains(fileName)) {
LOG.error(partition + "FsFilename " + fileName + " not found in Meta data");
}
}
for (String fileName : metadataFilenames) {
if (!fsFileNames.contains(fileName)) {
LOG.error(partition + "Metadata file " + fileName + " not found in original FS");
}
}
}
// Block sizes should be valid
Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getBlockSize() > 0));
List<Long> fsBlockSizes = Arrays.stream(fsStatuses).map(FileStatus::getBlockSize).collect(Collectors.toList());
Collections.sort(fsBlockSizes);
List<Long> metadataBlockSizes = Arrays.stream(metaStatuses).map(FileStatus::getBlockSize).collect(Collectors.toList());
Collections.sort(metadataBlockSizes);
assertEquals(fsBlockSizes, metadataBlockSizes);
assertEquals(fsFileNames.size(), metadataFilenames.size(), "Files within partition " + partition + " should match");
assertTrue(fsFileNames.equals(metadataFilenames), "Files within partition " + partition + " should match");
// FileSystemView should expose the same data
List<HoodieFileGroup> fileGroups = tableView.getAllFileGroups(partition).collect(Collectors.toList());
fileGroups.addAll(tableView.getAllReplacedFileGroups(partition).collect(Collectors.toList()));
fileGroups.forEach(g -> LogManager.getLogger(TestHoodieBackedMetadata.class).info(g));
fileGroups.forEach(g -> g.getAllBaseFiles().forEach(b -> LogManager.getLogger(TestHoodieBackedMetadata.class).info(b)));
fileGroups.forEach(g -> g.getAllFileSlices().forEach(s -> LogManager.getLogger(TestHoodieBackedMetadata.class).info(s)));
long numFiles = fileGroups.stream().mapToLong(g -> g.getAllBaseFiles().count() + g.getAllFileSlices().mapToLong(s -> s.getLogFiles().count()).sum()).sum();
assertEquals(metadataFilenames.size(), numFiles);
} catch (IOException e) {
e.printStackTrace();
assertTrue(false, "Exception should not be raised: " + e);
}
});
HoodieBackedTableMetadataWriter metadataWriter = metadataWriter(client);
assertNotNull(metadataWriter, "MetadataWriter should have been initialized");
// Validate write config for metadata table
HoodieWriteConfig metadataWriteConfig = metadataWriter.getWriteConfig();
assertFalse(metadataWriteConfig.isMetadataTableEnabled(), "No metadata table for metadata table");
// Metadata table should be in sync with the dataset
HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build();
// Metadata table is MOR
assertEquals(metadataMetaClient.getTableType(), HoodieTableType.MERGE_ON_READ, "Metadata Table should be MOR");
// Metadata table is HFile format
assertEquals(metadataMetaClient.getTableConfig().getBaseFileFormat(), HoodieFileFormat.HFILE, "Metadata Table base file format should be HFile");
// Metadata table has a fixed number of partitions
// Cannot use FSUtils.getAllFoldersWithPartitionMetaFile for this as that function filters all directory
// in the .hoodie folder.
List<String> metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, HoodieTableMetadata.getMetadataTableBasePath(basePath), false, false);
assertEquals(metadataWriter.getEnabledPartitionTypes().size(), metadataTablePartitions.size());
final Map<String, MetadataPartitionType> metadataEnabledPartitionTypes = new HashMap<>();
metadataWriter.getEnabledPartitionTypes().forEach(e -> metadataEnabledPartitionTypes.put(e.getPartitionPath(), e));
// Metadata table should automatically compact and clean
// versions are +1 as autoclean / compaction happens end of commits
int numFileVersions = metadataWriteConfig.getCleanerFileVersionsRetained() + 1;
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metadataMetaClient, metadataMetaClient.getActiveTimeline());
metadataTablePartitions.forEach(partition -> {
List<FileSlice> latestSlices = fsView.getLatestFileSlices(partition).collect(Collectors.toList());
assertTrue(latestSlices.stream().map(FileSlice::getBaseFile).count() <= metadataEnabledPartitionTypes.get(partition).getFileGroupCount(), "Should have a single latest base file per file group");
assertTrue(latestSlices.size() <= metadataEnabledPartitionTypes.get(partition).getFileGroupCount(), "Should have a single latest file slice per file group");
assertTrue(latestSlices.size() <= (numFileVersions * metadataEnabledPartitionTypes.get(partition).getFileGroupCount()), "Should limit file slice to " + numFileVersions + " per file group, but was " + latestSlices.size());
});
LOG.info("Validation time=" + timer.endTimer());
}
Aggregations