use of org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView in project hudi by apache.
the class TestHoodieMergeOnReadTable method testUpsertPartitioner.
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testUpsertPartitioner(boolean populateMetaFields) throws Exception {
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true);
addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
HoodieWriteConfig cfg = cfgBuilder.build();
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
/**
* Write 1 (only inserts, written as base file)
*/
String newCommitTime = "001";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses);
HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
assertTrue(deltaCommit.isPresent());
assertEquals("001", deltaCommit.get().getTimestamp(), "Delta commit should be 001");
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent());
FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
BaseFileOnlyView roView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), allFiles);
Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
Map<String, Long> fileIdToSize = dataFilesToRead.collect(Collectors.toMap(HoodieBaseFile::getFileId, HoodieBaseFile::getFileSize));
roView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
dataFilesToRead = roView.getLatestBaseFiles();
List<HoodieBaseFile> dataFilesList = dataFilesToRead.collect(Collectors.toList());
assertTrue(dataFilesList.size() > 0, "Should list the base files we wrote in the delta commit");
/**
* Write 2 (only updates + inserts, written to .log file + correction of existing base file size)
*/
newCommitTime = "002";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> newRecords = dataGen.generateUpdates(newCommitTime, records);
newRecords.addAll(dataGen.generateInserts(newCommitTime, 20));
statuses = client.upsert(jsc().parallelize(newRecords), newCommitTime).collect();
// Verify there are no errors
assertNoWriteErrors(statuses);
metaClient = HoodieTableMetaClient.reload(metaClient);
deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant();
assertTrue(deltaCommit.isPresent());
assertEquals("002", deltaCommit.get().getTimestamp(), "Latest Delta commit should be 002");
commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent());
allFiles = listAllBaseFilesInPath(hoodieTable);
roView = getHoodieTableFileSystemView(metaClient, hoodieTable.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(), allFiles);
dataFilesToRead = roView.getLatestBaseFiles();
List<HoodieBaseFile> newDataFilesList = dataFilesToRead.collect(Collectors.toList());
Map<String, Long> fileIdToNewSize = newDataFilesList.stream().collect(Collectors.toMap(HoodieBaseFile::getFileId, HoodieBaseFile::getFileSize));
assertTrue(fileIdToNewSize.entrySet().stream().anyMatch(entry -> fileIdToSize.get(entry.getKey()) < entry.getValue()));
List<String> inputPaths = roView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath(), new JobConf(hadoopConf()), true, false);
// Wrote 20 records in 2 batches
assertEquals(40, recordsRead.size(), "Must contain 40 records");
}
}
use of org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView in project hudi by apache.
the class HoodieClientTestUtils method getLatestBaseFiles.
public static List<HoodieBaseFile> getLatestBaseFiles(String basePath, FileSystem fs, String... paths) {
List<HoodieBaseFile> latestFiles = new ArrayList<>();
try {
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build();
for (String path : paths) {
BaseFileOnlyView fileSystemView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), fs.globStatus(new Path(path)));
latestFiles.addAll(fileSystemView.getLatestBaseFiles().collect(Collectors.toList()));
}
} catch (Exception e) {
throw new HoodieException("Error reading hoodie table as a dataframe", e);
}
return latestFiles;
}
use of org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView in project hudi by apache.
the class HoodieSnapshotExporter method exportAsHudi.
private void exportAsHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) throws IOException {
final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
final HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
final SerializableConfiguration serConf = context.getHadoopConf();
context.setJobStatus(this.getClass().getSimpleName(), "Exporting as HUDI dataset");
List<Tuple2<String, String>> files = context.flatMap(partitions, partition -> {
// Only take latest version files <= latestCommit.
List<Tuple2<String, String>> filePaths = new ArrayList<>();
Stream<HoodieBaseFile> dataFiles = fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp);
dataFiles.forEach(hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath())));
// also need to copy over partition metadata
Path partitionMetaFile = new Path(FSUtils.getPartitionPath(cfg.sourceBasePath, partition), HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE);
FileSystem fs = FSUtils.getFs(cfg.sourceBasePath, serConf.newCopy());
if (fs.exists(partitionMetaFile)) {
filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString()));
}
return filePaths.stream();
}, partitions.size());
context.foreach(files, tuple -> {
String partition = tuple._1();
Path sourceFilePath = new Path(tuple._2());
Path toPartitionPath = FSUtils.getPartitionPath(cfg.targetOutputPath, partition);
FileSystem fs = FSUtils.getFs(cfg.targetOutputPath, serConf.newCopy());
if (!fs.exists(toPartitionPath)) {
fs.mkdirs(toPartitionPath);
}
FileUtil.copy(fs, sourceFilePath, fs, new Path(toPartitionPath, sourceFilePath.getName()), false, fs.getConf());
}, files.size());
// Also copy the .commit files
LOG.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp));
final FileSystem fileSystem = FSUtils.getFs(cfg.sourceBasePath, jsc.hadoopConfiguration());
FileStatus[] commitFilesToCopy = fileSystem.listStatus(new Path(cfg.sourceBasePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> {
if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) {
return true;
} else {
String instantTime = FSUtils.getCommitFromCommitFile(commitFilePath.getName());
return HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.LESSER_THAN_OR_EQUALS, latestCommitTimestamp);
}
});
for (FileStatus commitStatus : commitFilesToCopy) {
Path targetFilePath = new Path(cfg.targetOutputPath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus.getPath().getName());
if (!fileSystem.exists(targetFilePath.getParent())) {
fileSystem.mkdirs(targetFilePath.getParent());
}
if (fileSystem.exists(targetFilePath)) {
LOG.error(String.format("The target output commit file (%s targetBasePath) already exists.", targetFilePath));
}
FileUtil.copy(fileSystem, commitStatus.getPath(), fileSystem, targetFilePath, false, fileSystem.getConf());
}
}
use of org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView in project hudi by apache.
the class HoodieSnapshotCopier method snapshot.
public void snapshot(JavaSparkContext jsc, String baseDir, final String outputDir, final boolean shouldAssumeDatePartitioning, final boolean useFileListingFromMetadata) throws IOException {
FileSystem fs = FSUtils.getFs(baseDir, jsc.hadoopConfiguration());
final SerializableConfiguration serConf = new SerializableConfiguration(jsc.hadoopConfiguration());
final HoodieTableMetaClient tableMetadata = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(baseDir).build();
final BaseFileOnlyView fsView = new HoodieTableFileSystemView(tableMetadata, tableMetadata.getActiveTimeline().getWriteTimeline().filterCompletedInstants());
HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
// Get the latest commit
Option<HoodieInstant> latestCommit = tableMetadata.getActiveTimeline().getWriteTimeline().filterCompletedInstants().lastInstant();
if (!latestCommit.isPresent()) {
LOG.warn("No commits present. Nothing to snapshot");
return;
}
final String latestCommitTimestamp = latestCommit.get().getTimestamp();
LOG.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.", latestCommitTimestamp));
List<String> partitions = FSUtils.getAllPartitionPaths(context, baseDir, useFileListingFromMetadata, shouldAssumeDatePartitioning);
if (partitions.size() > 0) {
LOG.info(String.format("The job needs to copy %d partitions.", partitions.size()));
// Make sure the output directory is empty
Path outputPath = new Path(outputDir);
if (fs.exists(outputPath)) {
LOG.warn(String.format("The output path %s targetBasePath already exists, deleting", outputPath));
fs.delete(new Path(outputDir), true);
}
context.setJobStatus(this.getClass().getSimpleName(), "Creating a snapshot");
List<Tuple2<String, String>> filesToCopy = context.flatMap(partitions, partition -> {
// Only take latest version files <= latestCommit.
FileSystem fs1 = FSUtils.getFs(baseDir, serConf.newCopy());
List<Tuple2<String, String>> filePaths = new ArrayList<>();
Stream<HoodieBaseFile> dataFiles = fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp);
dataFiles.forEach(hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath())));
// also need to copy over partition metadata
Path partitionMetaFile = new Path(FSUtils.getPartitionPath(baseDir, partition), HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE);
if (fs1.exists(partitionMetaFile)) {
filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString()));
}
return filePaths.stream();
}, partitions.size());
context.foreach(filesToCopy, tuple -> {
String partition = tuple._1();
Path sourceFilePath = new Path(tuple._2());
Path toPartitionPath = FSUtils.getPartitionPath(outputDir, partition);
FileSystem ifs = FSUtils.getFs(baseDir, serConf.newCopy());
if (!ifs.exists(toPartitionPath)) {
ifs.mkdirs(toPartitionPath);
}
FileUtil.copy(ifs, sourceFilePath, ifs, new Path(toPartitionPath, sourceFilePath.getName()), false, ifs.getConf());
}, filesToCopy.size());
// Also copy the .commit files
LOG.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp));
FileStatus[] commitFilesToCopy = fs.listStatus(new Path(baseDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> {
if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) {
return true;
} else {
String instantTime = FSUtils.getCommitFromCommitFile(commitFilePath.getName());
return HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.LESSER_THAN_OR_EQUALS, latestCommitTimestamp);
}
});
for (FileStatus commitStatus : commitFilesToCopy) {
Path targetFilePath = new Path(outputDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus.getPath().getName());
if (!fs.exists(targetFilePath.getParent())) {
fs.mkdirs(targetFilePath.getParent());
}
if (fs.exists(targetFilePath)) {
LOG.error(String.format("The target output commit file (%s targetBasePath) already exists.", targetFilePath));
}
FileUtil.copy(fs, commitStatus.getPath(), fs, targetFilePath, false, fs.getConf());
}
} else {
LOG.info("The job has 0 partition to copy.");
}
// Create the _SUCCESS tag
Path successTagPath = new Path(outputDir + "/_SUCCESS");
if (!fs.exists(successTagPath)) {
LOG.info(String.format("Creating _SUCCESS under targetBasePath: %s", outputDir));
fs.createNewFile(successTagPath);
}
}
use of org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView in project hudi by apache.
the class HoodieSnapshotExporter method exportAsNonHudi.
private void exportAsNonHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) {
Partitioner defaultPartitioner = dataset -> {
Dataset<Row> hoodieDroppedDataset = dataset.drop(JavaConversions.asScalaIterator(HoodieRecord.HOODIE_META_COLUMNS.iterator()).toSeq());
return StringUtils.isNullOrEmpty(cfg.outputPartitionField) ? hoodieDroppedDataset.write() : hoodieDroppedDataset.repartition(new Column(cfg.outputPartitionField)).write().partitionBy(cfg.outputPartitionField);
};
Partitioner partitioner = StringUtils.isNullOrEmpty(cfg.outputPartitioner) ? defaultPartitioner : ReflectionUtils.loadClass(cfg.outputPartitioner);
HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
context.setJobStatus(this.getClass().getSimpleName(), "Exporting as non-HUDI dataset");
final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
Iterator<String> exportingFilePaths = jsc.parallelize(partitions, partitions.size()).flatMap(partition -> fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp).map(HoodieBaseFile::getPath).iterator()).toLocalIterator();
Dataset<Row> sourceDataset = new SQLContext(jsc).read().parquet(JavaConversions.asScalaIterator(exportingFilePaths).toSeq());
partitioner.partition(sourceDataset).format(cfg.outputFormat).mode(SaveMode.Overwrite).save(cfg.targetOutputPath);
}
Aggregations