use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class DatePartitionPathSelector method getNextFilePathsAndMaxModificationTime.
@Override
public Pair<Option<String>, String> getNextFilePathsAndMaxModificationTime(JavaSparkContext sparkContext, Option<String> lastCheckpointStr, long sourceLimit) {
// If not specified the current date is assumed by default.
LocalDate currentDate = LocalDate.parse(props.getString(Config.CURRENT_DATE, LocalDate.now().toString()));
// obtain all eligible files under root folder.
LOG.info("Root path => " + props.getString(ROOT_INPUT_PATH_PROP) + " source limit => " + sourceLimit + " depth of day partition => " + datePartitionDepth + " num prev days to list => " + numPrevDaysToList + " from current date => " + currentDate);
long lastCheckpointTime = lastCheckpointStr.map(Long::parseLong).orElse(Long.MIN_VALUE);
HoodieSparkEngineContext context = new HoodieSparkEngineContext(sparkContext);
SerializableConfiguration serializedConf = new SerializableConfiguration(fs.getConf());
List<String> prunedPartitionPaths = pruneDatePartitionPaths(context, fs, props.getString(ROOT_INPUT_PATH_PROP), currentDate);
List<FileStatus> eligibleFiles = context.flatMap(prunedPartitionPaths, path -> {
FileSystem fs = new Path(path).getFileSystem(serializedConf.get());
return listEligibleFiles(fs, new Path(path), lastCheckpointTime).stream();
}, partitionsListParallelism);
// sort them by modification time ascending.
List<FileStatus> sortedEligibleFiles = eligibleFiles.stream().sorted(Comparator.comparingLong(FileStatus::getModificationTime)).collect(Collectors.toList());
// Filter based on checkpoint & input size, if needed
long currentBytes = 0;
long newCheckpointTime = lastCheckpointTime;
List<FileStatus> filteredFiles = new ArrayList<>();
for (FileStatus f : sortedEligibleFiles) {
if (currentBytes + f.getLen() >= sourceLimit && f.getModificationTime() > newCheckpointTime) {
// so that some files with the same modification time won't be skipped in next read
break;
}
newCheckpointTime = f.getModificationTime();
currentBytes += f.getLen();
filteredFiles.add(f);
}
// no data to read
if (filteredFiles.isEmpty()) {
return new ImmutablePair<>(Option.empty(), String.valueOf(newCheckpointTime));
}
// read the files out.
String pathStr = filteredFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(","));
return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(newCheckpointTime));
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class TestHoodieRepairTool method initWithCleanState.
@BeforeEach
public void initWithCleanState() throws IOException {
boolean initialized = spark != null;
if (!initialized) {
SparkConf sparkConf = conf();
SparkRDDWriteClient.registerClasses(sparkConf);
HoodieReadClient.addHoodieSupport(sparkConf);
spark = SparkSession.builder().config(sparkConf).getOrCreate();
sqlContext = spark.sqlContext();
jsc = new JavaSparkContext(spark.sparkContext());
context = new HoodieSparkEngineContext(jsc);
}
initPath();
metaClient = HoodieTestUtils.init(basePath, getTableType());
backupTempDir = tempDir.resolve("backup");
cleanUpDanglingDataFilesInFS();
cleanUpBackupTempDir();
HoodieTestCommitGenerator.setupTimelineInFS(basePath, BASE_FILE_INFO, LOG_FILE_INFO, instantInfoMap);
allFileAbsolutePathList.clear();
allFileAbsolutePathList.addAll(instantInfoMap.entrySet().stream().flatMap(e -> e.getValue().entrySet().stream().flatMap(partition -> partition.getValue().stream().map(fileInfo -> new Path(new Path(basePath, partition.getKey()), fileInfo.getValue()).toString()).collect(Collectors.toList()).stream()).collect(Collectors.toList()).stream()).collect(Collectors.toList()));
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class TimelineServerPerf method run.
public void run() throws IOException {
JavaSparkContext jsc = UtilHelpers.buildSparkContext("hudi-view-perf-" + cfg.basePath, cfg.sparkMaster);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(engineContext, cfg.basePath, cfg.useFileListingFromMetadata, true);
Collections.shuffle(allPartitionPaths);
List<String> selected = allPartitionPaths.stream().filter(p -> !p.contains("error")).limit(cfg.maxPartitions).collect(Collectors.toList());
if (!useExternalTimelineServer) {
this.timelineServer.startService();
setHostAddrFromSparkConf(jsc.getConf());
} else {
this.hostAddr = cfg.serverHost;
}
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(timelineServer.getConf()).setBasePath(cfg.basePath).setLoadActiveTimelineOnLoad(true).build();
SyncableFileSystemView fsView = new RemoteHoodieTableFileSystemView(this.hostAddr, cfg.serverPort, metaClient);
String reportDir = cfg.reportDir;
metaClient.getFs().mkdirs(new Path(reportDir));
String dumpPrefix = UUID.randomUUID().toString();
System.out.println("First Iteration to load all partitions");
Dumper d = new Dumper(metaClient.getFs(), new Path(reportDir, String.format("1_%s.csv", dumpPrefix)));
d.init();
d.dump(runLookups(jsc, selected, fsView, 1, 0));
d.close();
System.out.println("\n\n\n First Iteration is done");
Dumper d2 = new Dumper(metaClient.getFs(), new Path(reportDir, String.format("2_%s.csv", dumpPrefix)));
d2.init();
d2.dump(runLookups(jsc, selected, fsView, cfg.numIterations, cfg.numCoresPerExecutor));
d2.close();
System.out.println("\n\n\nDumping all File Slices");
selected.forEach(p -> fsView.getAllFileSlices(p).forEach(s -> System.out.println("\tMyFileSlice=" + s)));
// Waiting for curl queries
if (!useExternalTimelineServer && cfg.waitForManualQueries) {
System.out.println("Timeline Server Host Address=" + hostAddr + ", port=" + timelineServer.getServerPort());
while (true) {
try {
Thread.sleep(60000);
} catch (InterruptedException e) {
// skip it
}
}
}
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class HoodieMetadataTableValidator method doMetadataTableValidation.
public void doMetadataTableValidation() {
boolean finalResult = true;
metaClient.reloadActiveTimeline();
String basePath = metaClient.getBasePath();
Set<String> baseFilesForCleaning = Collections.emptySet();
if (cfg.skipDataFilesForCleaning) {
HoodieTimeline inflightCleaningTimeline = metaClient.getActiveTimeline().getCleanerTimeline().filterInflights();
baseFilesForCleaning = inflightCleaningTimeline.getInstants().flatMap(instant -> {
try {
// convert inflight instant to requested and get clean plan
instant = new HoodieInstant(HoodieInstant.State.REQUESTED, instant.getAction(), instant.getTimestamp());
HoodieCleanerPlan cleanerPlan = CleanerUtils.getCleanerPlan(metaClient, instant);
return cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().flatMap(cleanerFileInfoList -> {
return cleanerFileInfoList.stream().map(fileInfo -> {
return new Path(fileInfo.getFilePath()).getName();
});
});
} catch (IOException e) {
throw new HoodieIOException("Error reading cleaner metadata for " + instant);
}
// only take care of base files here.
}).filter(path -> {
String fileExtension = FSUtils.getFileExtension(path);
return HoodieFileFormat.BASE_FILE_EXTENSIONS.contains(fileExtension);
}).collect(Collectors.toSet());
}
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
List<String> allPartitions = validatePartitions(engineContext, basePath);
HoodieMetadataValidationContext metadataTableBasedContext = new HoodieMetadataValidationContext(engineContext, cfg, metaClient, true);
HoodieMetadataValidationContext fsBasedContext = new HoodieMetadataValidationContext(engineContext, cfg, metaClient, false);
Set<String> finalBaseFilesForCleaning = baseFilesForCleaning;
List<Boolean> result = engineContext.parallelize(allPartitions, allPartitions.size()).map(partitionPath -> {
try {
validateFilesInPartition(metadataTableBasedContext, fsBasedContext, partitionPath, finalBaseFilesForCleaning);
LOG.info("Metadata table validation succeeded for " + partitionPath);
return true;
} catch (HoodieValidationException e) {
LOG.error("Metadata table validation failed for " + partitionPath + " due to HoodieValidationException", e);
if (!cfg.ignoreFailed) {
throw e;
}
return false;
}
}).collectAsList();
for (Boolean res : result) {
finalResult &= res;
}
if (finalResult) {
LOG.info("Metadata table validation succeeded.");
} else {
LOG.warn("Metadata table validation failed.");
}
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class HoodieSnapshotCopier method snapshot.
public void snapshot(JavaSparkContext jsc, String baseDir, final String outputDir, final boolean shouldAssumeDatePartitioning, final boolean useFileListingFromMetadata) throws IOException {
FileSystem fs = FSUtils.getFs(baseDir, jsc.hadoopConfiguration());
final SerializableConfiguration serConf = new SerializableConfiguration(jsc.hadoopConfiguration());
final HoodieTableMetaClient tableMetadata = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(baseDir).build();
final BaseFileOnlyView fsView = new HoodieTableFileSystemView(tableMetadata, tableMetadata.getActiveTimeline().getWriteTimeline().filterCompletedInstants());
HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
// Get the latest commit
Option<HoodieInstant> latestCommit = tableMetadata.getActiveTimeline().getWriteTimeline().filterCompletedInstants().lastInstant();
if (!latestCommit.isPresent()) {
LOG.warn("No commits present. Nothing to snapshot");
return;
}
final String latestCommitTimestamp = latestCommit.get().getTimestamp();
LOG.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.", latestCommitTimestamp));
List<String> partitions = FSUtils.getAllPartitionPaths(context, baseDir, useFileListingFromMetadata, shouldAssumeDatePartitioning);
if (partitions.size() > 0) {
LOG.info(String.format("The job needs to copy %d partitions.", partitions.size()));
// Make sure the output directory is empty
Path outputPath = new Path(outputDir);
if (fs.exists(outputPath)) {
LOG.warn(String.format("The output path %s targetBasePath already exists, deleting", outputPath));
fs.delete(new Path(outputDir), true);
}
context.setJobStatus(this.getClass().getSimpleName(), "Creating a snapshot");
List<Tuple2<String, String>> filesToCopy = context.flatMap(partitions, partition -> {
// Only take latest version files <= latestCommit.
FileSystem fs1 = FSUtils.getFs(baseDir, serConf.newCopy());
List<Tuple2<String, String>> filePaths = new ArrayList<>();
Stream<HoodieBaseFile> dataFiles = fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp);
dataFiles.forEach(hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath())));
// also need to copy over partition metadata
Path partitionMetaFile = new Path(FSUtils.getPartitionPath(baseDir, partition), HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE);
if (fs1.exists(partitionMetaFile)) {
filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString()));
}
return filePaths.stream();
}, partitions.size());
context.foreach(filesToCopy, tuple -> {
String partition = tuple._1();
Path sourceFilePath = new Path(tuple._2());
Path toPartitionPath = FSUtils.getPartitionPath(outputDir, partition);
FileSystem ifs = FSUtils.getFs(baseDir, serConf.newCopy());
if (!ifs.exists(toPartitionPath)) {
ifs.mkdirs(toPartitionPath);
}
FileUtil.copy(ifs, sourceFilePath, ifs, new Path(toPartitionPath, sourceFilePath.getName()), false, ifs.getConf());
}, filesToCopy.size());
// Also copy the .commit files
LOG.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp));
FileStatus[] commitFilesToCopy = fs.listStatus(new Path(baseDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> {
if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) {
return true;
} else {
String instantTime = FSUtils.getCommitFromCommitFile(commitFilePath.getName());
return HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.LESSER_THAN_OR_EQUALS, latestCommitTimestamp);
}
});
for (FileStatus commitStatus : commitFilesToCopy) {
Path targetFilePath = new Path(outputDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus.getPath().getName());
if (!fs.exists(targetFilePath.getParent())) {
fs.mkdirs(targetFilePath.getParent());
}
if (fs.exists(targetFilePath)) {
LOG.error(String.format("The target output commit file (%s targetBasePath) already exists.", targetFilePath));
}
FileUtil.copy(fs, commitStatus.getPath(), fs, targetFilePath, false, fs.getConf());
}
} else {
LOG.info("The job has 0 partition to copy.");
}
// Create the _SUCCESS tag
Path successTagPath = new Path(outputDir + "/_SUCCESS");
if (!fs.exists(successTagPath)) {
LOG.info(String.format("Creating _SUCCESS under targetBasePath: %s", outputDir));
fs.createNewFile(successTagPath);
}
}
Aggregations