use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class FlinkDeleteHelper method execute.
@Override
public HoodieWriteMetadata<List<WriteStatus>> execute(String instantTime, List<HoodieKey> keys, HoodieEngineContext context, HoodieWriteConfig config, HoodieTable<EmptyHoodieRecordPayload, List<HoodieRecord<EmptyHoodieRecordPayload>>, List<HoodieKey>, List<WriteStatus>> table, BaseCommitActionExecutor<EmptyHoodieRecordPayload, List<HoodieRecord<EmptyHoodieRecordPayload>>, List<HoodieKey>, List<WriteStatus>, R> deleteExecutor) {
try {
HoodieWriteMetadata<List<WriteStatus>> result = null;
List<HoodieKey> dedupedKeys = keys;
final int parallelism = config.getDeleteShuffleParallelism();
if (config.shouldCombineBeforeDelete()) {
// De-dupe/merge if needed
dedupedKeys = deduplicateKeys(keys, table, parallelism);
}
List<HoodieRecord<EmptyHoodieRecordPayload>> dedupedRecords = dedupedKeys.stream().map(key -> new HoodieAvroRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList());
Instant beginTag = Instant.now();
// perform index look up to get existing location of records
List<HoodieRecord<EmptyHoodieRecordPayload>> taggedRecords = HoodieList.getList(table.getIndex().tagLocation(HoodieList.of(dedupedRecords), context, table));
Duration tagLocationDuration = Duration.between(beginTag, Instant.now());
// filter out non existent keys/records
List<HoodieRecord<EmptyHoodieRecordPayload>> taggedValidRecords = taggedRecords.stream().filter(HoodieRecord::isCurrentLocationKnown).collect(Collectors.toList());
if (!taggedValidRecords.isEmpty()) {
result = deleteExecutor.execute(taggedValidRecords);
result.setIndexLookupDuration(tagLocationDuration);
} else {
// if entire set of keys are non existent
deleteExecutor.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(Pair.of(new HashMap<>(), new WorkloadStat())), instantTime);
result = new HoodieWriteMetadata<>();
result.setWriteStatuses(Collections.EMPTY_LIST);
deleteExecutor.commitOnAutoCommit(result);
}
return result;
} catch (Throwable e) {
if (e instanceof HoodieUpsertException) {
throw (HoodieUpsertException) e;
}
throw new HoodieUpsertException("Failed to delete for commit time " + instantTime, e);
}
}
use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class HoodieRepairTool method copyFiles.
/**
* Copies the list of files from source base path to destination base path.
* The destination file path (base + relative) should not already exist.
*
* @param context {@link HoodieEngineContext} instance.
* @param relativeFilePaths A {@link List} of relative file paths for copying.
* @param sourceBasePath Source base path.
* @param destBasePath Destination base path.
* @return {@code true} if all successful; {@code false} otherwise.
*/
static boolean copyFiles(HoodieEngineContext context, List<String> relativeFilePaths, String sourceBasePath, String destBasePath) {
SerializableConfiguration conf = context.getHadoopConf();
List<Boolean> allResults = context.parallelize(relativeFilePaths).mapPartitions(iterator -> {
List<Boolean> results = new ArrayList<>();
FileSystem fs = FSUtils.getFs(destBasePath, conf.get());
iterator.forEachRemaining(filePath -> {
boolean success = false;
Path sourcePath = new Path(sourceBasePath, filePath);
Path destPath = new Path(destBasePath, filePath);
try {
if (!fs.exists(destPath)) {
FileIOUtils.copy(fs, sourcePath, destPath);
success = true;
}
} catch (IOException e) {
// Copy Fail
LOG.error(String.format("Copying file fails: source [%s], destination [%s]", sourcePath, destPath));
} finally {
results.add(success);
}
});
return results.iterator();
}, true).collectAsList();
return allResults.stream().reduce((r1, r2) -> r1 && r2).orElse(false);
}
use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class HoodieSnapshotExporter method exportAsHudi.
private void exportAsHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) throws IOException {
final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
final HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
final SerializableConfiguration serConf = context.getHadoopConf();
context.setJobStatus(this.getClass().getSimpleName(), "Exporting as HUDI dataset");
List<Tuple2<String, String>> files = context.flatMap(partitions, partition -> {
// Only take latest version files <= latestCommit.
List<Tuple2<String, String>> filePaths = new ArrayList<>();
Stream<HoodieBaseFile> dataFiles = fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp);
dataFiles.forEach(hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath())));
// also need to copy over partition metadata
Path partitionMetaFile = new Path(FSUtils.getPartitionPath(cfg.sourceBasePath, partition), HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE);
FileSystem fs = FSUtils.getFs(cfg.sourceBasePath, serConf.newCopy());
if (fs.exists(partitionMetaFile)) {
filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString()));
}
return filePaths.stream();
}, partitions.size());
context.foreach(files, tuple -> {
String partition = tuple._1();
Path sourceFilePath = new Path(tuple._2());
Path toPartitionPath = FSUtils.getPartitionPath(cfg.targetOutputPath, partition);
FileSystem fs = FSUtils.getFs(cfg.targetOutputPath, serConf.newCopy());
if (!fs.exists(toPartitionPath)) {
fs.mkdirs(toPartitionPath);
}
FileUtil.copy(fs, sourceFilePath, fs, new Path(toPartitionPath, sourceFilePath.getName()), false, fs.getConf());
}, files.size());
// Also copy the .commit files
LOG.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp));
final FileSystem fileSystem = FSUtils.getFs(cfg.sourceBasePath, jsc.hadoopConfiguration());
FileStatus[] commitFilesToCopy = fileSystem.listStatus(new Path(cfg.sourceBasePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> {
if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) {
return true;
} else {
String instantTime = FSUtils.getCommitFromCommitFile(commitFilePath.getName());
return HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.LESSER_THAN_OR_EQUALS, latestCommitTimestamp);
}
});
for (FileStatus commitStatus : commitFilesToCopy) {
Path targetFilePath = new Path(cfg.targetOutputPath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus.getPath().getName());
if (!fileSystem.exists(targetFilePath.getParent())) {
fileSystem.mkdirs(targetFilePath.getParent());
}
if (fileSystem.exists(targetFilePath)) {
LOG.error(String.format("The target output commit file (%s targetBasePath) already exists.", targetFilePath));
}
FileUtil.copy(fileSystem, commitStatus.getPath(), fileSystem, targetFilePath, false, fileSystem.getConf());
}
}
use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class TimelineServerPerf method runLookups.
public List<PerfStats> runLookups(JavaSparkContext jsc, List<String> partitionPaths, SyncableFileSystemView fsView, int numIterations, int concurrency) {
HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
context.setJobStatus(this.getClass().getSimpleName(), "Lookup all performance stats");
return context.flatMap(partitionPaths, p -> {
ScheduledThreadPoolExecutor executor = new ScheduledThreadPoolExecutor(100);
final List<PerfStats> result = new ArrayList<>();
final List<ScheduledFuture<PerfStats>> futures = new ArrayList<>();
List<FileSlice> slices = fsView.getLatestFileSlices(p).collect(Collectors.toList());
String fileId = slices.isEmpty() ? "dummyId" : slices.get(new Random(Double.doubleToLongBits(Math.random())).nextInt(slices.size())).getFileId();
IntStream.range(0, concurrency).forEach(i -> futures.add(executor.schedule(() -> runOneRound(fsView, p, fileId, i, numIterations), 0, TimeUnit.NANOSECONDS)));
futures.forEach(x -> {
try {
result.add(x.get());
} catch (InterruptedException | ExecutionException e) {
throw new RuntimeException(e);
}
});
System.out.println("SLICES are=");
slices.forEach(s -> System.out.println("\t\tFileSlice=" + s));
return result.stream();
}, cfg.numExecutors);
}
use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class MarkerUtils method readTimelineServerBasedMarkersFromFileSystem.
/**
* Reads files containing the markers written by timeline-server-based marker mechanism.
*
* @param markerDir marker directory.
* @param fileSystem file system to use.
* @param context instance of {@link HoodieEngineContext} to use
* @param parallelism parallelism to use
* @return A {@code Map} of file name to the set of markers stored in the file.
*/
public static Map<String, Set<String>> readTimelineServerBasedMarkersFromFileSystem(String markerDir, FileSystem fileSystem, HoodieEngineContext context, int parallelism) {
Path dirPath = new Path(markerDir);
try {
if (fileSystem.exists(dirPath)) {
Predicate<FileStatus> prefixFilter = fileStatus -> fileStatus.getPath().getName().startsWith(MARKERS_FILENAME_PREFIX);
Predicate<FileStatus> markerTypeFilter = fileStatus -> !fileStatus.getPath().getName().equals(MARKER_TYPE_FILENAME);
return FSUtils.parallelizeSubPathProcess(context, fileSystem, dirPath, parallelism, prefixFilter.and(markerTypeFilter), pairOfSubPathAndConf -> {
String markersFilePathStr = pairOfSubPathAndConf.getKey();
SerializableConfiguration conf = pairOfSubPathAndConf.getValue();
return readMarkersFromFile(new Path(markersFilePathStr), conf);
});
}
return new HashMap<>();
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}
Aggregations