use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class UpsertPartitioner method getSmallFilesForPartitions.
private Map<String, List<SmallFile>> getSmallFilesForPartitions(List<String> partitionPaths, HoodieEngineContext context) {
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
Map<String, List<SmallFile>> partitionSmallFilesMap = new HashMap<>();
if (config.getParquetSmallFileLimit() <= 0) {
return partitionSmallFilesMap;
}
if (partitionPaths != null && partitionPaths.size() > 0) {
context.setJobStatus(this.getClass().getSimpleName(), "Getting small files from partitions");
JavaRDD<String> partitionPathRdds = jsc.parallelize(partitionPaths, partitionPaths.size());
partitionSmallFilesMap = partitionPathRdds.mapToPair((PairFunction<String, String, List<SmallFile>>) partitionPath -> new Tuple2<>(partitionPath, getSmallFiles(partitionPath))).collectAsMap();
}
return partitionSmallFilesMap;
}
use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class FSUtils method parallelizeFilesProcess.
public static <T> Map<String, T> parallelizeFilesProcess(HoodieEngineContext hoodieEngineContext, FileSystem fs, int parallelism, SerializableFunction<Pair<String, SerializableConfiguration>, T> pairFunction, List<String> subPaths) {
Map<String, T> result = new HashMap<>();
if (subPaths.size() > 0) {
SerializableConfiguration conf = new SerializableConfiguration(fs.getConf());
int actualParallelism = Math.min(subPaths.size(), parallelism);
result = hoodieEngineContext.mapToPair(subPaths, subPath -> new ImmutablePair<>(subPath, pairFunction.apply(new ImmutablePair<>(subPath, conf))), actualParallelism);
}
return result;
}
use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class FSUtils method parallelizeSubPathProcess.
/**
* Processes sub-path in parallel.
*
* @param hoodieEngineContext {@code HoodieEngineContext} instance
* @param fs file system
* @param dirPath directory path
* @param parallelism parallelism to use for sub-paths
* @param subPathPredicate predicate to use to filter sub-paths for processing
* @param pairFunction actual processing logic for each sub-path
* @param <T> type of result to return for each sub-path
* @return a map of sub-path to result of the processing
*/
public static <T> Map<String, T> parallelizeSubPathProcess(HoodieEngineContext hoodieEngineContext, FileSystem fs, Path dirPath, int parallelism, Predicate<FileStatus> subPathPredicate, SerializableFunction<Pair<String, SerializableConfiguration>, T> pairFunction) {
Map<String, T> result = new HashMap<>();
try {
FileStatus[] fileStatuses = fs.listStatus(dirPath);
List<String> subPaths = Arrays.stream(fileStatuses).filter(subPathPredicate).map(fileStatus -> fileStatus.getPath().toString()).collect(Collectors.toList());
result = parallelizeFilesProcess(hoodieEngineContext, fs, parallelism, pairFunction, subPaths);
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
return result;
}
use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class SparkMain method deleteMarker.
protected static int deleteMarker(JavaSparkContext jsc, String instantTime, String basePath) {
try {
SparkRDDWriteClient client = createHoodieClient(jsc, basePath);
HoodieWriteConfig config = client.getConfig();
HoodieEngineContext context = client.getEngineContext();
HoodieSparkTable table = HoodieSparkTable.create(config, context, true);
WriteMarkersFactory.get(config.getMarkersType(), table, instantTime).quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism());
return 0;
} catch (Exception e) {
LOG.warn(String.format("Failed: Could not clean marker instantTime: \"%s\".", instantTime), e);
return -1;
}
}
use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class HoodieSparkCopyOnWriteTable method updateColumnsStatsIndex.
private void updateColumnsStatsIndex(@Nonnull HoodieEngineContext context, @Nonnull List<HoodieWriteStat> updatedFilesStats, @Nonnull String instantTime) throws Exception {
String sortColsList = config.getClusteringSortColumns();
String basePath = metaClient.getBasePath();
String indexPath = metaClient.getColumnStatsIndexPath();
List<String> touchedFiles = updatedFilesStats.stream().map(s -> new Path(basePath, s.getPath()).toString()).collect(Collectors.toList());
if (touchedFiles.isEmpty() || StringUtils.isNullOrEmpty(sortColsList) || StringUtils.isNullOrEmpty(indexPath)) {
return;
}
LOG.info(String.format("Updating column-statistics index table (%s)", indexPath));
List<String> sortCols = Arrays.stream(sortColsList.split(",")).map(String::trim).collect(Collectors.toList());
HoodieSparkEngineContext sparkEngineContext = (HoodieSparkEngineContext) context;
// Fetch table schema to appropriately construct col-stats index schema
Schema tableWriteSchema = HoodieAvroUtils.createHoodieWriteSchema(new TableSchemaResolver(metaClient).getTableAvroSchemaWithoutMetadataFields());
List<String> completedCommits = metaClient.getCommitsTimeline().filterCompletedInstants().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
ColumnStatsIndexHelper.updateColumnStatsIndexFor(sparkEngineContext.getSqlContext().sparkSession(), AvroConversionUtils.convertAvroSchemaToStructType(tableWriteSchema), touchedFiles, sortCols, indexPath, instantTime, completedCommits);
LOG.info(String.format("Successfully updated column-statistics index at instant (%s)", instantTime));
}
Aggregations