Search in sources :

Example 1 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class FlinkDeleteHelper method execute.

@Override
public HoodieWriteMetadata<List<WriteStatus>> execute(String instantTime, List<HoodieKey> keys, HoodieEngineContext context, HoodieWriteConfig config, HoodieTable<EmptyHoodieRecordPayload, List<HoodieRecord<EmptyHoodieRecordPayload>>, List<HoodieKey>, List<WriteStatus>> table, BaseCommitActionExecutor<EmptyHoodieRecordPayload, List<HoodieRecord<EmptyHoodieRecordPayload>>, List<HoodieKey>, List<WriteStatus>, R> deleteExecutor) {
    try {
        HoodieWriteMetadata<List<WriteStatus>> result = null;
        List<HoodieKey> dedupedKeys = keys;
        final int parallelism = config.getDeleteShuffleParallelism();
        if (config.shouldCombineBeforeDelete()) {
            // De-dupe/merge if needed
            dedupedKeys = deduplicateKeys(keys, table, parallelism);
        }
        List<HoodieRecord<EmptyHoodieRecordPayload>> dedupedRecords = dedupedKeys.stream().map(key -> new HoodieAvroRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList());
        Instant beginTag = Instant.now();
        // perform index look up to get existing location of records
        List<HoodieRecord<EmptyHoodieRecordPayload>> taggedRecords = HoodieList.getList(table.getIndex().tagLocation(HoodieList.of(dedupedRecords), context, table));
        Duration tagLocationDuration = Duration.between(beginTag, Instant.now());
        // filter out non existent keys/records
        List<HoodieRecord<EmptyHoodieRecordPayload>> taggedValidRecords = taggedRecords.stream().filter(HoodieRecord::isCurrentLocationKnown).collect(Collectors.toList());
        if (!taggedValidRecords.isEmpty()) {
            result = deleteExecutor.execute(taggedValidRecords);
            result.setIndexLookupDuration(tagLocationDuration);
        } else {
            // if entire set of keys are non existent
            deleteExecutor.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(Pair.of(new HashMap<>(), new WorkloadStat())), instantTime);
            result = new HoodieWriteMetadata<>();
            result.setWriteStatuses(Collections.EMPTY_LIST);
            deleteExecutor.commitOnAutoCommit(result);
        }
        return result;
    } catch (Throwable e) {
        if (e instanceof HoodieUpsertException) {
            throw (HoodieUpsertException) e;
        }
        throw new HoodieUpsertException("Failed to delete for commit time " + instantTime, e);
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieList(org.apache.hudi.common.data.HoodieList) HashSet(java.util.HashSet) WriteStatus(org.apache.hudi.client.WriteStatus) List(java.util.List) Duration(java.time.Duration) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieKey(org.apache.hudi.common.model.HoodieKey) WorkloadStat(org.apache.hudi.table.WorkloadStat) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) LinkedList(java.util.LinkedList) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) Collections(java.util.Collections) Pair(org.apache.hudi.common.util.collection.Pair) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Instant(java.time.Instant) Duration(java.time.Duration) WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) HoodieList(org.apache.hudi.common.data.HoodieList) List(java.util.List) LinkedList(java.util.LinkedList)

Example 2 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class HoodieRepairTool method copyFiles.

/**
 * Copies the list of files from source base path to destination base path.
 * The destination file path (base + relative) should not already exist.
 *
 * @param context           {@link HoodieEngineContext} instance.
 * @param relativeFilePaths A {@link List} of relative file paths for copying.
 * @param sourceBasePath    Source base path.
 * @param destBasePath      Destination base path.
 * @return {@code true} if all successful; {@code false} otherwise.
 */
static boolean copyFiles(HoodieEngineContext context, List<String> relativeFilePaths, String sourceBasePath, String destBasePath) {
    SerializableConfiguration conf = context.getHadoopConf();
    List<Boolean> allResults = context.parallelize(relativeFilePaths).mapPartitions(iterator -> {
        List<Boolean> results = new ArrayList<>();
        FileSystem fs = FSUtils.getFs(destBasePath, conf.get());
        iterator.forEachRemaining(filePath -> {
            boolean success = false;
            Path sourcePath = new Path(sourceBasePath, filePath);
            Path destPath = new Path(destBasePath, filePath);
            try {
                if (!fs.exists(destPath)) {
                    FileIOUtils.copy(fs, sourcePath, destPath);
                    success = true;
                }
            } catch (IOException e) {
                // Copy Fail
                LOG.error(String.format("Copying file fails: source [%s], destination [%s]", sourcePath, destPath));
            } finally {
                results.add(success);
            }
        });
        return results.iterator();
    }, true).collectAsList();
    return allResults.stream().reduce((r1, r2) -> r1 && r2).orElse(false);
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) Parameter(com.beust.jcommander.Parameter) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) SecureRandom(java.security.SecureRandom) Logger(org.apache.log4j.Logger) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) JCommander(com.beust.jcommander.JCommander) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) Serializable(java.io.Serializable) List(java.util.List) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) FileSystemBackedTableMetadata(org.apache.hudi.metadata.FileSystemBackedTableMetadata) HoodieIOException(org.apache.hudi.exception.HoodieIOException) RepairUtils(org.apache.hudi.table.repair.RepairUtils) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Path(org.apache.hadoop.fs.Path) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) FileSystem(org.apache.hadoop.fs.FileSystem) ArrayList(java.util.ArrayList) List(java.util.List) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 3 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class HoodieSnapshotExporter method exportAsHudi.

private void exportAsHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) throws IOException {
    final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
    final HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
    final SerializableConfiguration serConf = context.getHadoopConf();
    context.setJobStatus(this.getClass().getSimpleName(), "Exporting as HUDI dataset");
    List<Tuple2<String, String>> files = context.flatMap(partitions, partition -> {
        // Only take latest version files <= latestCommit.
        List<Tuple2<String, String>> filePaths = new ArrayList<>();
        Stream<HoodieBaseFile> dataFiles = fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp);
        dataFiles.forEach(hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath())));
        // also need to copy over partition metadata
        Path partitionMetaFile = new Path(FSUtils.getPartitionPath(cfg.sourceBasePath, partition), HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE);
        FileSystem fs = FSUtils.getFs(cfg.sourceBasePath, serConf.newCopy());
        if (fs.exists(partitionMetaFile)) {
            filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString()));
        }
        return filePaths.stream();
    }, partitions.size());
    context.foreach(files, tuple -> {
        String partition = tuple._1();
        Path sourceFilePath = new Path(tuple._2());
        Path toPartitionPath = FSUtils.getPartitionPath(cfg.targetOutputPath, partition);
        FileSystem fs = FSUtils.getFs(cfg.targetOutputPath, serConf.newCopy());
        if (!fs.exists(toPartitionPath)) {
            fs.mkdirs(toPartitionPath);
        }
        FileUtil.copy(fs, sourceFilePath, fs, new Path(toPartitionPath, sourceFilePath.getName()), false, fs.getConf());
    }, files.size());
    // Also copy the .commit files
    LOG.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp));
    final FileSystem fileSystem = FSUtils.getFs(cfg.sourceBasePath, jsc.hadoopConfiguration());
    FileStatus[] commitFilesToCopy = fileSystem.listStatus(new Path(cfg.sourceBasePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> {
        if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) {
            return true;
        } else {
            String instantTime = FSUtils.getCommitFromCommitFile(commitFilePath.getName());
            return HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.LESSER_THAN_OR_EQUALS, latestCommitTimestamp);
        }
    });
    for (FileStatus commitStatus : commitFilesToCopy) {
        Path targetFilePath = new Path(cfg.targetOutputPath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus.getPath().getName());
        if (!fileSystem.exists(targetFilePath.getParent())) {
            fileSystem.mkdirs(targetFilePath.getParent());
        }
        if (fileSystem.exists(targetFilePath)) {
            LOG.error(String.format("The target output commit file (%s targetBasePath) already exists.", targetFilePath));
        }
        FileUtil.copy(fileSystem, commitStatus.getPath(), fileSystem, targetFilePath, false, fileSystem.getConf());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) ArrayList(java.util.ArrayList) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Tuple2(scala.Tuple2) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 4 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class TimelineServerPerf method runLookups.

public List<PerfStats> runLookups(JavaSparkContext jsc, List<String> partitionPaths, SyncableFileSystemView fsView, int numIterations, int concurrency) {
    HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
    context.setJobStatus(this.getClass().getSimpleName(), "Lookup all performance stats");
    return context.flatMap(partitionPaths, p -> {
        ScheduledThreadPoolExecutor executor = new ScheduledThreadPoolExecutor(100);
        final List<PerfStats> result = new ArrayList<>();
        final List<ScheduledFuture<PerfStats>> futures = new ArrayList<>();
        List<FileSlice> slices = fsView.getLatestFileSlices(p).collect(Collectors.toList());
        String fileId = slices.isEmpty() ? "dummyId" : slices.get(new Random(Double.doubleToLongBits(Math.random())).nextInt(slices.size())).getFileId();
        IntStream.range(0, concurrency).forEach(i -> futures.add(executor.schedule(() -> runOneRound(fsView, p, fileId, i, numIterations), 0, TimeUnit.NANOSECONDS)));
        futures.forEach(x -> {
            try {
                result.add(x.get());
            } catch (InterruptedException | ExecutionException e) {
                throw new RuntimeException(e);
            }
        });
        System.out.println("SLICES are=");
        slices.forEach(s -> System.out.println("\t\tFileSlice=" + s));
        return result.stream();
    }, cfg.numExecutors);
}
Also used : HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) ScheduledThreadPoolExecutor(java.util.concurrent.ScheduledThreadPoolExecutor) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) ScheduledFuture(java.util.concurrent.ScheduledFuture) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Random(java.util.Random) ExecutionException(java.util.concurrent.ExecutionException)

Example 5 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class MarkerUtils method readTimelineServerBasedMarkersFromFileSystem.

/**
 * Reads files containing the markers written by timeline-server-based marker mechanism.
 *
 * @param markerDir   marker directory.
 * @param fileSystem  file system to use.
 * @param context     instance of {@link HoodieEngineContext} to use
 * @param parallelism parallelism to use
 * @return A {@code Map} of file name to the set of markers stored in the file.
 */
public static Map<String, Set<String>> readTimelineServerBasedMarkersFromFileSystem(String markerDir, FileSystem fileSystem, HoodieEngineContext context, int parallelism) {
    Path dirPath = new Path(markerDir);
    try {
        if (fileSystem.exists(dirPath)) {
            Predicate<FileStatus> prefixFilter = fileStatus -> fileStatus.getPath().getName().startsWith(MARKERS_FILENAME_PREFIX);
            Predicate<FileStatus> markerTypeFilter = fileStatus -> !fileStatus.getPath().getName().equals(MARKER_TYPE_FILENAME);
            return FSUtils.parallelizeSubPathProcess(context, fileSystem, dirPath, parallelism, prefixFilter.and(markerTypeFilter), pairOfSubPathAndConf -> {
                String markersFilePathStr = pairOfSubPathAndConf.getKey();
                SerializableConfiguration conf = pairOfSubPathAndConf.getValue();
                return readMarkersFromFile(new Path(markersFilePathStr), conf);
            });
        }
        return new HashMap<>();
    } catch (IOException ioe) {
        throw new HoodieIOException(ioe.getMessage(), ioe);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) BufferedWriter(java.io.BufferedWriter) Predicate(java.util.function.Predicate) HoodieException(org.apache.hudi.exception.HoodieException) Set(java.util.Set) IOException(java.io.IOException) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) StandardCharsets(java.nio.charset.StandardCharsets) HashSet(java.util.HashSet) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Logger(org.apache.log4j.Logger) MarkerType(org.apache.hudi.common.table.marker.MarkerType) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) Path(org.apache.hadoop.fs.Path) OutputStreamWriter(java.io.OutputStreamWriter) FileIOUtils.closeQuietly(org.apache.hudi.common.util.FileIOUtils.closeQuietly) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HashMap(java.util.HashMap) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Aggregations

HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)36 List (java.util.List)29 ArrayList (java.util.ArrayList)27 IOException (java.io.IOException)25 LogManager (org.apache.log4j.LogManager)25 Logger (org.apache.log4j.Logger)25 Map (java.util.Map)23 Collectors (java.util.stream.Collectors)23 Path (org.apache.hadoop.fs.Path)23 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)23 Option (org.apache.hudi.common.util.Option)23 FileSystem (org.apache.hadoop.fs.FileSystem)21 Pair (org.apache.hudi.common.util.collection.Pair)19 FSUtils (org.apache.hudi.common.fs.FSUtils)18 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)18 HoodieIOException (org.apache.hudi.exception.HoodieIOException)18 HashMap (java.util.HashMap)16 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)16 HoodieTable (org.apache.hudi.table.HoodieTable)15 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)14