Search in sources :

Example 6 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class BootstrapExecutor method execute.

/**
 * Executes Bootstrap.
 */
public void execute() throws IOException {
    initializeTable();
    SparkRDDWriteClient bootstrapClient = new SparkRDDWriteClient(new HoodieSparkEngineContext(jssc), bootstrapConfig);
    try {
        HashMap<String, String> checkpointCommitMetadata = new HashMap<>();
        checkpointCommitMetadata.put(HoodieDeltaStreamer.CHECKPOINT_KEY, cfg.checkpoint);
        if (cfg.checkpoint != null) {
            checkpointCommitMetadata.put(HoodieDeltaStreamer.CHECKPOINT_RESET_KEY, cfg.checkpoint);
        }
        bootstrapClient.bootstrap(Option.of(checkpointCommitMetadata));
        syncHive();
    } finally {
        bootstrapClient.close();
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HashMap(java.util.HashMap)

Example 7 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class DeltaSync method reInitWriteClient.

private void reInitWriteClient(Schema sourceSchema, Schema targetSchema) throws IOException {
    LOG.info("Setting up new Hoodie Write Client");
    registerAvroSchemas(sourceSchema, targetSchema);
    HoodieWriteConfig hoodieCfg = getHoodieClientConfig(targetSchema);
    if (hoodieCfg.isEmbeddedTimelineServerEnabled()) {
        if (!embeddedTimelineService.isPresent()) {
            embeddedTimelineService = EmbeddedTimelineServerHelper.createEmbeddedTimelineService(new HoodieSparkEngineContext(jssc), hoodieCfg);
        } else {
            EmbeddedTimelineServerHelper.updateWriteConfigWithTimelineServer(embeddedTimelineService.get(), hoodieCfg);
        }
    }
    if (null != writeClient) {
        // Close Write client.
        writeClient.close();
    }
    writeClient = new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jssc), hoodieCfg, embeddedTimelineService);
    onInitializingHoodieWriteClient.apply(writeClient);
}
Also used : HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig)

Example 8 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class HoodieSnapshotExporter method exportAsHudi.

private void exportAsHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) throws IOException {
    final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
    final HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
    final SerializableConfiguration serConf = context.getHadoopConf();
    context.setJobStatus(this.getClass().getSimpleName(), "Exporting as HUDI dataset");
    List<Tuple2<String, String>> files = context.flatMap(partitions, partition -> {
        // Only take latest version files <= latestCommit.
        List<Tuple2<String, String>> filePaths = new ArrayList<>();
        Stream<HoodieBaseFile> dataFiles = fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp);
        dataFiles.forEach(hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath())));
        // also need to copy over partition metadata
        Path partitionMetaFile = new Path(FSUtils.getPartitionPath(cfg.sourceBasePath, partition), HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE);
        FileSystem fs = FSUtils.getFs(cfg.sourceBasePath, serConf.newCopy());
        if (fs.exists(partitionMetaFile)) {
            filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString()));
        }
        return filePaths.stream();
    }, partitions.size());
    context.foreach(files, tuple -> {
        String partition = tuple._1();
        Path sourceFilePath = new Path(tuple._2());
        Path toPartitionPath = FSUtils.getPartitionPath(cfg.targetOutputPath, partition);
        FileSystem fs = FSUtils.getFs(cfg.targetOutputPath, serConf.newCopy());
        if (!fs.exists(toPartitionPath)) {
            fs.mkdirs(toPartitionPath);
        }
        FileUtil.copy(fs, sourceFilePath, fs, new Path(toPartitionPath, sourceFilePath.getName()), false, fs.getConf());
    }, files.size());
    // Also copy the .commit files
    LOG.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp));
    final FileSystem fileSystem = FSUtils.getFs(cfg.sourceBasePath, jsc.hadoopConfiguration());
    FileStatus[] commitFilesToCopy = fileSystem.listStatus(new Path(cfg.sourceBasePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> {
        if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) {
            return true;
        } else {
            String instantTime = FSUtils.getCommitFromCommitFile(commitFilePath.getName());
            return HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.LESSER_THAN_OR_EQUALS, latestCommitTimestamp);
        }
    });
    for (FileStatus commitStatus : commitFilesToCopy) {
        Path targetFilePath = new Path(cfg.targetOutputPath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus.getPath().getName());
        if (!fileSystem.exists(targetFilePath.getParent())) {
            fileSystem.mkdirs(targetFilePath.getParent());
        }
        if (fileSystem.exists(targetFilePath)) {
            LOG.error(String.format("The target output commit file (%s targetBasePath) already exists.", targetFilePath));
        }
        FileUtil.copy(fileSystem, commitStatus.getPath(), fileSystem, targetFilePath, false, fileSystem.getConf());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) ArrayList(java.util.ArrayList) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Tuple2(scala.Tuple2) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 9 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class HoodieCleaner method run.

public void run() {
    HoodieWriteConfig hoodieCfg = getHoodieClientConfig();
    SparkRDDWriteClient client = new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jssc), hoodieCfg);
    client.clean();
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig)

Example 10 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class HoodieCompactionAdminTool method run.

/**
 * Executes one of compaction admin operations.
 */
public void run(JavaSparkContext jsc) throws Exception {
    HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(cfg.basePath).build();
    try (CompactionAdminClient admin = new CompactionAdminClient(new HoodieSparkEngineContext(jsc), cfg.basePath)) {
        final FileSystem fs = FSUtils.getFs(cfg.basePath, jsc.hadoopConfiguration());
        if (cfg.outputPath != null && fs.exists(new Path(cfg.outputPath))) {
            throw new IllegalStateException("Output File Path already exists");
        }
        switch(cfg.operation) {
            case VALIDATE:
                List<ValidationOpResult> res = admin.validateCompactionPlan(metaClient, cfg.compactionInstantTime, cfg.parallelism);
                if (cfg.printOutput) {
                    printOperationResult("Result of Validation Operation :", res);
                }
                serializeOperationResult(fs, res);
                break;
            case UNSCHEDULE_FILE:
                List<RenameOpResult> r = admin.unscheduleCompactionFileId(new HoodieFileGroupId(cfg.partitionPath, cfg.fileId), cfg.skipValidation, cfg.dryRun);
                if (cfg.printOutput) {
                    System.out.println(r);
                }
                serializeOperationResult(fs, r);
                break;
            case UNSCHEDULE_PLAN:
                List<RenameOpResult> r2 = admin.unscheduleCompactionPlan(cfg.compactionInstantTime, cfg.skipValidation, cfg.parallelism, cfg.dryRun);
                if (cfg.printOutput) {
                    printOperationResult("Result of Unscheduling Compaction Plan :", r2);
                }
                serializeOperationResult(fs, r2);
                break;
            case REPAIR:
                List<RenameOpResult> r3 = admin.repairCompaction(cfg.compactionInstantTime, cfg.parallelism, cfg.dryRun);
                if (cfg.printOutput) {
                    printOperationResult("Result of Repair Operation :", r3);
                }
                serializeOperationResult(fs, r3);
                break;
            default:
                throw new IllegalStateException("Not yet implemented !!");
        }
    }
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) CompactionAdminClient(org.apache.hudi.client.CompactionAdminClient) ValidationOpResult(org.apache.hudi.client.CompactionAdminClient.ValidationOpResult) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) FileSystem(org.apache.hadoop.fs.FileSystem) RenameOpResult(org.apache.hudi.client.CompactionAdminClient.RenameOpResult)

Aggregations

HoodieSparkEngineContext (org.apache.hudi.client.common.HoodieSparkEngineContext)58 Path (org.apache.hadoop.fs.Path)25 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)24 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)23 ArrayList (java.util.ArrayList)19 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)19 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)17 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)17 WriteStatus (org.apache.hudi.client.WriteStatus)15 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)15 IOException (java.io.IOException)14 List (java.util.List)14 Option (org.apache.hudi.common.util.Option)14 LogManager (org.apache.log4j.LogManager)14 Logger (org.apache.log4j.Logger)14 Test (org.junit.jupiter.api.Test)14 Collectors (java.util.stream.Collectors)12 FileStatus (org.apache.hadoop.fs.FileStatus)12 FileSystem (org.apache.hadoop.fs.FileSystem)12 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)11