Search in sources :

Example 1 with BaseFileOnlyView

use of org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView in project hudi by apache.

the class TestHoodieMergeOnReadTable method testUpsertPartitioner.

@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testUpsertPartitioner(boolean populateMetaFields) throws Exception {
    HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true);
    addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
    HoodieWriteConfig cfg = cfgBuilder.build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        /**
         * Write 1 (only inserts, written as base file)
         */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
        JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
        List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
        assertTrue(deltaCommit.isPresent());
        assertEquals("001", deltaCommit.get().getTimestamp(), "Delta commit should be 001");
        Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
        assertFalse(commit.isPresent());
        FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
        BaseFileOnlyView roView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), allFiles);
        Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
        Map<String, Long> fileIdToSize = dataFilesToRead.collect(Collectors.toMap(HoodieBaseFile::getFileId, HoodieBaseFile::getFileSize));
        roView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
        dataFilesToRead = roView.getLatestBaseFiles();
        List<HoodieBaseFile> dataFilesList = dataFilesToRead.collect(Collectors.toList());
        assertTrue(dataFilesList.size() > 0, "Should list the base files we wrote in the delta commit");
        /**
         * Write 2 (only updates + inserts, written to .log file + correction of existing base file size)
         */
        newCommitTime = "002";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> newRecords = dataGen.generateUpdates(newCommitTime, records);
        newRecords.addAll(dataGen.generateInserts(newCommitTime, 20));
        statuses = client.upsert(jsc().parallelize(newRecords), newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);
        metaClient = HoodieTableMetaClient.reload(metaClient);
        deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant();
        assertTrue(deltaCommit.isPresent());
        assertEquals("002", deltaCommit.get().getTimestamp(), "Latest Delta commit should be 002");
        commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
        assertFalse(commit.isPresent());
        allFiles = listAllBaseFilesInPath(hoodieTable);
        roView = getHoodieTableFileSystemView(metaClient, hoodieTable.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(), allFiles);
        dataFilesToRead = roView.getLatestBaseFiles();
        List<HoodieBaseFile> newDataFilesList = dataFilesToRead.collect(Collectors.toList());
        Map<String, Long> fileIdToNewSize = newDataFilesList.stream().collect(Collectors.toMap(HoodieBaseFile::getFileId, HoodieBaseFile::getFileSize));
        assertTrue(fileIdToNewSize.entrySet().stream().anyMatch(entry -> fileIdToSize.get(entry.getKey()) < entry.getValue()));
        List<String> inputPaths = roView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
        List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath(), new JobConf(hadoopConf()), true, false);
        // Wrote 20 records in 2 batches
        assertEquals(40, recordsRead.size(), "Must contain 40 records");
    }
}
Also used : HoodieClientTestHarness.buildProfile(org.apache.hudi.testutils.HoodieClientTestHarness.buildProfile) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieMergeOnReadTestUtils(org.apache.hudi.testutils.HoodieMergeOnReadTestUtils) Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) StorageLevel(org.apache.spark.storage.StorageLevel) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) BaseSparkDeltaCommitActionExecutor(org.apache.hudi.table.action.deltacommit.BaseSparkDeltaCommitActionExecutor) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) Set(java.util.Set) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) SparkDeleteDeltaCommitActionExecutor(org.apache.hudi.table.action.deltacommit.SparkDeleteDeltaCommitActionExecutor) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) Dataset(org.apache.spark.sql.Dataset) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) JobConf(org.apache.hadoop.mapred.JobConf) WriteStatus(org.apache.hudi.client.WriteStatus) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) Transformations(org.apache.hudi.common.testutils.Transformations) SparkClientFunctionalTestHarness(org.apache.hudi.testutils.SparkClientFunctionalTestHarness) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) JobConf(org.apache.hadoop.mapred.JobConf) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 2 with BaseFileOnlyView

use of org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView in project hudi by apache.

the class HoodieClientTestUtils method getLatestBaseFiles.

public static List<HoodieBaseFile> getLatestBaseFiles(String basePath, FileSystem fs, String... paths) {
    List<HoodieBaseFile> latestFiles = new ArrayList<>();
    try {
        HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build();
        for (String path : paths) {
            BaseFileOnlyView fileSystemView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), fs.globStatus(new Path(path)));
            latestFiles.addAll(fileSystemView.getLatestBaseFiles().collect(Collectors.toList()));
        }
    } catch (Exception e) {
        throw new HoodieException("Error reading hoodie table as a dataframe", e);
    }
    return latestFiles;
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Path(org.apache.hadoop.fs.Path) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) ArrayList(java.util.ArrayList) HoodieException(org.apache.hudi.exception.HoodieException) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException)

Example 3 with BaseFileOnlyView

use of org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView in project hudi by apache.

the class HoodieSnapshotExporter method exportAsHudi.

private void exportAsHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) throws IOException {
    final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
    final HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
    final SerializableConfiguration serConf = context.getHadoopConf();
    context.setJobStatus(this.getClass().getSimpleName(), "Exporting as HUDI dataset");
    List<Tuple2<String, String>> files = context.flatMap(partitions, partition -> {
        // Only take latest version files <= latestCommit.
        List<Tuple2<String, String>> filePaths = new ArrayList<>();
        Stream<HoodieBaseFile> dataFiles = fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp);
        dataFiles.forEach(hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath())));
        // also need to copy over partition metadata
        Path partitionMetaFile = new Path(FSUtils.getPartitionPath(cfg.sourceBasePath, partition), HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE);
        FileSystem fs = FSUtils.getFs(cfg.sourceBasePath, serConf.newCopy());
        if (fs.exists(partitionMetaFile)) {
            filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString()));
        }
        return filePaths.stream();
    }, partitions.size());
    context.foreach(files, tuple -> {
        String partition = tuple._1();
        Path sourceFilePath = new Path(tuple._2());
        Path toPartitionPath = FSUtils.getPartitionPath(cfg.targetOutputPath, partition);
        FileSystem fs = FSUtils.getFs(cfg.targetOutputPath, serConf.newCopy());
        if (!fs.exists(toPartitionPath)) {
            fs.mkdirs(toPartitionPath);
        }
        FileUtil.copy(fs, sourceFilePath, fs, new Path(toPartitionPath, sourceFilePath.getName()), false, fs.getConf());
    }, files.size());
    // Also copy the .commit files
    LOG.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp));
    final FileSystem fileSystem = FSUtils.getFs(cfg.sourceBasePath, jsc.hadoopConfiguration());
    FileStatus[] commitFilesToCopy = fileSystem.listStatus(new Path(cfg.sourceBasePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> {
        if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) {
            return true;
        } else {
            String instantTime = FSUtils.getCommitFromCommitFile(commitFilePath.getName());
            return HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.LESSER_THAN_OR_EQUALS, latestCommitTimestamp);
        }
    });
    for (FileStatus commitStatus : commitFilesToCopy) {
        Path targetFilePath = new Path(cfg.targetOutputPath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus.getPath().getName());
        if (!fileSystem.exists(targetFilePath.getParent())) {
            fileSystem.mkdirs(targetFilePath.getParent());
        }
        if (fileSystem.exists(targetFilePath)) {
            LOG.error(String.format("The target output commit file (%s targetBasePath) already exists.", targetFilePath));
        }
        FileUtil.copy(fileSystem, commitStatus.getPath(), fileSystem, targetFilePath, false, fileSystem.getConf());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) ArrayList(java.util.ArrayList) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Tuple2(scala.Tuple2) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 4 with BaseFileOnlyView

use of org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView in project hudi by apache.

the class HoodieSnapshotCopier method snapshot.

public void snapshot(JavaSparkContext jsc, String baseDir, final String outputDir, final boolean shouldAssumeDatePartitioning, final boolean useFileListingFromMetadata) throws IOException {
    FileSystem fs = FSUtils.getFs(baseDir, jsc.hadoopConfiguration());
    final SerializableConfiguration serConf = new SerializableConfiguration(jsc.hadoopConfiguration());
    final HoodieTableMetaClient tableMetadata = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(baseDir).build();
    final BaseFileOnlyView fsView = new HoodieTableFileSystemView(tableMetadata, tableMetadata.getActiveTimeline().getWriteTimeline().filterCompletedInstants());
    HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
    // Get the latest commit
    Option<HoodieInstant> latestCommit = tableMetadata.getActiveTimeline().getWriteTimeline().filterCompletedInstants().lastInstant();
    if (!latestCommit.isPresent()) {
        LOG.warn("No commits present. Nothing to snapshot");
        return;
    }
    final String latestCommitTimestamp = latestCommit.get().getTimestamp();
    LOG.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.", latestCommitTimestamp));
    List<String> partitions = FSUtils.getAllPartitionPaths(context, baseDir, useFileListingFromMetadata, shouldAssumeDatePartitioning);
    if (partitions.size() > 0) {
        LOG.info(String.format("The job needs to copy %d partitions.", partitions.size()));
        // Make sure the output directory is empty
        Path outputPath = new Path(outputDir);
        if (fs.exists(outputPath)) {
            LOG.warn(String.format("The output path %s targetBasePath already exists, deleting", outputPath));
            fs.delete(new Path(outputDir), true);
        }
        context.setJobStatus(this.getClass().getSimpleName(), "Creating a snapshot");
        List<Tuple2<String, String>> filesToCopy = context.flatMap(partitions, partition -> {
            // Only take latest version files <= latestCommit.
            FileSystem fs1 = FSUtils.getFs(baseDir, serConf.newCopy());
            List<Tuple2<String, String>> filePaths = new ArrayList<>();
            Stream<HoodieBaseFile> dataFiles = fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp);
            dataFiles.forEach(hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath())));
            // also need to copy over partition metadata
            Path partitionMetaFile = new Path(FSUtils.getPartitionPath(baseDir, partition), HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE);
            if (fs1.exists(partitionMetaFile)) {
                filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString()));
            }
            return filePaths.stream();
        }, partitions.size());
        context.foreach(filesToCopy, tuple -> {
            String partition = tuple._1();
            Path sourceFilePath = new Path(tuple._2());
            Path toPartitionPath = FSUtils.getPartitionPath(outputDir, partition);
            FileSystem ifs = FSUtils.getFs(baseDir, serConf.newCopy());
            if (!ifs.exists(toPartitionPath)) {
                ifs.mkdirs(toPartitionPath);
            }
            FileUtil.copy(ifs, sourceFilePath, ifs, new Path(toPartitionPath, sourceFilePath.getName()), false, ifs.getConf());
        }, filesToCopy.size());
        // Also copy the .commit files
        LOG.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp));
        FileStatus[] commitFilesToCopy = fs.listStatus(new Path(baseDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> {
            if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) {
                return true;
            } else {
                String instantTime = FSUtils.getCommitFromCommitFile(commitFilePath.getName());
                return HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.LESSER_THAN_OR_EQUALS, latestCommitTimestamp);
            }
        });
        for (FileStatus commitStatus : commitFilesToCopy) {
            Path targetFilePath = new Path(outputDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus.getPath().getName());
            if (!fs.exists(targetFilePath.getParent())) {
                fs.mkdirs(targetFilePath.getParent());
            }
            if (fs.exists(targetFilePath)) {
                LOG.error(String.format("The target output commit file (%s targetBasePath) already exists.", targetFilePath));
            }
            FileUtil.copy(fs, commitStatus.getPath(), fs, targetFilePath, false, fs.getConf());
        }
    } else {
        LOG.info("The job has 0 partition to copy.");
    }
    // Create the _SUCCESS tag
    Path successTagPath = new Path(outputDir + "/_SUCCESS");
    if (!fs.exists(successTagPath)) {
        LOG.info(String.format("Creating _SUCCESS under targetBasePath: %s", outputDir));
        fs.createNewFile(successTagPath);
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) ArrayList(java.util.ArrayList) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Tuple2(scala.Tuple2) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView)

Example 5 with BaseFileOnlyView

use of org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView in project hudi by apache.

the class HoodieSnapshotExporter method exportAsNonHudi.

private void exportAsNonHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) {
    Partitioner defaultPartitioner = dataset -> {
        Dataset<Row> hoodieDroppedDataset = dataset.drop(JavaConversions.asScalaIterator(HoodieRecord.HOODIE_META_COLUMNS.iterator()).toSeq());
        return StringUtils.isNullOrEmpty(cfg.outputPartitionField) ? hoodieDroppedDataset.write() : hoodieDroppedDataset.repartition(new Column(cfg.outputPartitionField)).write().partitionBy(cfg.outputPartitionField);
    };
    Partitioner partitioner = StringUtils.isNullOrEmpty(cfg.outputPartitioner) ? defaultPartitioner : ReflectionUtils.loadClass(cfg.outputPartitioner);
    HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
    context.setJobStatus(this.getClass().getSimpleName(), "Exporting as non-HUDI dataset");
    final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
    Iterator<String> exportingFilePaths = jsc.parallelize(partitions, partitions.size()).flatMap(partition -> fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp).map(HoodieBaseFile::getPath).iterator()).toLocalIterator();
    Dataset<Row> sourceDataset = new SQLContext(jsc).read().parquet(JavaConversions.asScalaIterator(exportingFilePaths).toSeq());
    partitioner.partition(sourceDataset).format(cfg.outputFormat).mode(SaveMode.Overwrite).save(cfg.targetOutputPath);
}
Also used : ParameterException(com.beust.jcommander.ParameterException) Dataset(org.apache.spark.sql.Dataset) Parameter(com.beust.jcommander.Parameter) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) FileUtil(org.apache.hadoop.fs.FileUtil) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) JavaConversions(scala.collection.JavaConversions) SaveMode(org.apache.spark.sql.SaveMode) Iterator(java.util.Iterator) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) Column(org.apache.spark.sql.Column) SQLContext(org.apache.spark.sql.SQLContext) JCommander(com.beust.jcommander.JCommander) SparkConf(org.apache.spark.SparkConf) HoodieSnapshotExporterException(org.apache.hudi.utilities.exception.HoodieSnapshotExporterException) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) Tuple2(scala.Tuple2) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Serializable(java.io.Serializable) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodiePartitionMetadata(org.apache.hudi.common.model.HoodiePartitionMetadata) DataFrameWriter(org.apache.spark.sql.DataFrameWriter) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) LogManager(org.apache.log4j.LogManager) IValueValidator(com.beust.jcommander.IValueValidator) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) Column(org.apache.spark.sql.Column) Dataset(org.apache.spark.sql.Dataset) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) Row(org.apache.spark.sql.Row) SQLContext(org.apache.spark.sql.SQLContext)

Aggregations

HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)8 BaseFileOnlyView (org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView)8 Path (org.apache.hadoop.fs.Path)7 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)7 ArrayList (java.util.ArrayList)5 FileStatus (org.apache.hadoop.fs.FileStatus)5 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)5 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)5 IOException (java.io.IOException)4 List (java.util.List)4 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)4 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)4 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)4 Arrays (java.util.Arrays)3 HashMap (java.util.HashMap)3 Map (java.util.Map)3 Collectors (java.util.stream.Collectors)3 Stream (java.util.stream.Stream)3 GenericRecord (org.apache.avro.generic.GenericRecord)3 FileSystem (org.apache.hadoop.fs.FileSystem)3