Search in sources :

Example 1 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class TestHoodieMergeOnReadTable method testUpsertPartitioner.

@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testUpsertPartitioner(boolean populateMetaFields) throws Exception {
    HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true);
    addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
    HoodieWriteConfig cfg = cfgBuilder.build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        /**
         * Write 1 (only inserts, written as base file)
         */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
        JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
        List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
        assertTrue(deltaCommit.isPresent());
        assertEquals("001", deltaCommit.get().getTimestamp(), "Delta commit should be 001");
        Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
        assertFalse(commit.isPresent());
        FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
        BaseFileOnlyView roView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), allFiles);
        Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
        Map<String, Long> fileIdToSize = dataFilesToRead.collect(Collectors.toMap(HoodieBaseFile::getFileId, HoodieBaseFile::getFileSize));
        roView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
        dataFilesToRead = roView.getLatestBaseFiles();
        List<HoodieBaseFile> dataFilesList = dataFilesToRead.collect(Collectors.toList());
        assertTrue(dataFilesList.size() > 0, "Should list the base files we wrote in the delta commit");
        /**
         * Write 2 (only updates + inserts, written to .log file + correction of existing base file size)
         */
        newCommitTime = "002";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> newRecords = dataGen.generateUpdates(newCommitTime, records);
        newRecords.addAll(dataGen.generateInserts(newCommitTime, 20));
        statuses = client.upsert(jsc().parallelize(newRecords), newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);
        metaClient = HoodieTableMetaClient.reload(metaClient);
        deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant();
        assertTrue(deltaCommit.isPresent());
        assertEquals("002", deltaCommit.get().getTimestamp(), "Latest Delta commit should be 002");
        commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
        assertFalse(commit.isPresent());
        allFiles = listAllBaseFilesInPath(hoodieTable);
        roView = getHoodieTableFileSystemView(metaClient, hoodieTable.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(), allFiles);
        dataFilesToRead = roView.getLatestBaseFiles();
        List<HoodieBaseFile> newDataFilesList = dataFilesToRead.collect(Collectors.toList());
        Map<String, Long> fileIdToNewSize = newDataFilesList.stream().collect(Collectors.toMap(HoodieBaseFile::getFileId, HoodieBaseFile::getFileSize));
        assertTrue(fileIdToNewSize.entrySet().stream().anyMatch(entry -> fileIdToSize.get(entry.getKey()) < entry.getValue()));
        List<String> inputPaths = roView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
        List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath(), new JobConf(hadoopConf()), true, false);
        // Wrote 20 records in 2 batches
        assertEquals(40, recordsRead.size(), "Must contain 40 records");
    }
}
Also used : HoodieClientTestHarness.buildProfile(org.apache.hudi.testutils.HoodieClientTestHarness.buildProfile) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieMergeOnReadTestUtils(org.apache.hudi.testutils.HoodieMergeOnReadTestUtils) Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) StorageLevel(org.apache.spark.storage.StorageLevel) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) BaseSparkDeltaCommitActionExecutor(org.apache.hudi.table.action.deltacommit.BaseSparkDeltaCommitActionExecutor) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) Set(java.util.Set) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) SparkDeleteDeltaCommitActionExecutor(org.apache.hudi.table.action.deltacommit.SparkDeleteDeltaCommitActionExecutor) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) Dataset(org.apache.spark.sql.Dataset) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) JobConf(org.apache.hadoop.mapred.JobConf) WriteStatus(org.apache.hudi.client.WriteStatus) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) Transformations(org.apache.hudi.common.testutils.Transformations) SparkClientFunctionalTestHarness(org.apache.hudi.testutils.SparkClientFunctionalTestHarness) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) JobConf(org.apache.hadoop.mapred.JobConf) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 2 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class CompactionTestBase method runNextDeltaCommits.

protected List<HoodieRecord> runNextDeltaCommits(SparkRDDWriteClient client, final HoodieReadClient readClient, List<String> deltaInstants, List<HoodieRecord> records, HoodieWriteConfig cfg, boolean insertFirst, List<String> expPendingCompactionInstants) throws Exception {
    HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
    List<Pair<String, HoodieCompactionPlan>> pendingCompactions = readClient.getPendingCompactions();
    List<String> gotPendingCompactionInstants = pendingCompactions.stream().map(pc -> pc.getKey()).sorted().collect(Collectors.toList());
    assertEquals(expPendingCompactionInstants, gotPendingCompactionInstants);
    Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> fgIdToCompactionOperation = CompactionUtils.getAllPendingCompactionOperations(metaClient);
    if (insertFirst) {
        // Use first instant for inserting records
        String firstInstant = deltaInstants.get(0);
        deltaInstants = deltaInstants.subList(1, deltaInstants.size());
        JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
        client.startCommitWithTime(firstInstant);
        JavaRDD<WriteStatus> statuses = client.upsert(writeRecords, firstInstant);
        List<WriteStatus> statusList = statuses.collect();
        if (!cfg.shouldAutoCommit()) {
            client.commit(firstInstant, statuses);
        }
        assertNoWriteErrors(statusList);
        metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
        HoodieTable hoodieTable = getHoodieTable(metaClient, cfg);
        List<HoodieBaseFile> dataFilesToRead = getCurrentLatestBaseFiles(hoodieTable);
        assertTrue(dataFilesToRead.stream().findAny().isPresent(), "should list the base files we wrote in the delta commit");
        validateDeltaCommit(firstInstant, fgIdToCompactionOperation, cfg);
    }
    int numRecords = records.size();
    for (String instantTime : deltaInstants) {
        records = dataGen.generateUpdates(instantTime, numRecords);
        metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
        createNextDeltaCommit(instantTime, records, client, metaClient, cfg, false);
        validateDeltaCommit(instantTime, fgIdToCompactionOperation, cfg);
    }
    return records;
}
Also used : HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieTable(org.apache.hudi.table.HoodieTable) WriteStatus(org.apache.hudi.client.WriteStatus) Pair(org.apache.hudi.common.util.collection.Pair)

Example 3 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class HoodieClientTestUtils method getLatestBaseFiles.

public static List<HoodieBaseFile> getLatestBaseFiles(String basePath, FileSystem fs, String... paths) {
    List<HoodieBaseFile> latestFiles = new ArrayList<>();
    try {
        HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build();
        for (String path : paths) {
            BaseFileOnlyView fileSystemView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), fs.globStatus(new Path(path)));
            latestFiles.addAll(fileSystemView.getLatestBaseFiles().collect(Collectors.toList()));
        }
    } catch (Exception e) {
        throw new HoodieException("Error reading hoodie table as a dataframe", e);
    }
    return latestFiles;
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Path(org.apache.hadoop.fs.Path) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) ArrayList(java.util.ArrayList) HoodieException(org.apache.hudi.exception.HoodieException) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException)

Example 4 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class HoodieClientTestUtils method read.

/**
 * Reads the paths under the hoodie table out as a DataFrame.
 */
public static Dataset<Row> read(JavaSparkContext jsc, String basePath, SQLContext sqlContext, FileSystem fs, String... paths) {
    List<String> filteredPaths = new ArrayList<>();
    try {
        List<HoodieBaseFile> latestFiles = getLatestBaseFiles(basePath, fs, paths);
        for (HoodieBaseFile file : latestFiles) {
            filteredPaths.add(file.getPath());
        }
        if (filteredPaths.isEmpty()) {
            return sqlContext.emptyDataFrame();
        }
        String[] filteredPathsToRead = filteredPaths.toArray(new String[filteredPaths.size()]);
        if (filteredPathsToRead[0].endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
            return sqlContext.read().parquet(filteredPathsToRead);
        } else if (filteredPathsToRead[0].endsWith(HoodieFileFormat.ORC.getFileExtension())) {
            return sqlContext.read().orc(filteredPathsToRead);
        }
        return sqlContext.emptyDataFrame();
    } catch (Exception e) {
        throw new HoodieException("Error reading hoodie table as a dataframe", e);
    }
}
Also used : HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) ArrayList(java.util.ArrayList) HoodieException(org.apache.hudi.exception.HoodieException) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException)

Example 5 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableClustering method doClusteringAndValidate.

private void doClusteringAndValidate(SparkRDDWriteClient client, String clusteringCommitTime, HoodieTableMetaClient metaClient, HoodieWriteConfig cfg, HoodieTestDataGenerator dataGen) {
    client.cluster(clusteringCommitTime, true);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    final HoodieTable clusteredTable = HoodieSparkTable.create(cfg, context(), metaClient);
    clusteredTable.getHoodieView().sync();
    Stream<HoodieBaseFile> dataFilesToRead = Arrays.stream(dataGen.getPartitionPaths()).flatMap(p -> clusteredTable.getBaseFileOnlyView().getLatestBaseFiles(p));
    assertEquals(dataGen.getPartitionPaths().length, dataFilesToRead.count());
    HoodieTimeline timeline = metaClient.getCommitTimeline().filterCompletedInstants();
    assertEquals(1, timeline.findInstantsAfter("003", Integer.MAX_VALUE).countInstants(), "Expecting a single commit.");
    assertEquals(clusteringCommitTime, timeline.lastInstant().get().getTimestamp());
    assertEquals(HoodieTimeline.REPLACE_COMMIT_ACTION, timeline.lastInstant().get().getAction());
    if (cfg.populateMetaFields()) {
        assertEquals(400, HoodieClientTestUtils.countRecordsOptionallySince(jsc(), basePath(), sqlContext(), timeline, Option.of("000")), "Must contain 200 records");
    } else {
        assertEquals(400, HoodieClientTestUtils.countRecordsOptionallySince(jsc(), basePath(), sqlContext(), timeline, Option.empty()));
    }
}
Also used : HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieTable(org.apache.hudi.table.HoodieTable)

Aggregations

HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)71 Path (org.apache.hadoop.fs.Path)40 ArrayList (java.util.ArrayList)33 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)31 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)31 FileSlice (org.apache.hudi.common.model.FileSlice)29 List (java.util.List)27 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)27 IOException (java.io.IOException)26 FileStatus (org.apache.hadoop.fs.FileStatus)25 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)24 Pair (org.apache.hudi.common.util.collection.Pair)24 Option (org.apache.hudi.common.util.Option)23 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)23 Collectors (java.util.stream.Collectors)21 Test (org.junit.jupiter.api.Test)21 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)21 Map (java.util.Map)20 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)20 HoodieTable (org.apache.hudi.table.HoodieTable)20