Search in sources :

Example 96 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class HoodieClientTestBase method deleteBatch.

/**
 * Helper to delete batch of hoodie keys and do regular assertions on the state after successful completion.
 *
 * @param client                 Hoodie Write Client
 * @param newCommitTime          New Commit Timestamp to be used
 * @param prevCommitTime         Commit Timestamp used in previous commit
 * @param initCommitTime         Begin Timestamp (usually "000")
 * @param keyGenFunction         Key Generation function
 * @param deleteFn               Write Function to be used for delete
 * @param assertForCommit        Enable Assertion of Writes
 * @param expRecordsInThisCommit Expected number of records in this commit
 * @param expTotalRecords        Expected number of records when scanned
 * @throws Exception in case of error
 */
public JavaRDD<WriteStatus> deleteBatch(SparkRDDWriteClient client, String newCommitTime, String prevCommitTime, String initCommitTime, int numRecordsInThisCommit, Function<Integer, List<HoodieKey>> keyGenFunction, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieKey>, String> deleteFn, boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, boolean filerForCommitTimeWithAssert) throws Exception {
    // Delete 1 (only deletes)
    client.startCommitWithTime(newCommitTime);
    List<HoodieKey> keysToDelete = keyGenFunction.apply(numRecordsInThisCommit);
    JavaRDD<HoodieKey> deleteRecords = jsc.parallelize(keysToDelete, 1);
    JavaRDD<WriteStatus> result = deleteFn.apply(client, deleteRecords, newCommitTime);
    List<WriteStatus> statuses = result.collect();
    assertNoWriteErrors(statuses);
    // check the partition metadata is written out
    assertPartitionMetadataForKeys(keysToDelete, fs);
    // verify that there is a commit
    HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
    HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
    if (assertForCommit) {
        assertEquals(3, timeline.findInstantsAfter(initCommitTime, Integer.MAX_VALUE).countInstants(), "Expecting 3 commits.");
        assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(), "Latest commit should be " + newCommitTime);
        if (filerForCommitTimeWithAssert) {
            // if meta cols are disabled, we can't do assertion based on assertion time
            assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), "Must contain " + expRecordsInThisCommit + " records");
        }
        // Check the entire dataset has all records still
        String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
        for (int i = 0; i < fullPartitionPaths.length; i++) {
            fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
        }
        assertEquals(expTotalRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), "Must contain " + expTotalRecords + " records");
        if (filerForCommitTimeWithAssert) {
            // Check that the incremental consumption from prevCommitTime
            assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), HoodieClientTestUtils.countRecordsOptionallySince(jsc, basePath, sqlContext, timeline, Option.of(prevCommitTime)), "Incremental consumption from " + prevCommitTime + " should give no records in latest commit," + " since it is a delete operation");
        }
    }
    return result;
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieKey(org.apache.hudi.common.model.HoodieKey) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 97 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class HoodieClientTestBase method writeBatchHelper.

private JavaRDD<WriteStatus> writeBatchHelper(SparkRDDWriteClient client, String newCommitTime, String prevCommitTime, Option<List<String>> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit, List<HoodieRecord> records, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits, boolean doCommit, boolean filterForCommitTimeWithAssert) throws IOException {
    // Write 1 (only inserts)
    client.startCommitWithTime(newCommitTime);
    JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
    JavaRDD<WriteStatus> result = writeFn.apply(client, writeRecords, newCommitTime);
    List<WriteStatus> statuses = result.collect();
    assertNoWriteErrors(statuses);
    if (doCommit) {
        client.commit(newCommitTime, result);
    }
    // check the partition metadata is written out
    assertPartitionMetadataForRecords(records, fs);
    // verify that there is a commit
    HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
    HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
    if (assertForCommit) {
        assertEquals(expTotalCommits, timeline.findInstantsAfter(initCommitTime, Integer.MAX_VALUE).countInstants(), "Expecting " + expTotalCommits + " commits.");
        assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(), "Latest commit should be " + newCommitTime);
        if (filterForCommitTimeWithAssert) {
            // when meta cols are disabled, we can't really do per commit assertion.
            assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), "Must contain " + expRecordsInThisCommit + " records");
        }
        // Check the entire dataset has all records still
        String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
        for (int i = 0; i < fullPartitionPaths.length; i++) {
            fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
        }
        assertEquals(expTotalRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), "Must contain " + expTotalRecords + " records");
        if (filterForCommitTimeWithAssert) {
            // Check that the incremental consumption from prevCommitTime
            assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), HoodieClientTestUtils.countRecordsOptionallySince(jsc, basePath, sqlContext, timeline, Option.of(prevCommitTime)), "Incremental consumption from " + prevCommitTime + " should give all records in latest commit");
            if (commitTimesBetweenPrevAndNew.isPresent()) {
                commitTimesBetweenPrevAndNew.get().forEach(ct -> {
                    assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), HoodieClientTestUtils.countRecordsOptionallySince(jsc, basePath, sqlContext, timeline, Option.of(ct)), "Incremental consumption from " + ct + " should give all records in latest commit");
                });
            }
        }
    }
    return result;
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 98 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class HoodieClientTestUtils method getCommitMetadataForInstant.

private static Option<HoodieCommitMetadata> getCommitMetadataForInstant(HoodieTableMetaClient metaClient, HoodieInstant instant) {
    try {
        HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
        byte[] data = timeline.getInstantDetails(instant).get();
        return Option.of(HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class));
    } catch (Exception e) {
        throw new HoodieException("Failed to read schema from commit metadata", e);
    }
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieException(org.apache.hudi.exception.HoodieException) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException)

Example 99 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableInsertUpdateDelete method testSimpleInsertAndUpdate.

@ParameterizedTest
@MethodSource
public void testSimpleInsertAndUpdate(HoodieFileFormat fileFormat, boolean populateMetaFields) throws Exception {
    Properties properties = populateMetaFields ? new Properties() : getPropertiesForKeyGen();
    properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), fileFormat.toString());
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
    HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true);
    addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
    HoodieWriteConfig cfg = cfgBuilder.build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
        /*
       * Write 1 (only inserts)
       */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
        Stream<HoodieBaseFile> dataFiles = insertRecordsToMORTable(metaClient, records, client, cfg, newCommitTime);
        assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
        /*
       * Write 2 (updates)
       */
        newCommitTime = "004";
        client.startCommitWithTime(newCommitTime);
        records = dataGen.generateUpdates(newCommitTime, 100);
        updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, false);
        String compactionCommitTime = client.scheduleCompaction(Option.empty()).get().toString();
        client.compact(compactionCommitTime);
        HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        hoodieTable.getHoodieView().sync();
        FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
        HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
        Stream<HoodieBaseFile> dataFilesToRead = tableView.getLatestBaseFiles();
        assertTrue(dataFilesToRead.findAny().isPresent());
        // verify that there is a commit
        metaClient = HoodieTableMetaClient.reload(metaClient);
        HoodieTimeline timeline = metaClient.getCommitTimeline().filterCompletedInstants();
        assertEquals(1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting a single commit.");
        String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp();
        assertTrue(HoodieTimeline.compareTimestamps("000", HoodieTimeline.LESSER_THAN, latestCompactionCommitTime));
        if (cfg.populateMetaFields()) {
            assertEquals(200, HoodieClientTestUtils.countRecordsOptionallySince(jsc(), basePath(), sqlContext(), timeline, Option.of("000")), "Must contain 200 records");
        } else {
            assertEquals(200, HoodieClientTestUtils.countRecordsOptionallySince(jsc(), basePath(), sqlContext(), timeline, Option.empty()));
        }
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Properties(java.util.Properties) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 100 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class BaseHoodieTableFileIndex method doRefresh.

private void doRefresh() {
    long startTime = System.currentTimeMillis();
    Map<PartitionPath, FileStatus[]> partitionFiles = loadPartitionPathFiles();
    FileStatus[] allFiles = partitionFiles.values().stream().flatMap(Arrays::stream).toArray(FileStatus[]::new);
    metaClient.reloadActiveTimeline();
    HoodieTimeline activeTimeline = getActiveTimeline();
    Option<HoodieInstant> latestInstant = activeTimeline.lastInstant();
    // TODO we can optimize the flow by:
    // - First fetch list of files from instants of interest
    // - Load FileStatus's
    fileSystemView = new HoodieTableFileSystemView(metaClient, activeTimeline, allFiles);
    Option<String> queryInstant = specifiedQueryInstant.or(() -> latestInstant.map(HoodieInstant::getTimestamp));
    validate(activeTimeline, queryInstant);
    if (tableType.equals(HoodieTableType.MERGE_ON_READ) && queryType.equals(HoodieTableQueryType.SNAPSHOT)) {
        cachedAllInputFileSlices = partitionFiles.keySet().stream().collect(Collectors.toMap(Function.identity(), partitionPath -> queryInstant.map(instant -> fileSystemView.getLatestMergedFileSlicesBeforeOrOn(partitionPath.path, queryInstant.get()).collect(Collectors.toList())).orElse(Collections.emptyList())));
    } else {
        cachedAllInputFileSlices = partitionFiles.keySet().stream().collect(Collectors.toMap(Function.identity(), partitionPath -> queryInstant.map(instant -> fileSystemView.getLatestFileSlicesBeforeOrOn(partitionPath.path, instant, true)).orElse(fileSystemView.getLatestFileSlices(partitionPath.path)).collect(Collectors.toList())));
    }
    cachedFileSize = cachedAllInputFileSlices.values().stream().flatMap(Collection::stream).mapToLong(BaseHoodieTableFileIndex::fileSliceSize).sum();
    // If the partition value contains InternalRow.empty, we query it as a non-partitioned table.
    queryAsNonePartitionedTable = partitionFiles.keySet().stream().anyMatch(p -> p.values.length == 0);
    long duration = System.currentTimeMillis() - startTime;
    LOG.info(String.format("Refresh table %s, spent: %d ms", metaClient.getTableConfig().getTableName(), duration));
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) Function(java.util.function.Function) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) BaseFile(org.apache.hudi.common.model.BaseFile) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) TypedProperties(org.apache.hudi.common.config.TypedProperties) Collection(java.util.Collection) HoodieTableQueryType(org.apache.hudi.common.model.HoodieTableQueryType) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Objects(java.util.Objects) List(java.util.List) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView)

Aggregations

HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)118 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)74 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)39 List (java.util.List)36 IOException (java.io.IOException)34 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)34 ArrayList (java.util.ArrayList)32 Option (org.apache.hudi.common.util.Option)30 Collectors (java.util.stream.Collectors)29 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)29 HoodieException (org.apache.hudi.exception.HoodieException)26 Map (java.util.Map)25 FileStatus (org.apache.hadoop.fs.FileStatus)24 Path (org.apache.hadoop.fs.Path)24 Set (java.util.Set)22 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)22 FileSlice (org.apache.hudi.common.model.FileSlice)21 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)21 Pair (org.apache.hudi.common.util.collection.Pair)21 FSUtils (org.apache.hudi.common.fs.FSUtils)20