Search in sources :

Example 21 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class HoodieWriteHandle method write.

/**
 * Perform the actual writing of the given record into the backing file.
 */
public void write(HoodieRecord record, Option<IndexedRecord> avroRecord, Option<Exception> exception) {
    Option recordMetadata = ((HoodieRecordPayload) record.getData()).getMetadata();
    if (exception.isPresent() && exception.get() instanceof Throwable) {
        // Not throwing exception from here, since we don't want to fail the entire job for a single record
        writeStatus.markFailure(record, exception.get(), recordMetadata);
        LOG.error("Error writing record " + record, exception.get());
    } else {
        write(record, avroRecord);
    }
}
Also used : Option(org.apache.hudi.common.util.Option) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload)

Example 22 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class TestFlinkHoodieBloomIndex method testCheckExists.

@ParameterizedTest(name = TEST_NAME_WITH_PARAMS)
@MethodSource("configParams")
public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception {
    // We have some records to be tagged (two different partitions)
    String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    // record key same as recordStr2
    String recordStr4 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath());
    HoodieRecord record1 = new HoodieAvroRecord(key1, rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    HoodieKey key2 = new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath());
    HoodieRecord record2 = new HoodieAvroRecord(key2, rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    HoodieKey key3 = new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath());
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieKey key4 = new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath());
    HoodieRecord record4 = new HoodieAvroRecord(key4, rowChange4);
    List<HoodieKey> keys = asList(key1, key2, key3, key4);
    // Also create the metadata and config
    HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking);
    HoodieTable hoodieTable = HoodieFlinkTable.create(config, context, metaClient);
    HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(hoodieTable, SCHEMA);
    // Let's tag
    HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance());
    List<HoodieRecord> toTagRecords = new ArrayList<>();
    toTagRecords.add(new HoodieAvroRecord(record4.getKey(), null));
    List<HoodieRecord> taggedRecords = tagLocation(bloomIndex, toTagRecords, hoodieTable);
    Map<HoodieKey, Option<Pair<String, String>>> recordLocations = new HashMap<>();
    for (HoodieRecord taggedRecord : taggedRecords) {
        recordLocations.put(taggedRecord.getKey(), taggedRecord.isCurrentLocationKnown() ? Option.of(Pair.of(taggedRecord.getPartitionPath(), taggedRecord.getCurrentLocation().getFileId())) : Option.empty());
    }
    // Should not find any files
    for (Option<Pair<String, String>> record : recordLocations.values()) {
        assertTrue(!record.isPresent());
    }
    // We create three base file, each having one record. (two different partitions)
    String fileId1 = testTable.addCommit("001").getFileIdWithInserts("2016/01/31", record1);
    String fileId2 = testTable.addCommit("002").getFileIdWithInserts("2016/01/31", record2);
    String fileId3 = testTable.addCommit("003").getFileIdWithInserts("2015/01/31", record4);
    // We do the tag again
    metaClient = HoodieTableMetaClient.reload(metaClient);
    hoodieTable = HoodieFlinkTable.create(config, context, metaClient);
    List<HoodieRecord> toTagRecords1 = new ArrayList<>();
    for (HoodieKey key : keys) {
        taggedRecords.add(new HoodieAvroRecord(key, null));
    }
    taggedRecords = tagLocation(bloomIndex, toTagRecords1, hoodieTable);
    recordLocations.clear();
    for (HoodieRecord taggedRecord : taggedRecords) {
        recordLocations.put(taggedRecord.getKey(), taggedRecord.isCurrentLocationKnown() ? Option.of(Pair.of(taggedRecord.getPartitionPath(), taggedRecord.getCurrentLocation().getFileId())) : Option.empty());
    }
    // Check results
    for (Map.Entry<HoodieKey, Option<Pair<String, String>>> record : recordLocations.entrySet()) {
        if (record.getKey().getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
            assertTrue(record.getValue().isPresent());
            assertEquals(fileId1, record.getValue().get().getRight());
        } else if (record.getKey().getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
            assertTrue(record.getValue().isPresent());
            if (record.getKey().getPartitionPath().equals("2015/01/31")) {
                assertEquals(fileId3, record.getValue().get().getRight());
            } else {
                assertEquals(fileId2, record.getValue().get().getRight());
            }
        } else if (record.getKey().getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
            assertFalse(record.getValue().isPresent());
        }
    }
}
Also used : HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTable(org.apache.hudi.table.HoodieTable) Option(org.apache.hudi.common.util.Option) HoodieFlinkWriteableTestTable(org.apache.hudi.testutils.HoodieFlinkWriteableTestTable) HashMap(java.util.HashMap) Map(java.util.Map) HoodieMapPair(org.apache.hudi.common.data.HoodieMapPair) Pair(org.apache.hudi.common.util.collection.Pair) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 23 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class MultipleSparkJobExecutionStrategy method readRecordsForGroupWithLogs.

/**
 * Read records from baseFiles, apply updates and convert to RDD.
 */
private HoodieData<HoodieRecord<T>> readRecordsForGroupWithLogs(JavaSparkContext jsc, List<ClusteringOperation> clusteringOps, String instantTime) {
    HoodieWriteConfig config = getWriteConfig();
    HoodieTable table = getHoodieTable();
    return HoodieJavaRDD.of(jsc.parallelize(clusteringOps, clusteringOps.size()).mapPartitions(clusteringOpsPartition -> {
        List<Iterator<HoodieRecord<T>>> recordIterators = new ArrayList<>();
        clusteringOpsPartition.forEachRemaining(clusteringOp -> {
            long maxMemoryPerCompaction = IOUtils.getMaxMemoryPerCompaction(new SparkTaskContextSupplier(), config);
            LOG.info("MaxMemoryPerCompaction run as part of clustering => " + maxMemoryPerCompaction);
            try {
                Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
                HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(table.getMetaClient().getFs()).withBasePath(table.getMetaClient().getBasePath()).withLogFilePaths(clusteringOp.getDeltaFilePaths()).withReaderSchema(readerSchema).withLatestInstantTime(instantTime).withMaxMemorySizeInBytes(maxMemoryPerCompaction).withReadBlocksLazily(config.getCompactionLazyBlockReadEnabled()).withReverseReader(config.getCompactionReverseLogReadEnabled()).withBufferSize(config.getMaxDFSStreamBufferSize()).withSpillableMapBasePath(config.getSpillableMapBasePath()).withPartition(clusteringOp.getPartitionPath()).build();
                Option<HoodieFileReader> baseFileReader = StringUtils.isNullOrEmpty(clusteringOp.getDataFilePath()) ? Option.empty() : Option.of(HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), new Path(clusteringOp.getDataFilePath())));
                HoodieTableConfig tableConfig = table.getMetaClient().getTableConfig();
                recordIterators.add(getFileSliceReader(baseFileReader, scanner, readerSchema, tableConfig.getPayloadClass(), tableConfig.getPreCombineField(), tableConfig.populateMetaFields() ? Option.empty() : Option.of(Pair.of(tableConfig.getRecordKeyFieldProp(), tableConfig.getPartitionFieldProp()))));
            } catch (IOException e) {
                throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath() + " and " + clusteringOp.getDeltaFilePaths(), e);
            }
        });
        return new ConcatenatingIterator<>(recordIterators);
    }));
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) RewriteAvroPayload(org.apache.hudi.common.model.RewriteAvroPayload) ConcatenatingIterator(org.apache.hudi.client.utils.ConcatenatingIterator) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) SparkTaskContextSupplier(org.apache.hudi.client.SparkTaskContextSupplier) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) Logger(org.apache.log4j.Logger) RDDSpatialCurveSortPartitioner(org.apache.hudi.execution.bulkinsert.RDDSpatialCurveSortPartitioner) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) PLAN_STRATEGY_SORT_COLUMNS(org.apache.hudi.config.HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) ClusteringOperation(org.apache.hudi.common.model.ClusteringOperation) Collectors(java.util.stream.Collectors) List(java.util.List) Stream(java.util.stream.Stream) KeyGenUtils(org.apache.hudi.keygen.KeyGenUtils) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) CompletableFuture(java.util.concurrent.CompletableFuture) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieFileSliceReader.getFileSliceReader(org.apache.hudi.common.table.log.HoodieFileSliceReader.getFileSliceReader) ArrayList(java.util.ArrayList) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) ClusteringExecutionStrategy(org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy) IndexedRecord(org.apache.avro.generic.IndexedRecord) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) RDDCustomColumnsSortPartitioner(org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) FutureUtils(org.apache.hudi.common.util.FutureUtils) HoodieClusteringException(org.apache.hudi.exception.HoodieClusteringException) IOException(java.io.IOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieClusteringGroup(org.apache.hudi.avro.model.HoodieClusteringGroup) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) IOUtils(org.apache.hudi.io.IOUtils) LogManager(org.apache.log4j.LogManager) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) SparkTaskContextSupplier(org.apache.hudi.client.SparkTaskContextSupplier) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieClusteringException(org.apache.hudi.exception.HoodieClusteringException) HoodieTable(org.apache.hudi.table.HoodieTable) List(java.util.List) ArrayList(java.util.ArrayList) Option(org.apache.hudi.common.util.Option)

Example 24 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class BaseJavaCommitActionExecutor method buildProfile.

protected Pair<HashMap<String, WorkloadStat>, WorkloadStat> buildProfile(List<HoodieRecord<T>> inputRecords) {
    HashMap<String, WorkloadStat> partitionPathStatMap = new HashMap<>();
    WorkloadStat globalStat = new WorkloadStat();
    Map<Pair<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = inputRecords.stream().map(record -> Pair.of(Pair.of(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())), record)).collect(Collectors.groupingBy(Pair::getLeft, Collectors.counting()));
    for (Map.Entry<Pair<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts.entrySet()) {
        String partitionPath = e.getKey().getLeft();
        Long count = e.getValue();
        Option<HoodieRecordLocation> locOption = e.getKey().getRight();
        if (!partitionPathStatMap.containsKey(partitionPath)) {
            partitionPathStatMap.put(partitionPath, new WorkloadStat());
        }
        if (locOption.isPresent()) {
            // update
            partitionPathStatMap.get(partitionPath).addUpdates(locOption.get(), count);
            globalStat.addUpdates(locOption.get(), count);
        } else {
            // insert
            partitionPathStatMap.get(partitionPath).addInserts(count);
            globalStat.addInserts(count);
        }
    }
    return Pair.of(partitionPathStatMap, globalStat);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) CreateHandleFactory(org.apache.hudi.io.CreateHandleFactory) Option(org.apache.hudi.common.util.Option) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) CommitUtils(org.apache.hudi.common.util.CommitUtils) HoodieList(org.apache.hudi.common.data.HoodieList) LinkedHashMap(java.util.LinkedHashMap) JavaLazyInsertIterable(org.apache.hudi.execution.JavaLazyInsertIterable) Logger(org.apache.log4j.Logger) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Duration(java.time.Duration) Map(java.util.Map) HoodieSortedMergeHandle(org.apache.hudi.io.HoodieSortedMergeHandle) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) Path(org.apache.hadoop.fs.Path) HoodieConcatHandle(org.apache.hudi.io.HoodieConcatHandle) WorkloadStat(org.apache.hudi.table.WorkloadStat) LinkedList(java.util.LinkedList) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieKey(org.apache.hudi.common.model.HoodieKey) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) Pair(org.apache.hudi.common.util.collection.Pair) WorkloadStat(org.apache.hudi.table.WorkloadStat) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) Pair(org.apache.hudi.common.util.collection.Pair)

Example 25 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class TestHoodieTableFileSystemView method testHoodieTableFileSystemViewWithPendingClustering.

/**
 * create hoodie table like
 * .
 * ├── .hoodie
 * │   ├── .aux
 * │   │   └── .bootstrap
 * │   │       ├── .fileids
 * │   │       └── .partitions
 * │   ├── .temp
 * │   ├── 1.commit
 * │   ├── 1.commit.requested
 * │   ├── 1.inflight
 * │   ├── 2.replacecommit
 * │   ├── 2.replacecommit.inflight
 * │   ├── 2.replacecommit.requested
 * │   ├── 3.commit
 * │   ├── 3.commit.requested
 * │   ├── 3.inflight
 * │   ├── archived
 * │   └── hoodie.properties
 * └── 2020
 *     └── 06
 *         └── 27
 *             ├── 5fe477d2-0150-46d4-833c-1e9cc8da9948_1-0-1_3.parquet
 *             ├── 7e3208c8-fdec-4254-9682-8fff1e51ee8d_1-0-1_2.parquet
 *             ├── e04b0e2d-1467-46b2-8ea6-f4fe950965a5_1-0-1_1.parquet
 *             └── f3936b66-b3db-4fc8-a6d0-b1a7559016e6_1-0-1_1.parquet
 *
 * First test fsView API with finished clustering:
 *  1. getLatestBaseFilesBeforeOrOn
 *  2. getBaseFileOn
 *  3. getLatestBaseFilesInRange
 *  4. getAllBaseFiles
 *  5. getLatestBaseFiles
 *
 * Then remove 2.replacecommit, 1.commit, 1.commit.requested, 1.inflight to simulate
 * pending clustering at the earliest position in the active timeline and test these APIs again.
 *
 * @throws IOException
 */
@Test
public void testHoodieTableFileSystemViewWithPendingClustering() throws IOException {
    List<String> latestBaseFilesBeforeOrOn;
    Option<HoodieBaseFile> baseFileOn;
    List<String> latestBaseFilesInRange;
    List<String> allBaseFiles;
    List<String> latestBaseFiles;
    List<String> latestBaseFilesPerPartition;
    String partitionPath = "2020/06/27";
    new File(basePath + "/" + partitionPath).mkdirs();
    HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline();
    // will create 5 fileId in partition.
    // fileId1 and fileId2 will be replaced by fileID3
    // fileId4 and fileId5 will be committed after clustering finished.
    String fileId1 = UUID.randomUUID().toString();
    String fileId2 = UUID.randomUUID().toString();
    String fileId3 = UUID.randomUUID().toString();
    String fileId4 = UUID.randomUUID().toString();
    String fileId5 = UUID.randomUUID().toString();
    assertFalse(roView.getLatestBaseFiles(partitionPath).anyMatch(dfile -> dfile.getFileId().equals(fileId1) || dfile.getFileId().equals(fileId2) || dfile.getFileId().equals(fileId3) || dfile.getFileId().equals(fileId4) || dfile.getFileId().equals(fileId5)), "No commit, should not find any data file");
    // first insert commit
    String commitTime1 = "1";
    String fileName1 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1);
    String fileName2 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId2);
    new File(basePath + "/" + partitionPath + "/" + fileName1).createNewFile();
    new File(basePath + "/" + partitionPath + "/" + fileName2).createNewFile();
    HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime1);
    // build writeStats
    HashMap<String, List<String>> partitionToFile1 = new HashMap<>();
    ArrayList<String> files1 = new ArrayList<>();
    files1.add(fileId1);
    files1.add(fileId2);
    partitionToFile1.put(partitionPath, files1);
    List<HoodieWriteStat> writeStats1 = buildWriteStats(partitionToFile1, commitTime1);
    HoodieCommitMetadata commitMetadata1 = CommitUtils.buildMetadata(writeStats1, new HashMap<>(), Option.empty(), WriteOperationType.INSERT, "", HoodieTimeline.COMMIT_ACTION);
    saveAsComplete(commitTimeline, instant1, Option.of(commitMetadata1.toJsonString().getBytes(StandardCharsets.UTF_8)));
    commitTimeline.reload();
    // replace commit
    String commitTime2 = "2";
    String fileName3 = FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, fileId3);
    new File(basePath + "/" + partitionPath + "/" + fileName3).createNewFile();
    HoodieInstant instant2 = new HoodieInstant(true, HoodieTimeline.REPLACE_COMMIT_ACTION, commitTime2);
    Map<String, List<String>> partitionToReplaceFileIds = new HashMap<>();
    List<String> replacedFileIds = new ArrayList<>();
    replacedFileIds.add(fileId1);
    replacedFileIds.add(fileId2);
    partitionToReplaceFileIds.put(partitionPath, replacedFileIds);
    HashMap<String, List<String>> partitionToFile2 = new HashMap<>();
    ArrayList<String> files2 = new ArrayList<>();
    files2.add(fileId3);
    partitionToFile2.put(partitionPath, files2);
    List<HoodieWriteStat> writeStats2 = buildWriteStats(partitionToFile2, commitTime2);
    HoodieCommitMetadata commitMetadata2 = CommitUtils.buildMetadata(writeStats2, partitionToReplaceFileIds, Option.empty(), WriteOperationType.INSERT_OVERWRITE, "", HoodieTimeline.REPLACE_COMMIT_ACTION);
    saveAsComplete(commitTimeline, instant2, Option.of(commitMetadata2.toJsonString().getBytes(StandardCharsets.UTF_8)));
    // another insert commit
    String commitTime3 = "3";
    String fileName4 = FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId4);
    new File(basePath + "/" + partitionPath + "/" + fileName4).createNewFile();
    HoodieInstant instant3 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime3);
    // build writeStats
    HashMap<String, List<String>> partitionToFile3 = new HashMap<>();
    ArrayList<String> files3 = new ArrayList<>();
    files3.add(fileId4);
    partitionToFile3.put(partitionPath, files3);
    List<HoodieWriteStat> writeStats3 = buildWriteStats(partitionToFile3, commitTime3);
    HoodieCommitMetadata commitMetadata3 = CommitUtils.buildMetadata(writeStats3, new HashMap<>(), Option.empty(), WriteOperationType.INSERT, "", HoodieTimeline.COMMIT_ACTION);
    saveAsComplete(commitTimeline, instant3, Option.of(commitMetadata3.toJsonString().getBytes(StandardCharsets.UTF_8)));
    metaClient.reloadActiveTimeline();
    refreshFsView();
    ArrayList<String> commits = new ArrayList<>();
    commits.add(commitTime1);
    commits.add(commitTime2);
    commits.add(commitTime3);
    // do check
    latestBaseFilesBeforeOrOn = fsView.getLatestBaseFilesBeforeOrOn(partitionPath, commitTime3).map(HoodieBaseFile::getFileId).collect(Collectors.toList());
    assertEquals(2, latestBaseFilesBeforeOrOn.size());
    assertTrue(latestBaseFilesBeforeOrOn.contains(fileId3));
    assertTrue(latestBaseFilesBeforeOrOn.contains(fileId4));
    // could see fileId3 because clustering is committed.
    baseFileOn = fsView.getBaseFileOn(partitionPath, commitTime2, fileId3);
    assertTrue(baseFileOn.isPresent());
    assertEquals(baseFileOn.get().getFileId(), fileId3);
    latestBaseFilesInRange = fsView.getLatestBaseFilesInRange(commits).map(HoodieBaseFile::getFileId).collect(Collectors.toList());
    assertEquals(2, latestBaseFilesInRange.size());
    assertTrue(latestBaseFilesInRange.contains(fileId3));
    assertTrue(latestBaseFilesInRange.contains(fileId4));
    allBaseFiles = fsView.getAllBaseFiles(partitionPath).map(HoodieBaseFile::getFileId).collect(Collectors.toList());
    assertEquals(2, allBaseFiles.size());
    assertTrue(allBaseFiles.contains(fileId3));
    assertTrue(allBaseFiles.contains(fileId4));
    // could see fileId3 because clustering is committed.
    latestBaseFiles = fsView.getLatestBaseFiles().map(HoodieBaseFile::getFileId).collect(Collectors.toList());
    assertEquals(2, latestBaseFiles.size());
    assertTrue(allBaseFiles.contains(fileId3));
    assertTrue(allBaseFiles.contains(fileId4));
    // could see fileId3 because clustering is committed.
    latestBaseFilesPerPartition = fsView.getLatestBaseFiles(partitionPath).map(HoodieBaseFile::getFileId).collect(Collectors.toList());
    assertEquals(2, latestBaseFiles.size());
    assertTrue(latestBaseFilesPerPartition.contains(fileId3));
    assertTrue(latestBaseFilesPerPartition.contains(fileId4));
    HoodieWrapperFileSystem fs = metaClient.getFs();
    fs.delete(new Path(basePath + "/.hoodie", "1.commit"), false);
    fs.delete(new Path(basePath + "/.hoodie", "1.inflight"), false);
    fs.delete(new Path(basePath + "/.hoodie", "1.commit.requested"), false);
    fs.delete(new Path(basePath + "/.hoodie", "2.replacecommit"), false);
    metaClient.reloadActiveTimeline();
    refreshFsView();
    // do check after delete some commit file
    latestBaseFilesBeforeOrOn = fsView.getLatestBaseFilesBeforeOrOn(partitionPath, commitTime3).map(HoodieBaseFile::getFileId).collect(Collectors.toList());
    assertEquals(3, latestBaseFilesBeforeOrOn.size());
    assertTrue(latestBaseFilesBeforeOrOn.contains(fileId1));
    assertTrue(latestBaseFilesBeforeOrOn.contains(fileId2));
    assertTrue(latestBaseFilesBeforeOrOn.contains(fileId4));
    // couldn't see fileId3 because clustering is not committed.
    baseFileOn = fsView.getBaseFileOn(partitionPath, commitTime2, fileId3);
    assertFalse(baseFileOn.isPresent());
    latestBaseFilesInRange = fsView.getLatestBaseFilesInRange(commits).map(HoodieBaseFile::getFileId).collect(Collectors.toList());
    assertEquals(3, latestBaseFilesInRange.size());
    assertTrue(latestBaseFilesInRange.contains(fileId1));
    assertTrue(latestBaseFilesInRange.contains(fileId2));
    assertTrue(latestBaseFilesInRange.contains(fileId4));
    allBaseFiles = fsView.getAllBaseFiles(partitionPath).map(HoodieBaseFile::getFileId).collect(Collectors.toList());
    assertEquals(3, allBaseFiles.size());
    assertTrue(allBaseFiles.contains(fileId1));
    assertTrue(allBaseFiles.contains(fileId2));
    assertTrue(allBaseFiles.contains(fileId4));
    // couldn't see fileId3 because clustering is not committed.
    latestBaseFiles = fsView.getLatestBaseFiles().map(HoodieBaseFile::getFileId).collect(Collectors.toList());
    assertEquals(3, latestBaseFiles.size());
    assertTrue(allBaseFiles.contains(fileId1));
    assertTrue(allBaseFiles.contains(fileId2));
    assertTrue(allBaseFiles.contains(fileId4));
    // couldn't see fileId3 because clustering is not committed.
    latestBaseFilesPerPartition = fsView.getLatestBaseFiles(partitionPath).map(HoodieBaseFile::getFileId).collect(Collectors.toList());
    assertEquals(3, latestBaseFiles.size());
    assertTrue(latestBaseFilesPerPartition.contains(fileId1));
    assertTrue(latestBaseFilesPerPartition.contains(fileId2));
    assertTrue(latestBaseFilesPerPartition.contains(fileId4));
}
Also used : BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) Arrays(java.util.Arrays) Date(java.util.Date) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileStatus(org.apache.hadoop.fs.FileStatus) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) BaseFile(org.apache.hudi.common.model.BaseFile) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) MethodSource(org.junit.jupiter.params.provider.MethodSource) HoodiePath(org.apache.hudi.avro.model.HoodiePath) HoodieFSPermission(org.apache.hudi.avro.model.HoodieFSPermission) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) Set(java.util.Set) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) ClusteringUtils(org.apache.hudi.common.util.ClusteringUtils) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) FileSlice(org.apache.hudi.common.model.FileSlice) IndexWriter(org.apache.hudi.common.bootstrap.index.BootstrapIndex.IndexWriter) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) CommitUtils(org.apache.hudi.common.util.CommitUtils) FsAction(org.apache.hadoop.fs.permission.FsAction) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HFileBootstrapIndex(org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Paths(java.nio.file.Paths) FileStatusUtils(org.apache.hudi.common.bootstrap.FileStatusUtils) LogManager(org.apache.log4j.LogManager) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) HoodiePath(org.apache.hudi.avro.model.HoodiePath) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HashMap(java.util.HashMap) HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) ArrayList(java.util.ArrayList) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) List(java.util.List) ArrayList(java.util.ArrayList) BaseFile(org.apache.hudi.common.model.BaseFile) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) File(java.io.File) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

Option (org.apache.hudi.common.util.Option)105 List (java.util.List)84 IOException (java.io.IOException)70 Collectors (java.util.stream.Collectors)69 Map (java.util.Map)67 ArrayList (java.util.ArrayList)61 Path (org.apache.hadoop.fs.Path)59 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)59 Pair (org.apache.hudi.common.util.collection.Pair)59 HashMap (java.util.HashMap)58 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)58 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)56 LogManager (org.apache.log4j.LogManager)54 Logger (org.apache.log4j.Logger)54 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)53 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)46 HoodieIOException (org.apache.hudi.exception.HoodieIOException)44 Arrays (java.util.Arrays)43 FSUtils (org.apache.hudi.common.fs.FSUtils)43 Collections (java.util.Collections)39