Search in sources :

Example 56 with HoodieWriteStat

use of org.apache.hudi.common.model.HoodieWriteStat in project hudi by apache.

the class HiveTestUtil method createPartition.

private static HoodieCommitMetadata createPartition(String partitionPath, boolean isParquetSchemaSimple, boolean useSchemaFromCommitMetadata, String instantTime) throws IOException, URISyntaxException {
    HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
    Path partPath = new Path(hiveSyncConfig.basePath + "/" + partitionPath);
    fileSystem.makeQualified(partPath);
    fileSystem.mkdirs(partPath);
    List<HoodieWriteStat> writeStats = createTestData(partPath, isParquetSchemaSimple, instantTime);
    writeStats.forEach(s -> commitMetadata.addWriteStat(partitionPath, s));
    addSchemaToCommitMetadata(commitMetadata, isParquetSchemaSimple, useSchemaFromCommitMetadata);
    return commitMetadata;
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Path(org.apache.hadoop.fs.Path) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat)

Example 57 with HoodieWriteStat

use of org.apache.hudi.common.model.HoodieWriteStat in project hudi by apache.

the class HiveTestUtil method createCOWTableWithSchema.

public static void createCOWTableWithSchema(String instantTime, String schemaFileName) throws IOException, URISyntaxException {
    Path path = new Path(hiveSyncConfig.basePath);
    FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
    HoodieTableMetaClient.withPropertyBuilder().setTableType(HoodieTableType.COPY_ON_WRITE).setTableName(hiveSyncConfig.tableName).setPayloadClass(HoodieAvroPayload.class).initTable(configuration, hiveSyncConfig.basePath);
    boolean result = fileSystem.mkdirs(path);
    checkResult(result);
    ZonedDateTime dateTime = ZonedDateTime.now().truncatedTo(ChronoUnit.DAYS);
    HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
    String partitionPath = dateTime.format(dtfOut);
    Path partPath = new Path(hiveSyncConfig.basePath + "/" + partitionPath);
    fileSystem.makeQualified(partPath);
    fileSystem.mkdirs(partPath);
    List<HoodieWriteStat> writeStats = new ArrayList<>();
    String fileId = UUID.randomUUID().toString();
    Path filePath = new Path(partPath.toString() + "/" + FSUtils.makeDataFileName(instantTime, "1-0-1", fileId));
    Schema schema = SchemaTestUtil.getSchemaFromResource(HiveTestUtil.class, schemaFileName);
    generateParquetDataWithSchema(filePath, schema);
    HoodieWriteStat writeStat = new HoodieWriteStat();
    writeStat.setFileId(fileId);
    writeStat.setPath(filePath.toString());
    writeStats.add(writeStat);
    writeStats.forEach(s -> commitMetadata.addWriteStat(partitionPath, s));
    commitMetadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, schema.toString());
    createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
    createCommitFile(commitMetadata, instantTime, hiveSyncConfig.basePath);
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) ZonedDateTime(java.time.ZonedDateTime) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) File(java.io.File) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload)

Example 58 with HoodieWriteStat

use of org.apache.hudi.common.model.HoodieWriteStat in project hudi by apache.

the class TestCleaner method testInsertAndCleanByVersions.

/**
 * Test Helper for Cleaning by versions logic from HoodieWriteClient API perspective.
 *
 * @param insertFn Insert API to be tested
 * @param upsertFn Upsert API to be tested
 * @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
 *        record generation to also tag the regards (de-dupe is implicit as we use unique record-gen APIs)
 * @throws Exception in case of errors
 */
private void testInsertAndCleanByVersions(Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> insertFn, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> upsertFn, boolean isPreppedAPI) throws Exception {
    // keep upto 2 versions for each file
    int maxVersions = 2;
    HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(maxVersions).build()).withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()).build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        final Function2<List<HoodieRecord>, String, Integer> recordInsertGenWrappedFunction = generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateInserts);
        final Function2<List<HoodieRecord>, String, Integer> recordUpsertGenWrappedFunction = generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateUniqueUpdates);
        insertFirstBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS);
        Map<HoodieFileGroupId, FileSlice> compactionFileIdToLatestFileSlice = new HashMap<>();
        metaClient = HoodieTableMetaClient.reload(metaClient);
        HoodieTable table = HoodieSparkTable.create(getConfig(), context, metaClient);
        for (String partitionPath : dataGen.getPartitionPaths()) {
            TableFileSystemView fsView = table.getFileSystemView();
            Option<Boolean> added = Option.fromJavaOptional(fsView.getAllFileGroups(partitionPath).findFirst().map(fg -> {
                fg.getLatestFileSlice().map(fs -> compactionFileIdToLatestFileSlice.put(fg.getFileGroupId(), fs));
                return true;
            }));
            if (added.isPresent()) {
                // Select only one file-group for compaction
                break;
            }
        }
        // Create workload with selected file-slices
        List<Pair<String, FileSlice>> partitionFileSlicePairs = compactionFileIdToLatestFileSlice.entrySet().stream().map(e -> Pair.of(e.getKey().getPartitionPath(), e.getValue())).collect(Collectors.toList());
        HoodieCompactionPlan compactionPlan = CompactionUtils.buildFromFileSlices(partitionFileSlicePairs, Option.empty(), Option.empty());
        List<String> instantTimes = makeIncrementalCommitTimes(9, 1, 10);
        String compactionTime = instantTimes.get(0);
        table.getActiveTimeline().saveToCompactionRequested(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, compactionTime), TimelineMetadataUtils.serializeCompactionPlan(compactionPlan));
        instantTimes = instantTimes.subList(1, instantTimes.size());
        // remaining.
        for (String newInstantTime : instantTimes) {
            try {
                client.startCommitWithTime(newInstantTime);
                List<HoodieRecord> records = recordUpsertGenWrappedFunction.apply(newInstantTime, 100);
                List<WriteStatus> statuses = upsertFn.apply(client, jsc.parallelize(records, 1), newInstantTime).collect();
                // Verify there are no errors
                assertNoWriteErrors(statuses);
                metaClient = HoodieTableMetaClient.reload(metaClient);
                table = HoodieSparkTable.create(getConfig(), context, metaClient);
                HoodieTimeline timeline = table.getMetaClient().getCommitsTimeline();
                TableFileSystemView fsView = table.getFileSystemView();
                // Need to ensure the following
                for (String partitionPath : dataGen.getPartitionPaths()) {
                    // compute all the versions of all files, from time 0
                    HashMap<String, TreeSet<String>> fileIdToVersions = new HashMap<>();
                    for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) {
                        HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(entry).get(), HoodieCommitMetadata.class);
                        for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) {
                            if (!fileIdToVersions.containsKey(wstat.getFileId())) {
                                fileIdToVersions.put(wstat.getFileId(), new TreeSet<>());
                            }
                            fileIdToVersions.get(wstat.getFileId()).add(FSUtils.getCommitTime(new Path(wstat.getPath()).getName()));
                        }
                    }
                    List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList());
                    for (HoodieFileGroup fileGroup : fileGroups) {
                        if (compactionFileIdToLatestFileSlice.containsKey(fileGroup.getFileGroupId())) {
                            // Ensure latest file-slice selected for compaction is retained
                            Option<HoodieBaseFile> dataFileForCompactionPresent = Option.fromJavaOptional(fileGroup.getAllBaseFiles().filter(df -> {
                                return compactionFileIdToLatestFileSlice.get(fileGroup.getFileGroupId()).getBaseInstantTime().equals(df.getCommitTime());
                            }).findAny());
                            assertTrue(dataFileForCompactionPresent.isPresent(), "Data File selected for compaction is retained");
                        } else {
                            // file has no more than max versions
                            String fileId = fileGroup.getFileGroupId().getFileId();
                            List<HoodieBaseFile> dataFiles = fileGroup.getAllBaseFiles().collect(Collectors.toList());
                            assertTrue(dataFiles.size() <= maxVersions, "fileId " + fileId + " has more than " + maxVersions + " versions");
                            // Each file, has the latest N versions (i.e cleaning gets rid of older versions)
                            List<String> commitedVersions = new ArrayList<>(fileIdToVersions.get(fileId));
                            for (int i = 0; i < dataFiles.size(); i++) {
                                assertEquals((dataFiles.get(i)).getCommitTime(), commitedVersions.get(commitedVersions.size() - 1 - i), "File " + fileId + " does not have latest versions on commits" + commitedVersions);
                            }
                        }
                    }
                }
            } catch (IOException ioe) {
                throw new RuntimeException(ioe);
            }
        }
    }
}
Also used : Arrays(java.util.Arrays) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) CleanPlanner(org.apache.hudi.table.action.clean.CleanPlanner) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) Awaitility.await(org.awaitility.Awaitility.await) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) Set(java.util.Set) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieIndex(org.apache.hudi.index.HoodieIndex) StandardCharsets(java.nio.charset.StandardCharsets) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) HoodieCleaningPolicy(org.apache.hudi.common.model.HoodieCleaningPolicy) CleanPlanMigrator(org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanMigrator) Assertions.assertNull(org.junit.jupiter.api.Assertions.assertNull) Option(org.apache.hudi.common.util.Option) CleanPlanV1MigrationHandler(org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV1MigrationHandler) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) StringUtils(org.apache.hudi.common.util.StringUtils) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) HoodieTestCommitGenerator.getBaseFilename(org.apache.hudi.HoodieTestCommitGenerator.getBaseFilename) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) ValueSource(org.junit.jupiter.params.provider.ValueSource) ConsistencyGuardConfig(org.apache.hudi.common.fs.ConsistencyGuardConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Files(java.nio.file.Files) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) HoodieClusteringGroup(org.apache.hudi.avro.model.HoodieClusteringGroup) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) Paths(java.nio.file.Paths) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieCleanPartitionMetadata(org.apache.hudi.avro.model.HoodieCleanPartitionMetadata) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) HoodieTestTable.makeIncrementalCommitTimes(org.apache.hudi.common.testutils.HoodieTestTable.makeIncrementalCommitTimes) HoodieMetadataTestTable(org.apache.hudi.common.testutils.HoodieMetadataTestTable) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) MethodSource(org.junit.jupiter.params.provider.MethodSource) IOType(org.apache.hudi.common.model.IOType) Predicate(java.util.function.Predicate) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) Tuple3(scala.Tuple3) HoodieClusteringStrategy(org.apache.hudi.avro.model.HoodieClusteringStrategy) Test(org.junit.jupiter.api.Test) List(java.util.List) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) CleanMetadataMigrator(org.apache.hudi.common.table.timeline.versioning.clean.CleanMetadataMigrator) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) DEFAULT_PARTITION_PATHS(org.apache.hudi.common.testutils.HoodieTestUtils.DEFAULT_PARTITION_PATHS) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) FileSlice(org.apache.hudi.common.model.FileSlice) HashMap(java.util.HashMap) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HashSet(java.util.HashSet) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) TestBootstrapIndex(org.apache.hudi.common.bootstrap.TestBootstrapIndex) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) TimeUnit(java.util.concurrent.TimeUnit) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieTestTable.makeNewCommitTime(org.apache.hudi.common.testutils.HoodieTestTable.makeNewCommitTime) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) SparkHoodieIndexFactory(org.apache.hudi.index.SparkHoodieIndexFactory) HoodieSliceInfo(org.apache.hudi.avro.model.HoodieSliceInfo) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HashMap(java.util.HashMap) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ArrayList(java.util.ArrayList) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) List(java.util.List) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) WriteStatus(org.apache.hudi.client.WriteStatus) Pair(org.apache.hudi.common.util.collection.Pair) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup)

Example 59 with HoodieWriteStat

use of org.apache.hudi.common.model.HoodieWriteStat in project hudi by apache.

the class TestUpsertPartitioner method generateCommitMetadataWith.

private static HoodieCommitMetadata generateCommitMetadataWith(int totalRecordsWritten, int totalBytesWritten) {
    List<HoodieWriteStat> fakeHoodieWriteStats = generateCommitStatWith(totalRecordsWritten, totalBytesWritten);
    HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
    fakeHoodieWriteStats.forEach(stat -> commitMetadata.addWriteStat(stat.getPartitionPath(), stat));
    return commitMetadata;
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieTestUtils.generateFakeHoodieWriteStat(org.apache.hudi.common.testutils.HoodieTestUtils.generateFakeHoodieWriteStat)

Example 60 with HoodieWriteStat

use of org.apache.hudi.common.model.HoodieWriteStat in project hudi by apache.

the class TestUpsertPartitioner method generateCommitStatWith.

private static List<HoodieWriteStat> generateCommitStatWith(int totalRecordsWritten, int totalBytesWritten) {
    List<HoodieWriteStat> writeStatsList = generateFakeHoodieWriteStat(5);
    // clear all record and byte stats except for last entry.
    for (int i = 0; i < writeStatsList.size() - 1; i++) {
        HoodieWriteStat writeStat = writeStatsList.get(i);
        writeStat.setNumWrites(0);
        writeStat.setTotalWriteBytes(0);
    }
    HoodieWriteStat lastWriteStat = writeStatsList.get(writeStatsList.size() - 1);
    lastWriteStat.setTotalWriteBytes(totalBytesWritten);
    lastWriteStat.setNumWrites(totalRecordsWritten);
    return writeStatsList;
}
Also used : HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieTestUtils.generateFakeHoodieWriteStat(org.apache.hudi.common.testutils.HoodieTestUtils.generateFakeHoodieWriteStat)

Aggregations

HoodieWriteStat (org.apache.hudi.common.model.HoodieWriteStat)74 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)42 List (java.util.List)38 ArrayList (java.util.ArrayList)33 HashMap (java.util.HashMap)32 Map (java.util.Map)32 Path (org.apache.hadoop.fs.Path)28 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)24 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)23 IOException (java.io.IOException)22 Option (org.apache.hudi.common.util.Option)19 Collectors (java.util.stream.Collectors)18 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)18 WriteStatus (org.apache.hudi.client.WriteStatus)17 HoodieReplaceCommitMetadata (org.apache.hudi.common.model.HoodieReplaceCommitMetadata)17 LogManager (org.apache.log4j.LogManager)16 Logger (org.apache.log4j.Logger)16 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)15 FileSlice (org.apache.hudi.common.model.FileSlice)14 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)14