Search in sources :

Example 11 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestUpgradeDowngrade method testUpgradeOneToTwo.

@ParameterizedTest
@EnumSource(value = HoodieTableType.class)
public void testUpgradeOneToTwo(HoodieTableType tableType) throws IOException {
    // init config, table and client.
    Map<String, String> params = new HashMap<>();
    addNewTableParamsToProps(params);
    if (tableType == HoodieTableType.MERGE_ON_READ) {
        params.put(TYPE.key(), HoodieTableType.MERGE_ON_READ.name());
        metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ);
    }
    HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withRollbackUsingMarkers(false).withProps(params).build();
    SparkRDDWriteClient client = getHoodieWriteClient(cfg);
    // Write inserts
    doInsert(client);
    // downgrade table props
    downgradeTableConfigsFromTwoToOne(cfg);
    // perform upgrade
    new UpgradeDowngrade(metaClient, cfg, context, SparkUpgradeDowngradeHelper.getInstance()).run(HoodieTableVersion.TWO, null);
    // verify hoodie.table.version got upgraded
    metaClient = HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(cfg.getBasePath()).setLayoutVersion(Option.of(new TimelineLayoutVersion(cfg.getTimelineLayoutVersion()))).build();
    assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.TWO.versionCode());
    assertTableVersionFromPropertyFile(HoodieTableVersion.TWO);
    // verify table props
    assertTableProps(cfg);
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HashMap(java.util.HashMap) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) TimelineLayoutVersion(org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion) EnumSource(org.junit.jupiter.params.provider.EnumSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 12 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestUpgradeDowngrade method testDowngrade.

@ParameterizedTest(name = TEST_NAME_WITH_DOWNGRADE_PARAMS)
@MethodSource("downGradeConfigParams")
public void testDowngrade(boolean deletePartialMarkerFiles, HoodieTableType tableType, HoodieTableVersion fromVersion) throws IOException {
    MarkerType markerType = fromVersion == HoodieTableVersion.TWO ? MarkerType.TIMELINE_SERVER_BASED : MarkerType.DIRECT;
    // init config, table and client.
    Map<String, String> params = new HashMap<>();
    if (fromVersion == HoodieTableVersion.TWO) {
        addNewTableParamsToProps(params);
    }
    if (tableType == HoodieTableType.MERGE_ON_READ) {
        params.put(TYPE.key(), HoodieTableType.MERGE_ON_READ.name());
        metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ);
    }
    HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withRollbackUsingMarkers(true).withMarkersType(markerType.name()).withProps(params).build();
    SparkRDDWriteClient client = getHoodieWriteClient(cfg);
    if (fromVersion == HoodieTableVersion.TWO) {
        // set table configs
        HoodieTableConfig tableConfig = metaClient.getTableConfig();
        tableConfig.setValue(HoodieTableConfig.NAME, cfg.getTableName());
        tableConfig.setValue(HoodieTableConfig.PARTITION_FIELDS, cfg.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()));
        tableConfig.setValue(HoodieTableConfig.RECORDKEY_FIELDS, cfg.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()));
        tableConfig.setValue(BASE_FILE_FORMAT, cfg.getString(BASE_FILE_FORMAT));
    }
    // prepare data. Make 2 commits, in which 2nd is not committed.
    List<FileSlice> firstPartitionCommit2FileSlices = new ArrayList<>();
    List<FileSlice> secondPartitionCommit2FileSlices = new ArrayList<>();
    Pair<List<HoodieRecord>, List<HoodieRecord>> inputRecords = twoUpsertCommitDataWithTwoPartitions(firstPartitionCommit2FileSlices, secondPartitionCommit2FileSlices, cfg, client, false);
    HoodieTable table = this.getHoodieTable(metaClient, cfg);
    HoodieInstant commitInstant = table.getPendingCommitTimeline().lastInstant().get();
    // delete one of the marker files in 2nd commit if need be.
    WriteMarkers writeMarkers = WriteMarkersFactory.get(markerType, table, commitInstant.getTimestamp());
    List<String> markerPaths = new ArrayList<>(writeMarkers.allMarkerFilePaths());
    if (deletePartialMarkerFiles) {
        String toDeleteMarkerFile = markerPaths.get(0);
        table.getMetaClient().getFs().delete(new Path(table.getMetaClient().getTempFolderPath() + "/" + commitInstant.getTimestamp() + "/" + toDeleteMarkerFile));
        markerPaths.remove(toDeleteMarkerFile);
    }
    // set hoodie.table.version to fromVersion in hoodie.properties file
    HoodieTableVersion toVersion = HoodieTableVersion.ZERO;
    if (fromVersion == HoodieTableVersion.TWO) {
        prepForDowngradeFromTwoToOne();
        toVersion = HoodieTableVersion.ONE;
    } else {
        prepForDowngradeFromOneToZero();
    }
    // downgrade should be performed. all marker files should be deleted
    new UpgradeDowngrade(metaClient, cfg, context, SparkUpgradeDowngradeHelper.getInstance()).run(toVersion, null);
    if (fromVersion == HoodieTableVersion.TWO) {
        // assert marker files
        assertMarkerFilesForDowngrade(table, commitInstant, toVersion == HoodieTableVersion.ONE);
    }
    // verify hoodie.table.version got downgraded
    metaClient = HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(cfg.getBasePath()).setLayoutVersion(Option.of(new TimelineLayoutVersion(cfg.getTimelineLayoutVersion()))).build();
    assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), toVersion.versionCode());
    assertTableVersionFromPropertyFile(toVersion);
// trigger 3rd commit with marker based rollback disabled.
/* HUDI-2310
    List<HoodieRecord> thirdBatch = triggerCommit("003", tableType, false);

    // Check the entire dataset has all records only from 1st commit and 3rd commit since 2nd is expected to be rolledback.
    assertRows(inputRecords.getKey(), thirdBatch);
     */
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HashMap(java.util.HashMap) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieTable(org.apache.hudi.table.HoodieTable) WriteMarkers(org.apache.hudi.table.marker.WriteMarkers) MarkerType(org.apache.hudi.common.table.marker.MarkerType) TimelineLayoutVersion(org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion) List(java.util.List) ArrayList(java.util.ArrayList) HoodieTableVersion(org.apache.hudi.common.table.HoodieTableVersion) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 13 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class HoodieClientTestHarness method getHoodieWriteClient.

public SparkRDDWriteClient getHoodieWriteClient(HoodieWriteConfig cfg) {
    if (null != writeClient) {
        writeClient.close();
        writeClient = null;
    }
    writeClient = new SparkRDDWriteClient(context, cfg);
    return writeClient;
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient)

Example 14 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableClustering method testClusteringWithNoBaseFiles.

@ParameterizedTest
@ValueSource(booleans = { true, false })
void testClusteringWithNoBaseFiles(boolean doUpdates) throws Exception {
    // set low compaction small File Size to generate more file groups.
    HoodieWriteConfig.Builder cfgBuilder = HoodieWriteConfig.newBuilder().forTable("test-trip-table").withPath(basePath()).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withDeleteParallelism(2).withAutoCommit(true).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(10L).withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024 * 1024).parquetMaxFileSize(1024 * 1024 * 1024).build()).withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder().withEnableBackupForRemoteFileSystemView(false).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).withClusteringConfig(HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10).withClusteringTargetPartitions(0).withInlineClustering(true).withInlineClusteringNumCommits(1).build()).withRollbackUsingMarkers(false);
    HoodieWriteConfig cfg = cfgBuilder.build();
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, cfg.getProps());
    HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        // test 2 inserts
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 400);
        Stream<HoodieBaseFile> dataFiles = insertRecordsToMORTable(metaClient, records.subList(0, 200), client, cfg, newCommitTime);
        assertTrue(!dataFiles.findAny().isPresent(), "should not have any base files");
        newCommitTime = "002";
        client.startCommitWithTime(newCommitTime);
        dataFiles = insertRecordsToMORTable(metaClient, records.subList(200, 400), client, cfg, newCommitTime);
        assertTrue(!dataFiles.findAny().isPresent(), "should not have any base files");
        // run updates
        if (doUpdates) {
            newCommitTime = "003";
            client.startCommitWithTime(newCommitTime);
            records = dataGen.generateUpdates(newCommitTime, 100);
            updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, false);
        }
        HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        hoodieTable.getHoodieView().sync();
        FileStatus[] allBaseFiles = listAllBaseFilesInPath(hoodieTable);
        // expect 0 base files for each partition
        assertEquals(0, allBaseFiles.length);
        String clusteringCommitTime = client.scheduleClustering(Option.empty()).get().toString();
        metaClient = HoodieTableMetaClient.reload(metaClient);
        hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        // verify log files are included in clustering plan for each partition.
        assertEquals(dataGen.getPartitionPaths().length, hoodieTable.getFileSystemView().getFileGroupsInPendingClustering().map(Pair::getLeft).count());
        // do the clustering and validate
        doClusteringAndValidate(client, clusteringCommitTime, metaClient, cfg, dataGen);
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Pair(org.apache.hudi.common.util.collection.Pair) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 15 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableClustering method testClustering.

@ParameterizedTest
@MethodSource
void testClustering(boolean doUpdates, boolean populateMetaFields, boolean preserveCommitMetadata) throws Exception {
    // set low compaction small File Size to generate more file groups.
    HoodieWriteConfig.Builder cfgBuilder = HoodieWriteConfig.newBuilder().forTable("test-trip-table").withPath(basePath()).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withDeleteParallelism(2).withAutoCommit(true).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(10L).withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024 * 1024).parquetMaxFileSize(1024 * 1024 * 1024).build()).withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder().withEnableBackupForRemoteFileSystemView(false).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).withClusteringConfig(HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10).withClusteringTargetPartitions(0).withInlineClustering(true).withInlineClusteringNumCommits(1).withPreserveHoodieCommitMetadata(preserveCommitMetadata).build()).withRollbackUsingMarkers(false);
    addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
    HoodieWriteConfig cfg = cfgBuilder.build();
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, cfg.getProps());
    HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        /*
       * Write 1 (only inserts)
       */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 400);
        Stream<HoodieBaseFile> dataFiles = insertRecordsToMORTable(metaClient, records.subList(0, 200), client, cfg, newCommitTime);
        assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
        /*
       * Write 2 (more inserts to create new files)
       */
        // we already set small file size to small number to force inserts to go into new file.
        newCommitTime = "002";
        client.startCommitWithTime(newCommitTime);
        dataFiles = insertRecordsToMORTable(metaClient, records.subList(200, 400), client, cfg, newCommitTime);
        assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
        if (doUpdates) {
            /*
         * Write 3 (updates)
         */
            newCommitTime = "003";
            client.startCommitWithTime(newCommitTime);
            records = dataGen.generateUpdates(newCommitTime, 100);
            updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, false);
        }
        HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        hoodieTable.getHoodieView().sync();
        FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
        // expect 2 base files for each partition
        assertEquals(dataGen.getPartitionPaths().length * 2, allFiles.length);
        String clusteringCommitTime = client.scheduleClustering(Option.empty()).get().toString();
        metaClient = HoodieTableMetaClient.reload(metaClient);
        hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        // verify all files are included in clustering plan.
        assertEquals(allFiles.length, hoodieTable.getFileSystemView().getFileGroupsInPendingClustering().map(Pair::getLeft).count());
        // Do the clustering and validate
        doClusteringAndValidate(client, clusteringCommitTime, metaClient, cfg, dataGen);
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Pair(org.apache.hudi.common.util.collection.Pair) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Aggregations

SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)143 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)127 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)113 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)86 Test (org.junit.jupiter.api.Test)80 WriteStatus (org.apache.hudi.client.WriteStatus)76 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)74 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)61 List (java.util.List)59 ArrayList (java.util.ArrayList)51 HoodieTable (org.apache.hudi.table.HoodieTable)51 Path (org.apache.hadoop.fs.Path)47 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)47 JavaRDD (org.apache.spark.api.java.JavaRDD)47 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)44 Collectors (java.util.stream.Collectors)43 Assertions.assertEquals (org.junit.jupiter.api.Assertions.assertEquals)43 HoodieCompactionConfig (org.apache.hudi.config.HoodieCompactionConfig)42 HashMap (java.util.HashMap)41 Properties (java.util.Properties)41