Examples with SparkRDDWriteClient - org.apache.hudi.client.SparkRDDWriteClient

Example 6 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestHoodieMergeOnReadTable method testReleaseResource.

@Test
public void testReleaseResource() throws Exception {
    HoodieWriteConfig.Builder builder = getConfigBuilder(true);
    builder.withReleaseResourceEnabled(true);
    builder.withAutoCommit(false);
    /**
     * Write 1 (test when RELEASE_RESOURCE_ENABLE is true)
     */
    try (SparkRDDWriteClient client = getHoodieWriteClient(builder.build())) {
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
        JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
        writeRecords.persist(StorageLevel.MEMORY_AND_DISK());
        List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        client.commitStats(newCommitTime, statuses.stream().map(WriteStatus::getStat).collect(Collectors.toList()), Option.empty(), metaClient.getCommitActionType());
        assertEquals(spark().sparkContext().persistentRdds().size(), 0);
    }
    builder.withReleaseResourceEnabled(false);
    /**
     * Write 2 (test when RELEASE_RESOURCE_ENABLE is false)
     */
    try (SparkRDDWriteClient client = getHoodieWriteClient(builder.build())) {
        String newCommitTime = "002";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
        JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
        writeRecords.persist(StorageLevel.MEMORY_AND_DISK());
        List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        client.commitStats(newCommitTime, statuses.stream().map(WriteStatus::getStat).collect(Collectors.toList()), Option.empty(), metaClient.getCommitActionType());
        assertTrue(spark().sparkContext().persistentRdds().size() > 0);
    }
}

Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 7 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestCopyOnWriteActionExecutor method testUpdateRecords.

// TODO (weiy): Add testcases for crossing file writing.
@ParameterizedTest
@MethodSource("indexType")
public void testUpdateRecords(HoodieIndex.IndexType indexType) throws Exception {
    // Prepare the AvroParquetIO
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withProps(makeIndexConfig(indexType)).build();
    String firstCommitTime = makeNewCommitTime();
    SparkRDDWriteClient writeClient = getHoodieWriteClient(config);
    writeClient.startCommitWithTime(firstCommitTime);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    String partitionPath = "2016/01/31";
    HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
    // Get some records belong to the same partition (2016/01/31)
    String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}";
    List<HoodieRecord> records = new ArrayList<>();
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
    // Insert new records
    final HoodieSparkCopyOnWriteTable cowTable = table;
    writeClient.insert(jsc.parallelize(records, 1), firstCommitTime);
    FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1);
    assertEquals(1, allFiles.length);
    // Read out the bloom filter and make sure filter can answer record exist or not
    Path filePath = allFiles[0].getPath();
    BloomFilter filter = BaseFileUtils.getInstance(table.getBaseFileFormat()).readBloomFilterFromMetadata(hadoopConf, filePath);
    for (HoodieRecord record : records) {
        assertTrue(filter.mightContain(record.getRecordKey()));
    }
    // Read the base file, check the record content
    List<GenericRecord> fileRecords = BaseFileUtils.getInstance(table.getBaseFileFormat()).readAvroRecords(hadoopConf, filePath);
    GenericRecord newRecord;
    int index = 0;
    for (GenericRecord record : fileRecords) {
        assertEquals(records.get(index).getRecordKey(), record.get("_row_key").toString());
        index++;
    }
    // We update the 1st record & add a new record
    String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    RawTripTestPayload updateRowChanges1 = new RawTripTestPayload(updateRecordStr1);
    HoodieRecord updatedRecord1 = new HoodieAvroRecord(new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1);
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieRecord insertedRecord1 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    List<HoodieRecord> updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1);
    Thread.sleep(1000);
    String newCommitTime = makeNewCommitTime();
    metaClient = HoodieTableMetaClient.reload(metaClient);
    writeClient.startCommitWithTime(newCommitTime);
    List<WriteStatus> statuses = writeClient.upsert(jsc.parallelize(updatedRecords), newCommitTime).collect();
    allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1);
    assertEquals(1, allFiles.length);
    // verify new incremental file group is same as the previous one
    assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName()));
    // Check whether the record has been updated
    Path updatedFilePath = allFiles[0].getPath();
    BloomFilter updatedFilter = BaseFileUtils.getInstance(metaClient).readBloomFilterFromMetadata(hadoopConf, updatedFilePath);
    for (HoodieRecord record : records) {
        // No change to the _row_key
        assertTrue(updatedFilter.mightContain(record.getRecordKey()));
    }
    assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey()));
    // add this so it can further check below
    records.add(insertedRecord1);
    ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedFilePath).build();
    index = 0;
    while ((newRecord = (GenericRecord) updatedReader.read()) != null) {
        assertEquals(newRecord.get("_row_key").toString(), records.get(index).getRecordKey());
        if (index == 0) {
            assertEquals("15", newRecord.get("number").toString());
        }
        index++;
    }
    updatedReader.close();
    // Also check the numRecordsWritten
    WriteStatus writeStatus = statuses.get(0);
    assertEquals(1, statuses.size(), "Should be only one file generated");
    // 3 rewritten records + 1 new record
    assertEquals(4, writeStatus.getStat().getNumWrites());
}

Also used : Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) ParquetReader(org.apache.parquet.hadoop.ParquetReader) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) GenericRecord(org.apache.avro.generic.GenericRecord) AvroReadSupport(org.apache.parquet.avro.AvroReadSupport) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 8 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class CompactionTestBase method executeCompaction.

protected void executeCompaction(String compactionInstantTime, SparkRDDWriteClient client, HoodieTable table, HoodieWriteConfig cfg, int expectedNumRecs, boolean hasDeltaCommitAfterPendingCompaction) throws IOException {
    client.compact(compactionInstantTime);
    assertFalse(WriteMarkersFactory.get(cfg.getMarkersType(), table, compactionInstantTime).doesMarkerDirExist());
    List<FileSlice> fileSliceList = getCurrentLatestFileSlices(table);
    assertTrue(fileSliceList.stream().findAny().isPresent(), "Ensure latest file-slices are not empty");
    assertFalse(fileSliceList.stream().anyMatch(fs -> !fs.getBaseInstantTime().equals(compactionInstantTime)), "Verify all file-slices have base-instant same as compaction instant");
    assertFalse(fileSliceList.stream().anyMatch(fs -> !fs.getBaseFile().isPresent()), "Verify all file-slices have data-files");
    if (hasDeltaCommitAfterPendingCompaction) {
        assertFalse(fileSliceList.stream().anyMatch(fs -> fs.getLogFiles().count() == 0), "Verify all file-slices have atleast one log-file");
    } else {
        assertFalse(fileSliceList.stream().anyMatch(fs -> fs.getLogFiles().count() > 0), "Verify all file-slices have no log-files");
    }
    // verify that there is a commit
    table = getHoodieTable(HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).setLoadActiveTimelineOnLoad(true).build(), cfg);
    HoodieTimeline timeline = table.getMetaClient().getCommitTimeline().filterCompletedInstants();
    String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp();
    assertEquals(latestCompactionCommitTime, compactionInstantTime, "Expect compaction instant time to be the latest commit time");
    assertEquals(expectedNumRecs, HoodieClientTestUtils.countRecordsOptionallySince(jsc, basePath, sqlContext, timeline, Option.of("000")), "Must contain expected records");
}

Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) FileSystemViewStorageType(org.apache.hudi.common.table.view.FileSystemViewStorageType) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) TRIP_EXAMPLE_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline)

Example 9 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestUpgradeDowngrade method triggerCommit.

private List<HoodieRecord> triggerCommit(String newCommitTime, HoodieTableType tableType, boolean enableMarkedBasedRollback) {
    Map<String, String> params = new HashMap<>();
    if (tableType == HoodieTableType.MERGE_ON_READ) {
        params.put(TYPE.key(), HoodieTableType.MERGE_ON_READ.name());
    }
    HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withRollbackUsingMarkers(enableMarkedBasedRollback).withProps(params).build();
    SparkRDDWriteClient client = getHoodieWriteClient(cfg);
    client.startCommitWithTime(newCommitTime);
    List<HoodieRecord> records = dataGen.generateInsertsContainsAllPartitions(newCommitTime, 2);
    JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
    JavaRDD<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime);
    Assertions.assertNoWriteErrors(statuses.collect());
    client.commit(newCommitTime, statuses);
    return records;
}

Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 10 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestUpgradeDowngrade method testUpgradeZeroToOneInternal.

public void testUpgradeZeroToOneInternal(boolean induceResiduesFromPrevUpgrade, boolean deletePartialMarkerFiles, HoodieTableType tableType) throws IOException {
    // init config, table and client.
    Map<String, String> params = new HashMap<>();
    if (tableType == HoodieTableType.MERGE_ON_READ) {
        params.put(TYPE.key(), HoodieTableType.MERGE_ON_READ.name());
        metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ);
    }
    HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withRollbackUsingMarkers(false).withProps(params).build();
    SparkRDDWriteClient client = getHoodieWriteClient(cfg);
    // prepare data. Make 2 commits, in which 2nd is not committed.
    List<FileSlice> firstPartitionCommit2FileSlices = new ArrayList<>();
    List<FileSlice> secondPartitionCommit2FileSlices = new ArrayList<>();
    Pair<List<HoodieRecord>, List<HoodieRecord>> inputRecords = twoUpsertCommitDataWithTwoPartitions(firstPartitionCommit2FileSlices, secondPartitionCommit2FileSlices, cfg, client, false);
    HoodieTable table = this.getHoodieTable(metaClient, cfg);
    HoodieInstant commitInstant = table.getPendingCommitTimeline().lastInstant().get();
    // delete one of the marker files in 2nd commit if need be.
    WriteMarkers writeMarkers = WriteMarkersFactory.get(getConfig().getMarkersType(), table, commitInstant.getTimestamp());
    List<String> markerPaths = new ArrayList<>(writeMarkers.allMarkerFilePaths());
    if (deletePartialMarkerFiles) {
        String toDeleteMarkerFile = markerPaths.get(0);
        table.getMetaClient().getFs().delete(new Path(table.getMetaClient().getTempFolderPath() + "/" + commitInstant.getTimestamp() + "/" + toDeleteMarkerFile));
        markerPaths.remove(toDeleteMarkerFile);
    }
    // set hoodie.table.version to 0 in hoodie.properties file
    metaClient.getTableConfig().setTableVersion(HoodieTableVersion.ZERO);
    if (induceResiduesFromPrevUpgrade) {
        createResidualFile();
    }
    // should re-create marker files for 2nd commit since its pending.
    new UpgradeDowngrade(metaClient, cfg, context, SparkUpgradeDowngradeHelper.getInstance()).run(HoodieTableVersion.ONE, null);
    // assert marker files
    assertMarkerFilesForUpgrade(table, commitInstant, firstPartitionCommit2FileSlices, secondPartitionCommit2FileSlices);
    // verify hoodie.table.version got upgraded
    metaClient = HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(cfg.getBasePath()).setLayoutVersion(Option.of(new TimelineLayoutVersion(cfg.getTimelineLayoutVersion()))).build();
    assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.ONE.versionCode());
    assertTableVersionFromPropertyFile(HoodieTableVersion.ONE);
// trigger 3rd commit with marker based rollback enabled.
/* HUDI-2310
    List<HoodieRecord> thirdBatch = triggerCommit("003", tableType, true);

    // Check the entire dataset has all records only from 1st commit and 3rd commit since 2nd is expected to be rolledback.
    assertRows(inputRecords.getKey(), thirdBatch);
    if (induceResiduesFromPrevUpgrade) {
      assertFalse(dfs.exists(new Path(metaClient.getMetaPath(), SparkUpgradeDowngrade.HOODIE_UPDATED_PROPERTY_FILE)));
    }*/
}

Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HashMap(java.util.HashMap) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTable(org.apache.hudi.table.HoodieTable) WriteMarkers(org.apache.hudi.table.marker.WriteMarkers) TimelineLayoutVersion(org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion) List(java.util.List) ArrayList(java.util.ArrayList)

Aggregations

SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)143 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)127 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)113 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)86 Test (org.junit.jupiter.api.Test)80 WriteStatus (org.apache.hudi.client.WriteStatus)76 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)74 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)61 List (java.util.List)59 ArrayList (java.util.ArrayList)51 HoodieTable (org.apache.hudi.table.HoodieTable)51 Path (org.apache.hadoop.fs.Path)47 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)47 JavaRDD (org.apache.spark.api.java.JavaRDD)47 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)44 Collectors (java.util.stream.Collectors)43 Assertions.assertEquals (org.junit.jupiter.api.Assertions.assertEquals)43 HoodieCompactionConfig (org.apache.hudi.config.HoodieCompactionConfig)42 HashMap (java.util.HashMap)41 Properties (java.util.Properties)41