Search in sources :

Example 1 with HoodieIndex

use of org.apache.hudi.index.HoodieIndex in project hudi by apache.

the class TestHoodieCompactor method testWriteStatusContentsAfterCompaction.

@Test
public void testWriteStatusContentsAfterCompaction() throws Exception {
    // insert 100 records
    HoodieWriteConfig config = getConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().withMaxNumDeltaCommitsBeforeCompaction(1).build()).build();
    try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
        String newCommitTime = "100";
        writeClient.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
        JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
        writeClient.insert(recordsRDD, newCommitTime).collect();
        // Update all the 100 records
        HoodieTable table = HoodieSparkTable.create(config, context);
        newCommitTime = "101";
        List<HoodieRecord> updatedRecords = dataGen.generateUpdates(newCommitTime, records);
        JavaRDD<HoodieRecord> updatedRecordsRDD = jsc.parallelize(updatedRecords, 1);
        HoodieIndex index = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance());
        JavaRDD<HoodieRecord> updatedTaggedRecordsRDD = tagLocation(index, updatedRecordsRDD, table);
        writeClient.startCommitWithTime(newCommitTime);
        writeClient.upsertPreppedRecords(updatedTaggedRecordsRDD, newCommitTime).collect();
        metaClient.reloadActiveTimeline();
        // Verify that all data file has one log file
        table = HoodieSparkTable.create(config, context);
        for (String partitionPath : dataGen.getPartitionPaths()) {
            List<FileSlice> groupedLogFiles = table.getSliceView().getLatestFileSlices(partitionPath).collect(Collectors.toList());
            for (FileSlice fileSlice : groupedLogFiles) {
                assertEquals(1, fileSlice.getLogFiles().count(), "There should be 1 log file written for every data file");
            }
        }
        // Do a compaction
        table = HoodieSparkTable.create(config, context);
        String compactionInstantTime = "102";
        table.scheduleCompaction(context, compactionInstantTime, Option.empty());
        table.getMetaClient().reloadActiveTimeline();
        HoodieData<WriteStatus> result = (HoodieData<WriteStatus>) table.compact(context, compactionInstantTime).getWriteStatuses();
        // Verify that all partition paths are present in the WriteStatus result
        for (String partitionPath : dataGen.getPartitionPaths()) {
            List<WriteStatus> writeStatuses = result.collectAsList();
            assertTrue(writeStatuses.stream().filter(writeStatus -> writeStatus.getStat().getPartitionPath().contentEquals(partitionPath)).count() > 0);
        }
    }
}
Also used : HoodieData(org.apache.hudi.common.data.HoodieData) HoodieTable(org.apache.hudi.table.HoodieTable) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Option(org.apache.hudi.common.util.Option) SparkHoodieBloomIndexHelper(org.apache.hudi.index.bloom.SparkHoodieBloomIndexHelper) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HoodieBloomIndex(org.apache.hudi.index.bloom.HoodieBloomIndex) HoodieClientTestHarness(org.apache.hudi.testutils.HoodieClientTestHarness) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Configuration(org.apache.hadoop.conf.Configuration) HoodieMemoryConfig(org.apache.hudi.config.HoodieMemoryConfig) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieNotSupportedException(org.apache.hudi.exception.HoodieNotSupportedException) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) Test(org.junit.jupiter.api.Test) WriteStatus(org.apache.hudi.client.WriteStatus) AfterEach(org.junit.jupiter.api.AfterEach) List(java.util.List) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) FSUtils(org.apache.hudi.common.fs.FSUtils) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieBloomIndex(org.apache.hudi.index.bloom.HoodieBloomIndex) HoodieTable(org.apache.hudi.table.HoodieTable) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test)

Example 2 with HoodieIndex

use of org.apache.hudi.index.HoodieIndex in project hudi by apache.

the class StreamWriteFunction method flushRemaining.

@SuppressWarnings("unchecked, rawtypes")
private void flushRemaining(boolean endInput) {
    this.currentInstant = instantToWrite(hasData());
    if (this.currentInstant == null) {
        // in case there are empty checkpoints that has no input data
        throw new HoodieException("No inflight instant when flushing data!");
    }
    final List<WriteStatus> writeStatus;
    if (buckets.size() > 0) {
        writeStatus = new ArrayList<>();
        this.buckets.values().forEach(bucket -> {
            List<HoodieRecord> records = bucket.writeBuffer();
            if (records.size() > 0) {
                if (config.getBoolean(FlinkOptions.PRE_COMBINE)) {
                    records = FlinkWriteHelper.newInstance().deduplicateRecords(records, (HoodieIndex) null, -1);
                }
                bucket.preWrite(records);
                writeStatus.addAll(writeFunction.apply(records, currentInstant));
                records.clear();
                bucket.reset();
            }
        });
    } else {
        LOG.info("No data to write in subtask [{}] for instant [{}]", taskID, currentInstant);
        writeStatus = Collections.emptyList();
    }
    final WriteMetadataEvent event = WriteMetadataEvent.builder().taskID(taskID).instantTime(currentInstant).writeStatus(writeStatus).lastBatch(true).endInput(endInput).build();
    this.eventGateway.sendEventToCoordinator(event);
    this.buckets.clear();
    this.tracer.reset();
    this.writeClient.cleanHandles();
    this.writeStatuses.addAll(writeStatus);
    // blocks flushing until the coordinator starts a new instant
    this.confirming = true;
}
Also used : HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieException(org.apache.hudi.exception.HoodieException) WriteMetadataEvent(org.apache.hudi.sink.event.WriteMetadataEvent) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 3 with HoodieIndex

use of org.apache.hudi.index.HoodieIndex in project hudi by apache.

the class StreamWriteFunction method flushBucket.

@SuppressWarnings("unchecked, rawtypes")
private boolean flushBucket(DataBucket bucket) {
    String instant = instantToWrite(true);
    if (instant == null) {
        // in case there are empty checkpoints that has no input data
        LOG.info("No inflight instant when flushing data, skip.");
        return false;
    }
    List<HoodieRecord> records = bucket.writeBuffer();
    ValidationUtils.checkState(records.size() > 0, "Data bucket to flush has no buffering records");
    if (config.getBoolean(FlinkOptions.PRE_COMBINE)) {
        records = FlinkWriteHelper.newInstance().deduplicateRecords(records, (HoodieIndex) null, -1);
    }
    bucket.preWrite(records);
    final List<WriteStatus> writeStatus = new ArrayList<>(writeFunction.apply(records, instant));
    records.clear();
    final WriteMetadataEvent event = WriteMetadataEvent.builder().taskID(taskID).instantTime(// the write instant may shift but the event still use the currentInstant.
    instant).writeStatus(writeStatus).lastBatch(false).endInput(false).build();
    this.eventGateway.sendEventToCoordinator(event);
    writeStatuses.addAll(writeStatus);
    return true;
}
Also used : HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) WriteMetadataEvent(org.apache.hudi.sink.event.WriteMetadataEvent) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 4 with HoodieIndex

use of org.apache.hudi.index.HoodieIndex in project hudi by apache.

the class HoodieWriteHelper method deduplicateRecords.

@Override
public HoodieData<HoodieRecord<T>> deduplicateRecords(HoodieData<HoodieRecord<T>> records, HoodieIndex<?, ?> index, int parallelism) {
    boolean isIndexingGlobal = index.isGlobal();
    return records.mapToPair(record -> {
        HoodieKey hoodieKey = record.getKey();
        // If index used is global, then records are expected to differ in their partitionPath
        Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey;
        return Pair.of(key, record);
    }).reduceByKey((rec1, rec2) -> {
        @SuppressWarnings("unchecked") T reducedData = (T) rec2.getData().preCombine(rec1.getData());
        HoodieKey reducedKey = rec1.getData().equals(reducedData) ? rec1.getKey() : rec2.getKey();
        return new HoodieAvroRecord<>(reducedKey, reducedData);
    }, parallelism).map(Pair::getRight);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieIndex(org.apache.hudi.index.HoodieIndex) Pair(org.apache.hudi.common.util.collection.Pair) HoodieKey(org.apache.hudi.common.model.HoodieKey) Pair(org.apache.hudi.common.util.collection.Pair)

Example 5 with HoodieIndex

use of org.apache.hudi.index.HoodieIndex in project hudi by apache.

the class TestHoodieClientOnCopyOnWriteStorage method testDeduplication.

/**
 * Test Deduplication Logic for write function.
 *
 * @param writeFn One of HoddieWriteClient non-prepped write APIs
 * @throws Exception in case of failure
 */
private void testDeduplication(Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean populateMetaFields) throws Exception {
    String newCommitTime = "001";
    String recordKey = UUID.randomUUID().toString();
    HoodieKey keyOne = new HoodieKey(recordKey, "2018-01-01");
    HoodieRecord<RawTripTestPayload> recordOne = new HoodieAvroRecord(keyOne, dataGen.generateRandomValue(keyOne, newCommitTime));
    HoodieKey keyTwo = new HoodieKey(recordKey, "2018-02-01");
    HoodieRecord recordTwo = new HoodieAvroRecord(keyTwo, dataGen.generateRandomValue(keyTwo, newCommitTime));
    // Same key and partition as keyTwo
    HoodieRecord recordThree = new HoodieAvroRecord(keyTwo, dataGen.generateRandomValue(keyTwo, newCommitTime));
    HoodieData<HoodieRecord<RawTripTestPayload>> records = HoodieJavaRDD.of(jsc.parallelize(Arrays.asList(recordOne, recordTwo, recordThree), 1));
    // Global dedup should be done based on recordKey only
    HoodieIndex index = mock(HoodieIndex.class);
    when(index.isGlobal()).thenReturn(true);
    List<HoodieRecord<RawTripTestPayload>> dedupedRecs = HoodieWriteHelper.newInstance().deduplicateRecords(records, index, 1).collectAsList();
    assertEquals(1, dedupedRecs.size());
    assertEquals(dedupedRecs.get(0).getPartitionPath(), recordThree.getPartitionPath());
    assertNodupesWithinPartition(dedupedRecs);
    // non-Global dedup should be done based on both recordKey and partitionPath
    index = mock(HoodieIndex.class);
    when(index.isGlobal()).thenReturn(false);
    dedupedRecs = HoodieWriteHelper.newInstance().deduplicateRecords(records, index, 1).collectAsList();
    assertEquals(2, dedupedRecs.size());
    assertNodupesWithinPartition(dedupedRecs);
    // Perform write-action and check
    JavaRDD<HoodieRecord> recordList = jsc.parallelize(Arrays.asList(recordOne, recordTwo, recordThree), 1);
    HoodieWriteConfig.Builder configBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).combineInput(true, true);
    addConfigsForPopulateMetaFields(configBuilder, populateMetaFields);
    try (SparkRDDWriteClient client = getHoodieWriteClient(configBuilder.build())) {
        client.startCommitWithTime(newCommitTime);
        List<WriteStatus> statuses = writeFn.apply(client, recordList, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        assertEquals(2, statuses.size());
        assertNodupesInPartition(statuses.stream().map(WriteStatus::getWrittenRecords).flatMap(Collection::stream).collect(Collectors.toList()));
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieIndex(org.apache.hudi.index.HoodieIndex) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) Collection(java.util.Collection) WriteStatus(org.apache.hudi.client.WriteStatus)

Aggregations

WriteStatus (org.apache.hudi.client.WriteStatus)7 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)7 HoodieIndex (org.apache.hudi.index.HoodieIndex)7 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)3 HoodieKey (org.apache.hudi.common.model.HoodieKey)3 HoodieTable (org.apache.hudi.table.HoodieTable)3 List (java.util.List)2 Collectors (java.util.stream.Collectors)2 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)2 HoodieData (org.apache.hudi.common.data.HoodieData)2 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)2 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)2 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)2 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)2 Pair (org.apache.hudi.common.util.collection.Pair)2 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)2 WriteMetadataEvent (org.apache.hudi.sink.event.WriteMetadataEvent)2 Duration (java.time.Duration)1 Instant (java.time.Instant)1 ArrayList (java.util.ArrayList)1