Search in sources :

Example 6 with HoodieIndex

use of org.apache.hudi.index.HoodieIndex in project hudi by apache.

the class TestCleaner method insertFirstBigBatchForClientCleanerTest.

/**
 * Helper method to do first batch of insert for clean by versions/commits tests.
 *
 * @param cfg Hoodie Write Config
 * @param client Hoodie Client
 * @param recordGenFunction Function to generate records for insertion
 * @param insertFn Insertion API for testing
 * @throws Exception in case of error
 */
private Pair<String, JavaRDD<WriteStatus>> insertFirstBigBatchForClientCleanerTest(HoodieWriteConfig cfg, SparkRDDWriteClient client, Function2<List<HoodieRecord>, String, Integer> recordGenFunction, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> insertFn, HoodieCleaningPolicy cleaningPolicy) throws Exception {
    /*
     * do a big insert (this is basically same as insert part of upsert, just adding it here so we can catch breakages
     * in insert(), if the implementation diverges.)
     */
    String newCommitTime = client.startCommit();
    List<HoodieRecord> records = recordGenFunction.apply(newCommitTime, BIG_BATCH_INSERT_SIZE);
    JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 5);
    JavaRDD<WriteStatus> statuses = insertFn.apply(client, writeRecords, newCommitTime);
    // Verify there are no errors
    assertNoWriteErrors(statuses.collect());
    // verify that there is a commit
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
    assertEquals(1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting a single commit.");
    // Should have 100 records in table (check using Index), all in locations marked at commit
    HoodieTable table = HoodieSparkTable.create(client.getConfig(), context, metaClient);
    if (client.getConfig().shouldAutoCommit()) {
        assertFalse(table.getCompletedCommitsTimeline().empty());
    }
    // We no longer write empty cleaner plans when there is nothing to be cleaned.
    assertTrue(table.getCompletedCleanTimeline().empty());
    if (client.getConfig().shouldAutoCommit()) {
        HoodieIndex index = SparkHoodieIndexFactory.createIndex(cfg);
        List<HoodieRecord> taggedRecords = tagLocation(index, jsc.parallelize(records, 1), table).collect();
        checkTaggedRecords(taggedRecords, newCommitTime);
    }
    return Pair.of(newCommitTime, statuses);
}
Also used : HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 7 with HoodieIndex

use of org.apache.hudi.index.HoodieIndex in project hudi by apache.

the class FlinkWriteHelper method deduplicateRecords.

@Override
public List<HoodieRecord<T>> deduplicateRecords(List<HoodieRecord<T>> records, HoodieIndex<?, ?> index, int parallelism) {
    Map<Object, List<Pair<Object, HoodieRecord<T>>>> keyedRecords = records.stream().map(record -> {
        // If index used is global, then records are expected to differ in their partitionPath
        final Object key = record.getKey().getRecordKey();
        return Pair.of(key, record);
    }).collect(Collectors.groupingBy(Pair::getLeft));
    return keyedRecords.values().stream().map(x -> x.stream().map(Pair::getRight).reduce((rec1, rec2) -> {
        final T data1 = rec1.getData();
        final T data2 = rec2.getData();
        @SuppressWarnings("unchecked") final T reducedData = (T) data2.preCombine(data1);
        // we cannot allow the user to change the key or partitionPath, since that will affect
        // everything
        // so pick it from one of the records.
        boolean choosePrev = data1.equals(reducedData);
        HoodieKey reducedKey = choosePrev ? rec1.getKey() : rec2.getKey();
        HoodieOperation operation = choosePrev ? rec1.getOperation() : rec2.getOperation();
        HoodieRecord<T> hoodieRecord = new HoodieAvroRecord<>(reducedKey, reducedData, operation);
        // reuse the location from the first record.
        hoodieRecord.setCurrentLocation(rec1.getCurrentLocation());
        return hoodieRecord;
    }).orElse(null)).filter(Objects::nonNull).collect(Collectors.toList());
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieList(org.apache.hudi.common.data.HoodieList) HoodieOperation(org.apache.hudi.common.model.HoodieOperation) Objects(java.util.Objects) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) List(java.util.List) Duration(java.time.Duration) Map(java.util.Map) HoodieKey(org.apache.hudi.common.model.HoodieKey) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) Pair(org.apache.hudi.common.util.collection.Pair) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieOperation(org.apache.hudi.common.model.HoodieOperation) HoodieList(org.apache.hudi.common.data.HoodieList) List(java.util.List)

Aggregations

WriteStatus (org.apache.hudi.client.WriteStatus)7 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)7 HoodieIndex (org.apache.hudi.index.HoodieIndex)7 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)3 HoodieKey (org.apache.hudi.common.model.HoodieKey)3 HoodieTable (org.apache.hudi.table.HoodieTable)3 List (java.util.List)2 Collectors (java.util.stream.Collectors)2 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)2 HoodieData (org.apache.hudi.common.data.HoodieData)2 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)2 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)2 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)2 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)2 Pair (org.apache.hudi.common.util.collection.Pair)2 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)2 WriteMetadataEvent (org.apache.hudi.sink.event.WriteMetadataEvent)2 Duration (java.time.Duration)1 Instant (java.time.Instant)1 ArrayList (java.util.ArrayList)1