Search in sources :

Example 21 with HoodieData

use of org.apache.hudi.common.data.HoodieData in project hudi by apache.

the class HoodieWriteHelper method deduplicateRecords.

@Override
public HoodieData<HoodieRecord<T>> deduplicateRecords(HoodieData<HoodieRecord<T>> records, HoodieIndex<?, ?> index, int parallelism) {
    boolean isIndexingGlobal = index.isGlobal();
    return records.mapToPair(record -> {
        HoodieKey hoodieKey = record.getKey();
        // If index used is global, then records are expected to differ in their partitionPath
        Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey;
        return Pair.of(key, record);
    }).reduceByKey((rec1, rec2) -> {
        @SuppressWarnings("unchecked") T reducedData = (T) rec2.getData().preCombine(rec1.getData());
        HoodieKey reducedKey = rec1.getData().equals(reducedData) ? rec1.getKey() : rec2.getKey();
        return new HoodieAvroRecord<>(reducedKey, reducedData);
    }, parallelism).map(Pair::getRight);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieIndex(org.apache.hudi.index.HoodieIndex) Pair(org.apache.hudi.common.util.collection.Pair) HoodieKey(org.apache.hudi.common.model.HoodieKey) Pair(org.apache.hudi.common.util.collection.Pair)

Example 22 with HoodieData

use of org.apache.hudi.common.data.HoodieData in project hudi by apache.

the class HoodieCompactor method compact.

/**
 * Execute compaction operations and report back status.
 */
public HoodieData<WriteStatus> compact(HoodieEngineContext context, HoodieCompactionPlan compactionPlan, HoodieTable table, HoodieWriteConfig config, String compactionInstantTime, HoodieCompactionHandler compactionHandler) {
    if (compactionPlan == null || (compactionPlan.getOperations() == null) || (compactionPlan.getOperations().isEmpty())) {
        return context.emptyHoodieData();
    }
    HoodieActiveTimeline timeline = table.getActiveTimeline();
    HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime);
    // Mark instant as compaction inflight
    timeline.transitionCompactionRequestedToInflight(instant);
    table.getMetaClient().reloadActiveTimeline();
    HoodieTableMetaClient metaClient = table.getMetaClient();
    TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient);
    // the same with the table schema.
    try {
        Schema readerSchema = schemaResolver.getTableAvroSchema(false);
        config.setSchema(readerSchema.toString());
    } catch (Exception e) {
    // If there is no commit in the table, just ignore the exception.
    }
    // Compacting is very similar to applying updates to existing file
    List<CompactionOperation> operations = compactionPlan.getOperations().stream().map(CompactionOperation::convertFromAvroRecordInstance).collect(toList());
    LOG.info("Compactor compacting " + operations + " files");
    context.setJobStatus(this.getClass().getSimpleName(), "Compacting file slices");
    TaskContextSupplier taskContextSupplier = table.getTaskContextSupplier();
    return context.parallelize(operations).map(operation -> compact(compactionHandler, metaClient, config, operation, compactionInstantTime, taskContextSupplier)).flatMap(List::iterator);
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieAccumulator(org.apache.hudi.common.data.HoodieAccumulator) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) RuntimeStats(org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) StreamSupport(java.util.stream.StreamSupport) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieData(org.apache.hudi.common.data.HoodieData) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) TaskContextSupplier(org.apache.hudi.common.engine.TaskContextSupplier) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) CompactionStrategy(org.apache.hudi.table.action.compact.strategy.CompactionStrategy) Serializable(java.io.Serializable) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieCompactionHandler(org.apache.hudi.table.HoodieCompactionHandler) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) IOUtils(org.apache.hudi.io.IOUtils) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) Schema(org.apache.avro.Schema) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) ArrayList(java.util.ArrayList) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) IOException(java.io.IOException) TaskContextSupplier(org.apache.hudi.common.engine.TaskContextSupplier)

Example 23 with HoodieData

use of org.apache.hudi.common.data.HoodieData in project hudi by apache.

the class BaseCommitActionExecutor method executeClustering.

protected HoodieWriteMetadata<HoodieData<WriteStatus>> executeClustering(HoodieClusteringPlan clusteringPlan) {
    HoodieInstant instant = HoodieTimeline.getReplaceCommitRequestedInstant(instantTime);
    // Mark instant as clustering inflight
    table.getActiveTimeline().transitionReplaceRequestedToInflight(instant, Option.empty());
    table.getMetaClient().reloadActiveTimeline();
    final Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
    HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata = ((ClusteringExecutionStrategy<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>>) ReflectionUtils.loadClass(config.getClusteringExecutionStrategyClass(), new Class<?>[] { HoodieTable.class, HoodieEngineContext.class, HoodieWriteConfig.class }, table, context, config)).performClustering(clusteringPlan, schema, instantTime);
    HoodieData<WriteStatus> writeStatusList = writeMetadata.getWriteStatuses();
    HoodieData<WriteStatus> statuses = updateIndex(writeStatusList, writeMetadata);
    writeMetadata.setWriteStats(statuses.map(WriteStatus::getStat).collectAsList());
    writeMetadata.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(clusteringPlan, writeMetadata));
    validateWriteResult(clusteringPlan, writeMetadata);
    commitOnAutoCommit(writeMetadata);
    if (!writeMetadata.getCommitMetadata().isPresent()) {
        HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(writeMetadata.getWriteStats().get(), writeMetadata.getPartitionToReplaceFileIds(), extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType());
        writeMetadata.setCommitMetadata(Option.of(commitMetadata));
    }
    return writeMetadata;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieData(org.apache.hudi.common.data.HoodieData) ClusteringExecutionStrategy(org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTable(org.apache.hudi.table.HoodieTable) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 24 with HoodieData

use of org.apache.hudi.common.data.HoodieData in project hudi by apache.

the class TestCopyOnWriteActionExecutor method testBulkInsertRecords.

public void testBulkInsertRecords(String bulkInsertMode) throws Exception {
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA).withBulkInsertParallelism(2).withBulkInsertSortMode(bulkInsertMode).build();
    String instantTime = makeNewCommitTime();
    SparkRDDWriteClient writeClient = getHoodieWriteClient(config);
    writeClient.startCommitWithTime(instantTime);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
    // Insert new records
    final JavaRDD<HoodieRecord> inputRecords = generateTestRecordsForBulkInsert(jsc);
    SparkBulkInsertCommitActionExecutor bulkInsertExecutor = new SparkBulkInsertCommitActionExecutor(context, config, table, instantTime, HoodieJavaRDD.of(inputRecords), Option.empty());
    List<WriteStatus> returnedStatuses = ((HoodieData<WriteStatus>) bulkInsertExecutor.execute().getWriteStatuses()).collectAsList();
    verifyStatusResult(returnedStatuses, generateExpectedPartitionNumRecords(inputRecords));
}
Also used : HoodieData(org.apache.hudi.common.data.HoodieData) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 25 with HoodieData

use of org.apache.hudi.common.data.HoodieData in project hudi by apache.

the class TestHoodieBucketIndex method testTagLocation.

@Test
public void testTagLocation() throws Exception {
    String rowKey1 = UUID.randomUUID().toString();
    String rowKey2 = UUID.randomUUID().toString();
    String rowKey3 = UUID.randomUUID().toString();
    String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    JavaRDD<HoodieRecord<HoodieAvroRecord>> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
    HoodieWriteConfig config = makeConfig();
    HoodieTable table = HoodieSparkTable.create(config, context, metaClient);
    HoodieBucketIndex bucketIndex = new HoodieBucketIndex(config);
    HoodieData<HoodieRecord<HoodieAvroRecord>> taggedRecordRDD = bucketIndex.tagLocation(HoodieJavaRDD.of(recordRDD), context, table);
    assertFalse(taggedRecordRDD.collectAsList().stream().anyMatch(r -> r.isCurrentLocationKnown()));
    HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(table, SCHEMA);
    testTable.addCommit("001").withInserts("2016/01/31", getRecordFileId(record1), record1);
    testTable.addCommit("002").withInserts("2016/01/31", getRecordFileId(record2), record2);
    testTable.addCommit("003").withInserts("2016/01/31", getRecordFileId(record3), record3);
    taggedRecordRDD = bucketIndex.tagLocation(HoodieJavaRDD.of(recordRDD), context, HoodieSparkTable.create(config, context, metaClient));
    assertFalse(taggedRecordRDD.collectAsList().stream().filter(r -> r.isCurrentLocationKnown()).filter(r -> BucketIdentifier.bucketIdFromFileId(r.getCurrentLocation().getFileId()) != getRecordBucketId(r)).findAny().isPresent());
    assertTrue(taggedRecordRDD.collectAsList().stream().filter(r -> r.getPartitionPath().equals("2015/01/31") && !r.isCurrentLocationKnown()).count() == 1L);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) HoodieClientTestHarness(org.apache.hudi.testutils.HoodieClientTestHarness) Logger(org.apache.log4j.Logger) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) JavaRDD(org.apache.spark.api.java.JavaRDD) SchemaTestUtil.getSchemaFromResource(org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieIndexException(org.apache.hudi.exception.HoodieIndexException) HoodieData(org.apache.hudi.common.data.HoodieData) Schema(org.apache.avro.Schema) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) KeyGeneratorOptions(org.apache.hudi.keygen.constant.KeyGeneratorOptions) UUID(java.util.UUID) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) AfterEach(org.junit.jupiter.api.AfterEach) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) LogManager(org.apache.log4j.LogManager) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieData (org.apache.hudi.common.data.HoodieData)36 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)24 WriteStatus (org.apache.hudi.client.WriteStatus)22 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)22 List (java.util.List)21 HoodieTable (org.apache.hudi.table.HoodieTable)20 HoodieKey (org.apache.hudi.common.model.HoodieKey)18 LogManager (org.apache.log4j.LogManager)18 Logger (org.apache.log4j.Logger)18 IOException (java.io.IOException)17 Collectors (java.util.stream.Collectors)17 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)17 Option (org.apache.hudi.common.util.Option)17 Map (java.util.Map)16 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)16 HoodieWriteMetadata (org.apache.hudi.table.action.HoodieWriteMetadata)16 JavaRDD (org.apache.spark.api.java.JavaRDD)16 Pair (org.apache.hudi.common.util.collection.Pair)15 HoodieJavaRDD (org.apache.hudi.data.HoodieJavaRDD)15 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)14