Search in sources :

Example 16 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class SpillableMapTestUtils method upsertRecords.

public static List<String> upsertRecords(List<IndexedRecord> iRecords, Map<String, HoodieRecord<? extends HoodieRecordPayload>> records) {
    List<String> recordKeys = new ArrayList<>();
    iRecords.forEach(r -> {
        String key = ((GenericRecord) r).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
        String partitionPath = ((GenericRecord) r).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
        recordKeys.add(key);
        HoodieRecord record = new HoodieAvroRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r)));
        record.unseal();
        record.setCurrentLocation(new HoodieRecordLocation("DUMMY_COMMIT_TIME", "DUMMY_FILE_ID"));
        record.seal();
        records.put(key, record);
    });
    return recordKeys;
}
Also used : HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload)

Example 17 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class TestBucketAssigner method testUpdateAndInsertWithPartialSmallFiles.

/**
 * Test that only partial small files are assigned to the task.
 */
@Test
public void testUpdateAndInsertWithPartialSmallFiles() {
    SmallFile f0 = new SmallFile();
    f0.location = new HoodieRecordLocation("t0", "f0");
    f0.sizeBytes = 12;
    SmallFile f1 = new SmallFile();
    f1.location = new HoodieRecordLocation("t0", "f1");
    // no left space to append new records to this bucket
    f1.sizeBytes = 122879;
    SmallFile f2 = new SmallFile();
    f2.location = new HoodieRecordLocation("t0", "f2");
    f2.sizeBytes = 56;
    Map<String, List<SmallFile>> smallFilesMap = new HashMap<>();
    smallFilesMap.put("par1", Arrays.asList(f0, f1, f2));
    MockBucketAssigner mockBucketAssigner = new MockBucketAssigner(0, 2, context, writeConfig, smallFilesMap);
    mockBucketAssigner.addUpdate("par1", "f0");
    BucketInfo bucketInfo = mockBucketAssigner.addInsert("par1");
    assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f2");
    mockBucketAssigner.addInsert("par1");
    bucketInfo = mockBucketAssigner.addInsert("par1");
    assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f2");
    mockBucketAssigner.addUpdate("par1", "f2");
    mockBucketAssigner.addInsert("par1");
    bucketInfo = mockBucketAssigner.addInsert("par1");
    assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f2");
    MockBucketAssigner mockBucketAssigner2 = new MockBucketAssigner(1, 2, context, writeConfig, smallFilesMap);
    mockBucketAssigner2.addUpdate("par1", "f0");
    BucketInfo bucketInfo2 = mockBucketAssigner2.addInsert("par1");
    assertBucketEquals(bucketInfo2, "par1", BucketType.UPDATE, "f0");
    mockBucketAssigner2.addInsert("par1");
    bucketInfo2 = mockBucketAssigner2.addInsert("par1");
    assertBucketEquals(bucketInfo2, "par1", BucketType.UPDATE, "f0");
    mockBucketAssigner2.addUpdate("par1", "f2");
    mockBucketAssigner2.addInsert("par1");
    bucketInfo2 = mockBucketAssigner2.addInsert("par1");
    assertBucketEquals(bucketInfo2, "par1", BucketType.UPDATE, "f0");
}
Also used : HashMap(java.util.HashMap) SmallFile(org.apache.hudi.table.action.commit.SmallFile) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) List(java.util.List) BucketInfo(org.apache.hudi.table.action.commit.BucketInfo) Test(org.junit.jupiter.api.Test)

Example 18 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class TestBucketAssigner method testInsertWithSmallFiles.

@Test
public void testInsertWithSmallFiles() {
    SmallFile f0 = new SmallFile();
    f0.location = new HoodieRecordLocation("t0", "f0");
    f0.sizeBytes = 12;
    SmallFile f1 = new SmallFile();
    f1.location = new HoodieRecordLocation("t0", "f1");
    // no left space to append new records to this bucket
    f1.sizeBytes = 122879;
    SmallFile f2 = new SmallFile();
    f2.location = new HoodieRecordLocation("t0", "f2");
    f2.sizeBytes = 56;
    Map<String, List<SmallFile>> smallFilesMap = new HashMap<>();
    smallFilesMap.put("par1", Arrays.asList(f0, f1));
    smallFilesMap.put("par2", Collections.singletonList(f2));
    MockBucketAssigner mockBucketAssigner = new MockBucketAssigner(context, writeConfig, smallFilesMap);
    BucketInfo bucketInfo = mockBucketAssigner.addInsert("par1");
    assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f0");
    mockBucketAssigner.addInsert("par1");
    bucketInfo = mockBucketAssigner.addInsert("par1");
    assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f0");
    mockBucketAssigner.addInsert("par2");
    bucketInfo = mockBucketAssigner.addInsert("par2");
    assertBucketEquals(bucketInfo, "par2", BucketType.UPDATE, "f2");
    bucketInfo = mockBucketAssigner.addInsert("par3");
    assertBucketEquals(bucketInfo, "par3", BucketType.INSERT);
    bucketInfo = mockBucketAssigner.addInsert("par3");
    assertBucketEquals(bucketInfo, "par3", BucketType.INSERT);
}
Also used : HashMap(java.util.HashMap) SmallFile(org.apache.hudi.table.action.commit.SmallFile) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) List(java.util.List) BucketInfo(org.apache.hudi.table.action.commit.BucketInfo) Test(org.junit.jupiter.api.Test)

Example 19 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class SparkHoodieHBaseIndex method updateLocationFunction.

private Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>> updateLocationFunction() {
    return (partition, statusIterator) -> {
        List<WriteStatus> writeStatusList = new ArrayList<>();
        // Grab the global HBase connection
        synchronized (SparkHoodieHBaseIndex.class) {
            if (hbaseConnection == null || hbaseConnection.isClosed()) {
                hbaseConnection = getHBaseConnection();
            }
        }
        final long startTimeForPutsTask = DateTime.now().getMillis();
        LOG.info("startTimeForPutsTask for this task: " + startTimeForPutsTask);
        try (BufferedMutator mutator = hbaseConnection.getBufferedMutator(TableName.valueOf(tableName))) {
            final RateLimiter limiter = RateLimiter.create(multiPutBatchSize, TimeUnit.SECONDS);
            while (statusIterator.hasNext()) {
                WriteStatus writeStatus = statusIterator.next();
                List<Mutation> mutations = new ArrayList<>();
                try {
                    long numOfInserts = writeStatus.getStat().getNumInserts();
                    LOG.info("Num of inserts in this WriteStatus: " + numOfInserts);
                    LOG.info("Total inserts in this job: " + this.totalNumInserts);
                    LOG.info("multiPutBatchSize for this job: " + this.multiPutBatchSize);
                    // Any calls beyond `multiPutBatchSize` within a second will be rate limited
                    for (HoodieRecord rec : writeStatus.getWrittenRecords()) {
                        if (!writeStatus.isErrored(rec.getKey())) {
                            Option<HoodieRecordLocation> loc = rec.getNewLocation();
                            if (loc.isPresent()) {
                                if (rec.getCurrentLocation() != null) {
                                    // This is an update, no need to update index
                                    continue;
                                }
                                Put put = new Put(Bytes.toBytes(rec.getRecordKey()));
                                put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, Bytes.toBytes(loc.get().getInstantTime()));
                                put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, Bytes.toBytes(loc.get().getFileId()));
                                put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, Bytes.toBytes(rec.getPartitionPath()));
                                mutations.add(put);
                            } else {
                                // Delete existing index for a deleted record
                                Delete delete = new Delete(Bytes.toBytes(rec.getRecordKey()));
                                mutations.add(delete);
                            }
                        }
                        if (mutations.size() < multiPutBatchSize) {
                            continue;
                        }
                        doMutations(mutator, mutations, limiter);
                    }
                    // process remaining puts and deletes, if any
                    doMutations(mutator, mutations, limiter);
                } catch (Exception e) {
                    Exception we = new Exception("Error updating index for " + writeStatus, e);
                    LOG.error(we);
                    writeStatus.setGlobalError(we);
                }
                writeStatusList.add(writeStatus);
            }
            final long endPutsTime = DateTime.now().getMillis();
            LOG.info("hbase puts task time for this task: " + (endPutsTime - startTimeForPutsTask));
        } catch (IOException e) {
            throw new HoodieIndexException("Failed to Update Index locations because of exception with HBase Client", e);
        }
        return writeStatusList.iterator();
    };
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) Mutation(org.apache.hadoop.hbase.client.Mutation) Function2(org.apache.spark.api.java.function.Function2) Result(org.apache.hadoop.hbase.client.Result) Date(java.util.Date) RateLimiter(org.apache.hudi.common.util.RateLimiter) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) Logger(org.apache.log4j.Logger) Delete(org.apache.hadoop.hbase.client.Delete) Partitioner(org.apache.spark.Partitioner) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieDependentSystemUnavailableException(org.apache.hudi.exception.HoodieDependentSystemUnavailableException) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) BufferedMutator(org.apache.hadoop.hbase.client.BufferedMutator) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieIndexException(org.apache.hudi.exception.HoodieIndexException) Get(org.apache.hadoop.hbase.client.Get) Tuple2(scala.Tuple2) HoodieIndex(org.apache.hudi.index.HoodieIndex) Serializable(java.io.Serializable) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) RegionLocator(org.apache.hadoop.hbase.client.RegionLocator) HBaseConfiguration(org.apache.hadoop.hbase.HBaseConfiguration) ResultScanner(org.apache.hadoop.hbase.client.ResultScanner) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) SparkMemoryUtils(org.apache.hudi.client.utils.SparkMemoryUtils) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) HTable(org.apache.hadoop.hbase.client.HTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) LinkedList(java.util.LinkedList) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) Bytes(org.apache.hadoop.hbase.util.Bytes) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) TableName(org.apache.hadoop.hbase.TableName) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) Put(org.apache.hadoop.hbase.client.Put) SparkConf(org.apache.spark.SparkConf) DateTime(org.joda.time.DateTime) HoodieHBaseIndexConfig(org.apache.hudi.config.HoodieHBaseIndexConfig) IOException(java.io.IOException) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) ConnectionFactory(org.apache.hadoop.hbase.client.ConnectionFactory) Scan(org.apache.hadoop.hbase.client.Scan) TimeUnit(java.util.concurrent.TimeUnit) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HRegionLocation(org.apache.hadoop.hbase.HRegionLocation) Connection(org.apache.hadoop.hbase.client.Connection) HoodieKey(org.apache.hudi.common.model.HoodieKey) LogManager(org.apache.log4j.LogManager) Delete(org.apache.hadoop.hbase.client.Delete) BufferedMutator(org.apache.hadoop.hbase.client.BufferedMutator) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) IOException(java.io.IOException) HoodieIndexException(org.apache.hudi.exception.HoodieIndexException) RateLimiter(org.apache.hudi.common.util.RateLimiter) Put(org.apache.hadoop.hbase.client.Put) HoodieDependentSystemUnavailableException(org.apache.hudi.exception.HoodieDependentSystemUnavailableException) HoodieIndexException(org.apache.hudi.exception.HoodieIndexException) IOException(java.io.IOException) List(java.util.List) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) Option(org.apache.hudi.common.util.Option) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 20 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class BaseSparkCommitActionExecutor method mapPartitionsAsRDD.

private HoodieData<WriteStatus> mapPartitionsAsRDD(HoodieData<HoodieRecord<T>> dedupedRecords, Partitioner partitioner) {
    JavaPairRDD<Tuple2<HoodieKey, Option<HoodieRecordLocation>>, HoodieRecord<T>> mappedRDD = HoodieJavaPairRDD.getJavaPairRDD(dedupedRecords.mapToPair(record -> Pair.of(new Tuple2<>(record.getKey(), Option.ofNullable(record.getCurrentLocation())), record)));
    JavaPairRDD<Tuple2<HoodieKey, Option<HoodieRecordLocation>>, HoodieRecord<T>> partitionedRDD;
    if (table.requireSortedRecords()) {
        // Partition and sort within each partition as a single step. This is faster than partitioning first and then
        // applying a sort.
        Comparator<Tuple2<HoodieKey, Option<HoodieRecordLocation>>> comparator = (Comparator<Tuple2<HoodieKey, Option<HoodieRecordLocation>>> & Serializable) (t1, t2) -> {
            HoodieKey key1 = t1._1;
            HoodieKey key2 = t2._1;
            return key1.getRecordKey().compareTo(key2.getRecordKey());
        };
        partitionedRDD = mappedRDD.repartitionAndSortWithinPartitions(partitioner, comparator);
    } else {
        // Partition only
        partitionedRDD = mappedRDD.partitionBy(partitioner);
    }
    return HoodieJavaRDD.of(partitionedRDD.map(Tuple2::_2).mapPartitionsWithIndex((partition, recordItr) -> {
        if (WriteOperationType.isChangingRecords(operationType)) {
            return handleUpsertPartition(instantTime, partition, recordItr, partitioner);
        } else {
            return handleInsertPartition(instantTime, partition, recordItr, partitioner);
        }
    }, true).flatMap(List::iterator));
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) CreateHandleFactory(org.apache.hudi.io.CreateHandleFactory) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) Logger(org.apache.log4j.Logger) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) Partitioner(org.apache.spark.Partitioner) StorageLevel(org.apache.spark.storage.StorageLevel) Duration(java.time.Duration) Map(java.util.Map) HoodieSortedMergeHandle(org.apache.hudi.io.HoodieSortedMergeHandle) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieConcatHandle(org.apache.hudi.io.HoodieConcatHandle) WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) Set(java.util.Set) Instant(java.time.Instant) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) Serializable(java.io.Serializable) List(java.util.List) WRITE_STATUS_STORAGE_LEVEL_VALUE(org.apache.hudi.config.HoodieWriteConfig.WRITE_STATUS_STORAGE_LEVEL_VALUE) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) ClusteringUtils.getAllFileGroupsInPendingClusteringPlans(org.apache.hudi.common.util.ClusteringUtils.getAllFileGroupsInPendingClusteringPlans) UpdateStrategy(org.apache.hudi.table.action.cluster.strategy.UpdateStrategy) Option(org.apache.hudi.common.util.Option) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) CommitUtils(org.apache.hudi.common.util.CommitUtils) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SparkValidatorUtils(org.apache.hudi.client.utils.SparkValidatorUtils) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) SparkLazyInsertIterable(org.apache.hudi.execution.SparkLazyInsertIterable) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Collections(java.util.Collections) HoodieJavaPairRDD(org.apache.hudi.data.HoodieJavaPairRDD) Pair(org.apache.hudi.common.util.collection.Pair) Tuple2(scala.Tuple2) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Aggregations

HoodieRecordLocation (org.apache.hudi.common.model.HoodieRecordLocation)43 ArrayList (java.util.ArrayList)18 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)17 HashMap (java.util.HashMap)16 List (java.util.List)16 HoodieKey (org.apache.hudi.common.model.HoodieKey)16 Map (java.util.Map)13 Pair (org.apache.hudi.common.util.collection.Pair)12 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)9 Option (org.apache.hudi.common.util.Option)9 IOException (java.io.IOException)8 WorkloadStat (org.apache.hudi.table.WorkloadStat)8 SmallFile (org.apache.hudi.table.action.commit.SmallFile)8 Tuple2 (scala.Tuple2)8 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)7 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)7 HoodieTable (org.apache.hudi.table.HoodieTable)7 LogManager (org.apache.log4j.LogManager)7 Logger (org.apache.log4j.Logger)7 Collectors (java.util.stream.Collectors)6