Search in sources :

Example 41 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class TestHoodieIndex method testTagLocationAndFetchRecordLocations.

@ParameterizedTest
@MethodSource("regularIndexTypeParams")
public void testTagLocationAndFetchRecordLocations(IndexType indexType, boolean populateMetaFields, boolean enableMetadataIndex) throws Exception {
    setUp(indexType, populateMetaFields, enableMetadataIndex);
    String p1 = "2016/01/31";
    String p2 = "2015/01/31";
    String rowKey1 = UUID.randomUUID().toString();
    String rowKey2 = UUID.randomUUID().toString();
    String rowKey3 = UUID.randomUUID().toString();
    String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    // place same row key under a different partition.
    String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
    String newCommitTime = writeClient.startCommit();
    metaClient = HoodieTableMetaClient.reload(metaClient);
    writeClient.upsert(recordRDD, newCommitTime);
    HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    JavaRDD<HoodieRecord> taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable);
    // Should not find any files
    for (HoodieRecord record : taggedRecordRDD.collect()) {
        assertFalse(record.isCurrentLocationKnown());
    }
    // We create three parquet files, each having one record (two different partitions)
    HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, metadataWriter);
    final String fileId1 = "fileID1";
    final String fileId2 = "fileID2";
    final String fileId3 = "fileID3";
    Map<String, List<Pair<String, Integer>>> partitionToFilesNameLengthMap = new HashMap<>();
    Path baseFilePath = testTable.forCommit("0000001").withInserts(p1, fileId1, Collections.singletonList(record1));
    long baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.computeIfAbsent(p1, k -> new ArrayList<>()).add(Pair.of(fileId1, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation("0000001", WriteOperationType.UPSERT, Arrays.asList(p1, p2), partitionToFilesNameLengthMap, false, false);
    partitionToFilesNameLengthMap.clear();
    baseFilePath = testTable.forCommit("0000002").withInserts(p1, fileId2, Collections.singletonList(record2));
    baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.computeIfAbsent(p1, k -> new ArrayList<>()).add(Pair.of(fileId2, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation("0000002", WriteOperationType.UPSERT, Arrays.asList(p1, p2), partitionToFilesNameLengthMap, false, false);
    partitionToFilesNameLengthMap.clear();
    baseFilePath = testTable.forCommit("0000003").withInserts(p2, fileId3, Collections.singletonList(record4));
    baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.computeIfAbsent(p2, k -> new ArrayList<>()).add(Pair.of(fileId3, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation("0000003", WriteOperationType.UPSERT, Arrays.asList(p1, p2), partitionToFilesNameLengthMap, false, false);
    // We do the tag again
    metaClient = HoodieTableMetaClient.reload(metaClient);
    hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable);
    List<HoodieRecord> records = taggedRecordRDD.collect();
    // Check results
    for (HoodieRecord record : records) {
        if (record.getRecordKey().equals(rowKey1)) {
            if (record.getPartitionPath().equals(p2)) {
                assertEquals(record.getCurrentLocation().getFileId(), fileId3);
            } else {
                assertEquals(record.getCurrentLocation().getFileId(), fileId1);
            }
        } else if (record.getRecordKey().equals(rowKey2)) {
            assertEquals(record.getCurrentLocation().getFileId(), fileId2);
        } else if (record.getRecordKey().equals(rowKey3)) {
            assertFalse(record.isCurrentLocationKnown());
        }
    }
    JavaPairRDD<HoodieKey, Option<Pair<String, String>>> recordLocations = getRecordLocations(recordRDD.map(HoodieRecord::getKey), hoodieTable);
    for (Tuple2<HoodieKey, Option<Pair<String, String>>> entry : recordLocations.collect()) {
        if (entry._1.getRecordKey().equals(rowKey1)) {
            assertTrue(entry._2.isPresent(), "Row1 should have been present ");
            if (entry._1.getPartitionPath().equals(p2)) {
                assertTrue(entry._2.isPresent(), "Row1 should have been present ");
                assertEquals(entry._2.get().getRight(), fileId3);
            } else {
                assertEquals(entry._2.get().getRight(), fileId1);
            }
        } else if (entry._1.getRecordKey().equals(rowKey2)) {
            assertTrue(entry._2.isPresent(), "Row2 should have been present ");
            assertEquals(entry._2.get().getRight(), fileId2);
        } else if (entry._1.getRecordKey().equals(rowKey3)) {
            assertFalse(entry._2.isPresent(), "Row3 should have been absent ");
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieLayoutConfig(org.apache.hudi.config.HoodieLayoutConfig) HoodieTable(org.apache.hudi.table.HoodieTable) Arrays(java.util.Arrays) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Random(java.util.Random) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) SparkBucketIndexPartitioner(org.apache.hudi.table.action.commit.SparkBucketIndexPartitioner) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) Tag(org.junit.jupiter.api.Tag) FileSystemViewStorageType(org.apache.hudi.common.table.view.FileSystemViewStorageType) MethodSource(org.junit.jupiter.params.provider.MethodSource) Schema(org.apache.avro.Schema) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) Tuple2(scala.Tuple2) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions(org.apache.hudi.testutils.Assertions) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) JavaRDD(org.apache.spark.api.java.JavaRDD) SchemaTestUtil.getSchemaFromResource(org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) ConsistencyGuardConfig(org.apache.hudi.common.fs.ConsistencyGuardConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) IOException(java.io.IOException) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) Collections(java.util.Collections) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) List(java.util.List) ArrayList(java.util.ArrayList) Option(org.apache.hudi.common.util.Option) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 42 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class SparkHoodieHBaseIndex method updateLocationFunction.

private Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>> updateLocationFunction() {
    return (partition, statusIterator) -> {
        List<WriteStatus> writeStatusList = new ArrayList<>();
        // Grab the global HBase connection
        synchronized (SparkHoodieHBaseIndex.class) {
            if (hbaseConnection == null || hbaseConnection.isClosed()) {
                hbaseConnection = getHBaseConnection();
            }
        }
        final long startTimeForPutsTask = DateTime.now().getMillis();
        LOG.info("startTimeForPutsTask for this task: " + startTimeForPutsTask);
        try (BufferedMutator mutator = hbaseConnection.getBufferedMutator(TableName.valueOf(tableName))) {
            final RateLimiter limiter = RateLimiter.create(multiPutBatchSize, TimeUnit.SECONDS);
            while (statusIterator.hasNext()) {
                WriteStatus writeStatus = statusIterator.next();
                List<Mutation> mutations = new ArrayList<>();
                try {
                    long numOfInserts = writeStatus.getStat().getNumInserts();
                    LOG.info("Num of inserts in this WriteStatus: " + numOfInserts);
                    LOG.info("Total inserts in this job: " + this.totalNumInserts);
                    LOG.info("multiPutBatchSize for this job: " + this.multiPutBatchSize);
                    // Any calls beyond `multiPutBatchSize` within a second will be rate limited
                    for (HoodieRecord rec : writeStatus.getWrittenRecords()) {
                        if (!writeStatus.isErrored(rec.getKey())) {
                            Option<HoodieRecordLocation> loc = rec.getNewLocation();
                            if (loc.isPresent()) {
                                if (rec.getCurrentLocation() != null) {
                                    // This is an update, no need to update index
                                    continue;
                                }
                                Put put = new Put(Bytes.toBytes(rec.getRecordKey()));
                                put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, Bytes.toBytes(loc.get().getInstantTime()));
                                put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, Bytes.toBytes(loc.get().getFileId()));
                                put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, Bytes.toBytes(rec.getPartitionPath()));
                                mutations.add(put);
                            } else {
                                // Delete existing index for a deleted record
                                Delete delete = new Delete(Bytes.toBytes(rec.getRecordKey()));
                                mutations.add(delete);
                            }
                        }
                        if (mutations.size() < multiPutBatchSize) {
                            continue;
                        }
                        doMutations(mutator, mutations, limiter);
                    }
                    // process remaining puts and deletes, if any
                    doMutations(mutator, mutations, limiter);
                } catch (Exception e) {
                    Exception we = new Exception("Error updating index for " + writeStatus, e);
                    LOG.error(we);
                    writeStatus.setGlobalError(we);
                }
                writeStatusList.add(writeStatus);
            }
            final long endPutsTime = DateTime.now().getMillis();
            LOG.info("hbase puts task time for this task: " + (endPutsTime - startTimeForPutsTask));
        } catch (IOException e) {
            throw new HoodieIndexException("Failed to Update Index locations because of exception with HBase Client", e);
        }
        return writeStatusList.iterator();
    };
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) Mutation(org.apache.hadoop.hbase.client.Mutation) Function2(org.apache.spark.api.java.function.Function2) Result(org.apache.hadoop.hbase.client.Result) Date(java.util.Date) RateLimiter(org.apache.hudi.common.util.RateLimiter) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) Logger(org.apache.log4j.Logger) Delete(org.apache.hadoop.hbase.client.Delete) Partitioner(org.apache.spark.Partitioner) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieDependentSystemUnavailableException(org.apache.hudi.exception.HoodieDependentSystemUnavailableException) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) BufferedMutator(org.apache.hadoop.hbase.client.BufferedMutator) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieIndexException(org.apache.hudi.exception.HoodieIndexException) Get(org.apache.hadoop.hbase.client.Get) Tuple2(scala.Tuple2) HoodieIndex(org.apache.hudi.index.HoodieIndex) Serializable(java.io.Serializable) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) RegionLocator(org.apache.hadoop.hbase.client.RegionLocator) HBaseConfiguration(org.apache.hadoop.hbase.HBaseConfiguration) ResultScanner(org.apache.hadoop.hbase.client.ResultScanner) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) SparkMemoryUtils(org.apache.hudi.client.utils.SparkMemoryUtils) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) HTable(org.apache.hadoop.hbase.client.HTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) LinkedList(java.util.LinkedList) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) Bytes(org.apache.hadoop.hbase.util.Bytes) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) TableName(org.apache.hadoop.hbase.TableName) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) Put(org.apache.hadoop.hbase.client.Put) SparkConf(org.apache.spark.SparkConf) DateTime(org.joda.time.DateTime) HoodieHBaseIndexConfig(org.apache.hudi.config.HoodieHBaseIndexConfig) IOException(java.io.IOException) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) ConnectionFactory(org.apache.hadoop.hbase.client.ConnectionFactory) Scan(org.apache.hadoop.hbase.client.Scan) TimeUnit(java.util.concurrent.TimeUnit) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HRegionLocation(org.apache.hadoop.hbase.HRegionLocation) Connection(org.apache.hadoop.hbase.client.Connection) HoodieKey(org.apache.hudi.common.model.HoodieKey) LogManager(org.apache.log4j.LogManager) Delete(org.apache.hadoop.hbase.client.Delete) BufferedMutator(org.apache.hadoop.hbase.client.BufferedMutator) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) IOException(java.io.IOException) HoodieIndexException(org.apache.hudi.exception.HoodieIndexException) RateLimiter(org.apache.hudi.common.util.RateLimiter) Put(org.apache.hadoop.hbase.client.Put) HoodieDependentSystemUnavailableException(org.apache.hudi.exception.HoodieDependentSystemUnavailableException) HoodieIndexException(org.apache.hudi.exception.HoodieIndexException) IOException(java.io.IOException) List(java.util.List) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) Option(org.apache.hudi.common.util.Option) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 43 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class BaseSparkCommitActionExecutor method mapPartitionsAsRDD.

private HoodieData<WriteStatus> mapPartitionsAsRDD(HoodieData<HoodieRecord<T>> dedupedRecords, Partitioner partitioner) {
    JavaPairRDD<Tuple2<HoodieKey, Option<HoodieRecordLocation>>, HoodieRecord<T>> mappedRDD = HoodieJavaPairRDD.getJavaPairRDD(dedupedRecords.mapToPair(record -> Pair.of(new Tuple2<>(record.getKey(), Option.ofNullable(record.getCurrentLocation())), record)));
    JavaPairRDD<Tuple2<HoodieKey, Option<HoodieRecordLocation>>, HoodieRecord<T>> partitionedRDD;
    if (table.requireSortedRecords()) {
        // Partition and sort within each partition as a single step. This is faster than partitioning first and then
        // applying a sort.
        Comparator<Tuple2<HoodieKey, Option<HoodieRecordLocation>>> comparator = (Comparator<Tuple2<HoodieKey, Option<HoodieRecordLocation>>> & Serializable) (t1, t2) -> {
            HoodieKey key1 = t1._1;
            HoodieKey key2 = t2._1;
            return key1.getRecordKey().compareTo(key2.getRecordKey());
        };
        partitionedRDD = mappedRDD.repartitionAndSortWithinPartitions(partitioner, comparator);
    } else {
        // Partition only
        partitionedRDD = mappedRDD.partitionBy(partitioner);
    }
    return HoodieJavaRDD.of(partitionedRDD.map(Tuple2::_2).mapPartitionsWithIndex((partition, recordItr) -> {
        if (WriteOperationType.isChangingRecords(operationType)) {
            return handleUpsertPartition(instantTime, partition, recordItr, partitioner);
        } else {
            return handleInsertPartition(instantTime, partition, recordItr, partitioner);
        }
    }, true).flatMap(List::iterator));
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) CreateHandleFactory(org.apache.hudi.io.CreateHandleFactory) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) Logger(org.apache.log4j.Logger) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) Partitioner(org.apache.spark.Partitioner) StorageLevel(org.apache.spark.storage.StorageLevel) Duration(java.time.Duration) Map(java.util.Map) HoodieSortedMergeHandle(org.apache.hudi.io.HoodieSortedMergeHandle) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieConcatHandle(org.apache.hudi.io.HoodieConcatHandle) WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) Set(java.util.Set) Instant(java.time.Instant) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) Serializable(java.io.Serializable) List(java.util.List) WRITE_STATUS_STORAGE_LEVEL_VALUE(org.apache.hudi.config.HoodieWriteConfig.WRITE_STATUS_STORAGE_LEVEL_VALUE) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) ClusteringUtils.getAllFileGroupsInPendingClusteringPlans(org.apache.hudi.common.util.ClusteringUtils.getAllFileGroupsInPendingClusteringPlans) UpdateStrategy(org.apache.hudi.table.action.cluster.strategy.UpdateStrategy) Option(org.apache.hudi.common.util.Option) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) CommitUtils(org.apache.hudi.common.util.CommitUtils) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SparkValidatorUtils(org.apache.hudi.client.utils.SparkValidatorUtils) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) SparkLazyInsertIterable(org.apache.hudi.execution.SparkLazyInsertIterable) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Collections(java.util.Collections) HoodieJavaPairRDD(org.apache.hudi.data.HoodieJavaPairRDD) Pair(org.apache.hudi.common.util.collection.Pair) Tuple2(scala.Tuple2) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Example 44 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class SparkBulkInsertHelper method bulkInsert.

@Override
public HoodieData<WriteStatus> bulkInsert(HoodieData<HoodieRecord<T>> inputRecords, String instantTime, HoodieTable<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> table, HoodieWriteConfig config, boolean performDedupe, Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner, boolean useWriterSchema, int parallelism, WriteHandleFactory writeHandleFactory) {
    // De-dupe/merge if needed
    HoodieData<HoodieRecord<T>> dedupedRecords = inputRecords;
    if (performDedupe) {
        dedupedRecords = (HoodieData<HoodieRecord<T>>) HoodieWriteHelper.newInstance().combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords, parallelism, table);
    }
    final HoodieData<HoodieRecord<T>> repartitionedRecords;
    BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.isPresent() ? userDefinedBulkInsertPartitioner.get() : BulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode());
    // only JavaRDD is supported for Spark partitioner, but it is not enforced by BulkInsertPartitioner API. To improve this, TODO HUDI-3463
    repartitionedRecords = HoodieJavaRDD.of((JavaRDD<HoodieRecord<T>>) partitioner.repartitionRecords(HoodieJavaRDD.getJavaRDD(dedupedRecords), parallelism));
    // generate new file ID prefixes for each output partition
    final List<String> fileIDPrefixes = IntStream.range(0, parallelism).mapToObj(i -> FSUtils.createNewFileIdPfx()).collect(Collectors.toList());
    JavaRDD<WriteStatus> writeStatusRDD = HoodieJavaRDD.getJavaRDD(repartitionedRecords).mapPartitionsWithIndex(new BulkInsertMapFunction<>(instantTime, partitioner.arePartitionRecordsSorted(), config, table, fileIDPrefixes, useWriterSchema, writeHandleFactory), true).flatMap(List::iterator);
    return HoodieJavaRDD.of(writeStatusRDD);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) IntStream(java.util.stream.IntStream) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CreateHandleFactory(org.apache.hudi.io.CreateHandleFactory) Option(org.apache.hudi.common.util.Option) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) Collectors(java.util.stream.Collectors) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) WriteHandleFactory(org.apache.hudi.io.WriteHandleFactory) List(java.util.List) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) BulkInsertInternalPartitionerFactory(org.apache.hudi.execution.bulkinsert.BulkInsertInternalPartitionerFactory) BulkInsertMapFunction(org.apache.hudi.execution.bulkinsert.BulkInsertMapFunction) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) FSUtils(org.apache.hudi.common.fs.FSUtils) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BulkInsertMapFunction(org.apache.hudi.execution.bulkinsert.BulkInsertMapFunction) List(java.util.List) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) JavaRDD(org.apache.spark.api.java.JavaRDD)

Example 45 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class RestorePlanActionExecutor method execute.

@Override
public Option<HoodieRestorePlan> execute() {
    final HoodieInstant restoreInstant = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.RESTORE_ACTION, instantTime);
    try {
        // Get all the commits on the timeline after the provided commit time
        // rollback pending clustering instants first before other instants (See HUDI-3362)
        List<HoodieInstant> pendingClusteringInstantsToRollback = table.getActiveTimeline().filterPendingReplaceTimeline().filter(instant -> ClusteringUtils.isPendingClusteringInstant(table.getMetaClient(), instant)).getReverseOrderedInstants().filter(instant -> HoodieActiveTimeline.GREATER_THAN.test(instant.getTimestamp(), restoreInstantTime)).collect(Collectors.toList());
        // Get all the commits on the timeline after the provided commit time
        List<HoodieInstant> commitInstantsToRollback = table.getActiveTimeline().getWriteTimeline().getReverseOrderedInstants().filter(instant -> HoodieActiveTimeline.GREATER_THAN.test(instant.getTimestamp(), restoreInstantTime)).filter(instant -> !pendingClusteringInstantsToRollback.contains(instant)).collect(Collectors.toList());
        // Combine both lists - first rollback pending clustering and then rollback all other commits
        List<HoodieInstantInfo> instantsToRollback = Stream.concat(pendingClusteringInstantsToRollback.stream(), commitInstantsToRollback.stream()).map(entry -> new HoodieInstantInfo(entry.getTimestamp(), entry.getAction())).collect(Collectors.toList());
        HoodieRestorePlan restorePlan = new HoodieRestorePlan(instantsToRollback, LATEST_RESTORE_PLAN_VERSION);
        table.getActiveTimeline().saveToRestoreRequested(restoreInstant, TimelineMetadataUtils.serializeRestorePlan(restorePlan));
        table.getMetaClient().reloadActiveTimeline();
        LOG.info("Requesting Restore with instant time " + restoreInstant);
        return Option.of(restorePlan);
    } catch (IOException e) {
        LOG.error("Got exception when saving restore requested file", e);
        throw new HoodieIOException(e.getMessage(), e);
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BaseActionExecutor(org.apache.hudi.table.action.BaseActionExecutor) HoodieRestorePlan(org.apache.hudi.avro.model.HoodieRestorePlan) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) Option(org.apache.hudi.common.util.Option) IOException(java.io.IOException) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Collectors(java.util.stream.Collectors) HoodieInstantInfo(org.apache.hudi.avro.model.HoodieInstantInfo) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) Logger(org.apache.log4j.Logger) List(java.util.List) Stream(java.util.stream.Stream) ClusteringUtils(org.apache.hudi.common.util.ClusteringUtils) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieInstantInfo(org.apache.hudi.avro.model.HoodieInstantInfo) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieRestorePlan(org.apache.hudi.avro.model.HoodieRestorePlan)

Aggregations

Option (org.apache.hudi.common.util.Option)105 List (java.util.List)84 IOException (java.io.IOException)70 Collectors (java.util.stream.Collectors)69 Map (java.util.Map)67 ArrayList (java.util.ArrayList)61 Path (org.apache.hadoop.fs.Path)59 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)59 Pair (org.apache.hudi.common.util.collection.Pair)59 HashMap (java.util.HashMap)58 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)58 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)56 LogManager (org.apache.log4j.LogManager)54 Logger (org.apache.log4j.Logger)54 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)53 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)46 HoodieIOException (org.apache.hudi.exception.HoodieIOException)44 Arrays (java.util.Arrays)43 FSUtils (org.apache.hudi.common.fs.FSUtils)43 Collections (java.util.Collections)39