Search in sources :

Example 1 with HoodieSparkCopyOnWriteTable

use of org.apache.hudi.table.HoodieSparkCopyOnWriteTable in project hudi by apache.

the class TestCopyOnWriteActionExecutor method testInsertUpsertWithHoodieAvroPayload.

@Test
public void testInsertUpsertWithHoodieAvroPayload() throws Exception {
    Schema schema = getSchemaFromResource(TestCopyOnWriteActionExecutor.class, "/testDataGeneratorSchema.txt");
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schema.toString()).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder().withRemoteServerPort(timelineServicePort).build()).withStorageConfig(HoodieStorageConfig.newBuilder().parquetMaxFileSize(1000 * 1024).hfileMaxFileSize(1000 * 1024).build()).build();
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
    String instantTime = "000";
    // Perform inserts of 100 records to test CreateHandle and BufferedExecutor
    final List<HoodieRecord> inserts = dataGen.generateInsertsWithHoodieAvroPayload(instantTime, 100);
    BaseSparkCommitActionExecutor actionExecutor = new SparkInsertCommitActionExecutor(context, config, table, instantTime, context.parallelize(inserts));
    final List<List<WriteStatus>> ws = jsc.parallelize(Arrays.asList(1)).map(x -> {
        return actionExecutor.handleInsert(UUID.randomUUID().toString(), inserts.iterator());
    }).map(Transformations::flatten).collect();
    WriteStatus writeStatus = ws.get(0).get(0);
    String fileId = writeStatus.getFileId();
    metaClient.getFs().create(new Path(Paths.get(basePath, ".hoodie", "000.commit").toString())).close();
    final List<HoodieRecord> updates = dataGen.generateUpdatesWithHoodieAvroPayload(instantTime, inserts);
    String partitionPath = writeStatus.getPartitionPath();
    long numRecordsInPartition = updates.stream().filter(u -> u.getPartitionPath().equals(partitionPath)).count();
    table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, HoodieTableMetaClient.reload(metaClient));
    BaseSparkCommitActionExecutor newActionExecutor = new SparkUpsertCommitActionExecutor(context, config, table, instantTime, context.parallelize(updates));
    final List<List<WriteStatus>> updateStatus = jsc.parallelize(Arrays.asList(1)).map(x -> {
        return newActionExecutor.handleUpdate(partitionPath, fileId, updates.iterator());
    }).map(Transformations::flatten).collect();
    assertEquals(updates.size() - numRecordsInPartition, updateStatus.get(0).get(0).getTotalErrorRecords());
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieLayoutConfig(org.apache.hudi.config.HoodieLayoutConfig) HoodieTable(org.apache.hudi.table.HoodieTable) Arrays(java.util.Arrays) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Map(java.util.Map) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) HoodieHiveUtils(org.apache.hudi.hadoop.utils.HoodieHiveUtils) MethodSource(org.junit.jupiter.params.provider.MethodSource) Schema(org.apache.avro.Schema) TRIP_EXAMPLE_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) ParquetReader(org.apache.parquet.hadoop.ParquetReader) HoodieCreateHandle(org.apache.hudi.io.HoodieCreateHandle) KeyGeneratorOptions(org.apache.hudi.keygen.constant.KeyGeneratorOptions) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) TestBulkInsertInternalPartitioner.generateTestRecordsForBulkInsert(org.apache.hudi.execution.bulkinsert.TestBulkInsertInternalPartitioner.generateTestRecordsForBulkInsert) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) TestBulkInsertInternalPartitioner.generateExpectedPartitionNumRecords(org.apache.hudi.execution.bulkinsert.TestBulkInsertInternalPartitioner.generateExpectedPartitionNumRecords) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieStorageLayout(org.apache.hudi.table.storage.HoodieStorageLayout) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) Mockito.mock(org.mockito.Mockito.mock) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) AvroReadSupport(org.apache.parquet.avro.AvroReadSupport) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) HoodieParquetInputFormat(org.apache.hudi.hadoop.HoodieParquetInputFormat) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) JavaRDD(org.apache.spark.api.java.JavaRDD) SchemaTestUtil.getSchemaFromResource(org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieData(org.apache.hudi.common.data.HoodieData) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) TaskContext(org.apache.spark.TaskContext) Mockito.when(org.mockito.Mockito.when) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) File(java.io.File) JobConf(org.apache.hadoop.mapred.JobConf) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieTestTable.makeNewCommitTime(org.apache.hudi.common.testutils.HoodieTestTable.makeNewCommitTime) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) Transformations(org.apache.hudi.common.testutils.Transformations) Paths(java.nio.file.Paths) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) LogManager(org.apache.log4j.LogManager) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) List(java.util.List) ArrayList(java.util.ArrayList) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 2 with HoodieSparkCopyOnWriteTable

use of org.apache.hudi.table.HoodieSparkCopyOnWriteTable in project hudi by apache.

the class TestCopyOnWriteActionExecutor method testUpdateRecords.

// TODO (weiy): Add testcases for crossing file writing.
@ParameterizedTest
@MethodSource("indexType")
public void testUpdateRecords(HoodieIndex.IndexType indexType) throws Exception {
    // Prepare the AvroParquetIO
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withProps(makeIndexConfig(indexType)).build();
    String firstCommitTime = makeNewCommitTime();
    SparkRDDWriteClient writeClient = getHoodieWriteClient(config);
    writeClient.startCommitWithTime(firstCommitTime);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    String partitionPath = "2016/01/31";
    HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
    // Get some records belong to the same partition (2016/01/31)
    String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}";
    List<HoodieRecord> records = new ArrayList<>();
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
    // Insert new records
    final HoodieSparkCopyOnWriteTable cowTable = table;
    writeClient.insert(jsc.parallelize(records, 1), firstCommitTime);
    FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1);
    assertEquals(1, allFiles.length);
    // Read out the bloom filter and make sure filter can answer record exist or not
    Path filePath = allFiles[0].getPath();
    BloomFilter filter = BaseFileUtils.getInstance(table.getBaseFileFormat()).readBloomFilterFromMetadata(hadoopConf, filePath);
    for (HoodieRecord record : records) {
        assertTrue(filter.mightContain(record.getRecordKey()));
    }
    // Read the base file, check the record content
    List<GenericRecord> fileRecords = BaseFileUtils.getInstance(table.getBaseFileFormat()).readAvroRecords(hadoopConf, filePath);
    GenericRecord newRecord;
    int index = 0;
    for (GenericRecord record : fileRecords) {
        assertEquals(records.get(index).getRecordKey(), record.get("_row_key").toString());
        index++;
    }
    // We update the 1st record & add a new record
    String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    RawTripTestPayload updateRowChanges1 = new RawTripTestPayload(updateRecordStr1);
    HoodieRecord updatedRecord1 = new HoodieAvroRecord(new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1);
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieRecord insertedRecord1 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    List<HoodieRecord> updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1);
    Thread.sleep(1000);
    String newCommitTime = makeNewCommitTime();
    metaClient = HoodieTableMetaClient.reload(metaClient);
    writeClient.startCommitWithTime(newCommitTime);
    List<WriteStatus> statuses = writeClient.upsert(jsc.parallelize(updatedRecords), newCommitTime).collect();
    allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1);
    assertEquals(1, allFiles.length);
    // verify new incremental file group is same as the previous one
    assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName()));
    // Check whether the record has been updated
    Path updatedFilePath = allFiles[0].getPath();
    BloomFilter updatedFilter = BaseFileUtils.getInstance(metaClient).readBloomFilterFromMetadata(hadoopConf, updatedFilePath);
    for (HoodieRecord record : records) {
        // No change to the _row_key
        assertTrue(updatedFilter.mightContain(record.getRecordKey()));
    }
    assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey()));
    // add this so it can further check below
    records.add(insertedRecord1);
    ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedFilePath).build();
    index = 0;
    while ((newRecord = (GenericRecord) updatedReader.read()) != null) {
        assertEquals(newRecord.get("_row_key").toString(), records.get(index).getRecordKey());
        if (index == 0) {
            assertEquals("15", newRecord.get("number").toString());
        }
        index++;
    }
    updatedReader.close();
    // Also check the numRecordsWritten
    WriteStatus writeStatus = statuses.get(0);
    assertEquals(1, statuses.size(), "Should be only one file generated");
    // 3 rewritten records + 1 new record
    assertEquals(4, writeStatus.getStat().getNumWrites());
}
Also used : Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) ParquetReader(org.apache.parquet.hadoop.ParquetReader) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) GenericRecord(org.apache.avro.generic.GenericRecord) AvroReadSupport(org.apache.parquet.avro.AvroReadSupport) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 3 with HoodieSparkCopyOnWriteTable

use of org.apache.hudi.table.HoodieSparkCopyOnWriteTable in project hudi by apache.

the class TestUpsertPartitioner method testPartitionWeight.

@Test
public void testPartitionWeight() throws Exception {
    final String testPartitionPath = "2016/09/26";
    int totalInsertNum = 2000;
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(0).insertSplitSize(totalInsertNum / 2).autoTuneInsertSplits(false).build()).build();
    FileCreateUtils.createCommit(basePath, "001");
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
    List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", totalInsertNum);
    WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords)));
    UpsertPartitioner partitioner = new UpsertPartitioner(profile, context, table, config);
    List<InsertBucketCumulativeWeightPair> insertBuckets = partitioner.getInsertBuckets(testPartitionPath);
    float bucket0Weight = 0.2f;
    InsertBucketCumulativeWeightPair pair = insertBuckets.remove(0);
    pair.getKey().weight = bucket0Weight;
    pair.setValue(new Double(bucket0Weight));
    insertBuckets.add(0, pair);
    InsertBucketCumulativeWeightPair pair1 = insertBuckets.remove(1);
    pair1.getKey().weight = 1 - bucket0Weight;
    pair1.setValue(new Double(1));
    insertBuckets.add(1, pair1);
    Map<Integer, Integer> partition2numRecords = new HashMap<Integer, Integer>();
    for (HoodieRecord hoodieRecord : insertRecords) {
        int partition = partitioner.getPartition(new Tuple2<>(hoodieRecord.getKey(), Option.ofNullable(hoodieRecord.getCurrentLocation())));
        if (!partition2numRecords.containsKey(partition)) {
            partition2numRecords.put(partition, 0);
        }
        partition2numRecords.put(partition, partition2numRecords.get(partition) + 1);
    }
    assertTrue(partition2numRecords.get(0) < partition2numRecords.get(1), "The insert num of bucket1 should more than bucket0");
    assertTrue(partition2numRecords.get(0) + partition2numRecords.get(1) == totalInsertNum, "The total insert records should be " + totalInsertNum);
    assertEquals(String.valueOf(bucket0Weight), String.format("%.1f", (partition2numRecords.get(0) * 1.0f / totalInsertNum)), "The weight of bucket0 should be " + bucket0Weight);
    assertEquals(String.valueOf(1 - bucket0Weight), String.format("%.1f", (partition2numRecords.get(1) * 1.0f / totalInsertNum)), "The weight of bucket1 should be " + (1 - bucket0Weight));
}
Also used : WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Test(org.junit.jupiter.api.Test)

Example 4 with HoodieSparkCopyOnWriteTable

use of org.apache.hudi.table.HoodieSparkCopyOnWriteTable in project hudi by apache.

the class TestUpsertPartitioner method getUpsertPartitioner.

private UpsertPartitioner getUpsertPartitioner(int smallFileSize, int numInserts, int numUpdates, int fileSize, String testPartitionPath, boolean autoSplitInserts) throws Exception {
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize).insertSplitSize(100).autoTuneInsertSplits(autoSplitInserts).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1000 * 1024).parquetMaxFileSize(1000 * 1024).orcMaxFileSize(1000 * 1024).build()).build();
    FileCreateUtils.createCommit(basePath, "001");
    FileCreateUtils.createBaseFile(basePath, testPartitionPath, "001", "file1", fileSize);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
    List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
    List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
    for (HoodieRecord updateRec : updateRecords) {
        updateRec.unseal();
        updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1"));
        updateRec.seal();
    }
    List<HoodieRecord> records = new ArrayList<>();
    records.addAll(insertRecords);
    records.addAll(updateRecords);
    WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(records)));
    UpsertPartitioner partitioner = new UpsertPartitioner(profile, context, table, config);
    assertEquals(0, partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(), Option.ofNullable(updateRecords.get(0).getCurrentLocation()))), "Update record should have gone to the 1 update partition");
    return partitioner;
}
Also used : WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Tuple2(scala.Tuple2) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator)

Example 5 with HoodieSparkCopyOnWriteTable

use of org.apache.hudi.table.HoodieSparkCopyOnWriteTable in project hudi by apache.

the class TestHoodieClientOnCopyOnWriteStorage method testUpdateRejectForClustering.

@Test
public void testUpdateRejectForClustering() throws IOException {
    final String testPartitionPath = "2016/09/26";
    dataGen = new HoodieTestDataGenerator(new String[] { testPartitionPath });
    Properties props = new Properties();
    props.setProperty(ASYNC_CLUSTERING_ENABLE.key(), "true");
    HoodieWriteConfig config = getSmallInsertWriteConfig(100, TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), true, props);
    SparkRDDWriteClient client = getHoodieWriteClient(config);
    HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
    // 1. insert to generate 2 file group
    String commitTime1 = "001";
    Pair<List<WriteStatus>, List<HoodieRecord>> upsertResult = insertBatchRecords(client, commitTime1, 600, 2);
    List<HoodieRecord> inserts1 = upsertResult.getValue();
    List<String> fileGroupIds1 = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getFileGroupId().getFileId()).collect(Collectors.toList());
    assertEquals(2, fileGroupIds1.size());
    // 2. generate clustering plan for fileGroupIds1 file groups
    String commitTime2 = "002";
    List<List<FileSlice>> firstInsertFileSlicesList = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getAllFileSlices().collect(Collectors.toList())).collect(Collectors.toList());
    List<FileSlice>[] fileSlices = (List<FileSlice>[]) firstInsertFileSlicesList.toArray(new List[firstInsertFileSlicesList.size()]);
    createRequestedReplaceInstant(this.metaClient, commitTime2, fileSlices);
    // 3. insert one record with no updating reject exception, and not merge the small file, just generate a new file group
    String commitTime3 = "003";
    insertBatchRecords(client, commitTime3, 1, 1).getKey();
    List<String> fileGroupIds2 = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getFileGroupId().getFileId()).collect(Collectors.toList());
    assertEquals(3, fileGroupIds2.size());
    // 4. update one record for the clustering two file groups, throw reject update exception
    String commitTime4 = "004";
    client.startCommitWithTime(commitTime4);
    List<HoodieRecord> insertsAndUpdates3 = new ArrayList<>();
    insertsAndUpdates3.addAll(dataGen.generateUpdates(commitTime4, inserts1));
    String assertMsg = String.format("Not allowed to update the clustering files in partition: %s " + "For pending clustering operations, we are not going to support update for now.", testPartitionPath);
    assertThrows(HoodieUpsertException.class, () -> {
        writeClient.upsert(jsc.parallelize(insertsAndUpdates3, 1), commitTime3).collect();
    }, assertMsg);
    // 5. insert one record with no updating reject exception, will merge the small file
    String commitTime5 = "005";
    List<WriteStatus> statuses = insertBatchRecords(client, commitTime5, 1, 1).getKey();
    fileGroupIds2.removeAll(fileGroupIds1);
    assertEquals(fileGroupIds2.get(0), statuses.get(0).getFileId());
    List<String> firstInsertFileGroupIds4 = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getFileGroupId().getFileId()).collect(Collectors.toList());
    assertEquals(3, firstInsertFileGroupIds4.size());
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) SparkSingleFileSortPlanStrategy(org.apache.hudi.client.clustering.plan.strategy.SparkSingleFileSortPlanStrategy) SparkTaskContextSupplier(org.apache.hudi.client.SparkTaskContextSupplier) HoodieWriteHelper(org.apache.hudi.table.action.commit.HoodieWriteHelper) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) Future(java.util.concurrent.Future) Map(java.util.Map) EAGER(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy.EAGER) Tag(org.junit.jupiter.api.Tag) HoodieWriteResult(org.apache.hudi.client.HoodieWriteResult) REQUESTED(org.apache.hudi.common.table.timeline.HoodieInstant.State.REQUESTED) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) BaseHoodieWriteClient(org.apache.hudi.client.BaseHoodieWriteClient) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) Set(java.util.Set) VERSION_0(org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion.VERSION_0) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieIndex(org.apache.hudi.index.HoodieIndex) Executors(java.util.concurrent.Executors) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) ClusteringUtils(org.apache.hudi.common.util.ClusteringUtils) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) SqlQuerySingleResultPreCommitValidator(org.apache.hudi.client.validator.SqlQuerySingleResultPreCommitValidator) DEFAULT_THIRD_PARTITION_PATH(org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH) Mockito.mock(org.mockito.Mockito.mock) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Dataset(org.apache.spark.sql.Dataset) Assertions.assertNull(org.junit.jupiter.api.Assertions.assertNull) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) DEFAULT_FIRST_PARTITION_PATH(org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) HoodieValidationException(org.apache.hudi.exception.HoodieValidationException) ArrayList(java.util.ArrayList) MarkerType(org.apache.hudi.common.table.marker.MarkerType) StringUtils(org.apache.hudi.common.util.StringUtils) KeyGenerator(org.apache.hudi.keygen.KeyGenerator) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) Transformations.recordsToRecordKeySet(org.apache.hudi.common.testutils.Transformations.recordsToRecordKeySet) EXECUTION_STRATEGY_CLASS_NAME(org.apache.hudi.config.HoodieClusteringConfig.EXECUTION_STRATEGY_CLASS_NAME) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) JavaRDD(org.apache.spark.api.java.JavaRDD) TimelineLayoutVersion(org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion) ValueSource(org.junit.jupiter.params.provider.ValueSource) ConsistencyGuardConfig(org.apache.hudi.common.fs.ConsistencyGuardConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) HoodieData(org.apache.hudi.common.data.HoodieData) RDDCustomColumnsSortPartitioner(org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) SqlQueryEqualityPreCommitValidator(org.apache.hudi.client.validator.SqlQueryEqualityPreCommitValidator) DEFAULT_SECOND_PARTITION_PATH(org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) HoodieCorruptedDataException(org.apache.hudi.exception.HoodieCorruptedDataException) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) COMPLETED(org.apache.hudi.common.table.timeline.HoodieInstant.State.COMPLETED) REPLACE_COMMIT_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.REPLACE_COMMIT_ACTION) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) Logger(org.apache.log4j.Logger) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) CLEAN_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.CLEAN_ACTION) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) MethodSource(org.junit.jupiter.params.provider.MethodSource) HoodieRollbackException(org.apache.hudi.exception.HoodieRollbackException) SparkSingleFileSortExecutionStrategy(org.apache.hudi.client.clustering.run.strategy.SparkSingleFileSortExecutionStrategy) HoodiePreCommitValidatorConfig(org.apache.hudi.config.HoodiePreCommitValidatorConfig) TRIP_EXAMPLE_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) IOType(org.apache.hudi.common.model.IOType) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) Collection(java.util.Collection) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) MarkerUtils(org.apache.hudi.common.util.MarkerUtils) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) List(java.util.List) FileCreateUtils.getBaseFileCountsForPaths(org.apache.hudi.common.testutils.FileCreateUtils.getBaseFileCountsForPaths) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) ROLLBACK_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.ROLLBACK_ACTION) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) NotNull(org.jetbrains.annotations.NotNull) HoodieInsertException(org.apache.hudi.exception.HoodieInsertException) Transformations.randomSelectAsHoodieKeys(org.apache.hudi.common.testutils.Transformations.randomSelectAsHoodieKeys) INFLIGHT(org.apache.hudi.common.table.timeline.HoodieInstant.State.INFLIGHT) COMMIT_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.COMMIT_ACTION) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) EnumSource(org.junit.jupiter.params.provider.EnumSource) HashMap(java.util.HashMap) HashSet(java.util.HashSet) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ExecutorService(java.util.concurrent.ExecutorService) GenericRecord(org.apache.avro.generic.GenericRecord) ASYNC_CLUSTERING_ENABLE(org.apache.hudi.config.HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE) TypedProperties(org.apache.hudi.common.config.TypedProperties) NULL_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.NULL_SCHEMA) Mockito.when(org.mockito.Mockito.when) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) ClusteringTestUtils(org.apache.hudi.common.testutils.ClusteringTestUtils) SparkPreCommitValidator(org.apache.hudi.client.validator.SparkPreCommitValidator) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Properties(java.util.Properties) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) ArrayList(java.util.ArrayList) List(java.util.List) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) WriteStatus(org.apache.hudi.client.WriteStatus) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieRecord (org.apache.hudi.common.model.HoodieRecord)9 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)9 HoodieSparkCopyOnWriteTable (org.apache.hudi.table.HoodieSparkCopyOnWriteTable)9 ArrayList (java.util.ArrayList)7 WriteStatus (org.apache.hudi.client.WriteStatus)6 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)6 HoodieKey (org.apache.hudi.common.model.HoodieKey)6 RawTripTestPayload (org.apache.hudi.common.testutils.RawTripTestPayload)6 Test (org.junit.jupiter.api.Test)6 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)6 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)5 HashMap (java.util.HashMap)4 GenericRecord (org.apache.avro.generic.GenericRecord)4 Path (org.apache.hadoop.fs.Path)4 HoodieData (org.apache.hudi.common.data.HoodieData)4 Arrays (java.util.Arrays)3 List (java.util.List)3 Map (java.util.Map)3 Properties (java.util.Properties)3 UUID (java.util.UUID)3