use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.
the class TestUpsertPartitioner method testAverageBytesPerRecordForEmptyCommitTimeLine.
@Test
public void testAverageBytesPerRecordForEmptyCommitTimeLine() throws Exception {
HoodieTimeline commitTimeLine = mock(HoodieTimeline.class);
HoodieWriteConfig config = makeHoodieClientConfigBuilder().build();
when(commitTimeLine.empty()).thenReturn(true);
long expectAvgSize = config.getCopyOnWriteRecordSizeEstimate();
long actualAvgSize = averageBytesPerRecord(commitTimeLine, config);
assertEquals(expectAvgSize, actualAvgSize);
}
use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.
the class TestUpsertPartitioner method getUpsertPartitioner.
private UpsertPartitioner getUpsertPartitioner(int smallFileSize, int numInserts, int numUpdates, int fileSize, String testPartitionPath, boolean autoSplitInserts) throws Exception {
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize).insertSplitSize(100).autoTuneInsertSplits(autoSplitInserts).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1000 * 1024).parquetMaxFileSize(1000 * 1024).orcMaxFileSize(1000 * 1024).build()).build();
FileCreateUtils.createCommit(basePath, "001");
FileCreateUtils.createBaseFile(basePath, testPartitionPath, "001", "file1", fileSize);
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
for (HoodieRecord updateRec : updateRecords) {
updateRec.unseal();
updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1"));
updateRec.seal();
}
List<HoodieRecord> records = new ArrayList<>();
records.addAll(insertRecords);
records.addAll(updateRecords);
WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(records)));
UpsertPartitioner partitioner = new UpsertPartitioner(profile, context, table, config);
assertEquals(0, partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(), Option.ofNullable(updateRecords.get(0).getCurrentLocation()))), "Update record should have gone to the 1 update partition");
return partitioner;
}
use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.
the class TestUpsertPartitioner method testUpsertPartitionerWithSmallFileHandlingWithInflightCompactionWithCanIndexLogFiles.
@Test
public void testUpsertPartitionerWithSmallFileHandlingWithInflightCompactionWithCanIndexLogFiles() throws Exception {
// Note this is used because it is same partition path used in CompactionTestUtils.createCompactionPlan()
final String testPartitionPath = DEFAULT_PARTITION_PATHS[0];
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.HBASE).withHBaseIndexConfig(HoodieHBaseIndexConfig.newBuilder().build()).build()).build();
// This will generate initial commits and create a compaction plan which includes file groups created as part of this
HoodieCompactionPlan plan = CompactionTestUtils.createCompactionPlan(metaClient, "001", "002", 1, true, false);
FileCreateUtils.createRequestedCompactionCommit(basePath, "002", plan);
// Simulate one more commit so that inflight compaction is considered when building file groups in file system view
FileCreateUtils.createBaseFile(basePath, testPartitionPath, "003", "2", 1);
FileCreateUtils.createCommit(basePath, "003");
// Partitioner will attempt to assign inserts to file groups including base file created by inflight compaction
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
List<HoodieRecord> insertRecords = dataGenerator.generateInserts("004", 100);
WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords)));
HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient);
SparkUpsertDeltaCommitPartitioner partitioner = new SparkUpsertDeltaCommitPartitioner(profile, context, table, config);
assertEquals(1, partitioner.numPartitions(), "Should have 1 partitions");
assertEquals(BucketType.UPDATE, partitioner.getBucketInfo(0).bucketType, "Bucket 0 is UPDATE");
assertEquals("2", partitioner.getBucketInfo(0).fileIdPrefix, "Should be assigned to only file id not pending compaction which is 2");
}
use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.
the class TestUpsertPartitioner method testUpsertPartitionerWithSmallFileHandlingAndClusteringPlan.
@Test
public void testUpsertPartitionerWithSmallFileHandlingAndClusteringPlan() throws Exception {
final String testPartitionPath = DEFAULT_PARTITION_PATHS[0];
// create HoodieWriteConfig and set inline and async clustering disable here.
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().build()).withClusteringConfig(HoodieClusteringConfig.newBuilder().withInlineClustering(false).withAsyncClustering(false).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1000 * 1024).parquetMaxFileSize(1000 * 1024).build()).build();
// create file slice with instantTime 001 and build clustering plan including this created 001 file slice.
HoodieClusteringPlan clusteringPlan = ClusteringTestUtils.createClusteringPlan(metaClient, "001", "1");
// create requested replace commit
HoodieRequestedReplaceMetadata requestedReplaceMetadata = HoodieRequestedReplaceMetadata.newBuilder().setClusteringPlan(clusteringPlan).setOperationType(WriteOperationType.CLUSTER.name()).build();
FileCreateUtils.createRequestedReplaceCommit(basePath, "002", Option.of(requestedReplaceMetadata));
// create file slice 003
FileCreateUtils.createBaseFile(basePath, testPartitionPath, "003", "3", 1);
FileCreateUtils.createCommit(basePath, "003");
metaClient = HoodieTableMetaClient.reload(metaClient);
// generate new data to be ingested
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
List<HoodieRecord> insertRecords = dataGenerator.generateInserts("004", 100);
WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords)));
HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient);
// create UpsertPartitioner
UpsertPartitioner partitioner = new UpsertPartitioner(profile, context, table, config);
// for now we have file slice1 and file slice3 and file slice1 is contained in pending clustering plan
// So that only file slice3 can be used for ingestion.
assertEquals(1, partitioner.smallFiles.size(), "Should have 1 small file to be ingested.");
}
use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.
the class CompactionTestBase method executeCompaction.
protected void executeCompaction(String compactionInstantTime, SparkRDDWriteClient client, HoodieTable table, HoodieWriteConfig cfg, int expectedNumRecs, boolean hasDeltaCommitAfterPendingCompaction) throws IOException {
client.compact(compactionInstantTime);
assertFalse(WriteMarkersFactory.get(cfg.getMarkersType(), table, compactionInstantTime).doesMarkerDirExist());
List<FileSlice> fileSliceList = getCurrentLatestFileSlices(table);
assertTrue(fileSliceList.stream().findAny().isPresent(), "Ensure latest file-slices are not empty");
assertFalse(fileSliceList.stream().anyMatch(fs -> !fs.getBaseInstantTime().equals(compactionInstantTime)), "Verify all file-slices have base-instant same as compaction instant");
assertFalse(fileSliceList.stream().anyMatch(fs -> !fs.getBaseFile().isPresent()), "Verify all file-slices have data-files");
if (hasDeltaCommitAfterPendingCompaction) {
assertFalse(fileSliceList.stream().anyMatch(fs -> fs.getLogFiles().count() == 0), "Verify all file-slices have atleast one log-file");
} else {
assertFalse(fileSliceList.stream().anyMatch(fs -> fs.getLogFiles().count() > 0), "Verify all file-slices have no log-files");
}
// verify that there is a commit
table = getHoodieTable(HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).setLoadActiveTimelineOnLoad(true).build(), cfg);
HoodieTimeline timeline = table.getMetaClient().getCommitTimeline().filterCompletedInstants();
String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp();
assertEquals(latestCompactionCommitTime, compactionInstantTime, "Expect compaction instant time to be the latest commit time");
assertEquals(expectedNumRecs, HoodieClientTestUtils.countRecordsOptionallySince(jsc, basePath, sqlContext, timeline, Option.of("000")), "Must contain expected records");
}
Aggregations