use of org.apache.hudi.table.HoodieTable in project hudi by apache.
the class TestHoodieBackedMetadata method testVirtualKeysInBaseFiles.
/**
* Tests that virtual key configs are honored in base files after compaction in metadata table.
*
* @throws Exception
*/
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testVirtualKeysInBaseFiles(boolean populateMetaFields) throws Exception {
HoodieTableType tableType = MERGE_ON_READ;
init(tableType, false);
writeConfig = getWriteConfigBuilder(true, true, false).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).enableFullScan(true).enableMetrics(false).withPopulateMetaFields(populateMetaFields).withMaxNumDeltaCommitsBeforeCompaction(2).build()).build();
initWriteConfigAndMetatableWriter(writeConfig, true);
doWriteOperation(testTable, "0000001", INSERT);
doClean(testTable, "0000003", Arrays.asList("0000001"));
// this should have triggered compaction in metadata table
doWriteOperation(testTable, "0000004", UPSERT);
HoodieTableMetadata tableMetadata = metadata(writeConfig, context);
assertTrue(tableMetadata.getLatestCompactionTime().isPresent());
assertEquals(tableMetadata.getLatestCompactionTime().get(), "0000003001");
HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build();
HoodieWriteConfig metadataTableWriteConfig = getMetadataWriteConfig(writeConfig);
metadataMetaClient.reloadActiveTimeline();
HoodieTable table = HoodieSparkTable.create(metadataTableWriteConfig, context, metadataMetaClient);
table.getHoodieView().sync();
List<FileSlice> fileSlices = table.getSliceView().getLatestFileSlices("files").collect(Collectors.toList());
HoodieBaseFile baseFile = fileSlices.get(0).getBaseFile().get();
HoodieHFileReader hoodieHFileReader = new HoodieHFileReader(context.getHadoopConf().get(), new Path(baseFile.getPath()), new CacheConfig(context.getHadoopConf().get()));
List<Pair<String, IndexedRecord>> records = hoodieHFileReader.readAllRecords();
records.forEach(entry -> {
if (populateMetaFields) {
assertNotNull(((GenericRecord) entry.getSecond()).get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
} else {
assertNull(((GenericRecord) entry.getSecond()).get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
}
});
}
use of org.apache.hudi.table.HoodieTable in project hudi by apache.
the class TestHoodieBackedTableMetadata method testMetadataRecordKeyExcludeFromPayload.
/**
* 1. Verify metadata table records key deduplication feature. When record key
* deduplication is enabled, verify the metadata record payload on disk has empty key.
* Otherwise, verify the valid key.
* 2. Verify populate meta fields work irrespective of record key deduplication config.
* 3. Verify table services like compaction benefit from record key deduplication feature.
*/
@ParameterizedTest
@EnumSource(HoodieTableType.class)
public void testMetadataRecordKeyExcludeFromPayload(final HoodieTableType tableType) throws Exception {
initPath();
writeConfig = getWriteConfigBuilder(true, true, false).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).withPopulateMetaFields(false).withMaxNumDeltaCommitsBeforeCompaction(3).build()).build();
init(tableType, writeConfig);
// 2nd commit
doWriteOperation(testTable, "0000001", INSERT);
final HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build();
HoodieWriteConfig metadataTableWriteConfig = getMetadataWriteConfig(writeConfig);
metadataMetaClient.reloadActiveTimeline();
final HoodieTable table = HoodieSparkTable.create(metadataTableWriteConfig, context, metadataMetaClient);
// Compaction has not yet kicked in. Verify all the log files
// for the metadata records persisted on disk as per the config.
assertDoesNotThrow(() -> {
verifyMetadataRecordKeyExcludeFromPayloadLogFiles(table, metadataMetaClient, "0000001");
}, "Metadata table should have valid log files!");
// Verify no base file created yet.
assertThrows(IllegalStateException.class, () -> {
verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(table);
}, "Metadata table should not have a base file yet!");
// 2 more commits
doWriteOperation(testTable, "0000002", UPSERT);
doWriteOperation(testTable, "0000004", UPSERT);
// Compaction should be triggered by now. Let's verify the log files
// if any for the metadata records persisted on disk as per the config.
assertDoesNotThrow(() -> {
verifyMetadataRecordKeyExcludeFromPayloadLogFiles(table, metadataMetaClient, "0000002");
}, "Metadata table should have valid log files!");
// Verify the base file created by the just completed compaction.
assertDoesNotThrow(() -> {
verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(table);
}, "Metadata table should have a valid base file!");
// 2 more commits to trigger one more compaction, along with a clean
doWriteOperation(testTable, "0000005", UPSERT);
doClean(testTable, "0000006", Arrays.asList("0000004"));
doWriteOperation(testTable, "0000007", UPSERT);
assertDoesNotThrow(() -> {
verifyMetadataRecordKeyExcludeFromPayloadLogFiles(table, metadataMetaClient, "7");
}, "Metadata table should have valid log files!");
assertDoesNotThrow(() -> {
verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(table);
}, "Metadata table should have a valid base file!");
validateMetadata(testTable);
}
use of org.apache.hudi.table.HoodieTable in project hudi by apache.
the class TestHoodieBackedTableMetadata method verifyBaseMetadataTable.
private void verifyBaseMetadataTable() throws IOException {
HoodieBackedTableMetadata tableMetadata = new HoodieBackedTableMetadata(context, writeConfig.getMetadataConfig(), writeConfig.getBasePath(), writeConfig.getSpillableMapBasePath(), false);
assertTrue(tableMetadata.enabled());
List<java.nio.file.Path> fsPartitionPaths = testTable.getAllPartitionPaths();
List<String> fsPartitions = new ArrayList<>();
fsPartitionPaths.forEach(entry -> fsPartitions.add(entry.getFileName().toString()));
List<String> metadataPartitions = tableMetadata.getAllPartitionPaths();
Collections.sort(fsPartitions);
Collections.sort(metadataPartitions);
assertEquals(fsPartitions.size(), metadataPartitions.size(), "Partitions should match");
assertEquals(fsPartitions, metadataPartitions, "Partitions should match");
// Files within each partition should match
HoodieTable table = HoodieSparkTable.create(writeConfig, context, true);
TableFileSystemView tableView = table.getHoodieView();
List<String> fullPartitionPaths = fsPartitions.stream().map(partition -> basePath + "/" + partition).collect(Collectors.toList());
Map<String, FileStatus[]> partitionToFilesMap = tableMetadata.getAllFilesInPartitions(fullPartitionPaths);
assertEquals(fsPartitions.size(), partitionToFilesMap.size());
fsPartitions.forEach(partition -> {
try {
validateFilesPerPartition(testTable, tableMetadata, tableView, partitionToFilesMap, partition);
} catch (IOException e) {
fail("Exception should not be raised: " + e);
}
});
}
use of org.apache.hudi.table.HoodieTable in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testSmallInsertHandlingForUpserts.
/**
* Test scenario of new file-group getting added during upsert().
*/
@Test
public void testSmallInsertHandlingForUpserts() throws Exception {
final String testPartitionPath = "2016/09/26";
final int insertSplitLimit = 100;
// setup the small file handling params
// hold upto 200 records max
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit, TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150));
dataGen = new HoodieTestDataGenerator(new String[] { testPartitionPath });
SparkRDDWriteClient client = getHoodieWriteClient(config);
BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient);
// Inserts => will write file1
String commitTime1 = "001";
client.startCommitWithTime(commitTime1);
// this writes ~500kb
List<HoodieRecord> inserts1 = dataGen.generateInserts(commitTime1, insertSplitLimit);
Set<String> keys1 = recordsToRecordKeySet(inserts1);
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1);
List<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime1).collect();
assertNoWriteErrors(statuses);
assertEquals(1, statuses.size(), "Just 1 file needs to be added.");
String file1 = statuses.get(0).getFileId();
assertEquals(100, fileUtils.readRowKeys(hadoopConf, new Path(basePath, statuses.get(0).getStat().getPath())).size(), "file should contain 100 records");
// Update + Inserts such that they just expand file1
String commitTime2 = "002";
client.startCommitWithTime(commitTime2);
List<HoodieRecord> inserts2 = dataGen.generateInserts(commitTime2, 40);
Set<String> keys2 = recordsToRecordKeySet(inserts2);
List<HoodieRecord> insertsAndUpdates2 = new ArrayList<>();
insertsAndUpdates2.addAll(inserts2);
insertsAndUpdates2.addAll(dataGen.generateUpdates(commitTime2, inserts1));
JavaRDD<HoodieRecord> insertAndUpdatesRDD2 = jsc.parallelize(insertsAndUpdates2, 1);
statuses = client.upsert(insertAndUpdatesRDD2, commitTime2).collect();
assertNoWriteErrors(statuses);
assertEquals(1, statuses.size(), "Just 1 file needs to be updated.");
assertEquals(file1, statuses.get(0).getFileId(), "Existing file should be expanded");
assertEquals(commitTime1, statuses.get(0).getStat().getPrevCommit(), "Existing file should be expanded");
Path newFile = new Path(basePath, statuses.get(0).getStat().getPath());
assertEquals(140, fileUtils.readRowKeys(hadoopConf, newFile).size(), "file should contain 140 records");
List<GenericRecord> records = fileUtils.readAvroRecords(hadoopConf, newFile);
for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
assertEquals(commitTime2, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(), "only expect commit2");
assertTrue(keys2.contains(recordKey) || keys1.contains(recordKey), "key expected to be part of commit2");
}
// update + inserts such that file1 is updated and expanded, a new file2 is created.
String commitTime3 = "003";
client.startCommitWithTime(commitTime3);
List<HoodieRecord> insertsAndUpdates3 = dataGen.generateInserts(commitTime3, 200);
Set<String> keys3 = recordsToRecordKeySet(insertsAndUpdates3);
List<HoodieRecord> updates3 = dataGen.generateUpdates(commitTime3, inserts2);
insertsAndUpdates3.addAll(updates3);
JavaRDD<HoodieRecord> insertAndUpdatesRDD3 = jsc.parallelize(insertsAndUpdates3, 1);
statuses = client.upsert(insertAndUpdatesRDD3, commitTime3).collect();
assertNoWriteErrors(statuses);
assertEquals(2, statuses.size(), "2 files needs to be committed.");
HoodieTableMetaClient metadata = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
HoodieTable table = getHoodieTable(metadata, config);
BaseFileOnlyView fileSystemView = table.getBaseFileOnlyView();
List<HoodieBaseFile> files = fileSystemView.getLatestBaseFilesBeforeOrOn(testPartitionPath, commitTime3).collect(Collectors.toList());
int numTotalInsertsInCommit3 = 0;
int numTotalUpdatesInCommit3 = 0;
for (HoodieBaseFile file : files) {
if (file.getFileName().contains(file1)) {
assertEquals(commitTime3, file.getCommitTime(), "Existing file should be expanded");
records = fileUtils.readAvroRecords(hadoopConf, new Path(file.getPath()));
for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
if (recordCommitTime.equals(commitTime3)) {
if (keys2.contains(recordKey)) {
keys2.remove(recordKey);
numTotalUpdatesInCommit3++;
} else {
numTotalInsertsInCommit3++;
}
}
}
assertEquals(0, keys2.size(), "All keys added in commit 2 must be updated in commit3 correctly");
} else {
assertEquals(commitTime3, file.getCommitTime(), "New file must be written for commit 3");
records = fileUtils.readAvroRecords(hadoopConf, new Path(file.getPath()));
for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
assertEquals(commitTime3, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(), "only expect commit3");
assertTrue(keys3.contains(recordKey), "key expected to be part of commit3");
}
numTotalInsertsInCommit3 += records.size();
}
}
assertEquals(numTotalUpdatesInCommit3, inserts2.size(), "Total updates in commit3 must add up");
assertEquals(numTotalInsertsInCommit3, keys3.size(), "Total inserts in commit3 must add up");
}
use of org.apache.hudi.table.HoodieTable in project hudi by apache.
the class TestHoodieIndex method testTagLocationAndDuplicateUpdate.
@ParameterizedTest
@MethodSource("indexTypeParams")
public void testTagLocationAndDuplicateUpdate(IndexType indexType, boolean populateMetaFields, boolean enableMetadataIndex) throws Exception {
setUp(indexType, populateMetaFields, enableMetadataIndex);
String newCommitTime = "001";
int totalRecords = 10 + random.nextInt(20);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
HoodieSparkTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
writeClient.startCommitWithTime(newCommitTime);
JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
JavaRDD<HoodieRecord> javaRDD1 = tagLocation(index, writeRecords, hoodieTable);
// Duplicate upsert and ensure correctness is maintained
// We are trying to approximately imitate the case when the RDD is recomputed. For RDD creating, driver code is not
// recomputed. This includes the state transitions. We need to delete the inflight instance so that subsequent
// upsert will not run into conflicts.
metaClient.getFs().delete(new Path(metaClient.getMetaPath(), "001.inflight"));
writeClient.upsert(writeRecords, newCommitTime);
Assertions.assertNoWriteErrors(writeStatues.collect());
// Now commit this & update location of records inserted and validate no errors
writeClient.commit(newCommitTime, writeStatues);
// Now tagLocation for these records, hbaseIndex should tag them correctly
metaClient = HoodieTableMetaClient.reload(metaClient);
hoodieTable = HoodieSparkTable.create(config, context, metaClient);
JavaRDD<HoodieRecord> javaRDD = tagLocation(index, writeRecords, hoodieTable);
Map<String, String> recordKeyToPartitionPathMap = new HashMap();
List<HoodieRecord> hoodieRecords = writeRecords.collect();
hoodieRecords.forEach(entry -> recordKeyToPartitionPathMap.put(entry.getRecordKey(), entry.getPartitionPath()));
assertEquals(totalRecords, javaRDD.filter(HoodieRecord::isCurrentLocationKnown).collect().size());
assertEquals(totalRecords, javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count());
assertEquals(totalRecords, javaRDD.filter(record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count());
javaRDD.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry.getRecordKey()), entry.getPartitionPath(), "PartitionPath mismatch"));
JavaRDD<HoodieKey> hoodieKeyJavaRDD = writeRecords.map(entry -> entry.getKey());
JavaPairRDD<HoodieKey, Option<Pair<String, String>>> recordLocations = getRecordLocations(hoodieKeyJavaRDD, hoodieTable);
List<HoodieKey> hoodieKeys = hoodieKeyJavaRDD.collect();
assertEquals(totalRecords, recordLocations.collect().size());
assertEquals(totalRecords, recordLocations.map(record -> record._1).distinct().count());
recordLocations.foreach(entry -> assertTrue(hoodieKeys.contains(entry._1), "Missing HoodieKey"));
recordLocations.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry._1.getRecordKey()), entry._1.getPartitionPath(), "PartitionPath mismatch"));
}
Aggregations