use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testClustering.
private void testClustering(HoodieClusteringConfig clusteringConfig, boolean populateMetaFields, boolean completeClustering, boolean assertSameFileIds, String validatorClasses, String sqlQueryForEqualityValidation, String sqlQueryForSingleResultValidation, Pair<Pair<List<HoodieRecord>, List<String>>, Set<HoodieFileGroupId>> allRecords) throws IOException {
HoodieWriteConfig config = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).withAutoCommit(false).withClusteringConfig(clusteringConfig).withProps(getPropertiesForKeyGen()).build();
HoodieWriteMetadata<JavaRDD<WriteStatus>> clusterMetadata = performClustering(clusteringConfig, populateMetaFields, completeClustering, validatorClasses, sqlQueryForEqualityValidation, sqlQueryForSingleResultValidation, allRecords.getLeft());
if (assertSameFileIds) {
Set<HoodieFileGroupId> replacedFileIds = clusterMetadata.getWriteStats().get().stream().map(s -> new HoodieFileGroupId(s.getPartitionPath(), s.getFileId())).collect(Collectors.toSet());
Set<HoodieFileGroupId> insertedFileIds = allRecords.getRight();
assertEquals(insertedFileIds, replacedFileIds);
}
if (completeClustering) {
String clusteringCommitTime = metaClient.reloadActiveTimeline().getCompletedReplaceTimeline().getReverseOrderedInstants().findFirst().get().getTimestamp();
verifyRecordsWritten(clusteringCommitTime, populateMetaFields, allRecords.getLeft().getLeft(), clusterMetadata.getWriteStatuses().collect(), config);
}
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testUpdateRejectForClustering.
@Test
public void testUpdateRejectForClustering() throws IOException {
final String testPartitionPath = "2016/09/26";
dataGen = new HoodieTestDataGenerator(new String[] { testPartitionPath });
Properties props = new Properties();
props.setProperty(ASYNC_CLUSTERING_ENABLE.key(), "true");
HoodieWriteConfig config = getSmallInsertWriteConfig(100, TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), true, props);
SparkRDDWriteClient client = getHoodieWriteClient(config);
HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
// 1. insert to generate 2 file group
String commitTime1 = "001";
Pair<List<WriteStatus>, List<HoodieRecord>> upsertResult = insertBatchRecords(client, commitTime1, 600, 2);
List<HoodieRecord> inserts1 = upsertResult.getValue();
List<String> fileGroupIds1 = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getFileGroupId().getFileId()).collect(Collectors.toList());
assertEquals(2, fileGroupIds1.size());
// 2. generate clustering plan for fileGroupIds1 file groups
String commitTime2 = "002";
List<List<FileSlice>> firstInsertFileSlicesList = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getAllFileSlices().collect(Collectors.toList())).collect(Collectors.toList());
List<FileSlice>[] fileSlices = (List<FileSlice>[]) firstInsertFileSlicesList.toArray(new List[firstInsertFileSlicesList.size()]);
createRequestedReplaceInstant(this.metaClient, commitTime2, fileSlices);
// 3. insert one record with no updating reject exception, and not merge the small file, just generate a new file group
String commitTime3 = "003";
insertBatchRecords(client, commitTime3, 1, 1).getKey();
List<String> fileGroupIds2 = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getFileGroupId().getFileId()).collect(Collectors.toList());
assertEquals(3, fileGroupIds2.size());
// 4. update one record for the clustering two file groups, throw reject update exception
String commitTime4 = "004";
client.startCommitWithTime(commitTime4);
List<HoodieRecord> insertsAndUpdates3 = new ArrayList<>();
insertsAndUpdates3.addAll(dataGen.generateUpdates(commitTime4, inserts1));
String assertMsg = String.format("Not allowed to update the clustering files in partition: %s " + "For pending clustering operations, we are not going to support update for now.", testPartitionPath);
assertThrows(HoodieUpsertException.class, () -> {
writeClient.upsert(jsc.parallelize(insertsAndUpdates3, 1), commitTime3).collect();
}, assertMsg);
// 5. insert one record with no updating reject exception, will merge the small file
String commitTime5 = "005";
List<WriteStatus> statuses = insertBatchRecords(client, commitTime5, 1, 1).getKey();
fileGroupIds2.removeAll(fileGroupIds1);
assertEquals(fileGroupIds2.get(0), statuses.get(0).getFileId());
List<String> firstInsertFileGroupIds4 = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getFileGroupId().getFileId()).collect(Collectors.toList());
assertEquals(3, firstInsertFileGroupIds4.size());
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testInlineScheduleClustering.
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testInlineScheduleClustering(boolean scheduleInlineClustering) throws IOException {
testInsertTwoBatches(true);
// setup clustering config.
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10).withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).withInlineClustering(false).withScheduleInlineClustering(scheduleInlineClustering).withPreserveHoodieCommitMetadata(true).build();
HoodieWriteConfig config = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).withAutoCommit(false).withClusteringConfig(clusteringConfig).withProps(getPropertiesForKeyGen()).build();
SparkRDDWriteClient client = getHoodieWriteClient(config);
dataGen = new HoodieTestDataGenerator(new String[] { "2015/03/16" });
String commitTime1 = HoodieActiveTimeline.createNewInstantTime();
List<HoodieRecord> records1 = dataGen.generateInserts(commitTime1, 200);
client.startCommitWithTime(commitTime1);
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(records1, 2);
JavaRDD<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime1);
List<WriteStatus> statusList = statuses.collect();
assertNoWriteErrors(statusList);
client.commit(commitTime1, statuses);
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
List<Pair<HoodieInstant, HoodieClusteringPlan>> pendingClusteringPlans = ClusteringUtils.getAllPendingClusteringPlans(metaClient).collect(Collectors.toList());
if (scheduleInlineClustering) {
assertEquals(1, pendingClusteringPlans.size());
} else {
assertEquals(0, pendingClusteringPlans.size());
}
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestHoodieIndex method testSimpleTagLocationAndUpdate.
@ParameterizedTest
@MethodSource("indexTypeParams")
public void testSimpleTagLocationAndUpdate(IndexType indexType, boolean populateMetaFields, boolean enableMetadataIndex) throws Exception {
setUp(indexType, populateMetaFields, enableMetadataIndex);
String newCommitTime = "001";
int totalRecords = 10 + random.nextInt(20);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
// Test tagLocation without any entries in index
JavaRDD<HoodieRecord> javaRDD = tagLocation(index, writeRecords, hoodieTable);
assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0);
// Insert totalRecords records
writeClient.startCommitWithTime(newCommitTime);
JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
Assertions.assertNoWriteErrors(writeStatues.collect());
// Now tagLocation for these records, index should not tag them since it was a failed
// commit
javaRDD = tagLocation(index, writeRecords, hoodieTable);
assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0);
// Now commit this & update location of records inserted and validate no errors
writeClient.commit(newCommitTime, writeStatues);
// Now tagLocation for these records, index should tag them correctly
metaClient = HoodieTableMetaClient.reload(metaClient);
hoodieTable = HoodieSparkTable.create(config, context, metaClient);
javaRDD = tagLocation(index, writeRecords, hoodieTable);
Map<String, String> recordKeyToPartitionPathMap = new HashMap();
List<HoodieRecord> hoodieRecords = writeRecords.collect();
hoodieRecords.forEach(entry -> recordKeyToPartitionPathMap.put(entry.getRecordKey(), entry.getPartitionPath()));
assertEquals(totalRecords, javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size());
assertEquals(totalRecords, javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count());
assertEquals(totalRecords, javaRDD.filter(record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count());
javaRDD.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry.getRecordKey()), entry.getPartitionPath(), "PartitionPath mismatch"));
JavaRDD<HoodieKey> hoodieKeyJavaRDD = writeRecords.map(entry -> entry.getKey());
JavaPairRDD<HoodieKey, Option<Pair<String, String>>> recordLocations = getRecordLocations(hoodieKeyJavaRDD, hoodieTable);
List<HoodieKey> hoodieKeys = hoodieKeyJavaRDD.collect();
assertEquals(totalRecords, recordLocations.collect().size());
assertEquals(totalRecords, recordLocations.map(record -> record._1).distinct().count());
recordLocations.foreach(entry -> assertTrue(hoodieKeys.contains(entry._1), "Missing HoodieKey"));
recordLocations.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry._1.getRecordKey()), entry._1.getPartitionPath(), "PartitionPath mismatch"));
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestHoodieIndex method testTagLocationAndFetchRecordLocations.
@ParameterizedTest
@MethodSource("regularIndexTypeParams")
public void testTagLocationAndFetchRecordLocations(IndexType indexType, boolean populateMetaFields, boolean enableMetadataIndex) throws Exception {
setUp(indexType, populateMetaFields, enableMetadataIndex);
String p1 = "2016/01/31";
String p2 = "2015/01/31";
String rowKey1 = UUID.randomUUID().toString();
String rowKey2 = UUID.randomUUID().toString();
String rowKey3 = UUID.randomUUID().toString();
String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
// place same row key under a different partition.
String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
String newCommitTime = writeClient.startCommit();
metaClient = HoodieTableMetaClient.reload(metaClient);
writeClient.upsert(recordRDD, newCommitTime);
HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
JavaRDD<HoodieRecord> taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable);
// Should not find any files
for (HoodieRecord record : taggedRecordRDD.collect()) {
assertFalse(record.isCurrentLocationKnown());
}
// We create three parquet files, each having one record (two different partitions)
HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, metadataWriter);
final String fileId1 = "fileID1";
final String fileId2 = "fileID2";
final String fileId3 = "fileID3";
Map<String, List<Pair<String, Integer>>> partitionToFilesNameLengthMap = new HashMap<>();
Path baseFilePath = testTable.forCommit("0000001").withInserts(p1, fileId1, Collections.singletonList(record1));
long baseFileLength = fs.getFileStatus(baseFilePath).getLen();
partitionToFilesNameLengthMap.computeIfAbsent(p1, k -> new ArrayList<>()).add(Pair.of(fileId1, Integer.valueOf((int) baseFileLength)));
testTable.doWriteOperation("0000001", WriteOperationType.UPSERT, Arrays.asList(p1, p2), partitionToFilesNameLengthMap, false, false);
partitionToFilesNameLengthMap.clear();
baseFilePath = testTable.forCommit("0000002").withInserts(p1, fileId2, Collections.singletonList(record2));
baseFileLength = fs.getFileStatus(baseFilePath).getLen();
partitionToFilesNameLengthMap.computeIfAbsent(p1, k -> new ArrayList<>()).add(Pair.of(fileId2, Integer.valueOf((int) baseFileLength)));
testTable.doWriteOperation("0000002", WriteOperationType.UPSERT, Arrays.asList(p1, p2), partitionToFilesNameLengthMap, false, false);
partitionToFilesNameLengthMap.clear();
baseFilePath = testTable.forCommit("0000003").withInserts(p2, fileId3, Collections.singletonList(record4));
baseFileLength = fs.getFileStatus(baseFilePath).getLen();
partitionToFilesNameLengthMap.computeIfAbsent(p2, k -> new ArrayList<>()).add(Pair.of(fileId3, Integer.valueOf((int) baseFileLength)));
testTable.doWriteOperation("0000003", WriteOperationType.UPSERT, Arrays.asList(p1, p2), partitionToFilesNameLengthMap, false, false);
// We do the tag again
metaClient = HoodieTableMetaClient.reload(metaClient);
hoodieTable = HoodieSparkTable.create(config, context, metaClient);
taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable);
List<HoodieRecord> records = taggedRecordRDD.collect();
// Check results
for (HoodieRecord record : records) {
if (record.getRecordKey().equals(rowKey1)) {
if (record.getPartitionPath().equals(p2)) {
assertEquals(record.getCurrentLocation().getFileId(), fileId3);
} else {
assertEquals(record.getCurrentLocation().getFileId(), fileId1);
}
} else if (record.getRecordKey().equals(rowKey2)) {
assertEquals(record.getCurrentLocation().getFileId(), fileId2);
} else if (record.getRecordKey().equals(rowKey3)) {
assertFalse(record.isCurrentLocationKnown());
}
}
JavaPairRDD<HoodieKey, Option<Pair<String, String>>> recordLocations = getRecordLocations(recordRDD.map(HoodieRecord::getKey), hoodieTable);
for (Tuple2<HoodieKey, Option<Pair<String, String>>> entry : recordLocations.collect()) {
if (entry._1.getRecordKey().equals(rowKey1)) {
assertTrue(entry._2.isPresent(), "Row1 should have been present ");
if (entry._1.getPartitionPath().equals(p2)) {
assertTrue(entry._2.isPresent(), "Row1 should have been present ");
assertEquals(entry._2.get().getRight(), fileId3);
} else {
assertEquals(entry._2.get().getRight(), fileId1);
}
} else if (entry._1.getRecordKey().equals(rowKey2)) {
assertTrue(entry._2.isPresent(), "Row2 should have been present ");
assertEquals(entry._2.get().getRight(), fileId2);
} else if (entry._1.getRecordKey().equals(rowKey3)) {
assertFalse(entry._2.isPresent(), "Row3 should have been absent ");
}
}
}
Aggregations