use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class TestSparkHoodieHBaseIndex method testTagLocationAndPartitionPathUpdate.
@Test
public void testTagLocationAndPartitionPathUpdate() throws Exception {
final String newCommitTime = "001";
final int numRecords = 10;
final String oldPartitionPath = "1970/01/01";
final String emptyHoodieRecordPayloadClassName = EmptyHoodieRecordPayload.class.getName();
List<HoodieRecord> newRecords = dataGen.generateInserts(newCommitTime, numRecords);
List<HoodieRecord> oldRecords = new LinkedList();
for (HoodieRecord newRecord : newRecords) {
HoodieKey key = new HoodieKey(newRecord.getRecordKey(), oldPartitionPath);
HoodieRecord hoodieRecord = new HoodieAvroRecord(key, (HoodieRecordPayload) newRecord.getData());
oldRecords.add(hoodieRecord);
}
JavaRDD<HoodieRecord> newWriteRecords = jsc().parallelize(newRecords, 1);
JavaRDD<HoodieRecord> oldWriteRecords = jsc().parallelize(oldRecords, 1);
HoodieWriteConfig config = getConfig(true, false);
SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(getConfig(true, false));
try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
// allowed path change test
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
JavaRDD<HoodieRecord> oldHoodieRecord = tagLocation(index, oldWriteRecords, hoodieTable);
assertEquals(0, oldHoodieRecord.filter(record -> record.isCurrentLocationKnown()).count());
writeClient.startCommitWithTime(newCommitTime);
JavaRDD<WriteStatus> writeStatues = writeClient.upsert(oldWriteRecords, newCommitTime);
writeClient.commit(newCommitTime, writeStatues);
assertNoWriteErrors(writeStatues.collect());
updateLocation(index, writeStatues, hoodieTable);
metaClient = HoodieTableMetaClient.reload(metaClient);
hoodieTable = HoodieSparkTable.create(config, context, metaClient);
List<HoodieRecord> taggedRecords = tagLocation(index, newWriteRecords, hoodieTable).collect();
assertEquals(numRecords * 2L, taggedRecords.stream().count());
// Verify the number of deleted records
assertEquals(numRecords, taggedRecords.stream().filter(record -> record.getKey().getPartitionPath().equals(oldPartitionPath) && record.getData().getClass().getName().equals(emptyHoodieRecordPayloadClassName)).count());
// Verify the number of inserted records
assertEquals(numRecords, taggedRecords.stream().filter(record -> !record.getKey().getPartitionPath().equals(oldPartitionPath)).count());
// not allowed path change test
index = new SparkHoodieHBaseIndex(getConfig(false, false));
List<HoodieRecord> notAllowPathChangeRecords = tagLocation(index, newWriteRecords, hoodieTable).collect();
assertEquals(numRecords, notAllowPathChangeRecords.stream().count());
assertEquals(numRecords, taggedRecords.stream().filter(hoodieRecord -> hoodieRecord.isCurrentLocationKnown() && hoodieRecord.getKey().getPartitionPath().equals(oldPartitionPath)).count());
}
}
use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class TestBootstrap method generateInputBatch.
private static JavaRDD<HoodieRecord> generateInputBatch(JavaSparkContext jsc, List<Pair<String, List<HoodieFileStatus>>> partitionPaths, Schema writerSchema) {
List<Pair<String, Path>> fullFilePathsWithPartition = partitionPaths.stream().flatMap(p -> p.getValue().stream().map(x -> Pair.of(p.getKey(), FileStatusUtils.toPath(x.getPath())))).collect(Collectors.toList());
return jsc.parallelize(fullFilePathsWithPartition.stream().flatMap(p -> {
try {
Configuration conf = jsc.hadoopConfiguration();
AvroReadSupport.setAvroReadSchema(conf, writerSchema);
Iterator<GenericRecord> recIterator = new ParquetReaderIterator(AvroParquetReader.<GenericRecord>builder(p.getValue()).withConf(conf).build());
return StreamSupport.stream(Spliterators.spliteratorUnknownSize(recIterator, 0), false).map(gr -> {
try {
String key = gr.get("_row_key").toString();
String pPath = p.getKey();
return new HoodieAvroRecord<>(new HoodieKey(key, pPath), new RawTripTestPayload(gr.toString(), key, pPath, HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA));
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
});
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}).collect(Collectors.toList()));
}
use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class TestFlinkHoodieBloomIndex method testBloomFilterFalseError.
@ParameterizedTest(name = TEST_NAME_WITH_PARAMS)
@MethodSource("configParams")
public void testBloomFilterFalseError(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception {
// We have two hoodie records
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
// We write record1 to a base file, using a bloom filter having both records
RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name());
filter.add(record2.getRecordKey());
HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(metaClient, SCHEMA, filter);
String fileId = testTable.addCommit("000").getFileIdWithInserts("2016/01/31", record1);
assertTrue(filter.mightContain(record1.getRecordKey()));
assertTrue(filter.mightContain(record2.getRecordKey()));
// We do the tag
List<HoodieRecord> records = asList(record1, record2);
HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking);
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable table = HoodieFlinkTable.create(config, context, metaClient);
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance());
List<HoodieRecord> taggedRecords = tagLocation(bloomIndex, records, table);
// Check results
for (HoodieRecord record : taggedRecords) {
if (record.getKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
assertEquals(record.getCurrentLocation().getFileId(), fileId);
} else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
assertFalse(record.isCurrentLocationKnown());
}
}
}
use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class TestFlinkHoodieBloomIndex method testTagLocation.
@ParameterizedTest(name = TEST_NAME_WITH_PARAMS)
@MethodSource("configParams")
public void testTagLocation(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception {
// We have some records to be tagged (two different partitions)
String rowKey1 = randomUUID().toString();
String rowKey2 = randomUUID().toString();
String rowKey3 = randomUUID().toString();
String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
// place same row key under a different partition.
String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
List<HoodieRecord> records = asList(record1, record2, record3, record4);
// Also create the metadata and config
HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking);
HoodieFlinkTable hoodieTable = HoodieFlinkTable.create(config, context, metaClient);
HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(hoodieTable, SCHEMA);
// Let's tag
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance());
List<HoodieRecord> taggedRecords = tagLocation(bloomIndex, records, hoodieTable);
// Should not find any files
for (HoodieRecord record : taggedRecords) {
assertFalse(record.isCurrentLocationKnown());
}
// We create three base file, each having one record. (two different partitions)
String fileId1 = testTable.addCommit("001").getFileIdWithInserts("2016/01/31", record1);
String fileId2 = testTable.addCommit("002").getFileIdWithInserts("2016/01/31", record2);
String fileId3 = testTable.addCommit("003").getFileIdWithInserts("2015/01/31", record4);
metaClient.reloadActiveTimeline();
// We do the tag again
taggedRecords = tagLocation(bloomIndex, records, HoodieFlinkTable.create(config, context, metaClient));
// Check results
for (HoodieRecord record : taggedRecords) {
if (record.getRecordKey().equals(rowKey1)) {
if (record.getPartitionPath().equals("2015/01/31")) {
assertEquals(record.getCurrentLocation().getFileId(), fileId3);
} else {
assertEquals(record.getCurrentLocation().getFileId(), fileId1);
}
} else if (record.getRecordKey().equals(rowKey2)) {
assertEquals(record.getCurrentLocation().getFileId(), fileId2);
} else if (record.getRecordKey().equals(rowKey3)) {
assertFalse(record.isCurrentLocationKnown());
}
}
}
use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class TestJavaCopyOnWriteActionExecutor method newHoodieRecords.
private List<HoodieRecord> newHoodieRecords(int n, String time) throws Exception {
List<HoodieRecord> records = new ArrayList<>();
for (int i = 0; i < n; i++) {
String recordStr = String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}", UUID.randomUUID().toString(), time, i);
RawTripTestPayload rowChange = new RawTripTestPayload(recordStr);
records.add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
}
return records;
}
Aggregations