use of org.apache.hudi.common.testutils.RawTripTestPayload in project hudi by apache.
the class TestCopyOnWriteActionExecutor method testUpdateRecords.
// TODO (weiy): Add testcases for crossing file writing.
@ParameterizedTest
@MethodSource("indexType")
public void testUpdateRecords(HoodieIndex.IndexType indexType) throws Exception {
// Prepare the AvroParquetIO
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withProps(makeIndexConfig(indexType)).build();
String firstCommitTime = makeNewCommitTime();
SparkRDDWriteClient writeClient = getHoodieWriteClient(config);
writeClient.startCommitWithTime(firstCommitTime);
metaClient = HoodieTableMetaClient.reload(metaClient);
String partitionPath = "2016/01/31";
HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
// Get some records belong to the same partition (2016/01/31)
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}";
List<HoodieRecord> records = new ArrayList<>();
RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
// Insert new records
final HoodieSparkCopyOnWriteTable cowTable = table;
writeClient.insert(jsc.parallelize(records, 1), firstCommitTime);
FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1);
assertEquals(1, allFiles.length);
// Read out the bloom filter and make sure filter can answer record exist or not
Path filePath = allFiles[0].getPath();
BloomFilter filter = BaseFileUtils.getInstance(table.getBaseFileFormat()).readBloomFilterFromMetadata(hadoopConf, filePath);
for (HoodieRecord record : records) {
assertTrue(filter.mightContain(record.getRecordKey()));
}
// Read the base file, check the record content
List<GenericRecord> fileRecords = BaseFileUtils.getInstance(table.getBaseFileFormat()).readAvroRecords(hadoopConf, filePath);
GenericRecord newRecord;
int index = 0;
for (GenericRecord record : fileRecords) {
assertEquals(records.get(index).getRecordKey(), record.get("_row_key").toString());
index++;
}
// We update the 1st record & add a new record
String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
RawTripTestPayload updateRowChanges1 = new RawTripTestPayload(updateRecordStr1);
HoodieRecord updatedRecord1 = new HoodieAvroRecord(new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1);
RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
HoodieRecord insertedRecord1 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
List<HoodieRecord> updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1);
Thread.sleep(1000);
String newCommitTime = makeNewCommitTime();
metaClient = HoodieTableMetaClient.reload(metaClient);
writeClient.startCommitWithTime(newCommitTime);
List<WriteStatus> statuses = writeClient.upsert(jsc.parallelize(updatedRecords), newCommitTime).collect();
allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1);
assertEquals(1, allFiles.length);
// verify new incremental file group is same as the previous one
assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName()));
// Check whether the record has been updated
Path updatedFilePath = allFiles[0].getPath();
BloomFilter updatedFilter = BaseFileUtils.getInstance(metaClient).readBloomFilterFromMetadata(hadoopConf, updatedFilePath);
for (HoodieRecord record : records) {
// No change to the _row_key
assertTrue(updatedFilter.mightContain(record.getRecordKey()));
}
assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey()));
// add this so it can further check below
records.add(insertedRecord1);
ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedFilePath).build();
index = 0;
while ((newRecord = (GenericRecord) updatedReader.read()) != null) {
assertEquals(newRecord.get("_row_key").toString(), records.get(index).getRecordKey());
if (index == 0) {
assertEquals("15", newRecord.get("number").toString());
}
index++;
}
updatedReader.close();
// Also check the numRecordsWritten
WriteStatus writeStatus = statuses.get(0);
assertEquals(1, statuses.size(), "Should be only one file generated");
// 3 rewritten records + 1 new record
assertEquals(4, writeStatus.getStat().getNumWrites());
}
use of org.apache.hudi.common.testutils.RawTripTestPayload in project hudi by apache.
the class TestOrcBootstrap method generateInputBatch.
private static JavaRDD<HoodieRecord> generateInputBatch(JavaSparkContext jsc, List<Pair<String, List<HoodieFileStatus>>> partitionPaths, Schema writerSchema) {
List<Pair<String, Path>> fullFilePathsWithPartition = partitionPaths.stream().flatMap(p -> p.getValue().stream().map(x -> Pair.of(p.getKey(), FileStatusUtils.toPath(x.getPath())))).collect(Collectors.toList());
return jsc.parallelize(fullFilePathsWithPartition.stream().flatMap(p -> {
try {
Configuration conf = jsc.hadoopConfiguration();
AvroReadSupport.setAvroReadSchema(conf, writerSchema);
Reader orcReader = OrcFile.createReader(p.getValue(), new OrcFile.ReaderOptions(jsc.hadoopConfiguration()));
RecordReader recordReader = orcReader.rows();
TypeDescription orcSchema = orcReader.getSchema();
Schema avroSchema = AvroOrcUtils.createAvroSchemaWithDefaultValue(orcSchema, "test_orc_record", null, true);
Iterator<GenericRecord> recIterator = new OrcReaderIterator(recordReader, avroSchema, orcSchema);
return StreamSupport.stream(Spliterators.spliteratorUnknownSize(recIterator, 0), false).map(gr -> {
try {
String key = gr.get("_row_key").toString();
String pPath = p.getKey();
return new HoodieAvroRecord<>(new HoodieKey(key, pPath), new RawTripTestPayload(gr.toString(), key, pPath, HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA));
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
});
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}).collect(Collectors.toList()));
}
use of org.apache.hudi.common.testutils.RawTripTestPayload in project hudi by apache.
the class TestHoodieConcatHandle method testInsert.
@Test
public void testInsert() throws Exception {
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withMergeAllowDuplicateOnInserts(true).build();
HoodieJavaWriteClient writeClient = getHoodieWriteClient(config);
metaClient = HoodieTableMetaClient.reload(metaClient);
BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient);
// Get some records belong to the same partition (2021/09/11)
String insertRecordStr1 = "{\"_row_key\":\"1\"," + "\"time\":\"2021-09-11T16:16:41.415Z\",\"number\":1}";
String insertRecordStr2 = "{\"_row_key\":\"2\"," + "\"time\":\"2021-09-11T16:16:41.415Z\",\"number\":2}";
List<HoodieRecord> records1 = new ArrayList<>();
RawTripTestPayload insertRow1 = new RawTripTestPayload(insertRecordStr1);
RawTripTestPayload insertRow2 = new RawTripTestPayload(insertRecordStr2);
records1.add(new HoodieAvroRecord(new HoodieKey(insertRow1.getRowKey(), insertRow1.getPartitionPath()), insertRow1));
records1.add(new HoodieAvroRecord(new HoodieKey(insertRow2.getRowKey(), insertRow2.getPartitionPath()), insertRow2));
int startInstant = 1;
String firstCommitTime = makeNewCommitTime(startInstant++);
// First insert
writeClient.startCommitWithTime(firstCommitTime);
writeClient.insert(records1, firstCommitTime);
String partitionPath = "2021/09/11";
FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1);
assertEquals(1, allFiles.length);
// Read out the bloom filter and make sure filter can answer record exist or not
Path filePath = allFiles[0].getPath();
BloomFilter filter = fileUtils.readBloomFilterFromMetadata(hadoopConf, filePath);
for (HoodieRecord record : records1) {
assertTrue(filter.mightContain(record.getRecordKey()));
}
insertRecordStr1 = "{\"_row_key\":\"1\"," + "\"time\":\"2021-09-11T16:39:41.415Z\",\"number\":3}";
insertRecordStr2 = "{\"_row_key\":\"2\"," + "\"time\":\"2021-09-11T16:39:41.415Z\",\"number\":4}";
List<HoodieRecord> records2 = new ArrayList<>();
insertRow1 = new RawTripTestPayload(insertRecordStr1);
insertRow2 = new RawTripTestPayload(insertRecordStr2);
// The recordKey of records2 and records1 are the same, but the values of other fields are different
records2.add(new HoodieAvroRecord(new HoodieKey(insertRow1.getRowKey(), insertRow1.getPartitionPath()), insertRow1));
records2.add(new HoodieAvroRecord(new HoodieKey(insertRow2.getRowKey(), insertRow2.getPartitionPath()), insertRow2));
String newCommitTime = makeNewCommitTime(startInstant++);
writeClient.startCommitWithTime(newCommitTime);
// Second insert is the same as the _row_key of the first one,test allowDuplicateInserts
writeClient.insert(records2, newCommitTime);
allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1);
assertEquals(1, allFiles.length);
// verify new incremental file group is same as the previous one
assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName()));
filePath = allFiles[0].getPath();
// The final result should be a collection of records1 and records2
records1.addAll(records2);
// Read the base file, check the record content
List<GenericRecord> fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath);
int index = 0;
for (GenericRecord record : fileRecords) {
assertEquals(records1.get(index).getRecordKey(), record.get("_row_key").toString());
assertEquals(index + 1, record.get("number"));
index++;
}
}
use of org.apache.hudi.common.testutils.RawTripTestPayload in project hudi by apache.
the class TestJavaCopyOnWriteActionExecutor method testFileSizeUpsertRecords.
@Test
public void testFileSizeUpsertRecords() throws Exception {
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig(HoodieStorageConfig.newBuilder().parquetMaxFileSize(64 * 1024).hfileMaxFileSize(64 * 1024).parquetBlockSize(64 * 1024).parquetPageSize(64 * 1024).build()).build();
String instantTime = makeNewCommitTime();
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieJavaCopyOnWriteTable table = (HoodieJavaCopyOnWriteTable) HoodieJavaTable.create(config, context, metaClient);
List<HoodieRecord> records = new ArrayList<>();
// Approx 1150 records are written for block size of 64KB
for (int i = 0; i < 2000; i++) {
String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}";
RawTripTestPayload rowChange = new RawTripTestPayload(recordStr);
records.add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
}
// Insert new records
BaseJavaCommitActionExecutor actionExecutor = new JavaUpsertCommitActionExecutor(context, config, table, instantTime, records);
Arrays.asList(1).stream().map(i -> actionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), records.iterator())).map(Transformations::flatten).collect(Collectors.toList());
// Check the updated file
int counts = 0;
for (File file : Paths.get(basePath, "2016/01/31").toFile().listFiles()) {
if (file.getName().endsWith(table.getBaseFileExtension()) && FSUtils.getCommitTime(file.getName()).equals(instantTime)) {
LOG.info(file.getName() + "-" + file.length());
counts++;
}
}
assertEquals(3, counts, "If the number of records are more than 1150, then there should be a new file");
}
use of org.apache.hudi.common.testutils.RawTripTestPayload in project hudi by apache.
the class TestFlinkHoodieBloomIndex method testCheckUUIDsAgainstOneFile.
@Test
public void testCheckUUIDsAgainstOneFile() throws Exception {
final String partition = "2016/01/31";
// Create some records to use
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
// We write record1, record2 to a base file, but the bloom filter contains (record1,
// record2, record3).
BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name());
filter.add(record3.getRecordKey());
HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(metaClient, SCHEMA, filter);
String fileId = testTable.addCommit("000").getFileIdWithInserts(partition, record1, record2);
String filename = testTable.getBaseFileNameById(fileId);
// The bloom filter contains 3 records
assertTrue(filter.mightContain(record1.getRecordKey()));
assertTrue(filter.mightContain(record2.getRecordKey()));
assertTrue(filter.mightContain(record3.getRecordKey()));
assertFalse(filter.mightContain(record4.getRecordKey()));
// Compare with file
List<String> uuids = asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(), record4.getRecordKey());
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
HoodieFlinkTable table = HoodieFlinkTable.create(config, context, metaClient);
List<String> results = HoodieIndexUtils.filterKeysFromFile(new Path(java.nio.file.Paths.get(basePath, partition, filename).toString()), uuids, hadoopConf);
assertEquals(results.size(), 2);
assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") || results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0"));
assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0") || results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0"));
// TODO(vc): Need more coverage on actual filenames
// assertTrue(results.get(0)._2().equals(filename));
// assertTrue(results.get(1)._2().equals(filename));
}
Aggregations