use of org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema in project hudi by apache.
the class TestHoodieLogFormat method testBasicAppendAndReadInReverse.
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testBasicAppendAndReadInReverse(boolean readBlocksLazily) throws IOException, URISyntaxException, InterruptedException {
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
Schema schema = getSimpleSchema();
List<IndexedRecord> records1 = SchemaTestUtil.generateTestRecords(0, 100);
List<IndexedRecord> copyOfRecords1 = records1.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header);
writer.appendBlock(dataBlock);
writer.close();
writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
List<IndexedRecord> records2 = SchemaTestUtil.generateTestRecords(0, 100);
List<IndexedRecord> copyOfRecords2 = records2.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header);
writer.appendBlock(dataBlock);
writer.close();
// Close and Open again and append 100 more records
writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
List<IndexedRecord> records3 = SchemaTestUtil.generateTestRecords(0, 100);
List<IndexedRecord> copyOfRecords3 = records3.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records3, header);
writer.appendBlock(dataBlock);
writer.close();
FileCreateUtils.createDeltaCommit(basePath, "100", fs);
HoodieLogFileReader reader = new HoodieLogFileReader(fs, new HoodieLogFile(writer.getLogFile().getPath(), fs.getFileStatus(writer.getLogFile().getPath()).getLen()), SchemaTestUtil.getSimpleSchema(), bufferSize, readBlocksLazily, true);
assertTrue(reader.hasPrev(), "Last block should be available");
HoodieLogBlock prevBlock = reader.prev();
HoodieDataBlock dataBlockRead = (HoodieDataBlock) prevBlock;
List<IndexedRecord> recordsRead1 = getRecords(dataBlockRead);
assertEquals(copyOfRecords3.size(), recordsRead1.size(), "Third records size should be equal to the written records size");
assertEquals(copyOfRecords3, recordsRead1, "Both records lists should be the same. (ordering guaranteed)");
assertTrue(reader.hasPrev(), "Second block should be available");
prevBlock = reader.prev();
dataBlockRead = (HoodieDataBlock) prevBlock;
List<IndexedRecord> recordsRead2 = getRecords(dataBlockRead);
assertEquals(copyOfRecords2.size(), recordsRead2.size(), "Read records size should be equal to the written records size");
assertEquals(copyOfRecords2, recordsRead2, "Both records lists should be the same. (ordering guaranteed)");
assertTrue(reader.hasPrev(), "First block should be available");
prevBlock = reader.prev();
dataBlockRead = (HoodieDataBlock) prevBlock;
List<IndexedRecord> recordsRead3 = getRecords(dataBlockRead);
assertEquals(copyOfRecords1.size(), recordsRead3.size(), "Read records size should be equal to the written records size");
assertEquals(copyOfRecords1, recordsRead3, "Both records lists should be the same. (ordering guaranteed)");
assertFalse(reader.hasPrev());
reader.close();
}
use of org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema in project hudi by apache.
the class TestHoodieLogFormat method testBasicAppendAndRead.
@ParameterizedTest
@EnumSource(names = { "AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK", "PARQUET_DATA_BLOCK" })
public void testBasicAppendAndRead(HoodieLogBlockType dataBlockType) throws IOException, URISyntaxException, InterruptedException {
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
List<IndexedRecord> records1 = SchemaTestUtil.generateTestRecords(0, 100);
Schema schema = getSimpleSchema();
List<IndexedRecord> copyOfRecords1 = records1.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
HoodieDataBlock dataBlock = getDataBlock(dataBlockType, records1, header);
writer.appendBlock(dataBlock);
writer.close();
writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
List<IndexedRecord> records2 = SchemaTestUtil.generateTestRecords(0, 100);
List<IndexedRecord> copyOfRecords2 = records2.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
dataBlock = getDataBlock(dataBlockType, records2, header);
writer.appendBlock(dataBlock);
writer.close();
// Close and Open again and append 100 more records
writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
List<IndexedRecord> records3 = SchemaTestUtil.generateTestRecords(0, 100);
List<IndexedRecord> copyOfRecords3 = records3.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
dataBlock = getDataBlock(dataBlockType, records3, header);
writer.appendBlock(dataBlock);
writer.close();
Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema());
assertTrue(reader.hasNext(), "First block should be available");
HoodieLogBlock nextBlock = reader.next();
HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock;
List<IndexedRecord> recordsRead1 = getRecords(dataBlockRead);
assertEquals(copyOfRecords1.size(), recordsRead1.size(), "Read records size should be equal to the written records size");
assertEquals(copyOfRecords1, recordsRead1, "Both records lists should be the same. (ordering guaranteed)");
assertEquals(dataBlockRead.getSchema(), getSimpleSchema());
reader.hasNext();
nextBlock = reader.next();
dataBlockRead = (HoodieDataBlock) nextBlock;
List<IndexedRecord> recordsRead2 = getRecords(dataBlockRead);
assertEquals(copyOfRecords2.size(), recordsRead2.size(), "Read records size should be equal to the written records size");
assertEquals(copyOfRecords2, recordsRead2, "Both records lists should be the same. (ordering guaranteed)");
reader.hasNext();
nextBlock = reader.next();
dataBlockRead = (HoodieDataBlock) nextBlock;
List<IndexedRecord> recordsRead3 = getRecords(dataBlockRead);
assertEquals(copyOfRecords3.size(), recordsRead3.size(), "Read records size should be equal to the written records size");
assertEquals(copyOfRecords3, recordsRead3, "Both records lists should be the same. (ordering guaranteed)");
reader.close();
}
use of org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema in project hudi by apache.
the class TestBitCaskDiskMap method testSizeEstimatorPerformance.
/**
* @na: Leaving this test here for a quick performance test
*/
@Disabled
@Test
public void testSizeEstimatorPerformance() throws IOException, URISyntaxException {
// Test sizeEstimatorPerformance with simpleSchema
Schema schema = SchemaTestUtil.getSimpleSchema();
List<HoodieRecord> hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema);
HoodieRecordSizeEstimator sizeEstimator = new HoodieRecordSizeEstimator<>(schema);
HoodieRecord record = hoodieRecords.remove(0);
long startTime = System.currentTimeMillis();
SpillableMapUtils.computePayloadSize(record, sizeEstimator);
long timeTaken = System.currentTimeMillis() - startTime;
System.out.println("Time taken :" + timeTaken);
assertTrue(timeTaken < 100);
}
use of org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema in project hudi by apache.
the class TestBitCaskDiskMap method testSizeEstimator.
@Test
public void testSizeEstimator() throws IOException, URISyntaxException {
Schema schema = SchemaTestUtil.getSimpleSchema();
// Test sizeEstimator without hoodie metadata fields
List<HoodieRecord> hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema);
long payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema));
assertTrue(payloadSize > 0);
// Test sizeEstimator with hoodie metadata fields
schema = HoodieAvroUtils.addMetadataFields(schema);
hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema);
payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema));
assertTrue(payloadSize > 0);
// Following tests payloads without an Avro Schema in the Record
// Test sizeEstimator without hoodie metadata fields and without schema object in the payload
schema = SchemaTestUtil.getSimpleSchema();
List<IndexedRecord> indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1);
hoodieRecords = indexedRecords.stream().map(r -> new HoodieAvroRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), new AvroBinaryTestPayload(Option.of((GenericRecord) r)))).collect(Collectors.toList());
payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema));
assertTrue(payloadSize > 0);
// Test sizeEstimator with hoodie metadata fields and without schema object in the payload
final Schema simpleSchemaWithMetadata = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1);
hoodieRecords = indexedRecords.stream().map(r -> new HoodieAvroRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), new AvroBinaryTestPayload(Option.of(HoodieAvroUtils.rewriteRecord((GenericRecord) r, simpleSchemaWithMetadata))))).collect(Collectors.toList());
payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema));
assertTrue(payloadSize > 0);
}
use of org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema in project hudi by apache.
the class TestHoodieLogFormat method testHugeLogFileWrite.
@Test
public void testHugeLogFileWrite() throws IOException, URISyntaxException, InterruptedException {
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).withSizeThreshold(3L * 1024 * 1024 * 1024).build();
Schema schema = getSimpleSchema();
List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 1000);
List<IndexedRecord> copyOfRecords = records.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
byte[] dataBlockContentBytes = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header).getContentBytes();
HoodieLogBlock.HoodieLogBlockContentLocation logBlockContentLoc = new HoodieLogBlock.HoodieLogBlockContentLocation(new Configuration(), null, 0, dataBlockContentBytes.length, 0);
HoodieDataBlock reusableDataBlock = new HoodieAvroDataBlock(null, Option.ofNullable(dataBlockContentBytes), false, logBlockContentLoc, Option.ofNullable(getSimpleSchema()), header, new HashMap<>(), HoodieRecord.RECORD_KEY_METADATA_FIELD);
long writtenSize = 0;
int logBlockWrittenNum = 0;
while (writtenSize < Integer.MAX_VALUE) {
AppendResult appendResult = writer.appendBlock(reusableDataBlock);
assertTrue(appendResult.size() > 0);
writtenSize += appendResult.size();
logBlockWrittenNum++;
}
writer.close();
Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(), true, true);
assertTrue(reader.hasNext(), "We wrote a block, we should be able to read it");
HoodieLogBlock nextBlock = reader.next();
assertEquals(DEFAULT_DATA_BLOCK_TYPE, nextBlock.getBlockType(), "The next block should be a data block");
HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock;
List<IndexedRecord> recordsRead = getRecords(dataBlockRead);
assertEquals(copyOfRecords.size(), recordsRead.size(), "Read records size should be equal to the written records size");
assertEquals(copyOfRecords, recordsRead, "Both records lists should be the same. (ordering guaranteed)");
int logBlockReadNum = 1;
while (reader.hasNext()) {
reader.next();
logBlockReadNum++;
}
assertEquals(logBlockWrittenNum, logBlockReadNum, "All written log should be correctly found");
reader.close();
// test writing oversize data block which should be rejected
Writer oversizeWriter = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withSizeThreshold(3L * 1024 * 1024 * 1024).withFs(fs).build();
List<HoodieLogBlock> dataBlocks = new ArrayList<>(logBlockWrittenNum + 1);
for (int i = 0; i < logBlockWrittenNum + 1; i++) {
dataBlocks.add(reusableDataBlock);
}
assertThrows(HoodieIOException.class, () -> {
oversizeWriter.appendBlocks(dataBlocks);
}, "Blocks appended may overflow. Please decrease log block size or log block amount");
oversizeWriter.close();
}
Aggregations