use of org.apache.hudi.common.table.log.block.HoodieDataBlock in project hudi by apache.
the class TestHoodieLogFormat method testBasicAppendAndTraverseInReverse.
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testBasicAppendAndTraverseInReverse(boolean readBlocksLazily) throws IOException, URISyntaxException, InterruptedException {
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
Schema schema = getSimpleSchema();
List<IndexedRecord> records1 = SchemaTestUtil.generateTestRecords(0, 100);
List<IndexedRecord> copyOfRecords1 = records1.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header);
writer.appendBlock(dataBlock);
writer.close();
writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
List<IndexedRecord> records2 = SchemaTestUtil.generateTestRecords(0, 100);
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header);
writer.appendBlock(dataBlock);
writer.close();
// Close and Open again and append 100 more records
writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
List<IndexedRecord> records3 = SchemaTestUtil.generateTestRecords(0, 100);
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records3, header);
writer.appendBlock(dataBlock);
writer.close();
FileCreateUtils.createDeltaCommit(basePath, "100", fs);
HoodieLogFileReader reader = new HoodieLogFileReader(fs, new HoodieLogFile(writer.getLogFile().getPath(), fs.getFileStatus(writer.getLogFile().getPath()).getLen()), SchemaTestUtil.getSimpleSchema(), bufferSize, readBlocksLazily, true);
assertTrue(reader.hasPrev(), "Third block should be available");
reader.moveToPrev();
assertTrue(reader.hasPrev(), "Second block should be available");
reader.moveToPrev();
// After moving twice, this last reader.prev() should read the First block written
assertTrue(reader.hasPrev(), "First block should be available");
HoodieLogBlock prevBlock = reader.prev();
HoodieDataBlock dataBlockRead = (HoodieDataBlock) prevBlock;
List<IndexedRecord> recordsRead = getRecords(dataBlockRead);
assertEquals(copyOfRecords1.size(), recordsRead.size(), "Read records size should be equal to the written records size");
assertEquals(copyOfRecords1, recordsRead, "Both records lists should be the same. (ordering guaranteed)");
assertFalse(reader.hasPrev());
reader.close();
}
use of org.apache.hudi.common.table.log.block.HoodieDataBlock in project hudi by apache.
the class TestHoodieLogFormat method testDataBlockFormatAppendAndReadWithProjectedSchema.
@ParameterizedTest
@EnumSource(names = { "AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK", "PARQUET_DATA_BLOCK" })
public void testDataBlockFormatAppendAndReadWithProjectedSchema(HoodieLogBlockType dataBlockType) throws IOException, URISyntaxException, InterruptedException {
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
List<GenericRecord> records = SchemaTestUtil.generateTestGenericRecords(0, 1000);
Schema schema = getSimpleSchema();
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<HoodieLogBlock.HeaderMetadataType, String>() {
{
put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
}
};
// Init Benchmark to report number of bytes actually read from the Block
BenchmarkCounter.initCounterFromReporter(HadoopMapRedUtils.createTestReporter(), fs.getConf());
// NOTE: Have to use this ugly hack since List generic is not covariant in its type param
HoodieDataBlock dataBlock = getDataBlock(dataBlockType, (List<IndexedRecord>) (List) records, header);
writer.appendBlock(dataBlock);
writer.close();
Schema projectedSchema = HoodieAvroUtils.generateProjectionSchema(schema, Collections.singletonList("name"));
List<GenericRecord> projectedRecords = HoodieAvroUtils.rewriteRecords(records, projectedSchema);
try (Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), projectedSchema, true, false)) {
assertTrue(reader.hasNext(), "First block should be available");
HoodieLogBlock nextBlock = reader.next();
HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock;
Map<HoodieLogBlockType, Integer> expectedReadBytes = new HashMap<HoodieLogBlockType, Integer>() {
{
// not supported
put(HoodieLogBlockType.AVRO_DATA_BLOCK, 0);
// not supported
put(HoodieLogBlockType.HFILE_DATA_BLOCK, 0);
put(HoodieLogBlockType.PARQUET_DATA_BLOCK, 2605);
}
};
List<IndexedRecord> recordsRead = getRecords(dataBlockRead);
assertEquals(projectedRecords.size(), recordsRead.size(), "Read records size should be equal to the written records size");
assertEquals(projectedRecords, recordsRead, "Both records lists should be the same. (ordering guaranteed)");
assertEquals(dataBlockRead.getSchema(), projectedSchema);
int bytesRead = (int) BenchmarkCounter.getBytesRead();
assertEquals(expectedReadBytes.get(dataBlockType), bytesRead, "Read bytes have to match");
}
}
use of org.apache.hudi.common.table.log.block.HoodieDataBlock in project hudi by apache.
the class TestHoodieLogFormat method testBasicWriteAndScan.
@Test
public void testBasicWriteAndScan() throws IOException, URISyntaxException, InterruptedException {
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
Schema schema = getSimpleSchema();
List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100);
List<IndexedRecord> copyOfRecords = records.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
writer.appendBlock(dataBlock);
writer.close();
Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema());
assertTrue(reader.hasNext(), "We wrote a block, we should be able to read it");
HoodieLogBlock nextBlock = reader.next();
assertEquals(DEFAULT_DATA_BLOCK_TYPE, nextBlock.getBlockType(), "The next block should be a data block");
HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock;
List<IndexedRecord> recordsRead = getRecords(dataBlockRead);
assertEquals(copyOfRecords.size(), recordsRead.size(), "Read records size should be equal to the written records size");
assertEquals(copyOfRecords, recordsRead, "Both records lists should be the same. (ordering guaranteed)");
reader.close();
}
use of org.apache.hudi.common.table.log.block.HoodieDataBlock in project hudi by apache.
the class TestHoodieLogFormat method testBasicAppend.
@ParameterizedTest
@EnumSource(names = { "AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK", "PARQUET_DATA_BLOCK" })
public void testBasicAppend(HoodieLogBlockType dataBlockType) throws IOException, InterruptedException, URISyntaxException {
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100);
Map<HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
long pos = writer.getCurrentSize();
HoodieDataBlock dataBlock = getDataBlock(dataBlockType, records, header);
AppendResult result = writer.appendBlock(dataBlock);
long size = writer.getCurrentSize();
assertTrue(size > 0, "We just wrote a block - size should be > 0");
assertEquals(size, fs.getFileStatus(writer.getLogFile().getPath()).getLen(), "Write should be auto-flushed. The size reported by FileStatus and the writer should match");
assertEquals(size, result.size());
assertEquals(writer.getLogFile(), result.logFile());
assertEquals(0, result.offset());
writer.close();
}
use of org.apache.hudi.common.table.log.block.HoodieDataBlock in project hudi by apache.
the class TestHoodieLogFormat method testAppendNotSupported.
/*
* This is actually a test on concurrent append and not recovery lease. Commenting this out.
* https://issues.apache.org/jira/browse/HUDI-117
*/
/**
* @Test public void testLeaseRecovery() throws IOException, URISyntaxException, InterruptedException { Writer writer
* = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath)
* .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
* .overBaseCommit("100").withFs(fs).build(); List<IndexedRecord> records =
* SchemaTestUtil.generateTestRecords(0, 100); Map<HoodieLogBlock.HeaderMetadataType, String> header =
* Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
* header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieAvroDataBlock
* dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size1 =
* writer.getCurrentSize(); // do not close this writer - this simulates a data note appending to a log dying
* without closing the file // writer.close();
* <p>
* writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath)
* .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100")
* .withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100);
* header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new
* HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size2 =
* writer.getCurrentSize(); assertTrue("We just wrote a new block - size2 should be > size1", size2 > size1);
* assertEquals("Write should be auto-flushed. The size reported by FileStatus and the writer should match",
* size2, fs.getFileStatus(writer.getLogFile().getPath()).getLen()); writer.close(); }
*/
@Test
public void testAppendNotSupported() throws IOException, URISyntaxException, InterruptedException {
// Use some fs like LocalFileSystem, that does not support appends
Path localPartitionPath = new Path("file://" + partitionPath);
FileSystem localFs = FSUtils.getFs(localPartitionPath.toString(), HoodieTestUtils.getDefaultHadoopConf());
Path testPath = new Path(localPartitionPath, "append_test");
localFs.mkdirs(testPath);
// Some data & append two times.
List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100);
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
for (int i = 0; i < 2; i++) {
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(testPath).withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION).withFileId("commits.archive").overBaseCommit("").withFs(localFs).build();
writer.appendBlock(dataBlock);
writer.close();
}
// ensure there are two log file versions, with same data.
FileStatus[] statuses = localFs.listStatus(testPath);
assertEquals(2, statuses.length);
}
Aggregations