use of org.apache.hudi.common.table.log.block.HoodieAvroDataBlock in project hudi by apache.
the class HoodieTimelineArchiver method mergeArchiveFiles.
public void mergeArchiveFiles(List<FileStatus> compactCandidate) throws IOException {
LOG.info("Starting to merge small archive files.");
Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema();
try {
List<IndexedRecord> records = new ArrayList<>();
for (FileStatus fs : compactCandidate) {
// Read the archived file
try (HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(metaClient.getFs(), new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema())) {
// Read the avro blocks
while (reader.hasNext()) {
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
blk.getRecordItr().forEachRemaining(records::add);
if (records.size() >= this.config.getCommitArchivalBatchSize()) {
writeToFile(wrapperSchema, records);
}
}
}
}
writeToFile(wrapperSchema, records);
} catch (Exception e) {
throw new HoodieCommitException("Failed to merge small archive files", e);
} finally {
writer.close();
}
LOG.info("Success to merge small archive files.");
}
use of org.apache.hudi.common.table.log.block.HoodieAvroDataBlock in project hudi by apache.
the class TestHoodieLogFormat method testHugeLogFileWrite.
@Test
public void testHugeLogFileWrite() throws IOException, URISyntaxException, InterruptedException {
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).withSizeThreshold(3L * 1024 * 1024 * 1024).build();
Schema schema = getSimpleSchema();
List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 1000);
List<IndexedRecord> copyOfRecords = records.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
byte[] dataBlockContentBytes = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header).getContentBytes();
HoodieLogBlock.HoodieLogBlockContentLocation logBlockContentLoc = new HoodieLogBlock.HoodieLogBlockContentLocation(new Configuration(), null, 0, dataBlockContentBytes.length, 0);
HoodieDataBlock reusableDataBlock = new HoodieAvroDataBlock(null, Option.ofNullable(dataBlockContentBytes), false, logBlockContentLoc, Option.ofNullable(getSimpleSchema()), header, new HashMap<>(), HoodieRecord.RECORD_KEY_METADATA_FIELD);
long writtenSize = 0;
int logBlockWrittenNum = 0;
while (writtenSize < Integer.MAX_VALUE) {
AppendResult appendResult = writer.appendBlock(reusableDataBlock);
assertTrue(appendResult.size() > 0);
writtenSize += appendResult.size();
logBlockWrittenNum++;
}
writer.close();
Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(), true, true);
assertTrue(reader.hasNext(), "We wrote a block, we should be able to read it");
HoodieLogBlock nextBlock = reader.next();
assertEquals(DEFAULT_DATA_BLOCK_TYPE, nextBlock.getBlockType(), "The next block should be a data block");
HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock;
List<IndexedRecord> recordsRead = getRecords(dataBlockRead);
assertEquals(copyOfRecords.size(), recordsRead.size(), "Read records size should be equal to the written records size");
assertEquals(copyOfRecords, recordsRead, "Both records lists should be the same. (ordering guaranteed)");
int logBlockReadNum = 1;
while (reader.hasNext()) {
reader.next();
logBlockReadNum++;
}
assertEquals(logBlockWrittenNum, logBlockReadNum, "All written log should be correctly found");
reader.close();
// test writing oversize data block which should be rejected
Writer oversizeWriter = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withSizeThreshold(3L * 1024 * 1024 * 1024).withFs(fs).build();
List<HoodieLogBlock> dataBlocks = new ArrayList<>(logBlockWrittenNum + 1);
for (int i = 0; i < logBlockWrittenNum + 1; i++) {
dataBlocks.add(reusableDataBlock);
}
assertThrows(HoodieIOException.class, () -> {
oversizeWriter.appendBlocks(dataBlocks);
}, "Blocks appended may overflow. Please decrease log block size or log block amount");
oversizeWriter.close();
}
use of org.apache.hudi.common.table.log.block.HoodieAvroDataBlock in project hudi by apache.
the class ExportCommand method copyArchivedInstants.
private int copyArchivedInstants(List<FileStatus> statuses, Set<String> actionSet, int limit, String localFolder) throws Exception {
int copyCount = 0;
for (FileStatus fs : statuses) {
// read the archived file
Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(HoodieCLI.getTableMetaClient().getBasePath(), HoodieCLI.conf), new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema());
// read the avro blocks
while (reader.hasNext() && copyCount < limit) {
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
try (ClosableIterator<IndexedRecord> recordItr = blk.getRecordItr()) {
while (recordItr.hasNext()) {
IndexedRecord ir = recordItr.next();
// Archived instants are saved as arvo encoded HoodieArchivedMetaEntry records. We need to get the
// metadata record from the entry and convert it to json.
HoodieArchivedMetaEntry archiveEntryRecord = (HoodieArchivedMetaEntry) SpecificData.get().deepCopy(HoodieArchivedMetaEntry.SCHEMA$, ir);
final String action = archiveEntryRecord.get("actionType").toString();
if (!actionSet.contains(action)) {
continue;
}
GenericRecord metadata = null;
switch(action) {
case HoodieTimeline.CLEAN_ACTION:
metadata = archiveEntryRecord.getHoodieCleanMetadata();
break;
case HoodieTimeline.COMMIT_ACTION:
case HoodieTimeline.DELTA_COMMIT_ACTION:
metadata = archiveEntryRecord.getHoodieCommitMetadata();
break;
case HoodieTimeline.ROLLBACK_ACTION:
metadata = archiveEntryRecord.getHoodieRollbackMetadata();
break;
case HoodieTimeline.SAVEPOINT_ACTION:
metadata = archiveEntryRecord.getHoodieSavePointMetadata();
break;
case HoodieTimeline.COMPACTION_ACTION:
metadata = archiveEntryRecord.getHoodieCompactionMetadata();
break;
default:
throw new HoodieException("Unknown type of action " + action);
}
final String instantTime = archiveEntryRecord.get("commitTime").toString();
final String outPath = localFolder + Path.SEPARATOR + instantTime + "." + action;
writeToFile(outPath, HoodieAvroUtils.avroToJson(metadata, true));
if (++copyCount == limit) {
break;
}
}
}
}
reader.close();
}
return copyCount;
}
use of org.apache.hudi.common.table.log.block.HoodieAvroDataBlock in project hudi by apache.
the class TestHoodieLogFileCommand method init.
@BeforeEach
public void init() throws IOException, InterruptedException, URISyntaxException {
HoodieCLI.conf = hadoopConf();
// Create table and connect
String tableName = tableName();
tablePath = tablePath(tableName);
partitionPath = Paths.get(tablePath, HoodieTestCommitMetadataGenerator.DEFAULT_FIRST_PARTITION_PATH).toString();
new TableCommand().createTable(tablePath, tableName, HoodieTableType.MERGE_ON_READ.name(), "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload");
Files.createDirectories(Paths.get(partitionPath));
fs = FSUtils.getFs(tablePath, hadoopConf());
try (HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(partitionPath)).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-log-fileid1").overBaseCommit("100").withFs(fs).build()) {
// write data to file
List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100);
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, INSTANT_TIME);
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
dataBlock = new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD);
writer.appendBlock(dataBlock);
}
}
use of org.apache.hudi.common.table.log.block.HoodieAvroDataBlock in project hudi by apache.
the class TestHoodieLogFileCommand method testShowLogFileRecordsWithMerge.
/**
* Test case for 'show logfile records' with merge.
*/
@Test
public void testShowLogFileRecordsWithMerge() throws IOException, InterruptedException, URISyntaxException {
// create commit instant
HoodieTestCommitMetadataGenerator.createCommitFile(tablePath, INSTANT_TIME, HoodieCLI.conf);
// write to path '2015/03/16'.
Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema());
partitionPath = tablePath + Path.SEPARATOR + HoodieTestCommitMetadataGenerator.DEFAULT_SECOND_PARTITION_PATH;
Files.createDirectories(Paths.get(partitionPath));
HoodieLogFormat.Writer writer = null;
try {
// set little threshold to split file.
writer = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(partitionPath)).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-log-fileid1").overBaseCommit(INSTANT_TIME).withFs(fs).withSizeThreshold(500).build();
List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100);
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, INSTANT_TIME);
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header, HoodieRecord.RECORD_KEY_METADATA_FIELD);
writer.appendBlock(dataBlock);
} finally {
if (writer != null) {
writer.close();
}
}
CommandResult cr = shell().executeCommand("show logfile records --logFilePathPattern " + partitionPath + "/* --mergeRecords true");
assertTrue(cr.isSuccess());
// get expected result of 10 records.
List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(partitionPath + "/*"))).map(status -> status.getPath().toString()).collect(Collectors.toList());
HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(fs).withBasePath(tablePath).withLogFilePaths(logFilePaths).withReaderSchema(schema).withLatestInstantTime(INSTANT_TIME).withMaxMemorySizeInBytes(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES).withReadBlocksLazily(Boolean.parseBoolean(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLE.defaultValue())).withReverseReader(Boolean.parseBoolean(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLE.defaultValue())).withBufferSize(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE.defaultValue()).withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()).withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()).withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()).build();
Iterator<HoodieRecord<? extends HoodieRecordPayload>> records = scanner.iterator();
int num = 0;
int maxSize = 10;
List<IndexedRecord> indexRecords = new ArrayList<>();
while (records.hasNext() && num < maxSize) {
Option<IndexedRecord> hoodieRecord = records.next().getData().getInsertValue(schema);
indexRecords.add(hoodieRecord.get());
num++;
}
String[][] rows = indexRecords.stream().map(r -> new String[] { r.toString() }).toArray(String[][]::new);
assertNotNull(rows);
String expected = HoodiePrintHelper.print(new String[] { HoodieTableHeaderFields.HEADER_RECORDS }, rows);
expected = removeNonWordAndStripSpace(expected);
String got = removeNonWordAndStripSpace(cr.getResult().toString());
assertEquals(expected, got);
}
Aggregations