use of org.apache.hudi.io.storage.HoodieHFileReader in project hudi by apache.
the class HoodieHFileDataBlock method lookupRecords.
// TODO abstract this w/in HoodieDataBlock
@Override
protected ClosableIterator<IndexedRecord> lookupRecords(List<String> keys) throws IOException {
HoodieLogBlockContentLocation blockContentLoc = getBlockContentLocation().get();
// NOTE: It's important to extend Hadoop configuration here to make sure configuration
// is appropriately carried over
Configuration inlineConf = new Configuration(blockContentLoc.getHadoopConf());
inlineConf.set("fs." + InLineFileSystem.SCHEME + ".impl", InLineFileSystem.class.getName());
Path inlinePath = InLineFSUtils.getInlineFilePath(blockContentLoc.getLogFile().getPath(), blockContentLoc.getLogFile().getPath().getFileSystem(inlineConf).getScheme(), blockContentLoc.getContentPositionInLogFile(), blockContentLoc.getBlockSize());
// HFile read will be efficient if keys are sorted, since on storage, records are sorted by key. This will avoid unnecessary seeks.
Collections.sort(keys);
final HoodieHFileReader<IndexedRecord> reader = new HoodieHFileReader<>(inlineConf, inlinePath, new CacheConfig(inlineConf), inlinePath.getFileSystem(inlineConf));
// Get writer's schema from the header
final ClosableIterator<IndexedRecord> recordIterator = reader.getRecordIterator(keys, readerSchema);
return new ClosableIterator<IndexedRecord>() {
@Override
public boolean hasNext() {
return recordIterator.hasNext();
}
@Override
public IndexedRecord next() {
return recordIterator.next();
}
@Override
public void close() {
recordIterator.close();
reader.close();
}
};
}
use of org.apache.hudi.io.storage.HoodieHFileReader in project hudi by apache.
the class TestHoodieBackedMetadata method verifyMetadataRecordKeyExcludeFromPayloadBaseFiles.
/**
* Verify metadata table base files for the records persisted based on the config. When
* the key deduplication is enabled, the records persisted on the disk in the base file
* should have key field in the payload as empty string.
*
* @param table - Metadata table
* @param enableMetaFields - Enable meta fields
*/
private void verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(HoodieTable table, boolean enableMetaFields) throws IOException {
table.getHoodieView().sync();
List<FileSlice> fileSlices = table.getSliceView().getLatestFileSlices(MetadataPartitionType.FILES.getPartitionPath()).collect(Collectors.toList());
if (!fileSlices.get(0).getBaseFile().isPresent()) {
throw new IllegalStateException("Base file not available!");
}
final HoodieBaseFile baseFile = fileSlices.get(0).getBaseFile().get();
HoodieHFileReader hoodieHFileReader = new HoodieHFileReader(context.getHadoopConf().get(), new Path(baseFile.getPath()), new CacheConfig(context.getHadoopConf().get()));
List<Pair<String, IndexedRecord>> records = hoodieHFileReader.readAllRecords();
records.forEach(entry -> {
if (enableMetaFields) {
assertNotNull(((GenericRecord) entry.getSecond()).get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
} else {
assertNull(((GenericRecord) entry.getSecond()).get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
}
final String keyInPayload = (String) ((GenericRecord) entry.getSecond()).get(HoodieMetadataPayload.KEY_FIELD_NAME);
assertFalse(keyInPayload.isEmpty());
});
}
use of org.apache.hudi.io.storage.HoodieHFileReader in project hudi by apache.
the class HoodieHFileDataBlock method deserializeRecords.
@Override
protected ClosableIterator<IndexedRecord> deserializeRecords(byte[] content) throws IOException {
checkState(readerSchema != null, "Reader's schema has to be non-null");
// Get schema from the header
Schema writerSchema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
// Read the content
HoodieHFileReader<IndexedRecord> reader = new HoodieHFileReader<>(content);
// Sets up the writer schema
reader.withSchema(writerSchema);
Iterator<IndexedRecord> recordIterator = reader.getRecordIterator(readerSchema);
return new ClosableIterator<IndexedRecord>() {
@Override
public void close() {
reader.close();
}
@Override
public boolean hasNext() {
return recordIterator.hasNext();
}
@Override
public IndexedRecord next() {
return recordIterator.next();
}
};
}
use of org.apache.hudi.io.storage.HoodieHFileReader in project hudi by apache.
the class TableSchemaResolver method readSchemaFromHFileBaseFile.
/**
* Read the parquet schema from a HFile.
*/
public MessageType readSchemaFromHFileBaseFile(Path hFilePath) throws IOException {
LOG.info("Reading schema from " + hFilePath);
FileSystem fs = metaClient.getRawFs();
CacheConfig cacheConfig = new CacheConfig(fs.getConf());
HoodieHFileReader<IndexedRecord> hFileReader = new HoodieHFileReader<>(fs.getConf(), hFilePath, cacheConfig);
return convertAvroSchemaToParquet(hFileReader.getSchema());
}
use of org.apache.hudi.io.storage.HoodieHFileReader in project hudi by apache.
the class TestHoodieBackedMetadata method testVirtualKeysInBaseFiles.
/**
* Tests that virtual key configs are honored in base files after compaction in metadata table.
*
* @throws Exception
*/
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testVirtualKeysInBaseFiles(boolean populateMetaFields) throws Exception {
HoodieTableType tableType = MERGE_ON_READ;
init(tableType, false);
writeConfig = getWriteConfigBuilder(true, true, false).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).enableFullScan(true).enableMetrics(false).withPopulateMetaFields(populateMetaFields).withMaxNumDeltaCommitsBeforeCompaction(2).build()).build();
initWriteConfigAndMetatableWriter(writeConfig, true);
doWriteOperation(testTable, "0000001", INSERT);
doClean(testTable, "0000003", Arrays.asList("0000001"));
// this should have triggered compaction in metadata table
doWriteOperation(testTable, "0000004", UPSERT);
HoodieTableMetadata tableMetadata = metadata(writeConfig, context);
assertTrue(tableMetadata.getLatestCompactionTime().isPresent());
assertEquals(tableMetadata.getLatestCompactionTime().get(), "0000003001");
HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build();
HoodieWriteConfig metadataTableWriteConfig = getMetadataWriteConfig(writeConfig);
metadataMetaClient.reloadActiveTimeline();
HoodieTable table = HoodieSparkTable.create(metadataTableWriteConfig, context, metadataMetaClient);
table.getHoodieView().sync();
List<FileSlice> fileSlices = table.getSliceView().getLatestFileSlices("files").collect(Collectors.toList());
HoodieBaseFile baseFile = fileSlices.get(0).getBaseFile().get();
HoodieHFileReader hoodieHFileReader = new HoodieHFileReader(context.getHadoopConf().get(), new Path(baseFile.getPath()), new CacheConfig(context.getHadoopConf().get()));
List<Pair<String, IndexedRecord>> records = hoodieHFileReader.readAllRecords();
records.forEach(entry -> {
if (populateMetaFields) {
assertNotNull(((GenericRecord) entry.getSecond()).get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
} else {
assertNull(((GenericRecord) entry.getSecond()).get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
}
});
}
Aggregations