Search in sources :

Example 1 with HeaderMetadataType

use of org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType in project hudi by apache.

the class HoodieTimelineArchiver method writeToFile.

private void writeToFile(Schema wrapperSchema, List<IndexedRecord> records) throws Exception {
    if (records.size() > 0) {
        Map<HeaderMetadataType, String> header = new HashMap<>();
        header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, wrapperSchema.toString());
        final String keyField = table.getMetaClient().getTableConfig().getRecordKeyFieldProp();
        HoodieAvroDataBlock block = new HoodieAvroDataBlock(records, header, keyField);
        writer.appendBlock(block);
        records.clear();
    }
}
Also used : HashMap(java.util.HashMap) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock)

Example 2 with HeaderMetadataType

use of org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType in project hudi by apache.

the class HiveTestUtil method generateLogData.

private static HoodieLogFile generateLogData(Path parquetFilePath, boolean isLogSchemaSimple) throws IOException, InterruptedException, URISyntaxException {
    Schema schema = getTestDataSchema(isLogSchemaSimple);
    HoodieBaseFile dataFile = new HoodieBaseFile(fileSystem.getFileStatus(parquetFilePath));
    // Write a log file for this parquet file
    Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(parquetFilePath.getParent()).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(dataFile.getFileId()).overBaseCommit(dataFile.getCommitTime()).withFs(fileSystem).build();
    List<IndexedRecord> records = (isLogSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100) : SchemaTestUtil.generateEvolvedTestRecords(100, 100));
    Map<HeaderMetadataType, String> header = new HashMap<>(2);
    header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, dataFile.getCommitTime());
    header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
    HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD);
    logWriter.appendBlock(dataBlock);
    logWriter.close();
    return logWriter.getLogFile();
}
Also used : HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) IndexedRecord(org.apache.avro.generic.IndexedRecord) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer)

Example 3 with HeaderMetadataType

use of org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType in project hudi by apache.

the class HoodieFlinkWriteableTestTable method appendRecordsToLogFile.

private Pair<String, HoodieLogFile> appendRecordsToLogFile(List<HoodieRecord> groupedRecords) throws Exception {
    String partitionPath = groupedRecords.get(0).getPartitionPath();
    HoodieRecordLocation location = groupedRecords.get(0).getCurrentLocation();
    try (HoodieLogFormat.Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath)).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(location.getFileId()).overBaseCommit(location.getInstantTime()).withFs(fs).build()) {
        Map<HeaderMetadataType, String> header = new java.util.HashMap<>();
        header.put(HeaderMetadataType.INSTANT_TIME, location.getInstantTime());
        header.put(HeaderMetadataType.SCHEMA, schema.toString());
        logWriter.appendBlock(new HoodieAvroDataBlock(groupedRecords.stream().map(r -> {
            try {
                GenericRecord val = (GenericRecord) ((HoodieRecordPayload) r.getData()).getInsertValue(schema).get();
                HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), "");
                return (IndexedRecord) val;
            } catch (IOException e) {
                LOG.warn("Failed to convert record " + r.toString(), e);
                return null;
            }
        }).collect(Collectors.toList()), header, HoodieRecord.RECORD_KEY_METADATA_FIELD));
        return Pair.of(partitionPath, logWriter.getLogFile());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) Arrays(java.util.Arrays) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) BloomFilterFactory(org.apache.hudi.common.bloom.BloomFilterFactory) BloomFilterTypeCode(org.apache.hudi.common.bloom.BloomFilterTypeCode) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) LogManager(org.apache.log4j.LogManager) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) IOException(java.io.IOException) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 4 with HeaderMetadataType

use of org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType in project hudi by apache.

the class HoodieBackedTableMetadataWriter method initializeFileGroups.

/**
 * Initialize file groups for a partition. For file listing, we just have one file group.
 *
 * All FileGroups for a given metadata partition has a fixed prefix as per the {@link MetadataPartitionType#getFileIdPrefix()}.
 * Each file group is suffixed with 4 digits with increments of 1 starting with 0000.
 *
 * Lets say we configure 10 file groups for record level index partition, and prefix as "record-index-bucket-"
 * File groups will be named as :
 *    record-index-bucket-0000, .... -> ..., record-index-bucket-0009
 */
private void initializeFileGroups(HoodieTableMetaClient dataMetaClient, MetadataPartitionType metadataPartition, String instantTime, int fileGroupCount) throws IOException {
    final HashMap<HeaderMetadataType, String> blockHeader = new HashMap<>();
    blockHeader.put(HeaderMetadataType.INSTANT_TIME, instantTime);
    // Archival of data table has a dependency on compaction(base files) in metadata table.
    // It is assumed that as of time Tx of base instant (/compaction time) in metadata table,
    // all commits in data table is in sync with metadata table. So, we always start with log file for any fileGroup.
    final HoodieDeleteBlock block = new HoodieDeleteBlock(new HoodieKey[0], blockHeader);
    LOG.info(String.format("Creating %d file groups for partition %s with base fileId %s at instant time %s", fileGroupCount, metadataPartition.getPartitionPath(), metadataPartition.getFileIdPrefix(), instantTime));
    for (int i = 0; i < fileGroupCount; ++i) {
        final String fileGroupFileId = String.format("%s%04d", metadataPartition.getFileIdPrefix(), i);
        try {
            HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(FSUtils.getPartitionPath(metadataWriteConfig.getBasePath(), metadataPartition.getPartitionPath())).withFileId(fileGroupFileId).overBaseCommit(instantTime).withLogVersion(HoodieLogFile.LOGFILE_BASE_VERSION).withFileSize(0L).withSizeThreshold(metadataWriteConfig.getLogFileMaxSize()).withFs(dataMetaClient.getFs()).withRolloverLogWriteToken(HoodieLogFormat.DEFAULT_WRITE_TOKEN).withLogWriteToken(HoodieLogFormat.DEFAULT_WRITE_TOKEN).withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
            writer.appendBlock(block);
            writer.close();
        } catch (InterruptedException e) {
            throw new HoodieException("Failed to created fileGroup " + fileGroupFileId + " for partition " + metadataPartition.getPartitionPath(), e);
        }
    }
}
Also used : HashMap(java.util.HashMap) HoodieDeleteBlock(org.apache.hudi.common.table.log.block.HoodieDeleteBlock) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) HoodieException(org.apache.hudi.exception.HoodieException)

Example 5 with HeaderMetadataType

use of org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType in project hudi by apache.

the class HoodieLogFileReader method readBlock.

// TODO : convert content and block length to long by using ByteBuffer, raw byte [] allows
// for max of Integer size
private HoodieLogBlock readBlock() throws IOException {
    int blockSize;
    try {
        // 1 Read the total size of the block
        blockSize = (int) inputStream.readLong();
    } catch (EOFException | CorruptedLogFileException e) {
        // Create a corrupt block by finding the next MAGIC marker or EOF
        return createCorruptBlock();
    }
    // We may have had a crash which could have written this block partially
    // Skip blockSize in the stream and we should either find a sync marker (start of the next
    // block) or EOF. If we did not find either of it, then this block is a corrupted block.
    boolean isCorrupted = isBlockCorrupted(blockSize);
    if (isCorrupted) {
        return createCorruptBlock();
    }
    // 2. Read the version for this log format
    HoodieLogFormat.LogFormatVersion nextBlockVersion = readVersion();
    // 3. Read the block type for a log block
    HoodieLogBlockType blockType = tryReadBlockType(nextBlockVersion);
    // 4. Read the header for a log block, if present
    Map<HeaderMetadataType, String> header = nextBlockVersion.hasHeader() ? HoodieLogBlock.getLogMetadata(inputStream) : null;
    // 5. Read the content length for the content
    // Fallback to full-block size if no content-length
    // TODO replace w/ hasContentLength
    int contentLength = nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION ? (int) inputStream.readLong() : blockSize;
    // 6. Read the content or skip content based on IO vs Memory trade-off by client
    long contentPosition = inputStream.getPos();
    boolean shouldReadLazily = readBlockLazily && nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION;
    Option<byte[]> content = HoodieLogBlock.tryReadContent(inputStream, contentLength, shouldReadLazily);
    // 7. Read footer if any
    Map<HeaderMetadataType, String> footer = nextBlockVersion.hasFooter() ? HoodieLogBlock.getLogMetadata(inputStream) : null;
    // log file in reverse
    if (nextBlockVersion.hasLogBlockLength()) {
        inputStream.readLong();
    }
    // 9. Read the log block end position in the log file
    long blockEndPos = inputStream.getPos();
    HoodieLogBlock.HoodieLogBlockContentLocation logBlockContentLoc = new HoodieLogBlock.HoodieLogBlockContentLocation(hadoopConf, logFile, contentPosition, contentLength, blockEndPos);
    switch(Objects.requireNonNull(blockType)) {
        case AVRO_DATA_BLOCK:
            if (nextBlockVersion.getVersion() == HoodieLogFormatVersion.DEFAULT_VERSION) {
                return HoodieAvroDataBlock.getBlock(content.get(), readerSchema);
            } else {
                return new HoodieAvroDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc, Option.ofNullable(readerSchema), header, footer, keyField);
            }
        case HFILE_DATA_BLOCK:
            checkState(nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION, String.format("HFile block could not be of version (%d)", HoodieLogFormatVersion.DEFAULT_VERSION));
            return new HoodieHFileDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc, Option.ofNullable(readerSchema), header, footer, enableRecordLookups);
        case PARQUET_DATA_BLOCK:
            checkState(nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION, String.format("Parquet block could not be of version (%d)", HoodieLogFormatVersion.DEFAULT_VERSION));
            return new HoodieParquetDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc, Option.ofNullable(readerSchema), header, footer, keyField);
        case DELETE_BLOCK:
            return new HoodieDeleteBlock(content, inputStream, readBlockLazily, Option.of(logBlockContentLoc), header, footer);
        case COMMAND_BLOCK:
            return new HoodieCommandBlock(content, inputStream, readBlockLazily, Option.of(logBlockContentLoc), header, footer);
        default:
            throw new HoodieNotSupportedException("Unsupported Block " + blockType);
    }
}
Also used : HoodieDeleteBlock(org.apache.hudi.common.table.log.block.HoodieDeleteBlock) CorruptedLogFileException(org.apache.hudi.exception.CorruptedLogFileException) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieLogBlockType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType) HoodieNotSupportedException(org.apache.hudi.exception.HoodieNotSupportedException) HoodieHFileDataBlock(org.apache.hudi.common.table.log.block.HoodieHFileDataBlock) HoodieCommandBlock(org.apache.hudi.common.table.log.block.HoodieCommandBlock) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) HoodieParquetDataBlock(org.apache.hudi.common.table.log.block.HoodieParquetDataBlock) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) EOFException(java.io.EOFException)

Aggregations

HeaderMetadataType (org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType)7 HashMap (java.util.HashMap)6 IndexedRecord (org.apache.avro.generic.IndexedRecord)4 HoodieAvroDataBlock (org.apache.hudi.common.table.log.block.HoodieAvroDataBlock)4 Schema (org.apache.avro.Schema)3 HoodieLogFormat (org.apache.hudi.common.table.log.HoodieLogFormat)3 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 Map (java.util.Map)2 Collectors (java.util.stream.Collectors)2 Path (org.apache.hadoop.fs.Path)2 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)2 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)2 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)2 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)2 Writer (org.apache.hudi.common.table.log.HoodieLogFormat.Writer)2 HoodieDataBlock (org.apache.hudi.common.table.log.block.HoodieDataBlock)2 HoodieDeleteBlock (org.apache.hudi.common.table.log.block.HoodieDeleteBlock)2 HoodieLogBlock (org.apache.hudi.common.table.log.block.HoodieLogBlock)2