Search in sources :

Example 6 with HoodieLogBlock

use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.

the class TestHoodieLogFormat method testV0Format.

@Test
public void testV0Format() throws IOException, URISyntaxException {
    // HoodieLogFormatVersion.DEFAULT_VERSION has been deprecated so we cannot
    // create a writer for it. So these tests are only for the HoodieAvroDataBlock
    // of older version.
    Schema schema = getSimpleSchema();
    List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100);
    List<IndexedRecord> recordsCopy = new ArrayList<>(records);
    assertEquals(100, records.size());
    assertEquals(100, recordsCopy.size());
    HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, schema);
    byte[] content = dataBlock.getBytes(schema);
    assertTrue(content.length > 0);
    HoodieLogBlock logBlock = HoodieAvroDataBlock.getBlock(content, schema);
    assertEquals(HoodieLogBlockType.AVRO_DATA_BLOCK, logBlock.getBlockType());
    List<IndexedRecord> readRecords = getRecords((HoodieAvroDataBlock) logBlock);
    assertEquals(readRecords.size(), recordsCopy.size());
    for (int i = 0; i < recordsCopy.size(); ++i) {
        assertEquals(recordsCopy.get(i), readRecords.get(i));
    }
    // Reader schema is optional if it is same as write schema
    logBlock = HoodieAvroDataBlock.getBlock(content, null);
    assertEquals(HoodieLogBlockType.AVRO_DATA_BLOCK, logBlock.getBlockType());
    readRecords = getRecords((HoodieAvroDataBlock) logBlock);
    assertEquals(readRecords.size(), recordsCopy.size());
    for (int i = 0; i < recordsCopy.size(); ++i) {
        assertEquals(recordsCopy.get(i), readRecords.get(i));
    }
}
Also used : HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) IndexedRecord(org.apache.avro.generic.IndexedRecord) Schema(org.apache.avro.Schema) SchemaTestUtil.getSimpleSchema(org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema) ArrayList(java.util.ArrayList) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 7 with HoodieLogBlock

use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.

the class TestHoodieLogFormat method testAppendAndReadOnCorruptedLogInReverse.

@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testAppendAndReadOnCorruptedLogInReverse(boolean readBlocksLazily) throws IOException, URISyntaxException, InterruptedException {
    Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
    Schema schema = getSimpleSchema();
    List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100);
    Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
    header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
    header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
    HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
    writer.appendBlock(dataBlock);
    writer.close();
    FileCreateUtils.createDeltaCommit(basePath, "100", fs);
    // Append some arbit byte[] to thee end of the log (mimics a partially written commit)
    fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf());
    FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath());
    // create a block with
    outputStream.write(HoodieLogFormat.MAGIC);
    outputStream.writeInt(HoodieLogBlockType.AVRO_DATA_BLOCK.ordinal());
    // Write out a length that does not confirm with the content
    outputStream.writeInt(1000);
    // Write out footer length
    outputStream.writeInt(1);
    // Write out some metadata
    // TODO : test for failure to write metadata - NA ?
    outputStream.write(HoodieLogBlock.getLogMetadataBytes(header));
    outputStream.write("something-random".getBytes());
    outputStream.flush();
    outputStream.close();
    // Should be able to append a new block
    writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
    records = SchemaTestUtil.generateTestRecords(0, 100);
    dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
    writer.appendBlock(dataBlock);
    writer.close();
    // First round of reads - we should be able to read the first block and then EOF
    HoodieLogFileReader reader = new HoodieLogFileReader(fs, new HoodieLogFile(writer.getLogFile().getPath(), fs.getFileStatus(writer.getLogFile().getPath()).getLen()), schema, bufferSize, readBlocksLazily, true);
    assertTrue(reader.hasPrev(), "Last block should be available");
    HoodieLogBlock block = reader.prev();
    assertTrue(block instanceof HoodieDataBlock, "Last block should be datablock");
    assertTrue(reader.hasPrev(), "Last block should be available");
    assertThrows(CorruptedLogFileException.class, () -> {
        reader.prev();
    });
    reader.close();
}
Also used : HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) IndexedRecord(org.apache.avro.generic.IndexedRecord) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) SchemaTestUtil.getSimpleSchema(org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema) HoodieLogFileReader(org.apache.hudi.common.table.log.HoodieLogFileReader) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 8 with HoodieLogBlock

use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.

the class TestHoodieBackedMetadata method verifyMetadataRawRecords.

/**
 * Verify the metadata table on-disk raw records. When populate meta fields is enabled,
 * these records should have additional meta fields in the payload. When key deduplication
 * is enabled, these records on the disk should have key in the payload as empty string.
 *
 * @param table
 * @param logFiles         - Metadata table log files to be verified
 * @param enableMetaFields - Enable meta fields for records
 * @throws IOException
 */
private void verifyMetadataRawRecords(HoodieTable table, List<HoodieLogFile> logFiles, boolean enableMetaFields) throws IOException {
    for (HoodieLogFile logFile : logFiles) {
        FileStatus[] fsStatus = fs.listStatus(logFile.getPath());
        MessageType writerSchemaMsg = TableSchemaResolver.readSchemaFromLogFile(fs, logFile.getPath());
        if (writerSchemaMsg == null) {
            // not a data block
            continue;
        }
        Schema writerSchema = new AvroSchemaConverter().convert(writerSchemaMsg);
        HoodieLogFormat.Reader logFileReader = HoodieLogFormat.newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema);
        while (logFileReader.hasNext()) {
            HoodieLogBlock logBlock = logFileReader.next();
            if (logBlock instanceof HoodieDataBlock) {
                try (ClosableIterator<IndexedRecord> recordItr = ((HoodieDataBlock) logBlock).getRecordItr()) {
                    recordItr.forEachRemaining(indexRecord -> {
                        final GenericRecord record = (GenericRecord) indexRecord;
                        if (enableMetaFields) {
                            // Metadata table records should have meta fields!
                            assertNotNull(record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
                            assertNotNull(record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD));
                        } else {
                            // Metadata table records should not have meta fields!
                            assertNull(record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
                            assertNull(record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD));
                        }
                        final String key = String.valueOf(record.get(HoodieMetadataPayload.KEY_FIELD_NAME));
                        assertFalse(key.isEmpty());
                        if (enableMetaFields) {
                            assertTrue(key.equals(String.valueOf(record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD))));
                        }
                    });
                }
            }
        }
    }
}
Also used : AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) IndexedRecord(org.apache.avro.generic.IndexedRecord) Schema(org.apache.avro.Schema) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) GenericRecord(org.apache.avro.generic.GenericRecord) MessageType(org.apache.parquet.schema.MessageType)

Example 9 with HoodieLogBlock

use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.

the class TestHoodieBackedTableMetadata method verifyMetadataRawRecords.

/**
 * Verify the metadata table on-disk raw records. When populate meta fields is enabled,
 * these records should have additional meta fields in the payload. When key deduplication
 * is enabled, these records on the disk should have key in the payload as empty string.
 *
 * @param table
 * @param logFiles - Metadata table log files to be verified
 * @throws IOException
 */
private void verifyMetadataRawRecords(HoodieTable table, List<HoodieLogFile> logFiles) throws IOException {
    for (HoodieLogFile logFile : logFiles) {
        FileStatus[] fsStatus = fs.listStatus(logFile.getPath());
        MessageType writerSchemaMsg = TableSchemaResolver.readSchemaFromLogFile(fs, logFile.getPath());
        if (writerSchemaMsg == null) {
            // not a data block
            continue;
        }
        Schema writerSchema = new AvroSchemaConverter().convert(writerSchemaMsg);
        HoodieLogFormat.Reader logFileReader = HoodieLogFormat.newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema);
        while (logFileReader.hasNext()) {
            HoodieLogBlock logBlock = logFileReader.next();
            if (logBlock instanceof HoodieDataBlock) {
                try (ClosableIterator<IndexedRecord> recordItr = ((HoodieDataBlock) logBlock).getRecordItr()) {
                    recordItr.forEachRemaining(indexRecord -> {
                        final GenericRecord record = (GenericRecord) indexRecord;
                        assertNull(record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
                        assertNull(record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD));
                        final String key = String.valueOf(record.get(HoodieMetadataPayload.KEY_FIELD_NAME));
                        assertFalse(key.isEmpty());
                    });
                }
            }
        }
    }
}
Also used : AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) IndexedRecord(org.apache.avro.generic.IndexedRecord) Schema(org.apache.avro.Schema) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) GenericRecord(org.apache.avro.generic.GenericRecord) MessageType(org.apache.parquet.schema.MessageType)

Example 10 with HoodieLogBlock

use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.

the class LogReaderUtils method readSchemaFromLogFileInReverse.

private static Schema readSchemaFromLogFileInReverse(FileSystem fs, HoodieActiveTimeline activeTimeline, HoodieLogFile hoodieLogFile) throws IOException {
    // set length for the HoodieLogFile as it will be leveraged by HoodieLogFormat.Reader with reverseReading enabled
    Reader reader = HoodieLogFormat.newReader(fs, hoodieLogFile, null, true, true);
    Schema writerSchema = null;
    HoodieTimeline completedTimeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
    while (reader.hasPrev()) {
        HoodieLogBlock block = reader.prev();
        if (block instanceof HoodieDataBlock) {
            HoodieDataBlock lastBlock = (HoodieDataBlock) block;
            if (completedTimeline.containsOrBeforeTimelineStarts(lastBlock.getLogBlockHeader().get(HeaderMetadataType.INSTANT_TIME))) {
                writerSchema = new Schema.Parser().parse(lastBlock.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
                break;
            }
        }
    }
    reader.close();
    return writerSchema;
}
Also used : HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Schema(org.apache.avro.Schema) Reader(org.apache.hudi.common.table.log.HoodieLogFormat.Reader)

Aggregations

HoodieLogBlock (org.apache.hudi.common.table.log.block.HoodieLogBlock)21 HoodieDataBlock (org.apache.hudi.common.table.log.block.HoodieDataBlock)16 IndexedRecord (org.apache.avro.generic.IndexedRecord)15 Schema (org.apache.avro.Schema)14 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)12 Reader (org.apache.hudi.common.table.log.HoodieLogFormat.Reader)12 HeaderMetadataType (org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType)11 ArrayList (java.util.ArrayList)10 HashMap (java.util.HashMap)10 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)10 GenericRecord (org.apache.avro.generic.GenericRecord)9 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)9 FileSystem (org.apache.hadoop.fs.FileSystem)9 HoodieLogFileReader (org.apache.hudi.common.table.log.HoodieLogFileReader)9 HoodieLogFormat (org.apache.hudi.common.table.log.HoodieLogFormat)9 Writer (org.apache.hudi.common.table.log.HoodieLogFormat.Writer)9 HoodieDeleteBlock (org.apache.hudi.common.table.log.block.HoodieDeleteBlock)9 IOException (java.io.IOException)8 List (java.util.List)8 FileStatus (org.apache.hadoop.fs.FileStatus)8