Search in sources :

Example 31 with Writer

use of org.apache.hudi.common.table.log.HoodieLogFormat.Writer in project hudi by apache.

the class TestHoodieRealtimeRecordReader method testUnMergedReader.

@Test
public void testUnMergedReader() throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ);
    String instantTime = "100";
    final int numRecords = 1000;
    final int firstBatchLastRecordKey = numRecords - 1;
    final int secondBatchLastRecordKey = 2 * numRecords - 1;
    File partitionDir = InputFormatTestUtil.prepareParquetTable(basePath, schema, 1, numRecords, instantTime, HoodieTableType.MERGE_ON_READ);
    HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.DELTA_COMMIT_ACTION);
    FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime, commitMetadata);
    // Add the paths
    FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath());
    // insert new records to log file
    String newCommitTime = "101";
    HoodieLogFormat.Writer writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", instantTime, newCommitTime, numRecords, numRecords, 0);
    long size = writer.getCurrentSize();
    writer.close();
    assertTrue(size > 0, "block - size should be > 0");
    FileCreateUtils.createDeltaCommit(basePath.toString(), newCommitTime, commitMetadata);
    // create a split with baseFile (parquet file written earlier) and new log file(s)
    HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + instantTime + ".parquet"), 0, 1, baseJobConf), basePath.toUri().toString(), Collections.singletonList(writer.getLogFile()), newCommitTime, false, Option.empty());
    // create a RecordReader to be used by HoodieRealtimeRecordReader
    RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), baseJobConf, null);
    JobConf jobConf = new JobConf(baseJobConf);
    List<Schema.Field> fields = schema.getFields();
    setHiveColumnNameProps(fields, jobConf, true);
    // Enable merge skipping.
    jobConf.set(REALTIME_SKIP_MERGE_PROP, "true");
    // validate unmerged record reader
    RealtimeUnmergedRecordReader recordReader = new RealtimeUnmergedRecordReader(split, jobConf, reader);
    // use reader to read base Parquet File and log file
    // here all records should be present. Also ensure log records are in order.
    NullWritable key = recordReader.createKey();
    ArrayWritable value = recordReader.createValue();
    int numRecordsAtCommit1 = 0;
    int numRecordsAtCommit2 = 0;
    Set<Integer> seenKeys = new HashSet<>();
    int lastSeenKeyFromLog = firstBatchLastRecordKey;
    while (recordReader.next(key, value)) {
        Writable[] values = value.get();
        String gotCommit = values[0].toString();
        String keyStr = values[2].toString();
        int gotKey = Integer.parseInt(keyStr.substring("key".length()));
        if (gotCommit.equals(newCommitTime)) {
            numRecordsAtCommit2++;
            assertTrue(gotKey > firstBatchLastRecordKey);
            assertTrue(gotKey <= secondBatchLastRecordKey);
            assertEquals(gotKey, lastSeenKeyFromLog + 1);
            lastSeenKeyFromLog++;
        } else {
            numRecordsAtCommit1++;
            assertTrue(gotKey >= 0);
            assertTrue(gotKey <= firstBatchLastRecordKey);
        }
        // Ensure unique key
        assertFalse(seenKeys.contains(gotKey));
        seenKeys.add(gotKey);
        key = recordReader.createKey();
        value = recordReader.createValue();
    }
    assertEquals(numRecords, numRecordsAtCommit1);
    assertEquals(numRecords, numRecordsAtCommit2);
    assertEquals(2 * numRecords, seenKeys.size());
    assertEquals(1.0, recordReader.getProgress(), 0.05);
    recordReader.close();
}
Also used : Path(org.apache.hadoop.fs.Path) Schema(org.apache.avro.Schema) NullWritable(org.apache.hadoop.io.NullWritable) Writable(org.apache.hadoop.io.Writable) LongWritable(org.apache.hadoop.io.LongWritable) ArrayWritable(org.apache.hadoop.io.ArrayWritable) IntWritable(org.apache.hadoop.io.IntWritable) BooleanWritable(org.apache.hadoop.io.BooleanWritable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) FileSplit(org.apache.hadoop.mapred.FileSplit) NullWritable(org.apache.hadoop.io.NullWritable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Field(org.apache.avro.Schema.Field) MapredParquetInputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) ArrayWritable(org.apache.hadoop.io.ArrayWritable) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) File(java.io.File) JobConf(org.apache.hadoop.mapred.JobConf) HashSet(java.util.HashSet) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 32 with Writer

use of org.apache.hudi.common.table.log.HoodieLogFormat.Writer in project hudi by apache.

the class TestHoodieRealtimeRecordReader method testSchemaEvolutionAndRollbackBlockInLastLogFile.

@ParameterizedTest
@MethodSource("testArguments")
public void testSchemaEvolutionAndRollbackBlockInLastLogFile(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws Exception {
    // initial commit
    List<HoodieLogFile> logFiles = new ArrayList<>();
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
    HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ);
    String instantTime = "100";
    int numberOfRecords = 100;
    int numberOfLogRecords = numberOfRecords / 2;
    File partitionDir = InputFormatTestUtil.prepareSimpleParquetTable(basePath, schema, 1, numberOfRecords, instantTime, HoodieTableType.MERGE_ON_READ);
    HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.COMMIT_ACTION);
    FileCreateUtils.createCommit(basePath.toString(), instantTime, Option.of(commitMetadata));
    // Add the paths
    FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath());
    List<Field> firstSchemaFields = schema.getFields();
    // update files and generate new log file but don't commit
    schema = SchemaTestUtil.getComplexEvolvedSchema();
    String newCommitTime = "101";
    HoodieLogFormat.Writer writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", instantTime, newCommitTime, numberOfLogRecords, 0, 1);
    long size = writer.getCurrentSize();
    logFiles.add(writer.getLogFile());
    writer.close();
    assertTrue(size > 0, "block - size should be > 0");
    // write rollback for the previous block in new log file version
    newCommitTime = "102";
    writer = InputFormatTestUtil.writeRollbackBlockToLogFile(partitionDir, fs, schema, "fileid0", instantTime, newCommitTime, "101", 1);
    logFiles.add(writer.getLogFile());
    writer.close();
    commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.DELTA_COMMIT_ACTION);
    FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime, commitMetadata);
    // create a split with baseFile (parquet file written earlier) and new log file(s)
    HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(new Path(partitionDir + "/fileid0_1_" + instantTime + ".parquet"), 0, 1, baseJobConf), basePath.toUri().toString(), logFiles, newCommitTime, false, Option.empty());
    // create a RecordReader to be used by HoodieRealtimeRecordReader
    RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), baseJobConf, null);
    JobConf jobConf = new JobConf(baseJobConf);
    List<Schema.Field> fields = schema.getFields();
    assertFalse(firstSchemaFields.containsAll(fields));
    // Try to read all the fields passed by the new schema
    setHiveColumnNameProps(fields, jobConf, true);
    jobConf.setEnum(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), diskMapType);
    jobConf.setBoolean(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), isCompressionEnabled);
    HoodieRealtimeRecordReader recordReader;
    try {
        // validate record reader compaction
        recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);
        throw new RuntimeException("should've failed the previous line");
    } catch (HoodieException e) {
    // expected, field not found since the data written with the evolved schema was rolled back
    }
    // Try to read all the fields passed by the new schema
    setHiveColumnNameProps(firstSchemaFields, jobConf, true);
    // This time read only the fields which are part of parquet
    recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);
    // use reader to read base Parquet File and log file
    NullWritable key = recordReader.createKey();
    ArrayWritable value = recordReader.createValue();
    while (recordReader.next(key, value)) {
    // keep reading
    }
    reader.close();
}
Also used : Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) HoodieException(org.apache.hudi.exception.HoodieException) FileSplit(org.apache.hadoop.mapred.FileSplit) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Field(org.apache.avro.Schema.Field) MapredParquetInputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) ArrayWritable(org.apache.hadoop.io.ArrayWritable) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) JobConf(org.apache.hadoop.mapred.JobConf) Path(org.apache.hadoop.fs.Path) NullWritable(org.apache.hadoop.io.NullWritable) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) File(java.io.File) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 33 with Writer

use of org.apache.hudi.common.table.log.HoodieLogFormat.Writer in project hudi by apache.

the class TestHoodieRealtimeRecordReader method testReaderInternal.

private void testReaderInternal(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled, boolean partitioned, HoodieLogBlock.HoodieLogBlockType logBlockType) throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ);
    String baseInstant = "100";
    File partitionDir = partitioned ? InputFormatTestUtil.prepareParquetTable(basePath, schema, 1, 100, baseInstant, HoodieTableType.MERGE_ON_READ) : InputFormatTestUtil.prepareNonPartitionedParquetTable(basePath, schema, 1, 100, baseInstant, HoodieTableType.MERGE_ON_READ);
    HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.DELTA_COMMIT_ACTION);
    FileCreateUtils.createDeltaCommit(basePath.toString(), baseInstant, commitMetadata);
    // Add the paths
    FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath());
    List<Pair<String, Integer>> logVersionsWithAction = new ArrayList<>();
    logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 1));
    logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 2));
    // TODO: HUDI-154 Once Hive 2.x PR (PR-674) is merged, enable this change
    // logVersionsWithAction.add(Pair.of(HoodieTimeline.ROLLBACK_ACTION, 3));
    FileSlice fileSlice = new FileSlice(partitioned ? FSUtils.getRelativePartitionPath(new Path(basePath.toString()), new Path(partitionDir.getAbsolutePath())) : "default", baseInstant, "fileid0");
    logVersionsWithAction.forEach(logVersionWithAction -> {
        try {
            // update files or generate new log file
            int logVersion = logVersionWithAction.getRight();
            String action = logVersionWithAction.getKey();
            int baseInstantTs = Integer.parseInt(baseInstant);
            String instantTime = String.valueOf(baseInstantTs + logVersion);
            String latestInstant = action.equals(HoodieTimeline.ROLLBACK_ACTION) ? String.valueOf(baseInstantTs + logVersion - 2) : instantTime;
            HoodieLogFormat.Writer writer;
            if (action.equals(HoodieTimeline.ROLLBACK_ACTION)) {
                writer = InputFormatTestUtil.writeRollback(partitionDir, fs, "fileid0", baseInstant, instantTime, String.valueOf(baseInstantTs + logVersion - 1), logVersion);
            } else {
                writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", baseInstant, instantTime, 120, 0, logVersion, logBlockType);
            }
            long size = writer.getCurrentSize();
            writer.close();
            assertTrue(size > 0, "block - size should be > 0");
            FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime, commitMetadata);
            // create a split with baseFile (parquet file written earlier) and new log file(s)
            fileSlice.addLogFile(writer.getLogFile());
            HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1, baseJobConf), basePath.toUri().toString(), fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList()), instantTime, false, Option.empty());
            // create a RecordReader to be used by HoodieRealtimeRecordReader
            RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), baseJobConf, null);
            JobConf jobConf = new JobConf(baseJobConf);
            List<Schema.Field> fields = schema.getFields();
            setHiveColumnNameProps(fields, jobConf, partitioned);
            jobConf.setEnum(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), diskMapType);
            jobConf.setBoolean(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), isCompressionEnabled);
            // validate record reader compaction
            long logTmpFileStartTime = System.currentTimeMillis();
            HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);
            // use reader to read base Parquet File and log file, merge in flight and return latest commit
            // here all 100 records should be updated, see above
            // another 20 new insert records should also output with new commit time.
            NullWritable key = recordReader.createKey();
            ArrayWritable value = recordReader.createValue();
            int recordCnt = 0;
            while (recordReader.next(key, value)) {
                Writable[] values = value.get();
                // check if the record written is with latest commit, here "101"
                assertEquals(latestInstant, values[0].toString());
                key = recordReader.createKey();
                value = recordReader.createValue();
                recordCnt++;
            }
            recordReader.getPos();
            assertEquals(1.0, recordReader.getProgress(), 0.05);
            assertEquals(120, recordCnt);
            recordReader.close();
            // the temp file produced by logScanner should be deleted
            assertTrue(!getLogTempFile(logTmpFileStartTime, System.currentTimeMillis(), diskMapType.toString()).exists());
        } catch (Exception ioe) {
            throw new HoodieException(ioe.getMessage(), ioe);
        }
    });
// Add Rollback last version to next log-file
}
Also used : FileSlice(org.apache.hudi.common.model.FileSlice) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) NullWritable(org.apache.hadoop.io.NullWritable) Writable(org.apache.hadoop.io.Writable) LongWritable(org.apache.hadoop.io.LongWritable) ArrayWritable(org.apache.hadoop.io.ArrayWritable) IntWritable(org.apache.hadoop.io.IntWritable) BooleanWritable(org.apache.hadoop.io.BooleanWritable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) HoodieException(org.apache.hudi.exception.HoodieException) FileSplit(org.apache.hadoop.mapred.FileSplit) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Field(org.apache.avro.Schema.Field) MapredParquetInputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) ArrayWritable(org.apache.hadoop.io.ArrayWritable) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) JobConf(org.apache.hadoop.mapred.JobConf) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) NullWritable(org.apache.hadoop.io.NullWritable) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) File(java.io.File)

Example 34 with Writer

use of org.apache.hudi.common.table.log.HoodieLogFormat.Writer in project hudi by apache.

the class TestHoodieRealtimeRecordReader method testIncrementalWithOnlylog.

@Test
public void testIncrementalWithOnlylog() throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ);
    String instantTime = "100";
    final int numRecords = 1000;
    File partitionDir = InputFormatTestUtil.prepareParquetTable(basePath, schema, 1, numRecords, instantTime, HoodieTableType.MERGE_ON_READ);
    createDeltaCommitFile(basePath, instantTime, "2016/05/01", "2016/05/01/fileid0_1-0-1_100.parquet", "fileid0", schema.toString());
    // Add the paths
    FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath());
    // insert new records to log file
    try {
        String newCommitTime = "102";
        HoodieLogFormat.Writer writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", instantTime, newCommitTime, numRecords, numRecords, 0);
        writer.close();
        createDeltaCommitFile(basePath, newCommitTime, "2016/05/01", "2016/05/01/.fileid0_100.log.1_1-0-1", "fileid0", schema.toString());
        InputFormatTestUtil.setupIncremental(baseJobConf, "101", 1);
        HoodieParquetRealtimeInputFormat inputFormat = new HoodieParquetRealtimeInputFormat();
        inputFormat.setConf(baseJobConf);
        InputSplit[] splits = inputFormat.getSplits(baseJobConf, 1);
        assertEquals(1, splits.length);
        JobConf newJobConf = new JobConf(baseJobConf);
        List<Schema.Field> fields = schema.getFields();
        setHiveColumnNameProps(fields, newJobConf, false);
        RecordReader<NullWritable, ArrayWritable> reader = inputFormat.getRecordReader(splits[0], newJobConf, Reporter.NULL);
        // use reader to read log file.
        NullWritable key = reader.createKey();
        ArrayWritable value = reader.createValue();
        while (reader.next(key, value)) {
            Writable[] values = value.get();
            // since we set incremental start commit as 101 and commit_number as 1.
            // the data belong to commit 102 should be read out.
            assertEquals(newCommitTime, values[0].toString());
            key = reader.createKey();
            value = reader.createValue();
        }
        reader.close();
    } catch (IOException e) {
        throw new HoodieException(e.getMessage(), e);
    }
}
Also used : Schema(org.apache.avro.Schema) NullWritable(org.apache.hadoop.io.NullWritable) Writable(org.apache.hadoop.io.Writable) LongWritable(org.apache.hadoop.io.LongWritable) ArrayWritable(org.apache.hadoop.io.ArrayWritable) IntWritable(org.apache.hadoop.io.IntWritable) BooleanWritable(org.apache.hadoop.io.BooleanWritable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) NullWritable(org.apache.hadoop.io.NullWritable) Field(org.apache.avro.Schema.Field) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) ArrayWritable(org.apache.hadoop.io.ArrayWritable) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) File(java.io.File) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

Writer (org.apache.hudi.common.table.log.HoodieLogFormat.Writer)34 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)30 HashMap (java.util.HashMap)27 IndexedRecord (org.apache.avro.generic.IndexedRecord)27 HeaderMetadataType (org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType)26 Schema (org.apache.avro.Schema)25 HoodieDataBlock (org.apache.hudi.common.table.log.block.HoodieDataBlock)25 Test (org.junit.jupiter.api.Test)25 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)24 Path (org.apache.hadoop.fs.Path)23 HoodieLogFormat (org.apache.hudi.common.table.log.HoodieLogFormat)23 HoodieLogFileReader (org.apache.hudi.common.table.log.HoodieLogFileReader)20 HoodieLogBlock (org.apache.hudi.common.table.log.block.HoodieLogBlock)20 IOException (java.io.IOException)19 ArrayList (java.util.ArrayList)19 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)19 FileStatus (org.apache.hadoop.fs.FileStatus)18 FileSystem (org.apache.hadoop.fs.FileSystem)18 SchemaTestUtil.getSimpleSchema (org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema)18 MethodSource (org.junit.jupiter.params.provider.MethodSource)18