Search in sources :

Example 86 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class SparkSizeBasedClusteringPlanStrategy method buildClusteringGroupsForPartition.

@Override
protected Stream<HoodieClusteringGroup> buildClusteringGroupsForPartition(String partitionPath, List<FileSlice> fileSlices) {
    HoodieWriteConfig writeConfig = getWriteConfig();
    List<Pair<List<FileSlice>, Integer>> fileSliceGroups = new ArrayList<>();
    List<FileSlice> currentGroup = new ArrayList<>();
    long totalSizeSoFar = 0;
    for (FileSlice currentSlice : fileSlices) {
        // check if max size is reached and create new group, if needed.
        if (totalSizeSoFar >= writeConfig.getClusteringMaxBytesInGroup() && !currentGroup.isEmpty()) {
            int numOutputGroups = getNumberOfOutputFileGroups(totalSizeSoFar, writeConfig.getClusteringTargetFileMaxBytes());
            LOG.info("Adding one clustering group " + totalSizeSoFar + " max bytes: " + writeConfig.getClusteringMaxBytesInGroup() + " num input slices: " + currentGroup.size() + " output groups: " + numOutputGroups);
            fileSliceGroups.add(Pair.of(currentGroup, numOutputGroups));
            currentGroup = new ArrayList<>();
            totalSizeSoFar = 0;
        }
        // Add to the current file-group
        currentGroup.add(currentSlice);
        // assume each file group size is ~= parquet.max.file.size
        totalSizeSoFar += currentSlice.getBaseFile().isPresent() ? currentSlice.getBaseFile().get().getFileSize() : writeConfig.getParquetMaxFileSize();
    }
    if (!currentGroup.isEmpty()) {
        int numOutputGroups = getNumberOfOutputFileGroups(totalSizeSoFar, writeConfig.getClusteringTargetFileMaxBytes());
        LOG.info("Adding final clustering group " + totalSizeSoFar + " max bytes: " + writeConfig.getClusteringMaxBytesInGroup() + " num input slices: " + currentGroup.size() + " output groups: " + numOutputGroups);
        fileSliceGroups.add(Pair.of(currentGroup, numOutputGroups));
    }
    return fileSliceGroups.stream().map(fileSliceGroup -> HoodieClusteringGroup.newBuilder().setSlices(getFileSliceInfo(fileSliceGroup.getLeft())).setNumOutputFileGroups(fileSliceGroup.getRight()).setMetrics(buildMetrics(fileSliceGroup.getLeft())).build());
}
Also used : FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Pair(org.apache.hudi.common.util.collection.Pair)

Example 87 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class TestHoodieRealtimeRecordReader method testReaderInternal.

private void testReaderInternal(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled, boolean partitioned, HoodieLogBlock.HoodieLogBlockType logBlockType) throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ);
    String baseInstant = "100";
    File partitionDir = partitioned ? InputFormatTestUtil.prepareParquetTable(basePath, schema, 1, 100, baseInstant, HoodieTableType.MERGE_ON_READ) : InputFormatTestUtil.prepareNonPartitionedParquetTable(basePath, schema, 1, 100, baseInstant, HoodieTableType.MERGE_ON_READ);
    HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.DELTA_COMMIT_ACTION);
    FileCreateUtils.createDeltaCommit(basePath.toString(), baseInstant, commitMetadata);
    // Add the paths
    FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath());
    List<Pair<String, Integer>> logVersionsWithAction = new ArrayList<>();
    logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 1));
    logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 2));
    // TODO: HUDI-154 Once Hive 2.x PR (PR-674) is merged, enable this change
    // logVersionsWithAction.add(Pair.of(HoodieTimeline.ROLLBACK_ACTION, 3));
    FileSlice fileSlice = new FileSlice(partitioned ? FSUtils.getRelativePartitionPath(new Path(basePath.toString()), new Path(partitionDir.getAbsolutePath())) : "default", baseInstant, "fileid0");
    logVersionsWithAction.forEach(logVersionWithAction -> {
        try {
            // update files or generate new log file
            int logVersion = logVersionWithAction.getRight();
            String action = logVersionWithAction.getKey();
            int baseInstantTs = Integer.parseInt(baseInstant);
            String instantTime = String.valueOf(baseInstantTs + logVersion);
            String latestInstant = action.equals(HoodieTimeline.ROLLBACK_ACTION) ? String.valueOf(baseInstantTs + logVersion - 2) : instantTime;
            HoodieLogFormat.Writer writer;
            if (action.equals(HoodieTimeline.ROLLBACK_ACTION)) {
                writer = InputFormatTestUtil.writeRollback(partitionDir, fs, "fileid0", baseInstant, instantTime, String.valueOf(baseInstantTs + logVersion - 1), logVersion);
            } else {
                writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", baseInstant, instantTime, 120, 0, logVersion, logBlockType);
            }
            long size = writer.getCurrentSize();
            writer.close();
            assertTrue(size > 0, "block - size should be > 0");
            FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime, commitMetadata);
            // create a split with baseFile (parquet file written earlier) and new log file(s)
            fileSlice.addLogFile(writer.getLogFile());
            HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1, baseJobConf), basePath.toUri().toString(), fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList()), instantTime, false, Option.empty());
            // create a RecordReader to be used by HoodieRealtimeRecordReader
            RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), baseJobConf, null);
            JobConf jobConf = new JobConf(baseJobConf);
            List<Schema.Field> fields = schema.getFields();
            setHiveColumnNameProps(fields, jobConf, partitioned);
            jobConf.setEnum(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), diskMapType);
            jobConf.setBoolean(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), isCompressionEnabled);
            // validate record reader compaction
            long logTmpFileStartTime = System.currentTimeMillis();
            HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);
            // use reader to read base Parquet File and log file, merge in flight and return latest commit
            // here all 100 records should be updated, see above
            // another 20 new insert records should also output with new commit time.
            NullWritable key = recordReader.createKey();
            ArrayWritable value = recordReader.createValue();
            int recordCnt = 0;
            while (recordReader.next(key, value)) {
                Writable[] values = value.get();
                // check if the record written is with latest commit, here "101"
                assertEquals(latestInstant, values[0].toString());
                key = recordReader.createKey();
                value = recordReader.createValue();
                recordCnt++;
            }
            recordReader.getPos();
            assertEquals(1.0, recordReader.getProgress(), 0.05);
            assertEquals(120, recordCnt);
            recordReader.close();
            // the temp file produced by logScanner should be deleted
            assertTrue(!getLogTempFile(logTmpFileStartTime, System.currentTimeMillis(), diskMapType.toString()).exists());
        } catch (Exception ioe) {
            throw new HoodieException(ioe.getMessage(), ioe);
        }
    });
// Add Rollback last version to next log-file
}
Also used : FileSlice(org.apache.hudi.common.model.FileSlice) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) NullWritable(org.apache.hadoop.io.NullWritable) Writable(org.apache.hadoop.io.Writable) LongWritable(org.apache.hadoop.io.LongWritable) ArrayWritable(org.apache.hadoop.io.ArrayWritable) IntWritable(org.apache.hadoop.io.IntWritable) BooleanWritable(org.apache.hadoop.io.BooleanWritable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) HoodieException(org.apache.hudi.exception.HoodieException) FileSplit(org.apache.hadoop.mapred.FileSplit) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Field(org.apache.avro.Schema.Field) MapredParquetInputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) ArrayWritable(org.apache.hadoop.io.ArrayWritable) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) JobConf(org.apache.hadoop.mapred.JobConf) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) NullWritable(org.apache.hadoop.io.NullWritable) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) File(java.io.File)

Aggregations

FileSlice (org.apache.hudi.common.model.FileSlice)87 List (java.util.List)51 ArrayList (java.util.ArrayList)45 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)45 Map (java.util.Map)44 Collectors (java.util.stream.Collectors)43 IOException (java.io.IOException)39 Path (org.apache.hadoop.fs.Path)39 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)39 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)38 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)38 Option (org.apache.hudi.common.util.Option)37 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)36 Pair (org.apache.hudi.common.util.collection.Pair)35 HashMap (java.util.HashMap)32 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)32 FSUtils (org.apache.hudi.common.fs.FSUtils)29 Stream (java.util.stream.Stream)28 Test (org.junit.jupiter.api.Test)27 HoodieFileGroup (org.apache.hudi.common.model.HoodieFileGroup)26