Search in sources :

Example 26 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class TestCompactionUtils method testFileSliceCompactionOpEquality.

/**
 * Validates if generated compaction operation matches with input file slice and partition path.
 *
 * @param slice            File Slice
 * @param op               HoodieCompactionOperation
 * @param expPartitionPath Partition path
 */
private void testFileSliceCompactionOpEquality(FileSlice slice, HoodieCompactionOperation op, String expPartitionPath, int version) {
    assertEquals(expPartitionPath, op.getPartitionPath(), "Partition path is correct");
    assertEquals(slice.getBaseInstantTime(), op.getBaseInstantTime(), "Same base-instant");
    assertEquals(slice.getFileId(), op.getFileId(), "Same file-id");
    if (slice.getBaseFile().isPresent()) {
        HoodieBaseFile df = slice.getBaseFile().get();
        assertEquals(version == COMPACTION_METADATA_VERSION_1 ? df.getPath() : df.getFileName(), op.getDataFilePath(), "Same data-file");
    }
    List<String> paths = slice.getLogFiles().map(l -> l.getPath().toString()).collect(Collectors.toList());
    IntStream.range(0, paths.size()).boxed().forEach(idx -> assertEquals(version == COMPACTION_METADATA_VERSION_1 ? paths.get(idx) : new Path(paths.get(idx)).getName(), op.getDeltaFilePaths().get(idx), "Log File Index " + idx));
    assertEquals(METRICS, op.getMetrics(), "Metrics set");
}
Also used : IntStream(java.util.stream.IntStream) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) CompactionPlanMigrator(org.apache.hudi.common.table.timeline.versioning.compaction.CompactionPlanMigrator) CompactionTestUtils.setupAndValidateCompactionOperations(org.apache.hudi.common.testutils.CompactionTestUtils.setupAndValidateCompactionOperations) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HashMap(java.util.HashMap) COMPACTION_METADATA_VERSION_1(org.apache.hudi.common.util.CompactionUtils.COMPACTION_METADATA_VERSION_1) LATEST_COMPACTION_METADATA_VERSION(org.apache.hudi.common.util.CompactionUtils.LATEST_COMPACTION_METADATA_VERSION) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) CompactionTestUtils.createCompactionPlan(org.apache.hudi.common.testutils.CompactionTestUtils.createCompactionPlan) ValueSource(org.junit.jupiter.params.provider.ValueSource) DummyHoodieBaseFile(org.apache.hudi.common.testutils.CompactionTestUtils.DummyHoodieBaseFile) IOException(java.io.IOException) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) List(java.util.List) Stream(java.util.stream.Stream) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) CompactionTestUtils.scheduleCompaction(org.apache.hudi.common.testutils.CompactionTestUtils.scheduleCompaction) DEFAULT_PARTITION_PATHS(org.apache.hudi.common.testutils.HoodieTestUtils.DEFAULT_PARTITION_PATHS) Comparator(java.util.Comparator) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) DummyHoodieBaseFile(org.apache.hudi.common.testutils.CompactionTestUtils.DummyHoodieBaseFile) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile)

Example 27 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class TestClusteringUtils method generateFileSlice.

private FileSlice generateFileSlice(String partitionPath, String fileId, String baseInstant) {
    FileSlice fs = new FileSlice(new HoodieFileGroupId(partitionPath, fileId), baseInstant);
    fs.setBaseFile(new HoodieBaseFile(FSUtils.makeDataFileName(baseInstant, "1-0-1", fileId)));
    return fs;
}
Also used : HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) FileSlice(org.apache.hudi.common.model.FileSlice)

Example 28 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class TestHoodieRealtimeRecordReader method testLogOnlyReader.

@Test
public void testLogOnlyReader() throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ);
    String baseInstant = "100";
    File partitionDir = InputFormatTestUtil.prepareNonPartitionedParquetTable(basePath, schema, 1, 100, baseInstant, HoodieTableType.MERGE_ON_READ);
    FileCreateUtils.createDeltaCommit(basePath.toString(), baseInstant);
    // Add the paths
    FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath());
    FileSlice fileSlice = new FileSlice("default", baseInstant, "fileid1");
    try {
        // update files or generate new log file
        int logVersion = 1;
        int baseInstantTs = Integer.parseInt(baseInstant);
        String instantTime = String.valueOf(baseInstantTs + logVersion);
        HoodieLogFormat.Writer writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid1", baseInstant, instantTime, 100, 0, logVersion);
        long size = writer.getCurrentSize();
        writer.close();
        assertTrue(size > 0, "block - size should be > 0");
        HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.COMMIT_ACTION);
        FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime, commitMetadata);
        // create a split with new log file(s)
        fileSlice.addLogFile(new HoodieLogFile(writer.getLogFile().getPath(), size));
        RealtimeFileStatus realtimeFileStatus = new RealtimeFileStatus(new FileStatus(writer.getLogFile().getFileSize(), false, 1, 1, 0, writer.getLogFile().getPath()), basePath.toString(), fileSlice.getLogFiles().collect(Collectors.toList()), false, Option.empty());
        realtimeFileStatus.setMaxCommitTime(instantTime);
        HoodieRealtimePath realtimePath = (HoodieRealtimePath) realtimeFileStatus.getPath();
        HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(realtimePath, 0, 0, new String[] { "" }), realtimePath);
        JobConf newJobConf = new JobConf(baseJobConf);
        List<Schema.Field> fields = schema.getFields();
        setHiveColumnNameProps(fields, newJobConf, false);
        // create a dummy RecordReader to be used by HoodieRealtimeRecordReader
        RecordReader<NullWritable, ArrayWritable> reader = new HoodieRealtimeRecordReader(split, newJobConf, new HoodieEmptyRecordReader(split, newJobConf));
        // use reader to read log file.
        NullWritable key = reader.createKey();
        ArrayWritable value = reader.createValue();
        while (reader.next(key, value)) {
            Writable[] values = value.get();
            assertEquals(instantTime, values[0].toString());
            key = reader.createKey();
            value = reader.createValue();
        }
        reader.close();
    } catch (Exception e) {
        throw new HoodieException(e.getMessage(), e);
    }
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) RealtimeFileStatus(org.apache.hudi.hadoop.RealtimeFileStatus) FileSlice(org.apache.hudi.common.model.FileSlice) Schema(org.apache.avro.Schema) NullWritable(org.apache.hadoop.io.NullWritable) Writable(org.apache.hadoop.io.Writable) LongWritable(org.apache.hadoop.io.LongWritable) ArrayWritable(org.apache.hadoop.io.ArrayWritable) IntWritable(org.apache.hadoop.io.IntWritable) BooleanWritable(org.apache.hadoop.io.BooleanWritable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) HoodieException(org.apache.hudi.exception.HoodieException) FileSplit(org.apache.hadoop.mapred.FileSplit) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Field(org.apache.avro.Schema.Field) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) ArrayWritable(org.apache.hadoop.io.ArrayWritable) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) JobConf(org.apache.hadoop.mapred.JobConf) RealtimeFileStatus(org.apache.hudi.hadoop.RealtimeFileStatus) NullWritable(org.apache.hadoop.io.NullWritable) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) File(java.io.File) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 29 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class HoodieRealtimeInputFormatUtils method groupLogsByBaseFile.

// Return parquet file with a list of log files in the same file group.
public static List<Pair<Option<HoodieBaseFile>, List<HoodieLogFile>>> groupLogsByBaseFile(Configuration conf, List<Path> partitionPaths) {
    Set<Path> partitionSet = new HashSet<>(partitionPaths);
    // TODO(vc): Should we handle also non-hoodie splits here?
    Map<Path, HoodieTableMetaClient> partitionsToMetaClient = getTableMetaClientByPartitionPath(conf, partitionSet);
    // Get all the base file and it's log files pairs in required partition paths.
    List<Pair<Option<HoodieBaseFile>, List<HoodieLogFile>>> baseAndLogsList = new ArrayList<>();
    partitionSet.forEach(partitionPath -> {
        // for each partition path obtain the data & log file groupings, then map back to inputsplits
        HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath);
        HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline());
        String relPartitionPath = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), partitionPath);
        try {
            // Both commit and delta-commits are included - pick the latest completed one
            Option<HoodieInstant> latestCompletedInstant = metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants().lastInstant();
            Stream<FileSlice> latestFileSlices = latestCompletedInstant.map(instant -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, instant.getTimestamp())).orElse(Stream.empty());
            latestFileSlices.forEach(fileSlice -> {
                List<HoodieLogFile> logFilePaths = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
                baseAndLogsList.add(Pair.of(fileSlice.getBaseFile(), logFilePaths));
            });
        } catch (Exception e) {
            throw new HoodieException("Error obtaining data file/log file grouping: " + partitionPath, e);
        }
    });
    return baseAndLogsList;
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) ColumnProjectionUtils(org.apache.hadoop.hive.serde2.ColumnProjectionUtils) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieRealtimeBootstrapBaseFileSplit(org.apache.hudi.hadoop.realtime.HoodieRealtimeBootstrapBaseFileSplit) Logger(org.apache.log4j.Logger) FileSplit(org.apache.hadoop.mapred.FileSplit) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Configuration(org.apache.hadoop.conf.Configuration) RealtimeSplit(org.apache.hudi.hadoop.realtime.RealtimeSplit) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) TypeUtils.unsafeCast(org.apache.hudi.TypeUtils.unsafeCast) HoodieVirtualKeyInfo(org.apache.hudi.hadoop.realtime.HoodieVirtualKeyInfo) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Set(java.util.Set) HoodieRealtimeFileSplit(org.apache.hudi.hadoop.realtime.HoodieRealtimeFileSplit) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) JobConf(org.apache.hadoop.mapred.JobConf) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieException(org.apache.hudi.exception.HoodieException) HoodieException(org.apache.hudi.exception.HoodieException) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HashSet(java.util.HashSet) Pair(org.apache.hudi.common.util.collection.Pair)

Example 30 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class DFSHoodieDatasetInputReader method readColumnarOrLogFiles.

private Iterator<IndexedRecord> readColumnarOrLogFiles(FileSlice fileSlice) throws IOException {
    if (fileSlice.getBaseFile().isPresent()) {
        // Read the base files using the latest writer schema.
        Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr));
        HoodieFileReader reader = HoodieFileReaderFactory.getFileReader(metaClient.getHadoopConf(), new Path(fileSlice.getBaseFile().get().getPath()));
        return reader.getRecordIterator(schema);
    } else {
        // If there is no data file, fall back to reading log files
        HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(metaClient.getFs()).withBasePath(metaClient.getBasePath()).withLogFilePaths(fileSlice.getLogFiles().map(l -> l.getPath().getName()).collect(Collectors.toList())).withReaderSchema(new Schema.Parser().parse(schemaStr)).withLatestInstantTime(metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant().get().getTimestamp()).withMaxMemorySizeInBytes(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES).withReadBlocksLazily(true).withReverseReader(false).withBufferSize(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE.defaultValue()).withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()).withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()).withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()).build();
        // readAvro log files
        Iterable<HoodieRecord<? extends HoodieRecordPayload>> iterable = () -> scanner.iterator();
        Schema schema = new Schema.Parser().parse(schemaStr);
        return StreamSupport.stream(iterable.spliterator(), false).map(e -> {
            try {
                return (IndexedRecord) e.getData().getInsertValue(schema).get();
            } catch (IOException io) {
                throw new UncheckedIOException(io);
            }
        }).iterator();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Arrays(java.util.Arrays) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) FileSlice(org.apache.hudi.common.model.FileSlice) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LoggerFactory(org.slf4j.LoggerFactory) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) Entry.comparingByValue(java.util.Map.Entry.comparingByValue) LinkedHashMap(java.util.LinkedHashMap) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) Collectors.toMap(java.util.stream.Collectors.toMap) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) HoodieMemoryConfig(org.apache.hudi.config.HoodieMemoryConfig) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) StreamSupport(java.util.stream.StreamSupport) NoSuchElementException(java.util.NoSuchElementException) IndexedRecord(org.apache.avro.generic.IndexedRecord) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) IOException(java.io.IOException) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) UncheckedIOException(java.io.UncheckedIOException) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) List(java.util.List) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieCommonConfig(org.apache.hudi.common.config.HoodieCommonConfig) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload)

Aggregations

FileSlice (org.apache.hudi.common.model.FileSlice)87 List (java.util.List)51 ArrayList (java.util.ArrayList)45 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)45 Map (java.util.Map)44 Collectors (java.util.stream.Collectors)43 IOException (java.io.IOException)39 Path (org.apache.hadoop.fs.Path)39 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)39 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)38 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)38 Option (org.apache.hudi.common.util.Option)37 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)36 Pair (org.apache.hudi.common.util.collection.Pair)35 HashMap (java.util.HashMap)32 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)32 FSUtils (org.apache.hudi.common.fs.FSUtils)29 Stream (java.util.stream.Stream)28 Test (org.junit.jupiter.api.Test)27 HoodieFileGroup (org.apache.hudi.common.model.HoodieFileGroup)26