Search in sources :

Example 76 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class LogReaderUtils method readLatestSchemaFromLogFiles.

public static Schema readLatestSchemaFromLogFiles(String basePath, List<HoodieLogFile> logFiles, Configuration config) throws IOException {
    HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(config).setBasePath(basePath).build();
    List<String> deltaPaths = logFiles.stream().sorted(HoodieLogFile.getReverseLogFileComparator()).map(s -> s.getPath().toString()).collect(Collectors.toList());
    if (deltaPaths.size() > 0) {
        Map<String, HoodieLogFile> deltaFilePathToFileStatus = logFiles.stream().map(entry -> Pair.of(entry.getPath().toString(), entry)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
        for (String logPath : deltaPaths) {
            FileSystem fs = FSUtils.getFs(logPath, config);
            Schema schemaFromLogFile = readSchemaFromLogFileInReverse(fs, metaClient.getActiveTimeline(), deltaFilePathToFileStatus.get(logPath));
            if (schemaFromLogFile != null) {
                return schemaFromLogFile;
            }
        }
    }
    return null;
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Schema(org.apache.avro.Schema) FileSystem(org.apache.hadoop.fs.FileSystem) Reader(org.apache.hudi.common.table.log.HoodieLogFormat.Reader) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) List(java.util.List) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Pair(org.apache.hudi.common.util.collection.Pair) FileSystem(org.apache.hadoop.fs.FileSystem) Schema(org.apache.avro.Schema) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile)

Example 77 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class TableSchemaResolver method getTableSchemaFromCommitMetadata.

/**
 * Gets the schema for a hoodie table in Avro format from the HoodieCommitMetadata of the last commit with valid schema.
 *
 * @return Avro schema for this table
 */
private Option<Schema> getTableSchemaFromCommitMetadata(boolean includeMetadataFields) {
    Option<Pair<HoodieInstant, HoodieCommitMetadata>> instantAndCommitMetadata = metaClient.getActiveTimeline().getLastCommitMetadataWithValidSchema();
    if (instantAndCommitMetadata.isPresent()) {
        HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
        String schemaStr = commitMetadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY);
        Schema schema = new Schema.Parser().parse(schemaStr);
        if (includeMetadataFields) {
            schema = HoodieAvroUtils.addMetadataFields(schema, hasOperationField);
        }
        return Option.of(schema);
    } else {
        return Option.empty();
    }
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Schema(org.apache.avro.Schema) Pair(org.apache.hudi.common.util.collection.Pair)

Example 78 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class TableSchemaResolver method getTableParquetSchemaFromDataFile.

/**
 * Gets the schema for a hoodie table. Depending on the type of table, read from any file written in the latest
 * commit. We will assume that the schema has not changed within a single atomic write.
 *
 * @return Parquet schema for this table
 */
private MessageType getTableParquetSchemaFromDataFile() {
    HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
    Option<Pair<HoodieInstant, HoodieCommitMetadata>> instantAndCommitMetadata = activeTimeline.getLastCommitMetadataWithValidData();
    try {
        switch(metaClient.getTableType()) {
            case COPY_ON_WRITE:
                // For COW table, the file has data written must be in parquet or orc format currently.
                if (instantAndCommitMetadata.isPresent()) {
                    HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
                    String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().get();
                    return readSchemaFromBaseFile(filePath);
                } else {
                    throw new IllegalArgumentException("Could not find any data file written for commit, " + "so could not get schema for table " + metaClient.getBasePath());
                }
            case MERGE_ON_READ:
                // Determine the file format based on the file name, and then extract schema from it.
                if (instantAndCommitMetadata.isPresent()) {
                    HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
                    String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().get();
                    if (filePath.contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())) {
                        // this is a log file
                        return readSchemaFromLogFile(new Path(filePath));
                    } else {
                        return readSchemaFromBaseFile(filePath);
                    }
                } else {
                    throw new IllegalArgumentException("Could not find any data file written for commit, " + "so could not get schema for table " + metaClient.getBasePath());
                }
            default:
                LOG.error("Unknown table type " + metaClient.getTableType());
                throw new InvalidTableException(metaClient.getBasePath());
        }
    } catch (IOException e) {
        throw new HoodieException("Failed to read data schema", e);
    }
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Path(org.apache.hadoop.fs.Path) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) InvalidTableException(org.apache.hudi.exception.InvalidTableException) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) Pair(org.apache.hudi.common.util.collection.Pair)

Example 79 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class DFSPathSelector method getNextFilePathsAndMaxModificationTime.

/**
 * Get the list of files changed since last checkpoint.
 *
 * @param lastCheckpointStr the last checkpoint time string, empty if first run
 * @param sourceLimit       max bytes to read each time
 * @return the list of files concatenated and their latest modified time
 */
@Deprecated
public Pair<Option<String>, String> getNextFilePathsAndMaxModificationTime(Option<String> lastCheckpointStr, long sourceLimit) {
    try {
        // obtain all eligible files under root folder.
        log.info("Root path => " + props.getString(Config.ROOT_INPUT_PATH_PROP) + " source limit => " + sourceLimit);
        long lastCheckpointTime = lastCheckpointStr.map(Long::parseLong).orElse(Long.MIN_VALUE);
        List<FileStatus> eligibleFiles = listEligibleFiles(fs, new Path(props.getString(Config.ROOT_INPUT_PATH_PROP)), lastCheckpointTime);
        // sort them by modification time.
        eligibleFiles.sort(Comparator.comparingLong(FileStatus::getModificationTime));
        // Filter based on checkpoint & input size, if needed
        long currentBytes = 0;
        long newCheckpointTime = lastCheckpointTime;
        List<FileStatus> filteredFiles = new ArrayList<>();
        for (FileStatus f : eligibleFiles) {
            if (currentBytes + f.getLen() >= sourceLimit && f.getModificationTime() > newCheckpointTime) {
                // so that some files with the same modification time won't be skipped in next read
                break;
            }
            newCheckpointTime = f.getModificationTime();
            currentBytes += f.getLen();
            filteredFiles.add(f);
        }
        // no data to read
        if (filteredFiles.isEmpty()) {
            return new ImmutablePair<>(Option.empty(), String.valueOf(newCheckpointTime));
        }
        // read the files out.
        String pathStr = filteredFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(","));
        return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(newCheckpointTime));
    } catch (IOException ioe) {
        throw new HoodieIOException("Unable to read from source from checkpoint: " + lastCheckpointStr, ioe);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Arrays(java.util.Arrays) TypedProperties(org.apache.hudi.common.config.TypedProperties) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) IOException(java.io.IOException) DataSourceUtils(org.apache.hudi.DataSourceUtils) FileStatus(org.apache.hadoop.fs.FileStatus) Collectors(java.util.stream.Collectors) Serializable(java.io.Serializable) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) List(java.util.List) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) HoodieIOException(org.apache.hudi.exception.HoodieIOException) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieIOException(org.apache.hudi.exception.HoodieIOException) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) ArrayList(java.util.ArrayList) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 80 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class DatePartitionPathSelector method getNextFilePathsAndMaxModificationTime.

@Override
public Pair<Option<String>, String> getNextFilePathsAndMaxModificationTime(JavaSparkContext sparkContext, Option<String> lastCheckpointStr, long sourceLimit) {
    // If not specified the current date is assumed by default.
    LocalDate currentDate = LocalDate.parse(props.getString(Config.CURRENT_DATE, LocalDate.now().toString()));
    // obtain all eligible files under root folder.
    LOG.info("Root path => " + props.getString(ROOT_INPUT_PATH_PROP) + " source limit => " + sourceLimit + " depth of day partition => " + datePartitionDepth + " num prev days to list => " + numPrevDaysToList + " from current date => " + currentDate);
    long lastCheckpointTime = lastCheckpointStr.map(Long::parseLong).orElse(Long.MIN_VALUE);
    HoodieSparkEngineContext context = new HoodieSparkEngineContext(sparkContext);
    SerializableConfiguration serializedConf = new SerializableConfiguration(fs.getConf());
    List<String> prunedPartitionPaths = pruneDatePartitionPaths(context, fs, props.getString(ROOT_INPUT_PATH_PROP), currentDate);
    List<FileStatus> eligibleFiles = context.flatMap(prunedPartitionPaths, path -> {
        FileSystem fs = new Path(path).getFileSystem(serializedConf.get());
        return listEligibleFiles(fs, new Path(path), lastCheckpointTime).stream();
    }, partitionsListParallelism);
    // sort them by modification time ascending.
    List<FileStatus> sortedEligibleFiles = eligibleFiles.stream().sorted(Comparator.comparingLong(FileStatus::getModificationTime)).collect(Collectors.toList());
    // Filter based on checkpoint & input size, if needed
    long currentBytes = 0;
    long newCheckpointTime = lastCheckpointTime;
    List<FileStatus> filteredFiles = new ArrayList<>();
    for (FileStatus f : sortedEligibleFiles) {
        if (currentBytes + f.getLen() >= sourceLimit && f.getModificationTime() > newCheckpointTime) {
            // so that some files with the same modification time won't be skipped in next read
            break;
        }
        newCheckpointTime = f.getModificationTime();
        currentBytes += f.getLen();
        filteredFiles.add(f);
    }
    // no data to read
    if (filteredFiles.isEmpty()) {
        return new ImmutablePair<>(Option.empty(), String.valueOf(newCheckpointTime));
    }
    // read the files out.
    String pathStr = filteredFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(","));
    return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(newCheckpointTime));
}
Also used : Path(org.apache.hadoop.fs.Path) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) DEFAULT_PARTITIONS_LIST_PARALLELISM(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_PARTITIONS_LIST_PARALLELISM) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) ROOT_INPUT_PATH_PROP(org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP) TypedProperties(org.apache.hudi.common.config.TypedProperties) LOOKBACK_DAYS(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.LOOKBACK_DAYS) DATE_PARTITION_DEPTH(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DATE_PARTITION_DEPTH) DEFAULT_DATE_PARTITION_DEPTH(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_DATE_PARTITION_DEPTH) Collectors(java.util.stream.Collectors) DATE_FORMAT(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DATE_FORMAT) DEFAULT_DATE_FORMAT(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_DATE_FORMAT) PARTITIONS_LIST_PARALLELISM(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.PARTITIONS_LIST_PARALLELISM) List(java.util.List) LocalDate(java.time.LocalDate) DateTimeFormatter(java.time.format.DateTimeFormatter) DEFAULT_LOOKBACK_DAYS(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_LOOKBACK_DAYS) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Pair(org.apache.hudi.common.util.collection.Pair) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) ArrayList(java.util.ArrayList) LocalDate(java.time.LocalDate) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) FileSystem(org.apache.hadoop.fs.FileSystem)

Aggregations

Pair (org.apache.hudi.common.util.collection.Pair)147 List (java.util.List)98 Map (java.util.Map)91 IOException (java.io.IOException)89 Collectors (java.util.stream.Collectors)87 Option (org.apache.hudi.common.util.Option)87 ArrayList (java.util.ArrayList)85 Path (org.apache.hadoop.fs.Path)81 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)76 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)66 HashMap (java.util.HashMap)65 LogManager (org.apache.log4j.LogManager)64 Logger (org.apache.log4j.Logger)64 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)63 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)58 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)54 HoodieIOException (org.apache.hudi.exception.HoodieIOException)54 Arrays (java.util.Arrays)48 HoodieTable (org.apache.hudi.table.HoodieTable)46 Test (org.junit.jupiter.api.Test)46