Search in sources :

Example 31 with HoodieTimer

use of org.apache.hudi.common.util.HoodieTimer in project hudi by apache.

the class HoodieBackedTableMetadata method readFromBaseAndMergeWithLogRecords.

private List<Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>>> readFromBaseAndMergeWithLogRecords(HoodieFileReader baseFileReader, List<String> keys, Map<String, Option<HoodieRecord<HoodieMetadataPayload>>> logRecords, List<Long> timings, String partitionName) throws IOException {
    List<Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>>> result = new ArrayList<>();
    // merge with base records
    HoodieTimer timer = new HoodieTimer().startTimer();
    timer.startTimer();
    HoodieRecord<HoodieMetadataPayload> hoodieRecord = null;
    // Retrieve record from base file
    if (baseFileReader != null) {
        HoodieTimer readTimer = new HoodieTimer();
        Map<String, GenericRecord> baseFileRecords = baseFileReader.getRecordsByKeys(keys);
        for (String key : keys) {
            readTimer.startTimer();
            if (baseFileRecords.containsKey(key)) {
                hoodieRecord = getRecord(Option.of(baseFileRecords.get(key)), partitionName);
                metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BASEFILE_READ_STR, readTimer.endTimer()));
                // merge base file record w/ log record if present
                if (logRecords.containsKey(key) && logRecords.get(key).isPresent()) {
                    HoodieRecordPayload mergedPayload = logRecords.get(key).get().getData().preCombine(hoodieRecord.getData());
                    result.add(Pair.of(key, Option.of(new HoodieAvroRecord(hoodieRecord.getKey(), mergedPayload))));
                } else {
                    // only base record
                    result.add(Pair.of(key, Option.of(hoodieRecord)));
                }
            } else {
                // only log record
                result.add(Pair.of(key, logRecords.get(key)));
            }
        }
        timings.add(timer.endTimer());
    } else {
        // no base file at all
        timings.add(timer.endTimer());
        for (Map.Entry<String, Option<HoodieRecord<HoodieMetadataPayload>>> entry : logRecords.entrySet()) {
            result.add(Pair.of(entry.getKey(), entry.getValue()));
        }
    }
    return result;
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) Option(org.apache.hudi.common.util.Option) GenericRecord(org.apache.avro.generic.GenericRecord) HashMap(java.util.HashMap) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Pair(org.apache.hudi.common.util.collection.Pair)

Example 32 with HoodieTimer

use of org.apache.hudi.common.util.HoodieTimer in project hudi by apache.

the class HoodieBackedTableMetadata method openReadersIfNeeded.

/**
 * Create a file reader and the record scanner for a given partition and file slice
 * if readers are not already available.
 *
 * @param partitionName - Partition name
 * @param slice         - The file slice to open readers for
 * @return File reader and the record scanner pair for the requested file slice
 */
private Pair<HoodieFileReader, HoodieMetadataMergedLogRecordReader> openReadersIfNeeded(String partitionName, FileSlice slice) {
    return partitionReaders.computeIfAbsent(Pair.of(partitionName, slice.getFileId()), k -> {
        try {
            HoodieTimer timer = new HoodieTimer().startTimer();
            // Open base file reader
            Pair<HoodieFileReader, Long> baseFileReaderOpenTimePair = getBaseFileReader(slice, timer);
            HoodieFileReader baseFileReader = baseFileReaderOpenTimePair.getKey();
            final long baseFileOpenMs = baseFileReaderOpenTimePair.getValue();
            // Open the log record scanner using the log files from the latest file slice
            List<HoodieLogFile> logFiles = slice.getLogFiles().collect(Collectors.toList());
            Pair<HoodieMetadataMergedLogRecordReader, Long> logRecordScannerOpenTimePair = getLogRecordScanner(logFiles, partitionName);
            HoodieMetadataMergedLogRecordReader logRecordScanner = logRecordScannerOpenTimePair.getKey();
            final long logScannerOpenMs = logRecordScannerOpenTimePair.getValue();
            metrics.ifPresent(metrics -> metrics.updateMetrics(HoodieMetadataMetrics.SCAN_STR, +baseFileOpenMs + logScannerOpenMs));
            return Pair.of(baseFileReader, logRecordScanner);
        } catch (IOException e) {
            throw new HoodieIOException("Error opening readers for metadata table partition " + partitionName, e);
        }
    });
}
Also used : HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 33 with HoodieTimer

use of org.apache.hudi.common.util.HoodieTimer in project hudi by apache.

the class MetadataCommand method listFiles.

@CliCommand(value = "metadata list-files", help = "Print a list of all files in a partition from the metadata")
public String listFiles(@CliOption(key = { "partition" }, help = "Name of the partition to list files", mandatory = true) final String partition) throws IOException {
    HoodieCLI.getTableMetaClient();
    HoodieMetadataConfig config = HoodieMetadataConfig.newBuilder().enable(true).build();
    HoodieBackedTableMetadata metaReader = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(HoodieCLI.conf), config, HoodieCLI.basePath, "/tmp");
    if (!metaReader.enabled()) {
        return "[ERROR] Metadata Table not enabled/initialized\n\n";
    }
    HoodieTimer timer = new HoodieTimer().startTimer();
    FileStatus[] statuses = metaReader.getAllFilesInPartition(new Path(HoodieCLI.basePath, partition));
    LOG.debug("Took " + timer.endTimer() + " ms");
    final List<Comparable[]> rows = new ArrayList<>();
    Arrays.stream(statuses).sorted((p1, p2) -> p2.getPath().getName().compareTo(p1.getPath().getName())).forEach(f -> {
        Comparable[] row = new Comparable[1];
        row[0] = f;
        rows.add(row);
    });
    TableHeader header = new TableHeader().addTableHeaderField("file path");
    return HoodiePrintHelper.print(header, new HashMap<>(), "", false, Integer.MAX_VALUE, false, rows);
}
Also used : Path(org.apache.hadoop.fs.Path) Arrays(java.util.Arrays) HoodieBackedTableMetadata(org.apache.hudi.metadata.HoodieBackedTableMetadata) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) FileStatus(org.apache.hadoop.fs.FileStatus) CliOption(org.springframework.shell.core.annotation.CliOption) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) CommandMarker(org.springframework.shell.core.CommandMarker) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) CliCommand(org.springframework.shell.core.annotation.CliCommand) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) TableHeader(org.apache.hudi.cli.TableHeader) Set(java.util.Set) IOException(java.io.IOException) SparkUtil(org.apache.hudi.cli.utils.SparkUtil) FileNotFoundException(java.io.FileNotFoundException) HoodieCLI(org.apache.hudi.cli.HoodieCLI) Component(org.springframework.stereotype.Component) List(java.util.List) HoodiePrintHelper(org.apache.hudi.cli.HoodiePrintHelper) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Collections(java.util.Collections) FileStatus(org.apache.hadoop.fs.FileStatus) TableHeader(org.apache.hudi.cli.TableHeader) ArrayList(java.util.ArrayList) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieBackedTableMetadata(org.apache.hudi.metadata.HoodieBackedTableMetadata) CliCommand(org.springframework.shell.core.annotation.CliCommand)

Example 34 with HoodieTimer

use of org.apache.hudi.common.util.HoodieTimer in project hudi by apache.

the class MetadataCommand method init.

@CliCommand(value = "metadata init", help = "Update the metadata table from commits since the creation")
public String init(@CliOption(key = "sparkMaster", unspecifiedDefaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master, @CliOption(key = { "readonly" }, unspecifiedDefaultValue = "false", help = "Open in read-only mode") final boolean readOnly) throws Exception {
    HoodieCLI.getTableMetaClient();
    Path metadataPath = new Path(getMetadataTableBasePath(HoodieCLI.basePath));
    try {
        HoodieCLI.fs.listStatus(metadataPath);
    } catch (FileNotFoundException e) {
        // Metadata directory does not exist
        throw new RuntimeException("Metadata directory (" + metadataPath.toString() + ") does not exist.");
    }
    HoodieTimer timer = new HoodieTimer().startTimer();
    if (!readOnly) {
        HoodieWriteConfig writeConfig = getWriteConfig();
        initJavaSparkContext(Option.of(master));
        SparkHoodieBackedTableMetadataWriter.create(HoodieCLI.conf, writeConfig, new HoodieSparkEngineContext(jsc));
    }
    String action = readOnly ? "Opened" : "Initialized";
    return String.format(action + " Metadata Table in %s (duration=%.2fsec)", metadataPath, (timer.endTimer()) / 1000.0);
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) FileNotFoundException(java.io.FileNotFoundException) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) CliCommand(org.springframework.shell.core.annotation.CliCommand)

Example 35 with HoodieTimer

use of org.apache.hudi.common.util.HoodieTimer in project hudi by apache.

the class MetadataCommand method validateFiles.

@CliCommand(value = "metadata validate-files", help = "Validate all files in all partitions from the metadata")
public String validateFiles(@CliOption(key = { "verbose" }, help = "Print all file details", unspecifiedDefaultValue = "false") final boolean verbose) throws IOException {
    HoodieCLI.getTableMetaClient();
    HoodieMetadataConfig config = HoodieMetadataConfig.newBuilder().enable(true).build();
    HoodieBackedTableMetadata metadataReader = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(HoodieCLI.conf), config, HoodieCLI.basePath, "/tmp");
    if (!metadataReader.enabled()) {
        return "[ERROR] Metadata Table not enabled/initialized\n\n";
    }
    HoodieMetadataConfig fsConfig = HoodieMetadataConfig.newBuilder().enable(false).build();
    HoodieBackedTableMetadata fsMetaReader = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(HoodieCLI.conf), fsConfig, HoodieCLI.basePath, "/tmp");
    HoodieTimer timer = new HoodieTimer().startTimer();
    List<String> metadataPartitions = metadataReader.getAllPartitionPaths();
    LOG.debug("Listing partitions Took " + timer.endTimer() + " ms");
    List<String> fsPartitions = fsMetaReader.getAllPartitionPaths();
    Collections.sort(fsPartitions);
    Collections.sort(metadataPartitions);
    Set<String> allPartitions = new HashSet<>();
    allPartitions.addAll(fsPartitions);
    allPartitions.addAll(metadataPartitions);
    if (!fsPartitions.equals(metadataPartitions)) {
        LOG.error("FS partition listing is not matching with metadata partition listing!");
        LOG.error("All FS partitions: " + Arrays.toString(fsPartitions.toArray()));
        LOG.error("All Metadata partitions: " + Arrays.toString(metadataPartitions.toArray()));
    }
    final List<Comparable[]> rows = new ArrayList<>();
    for (String partition : allPartitions) {
        Map<String, FileStatus> fileStatusMap = new HashMap<>();
        Map<String, FileStatus> metadataFileStatusMap = new HashMap<>();
        FileStatus[] metadataStatuses = metadataReader.getAllFilesInPartition(new Path(HoodieCLI.basePath, partition));
        Arrays.stream(metadataStatuses).forEach(entry -> metadataFileStatusMap.put(entry.getPath().getName(), entry));
        FileStatus[] fsStatuses = fsMetaReader.getAllFilesInPartition(new Path(HoodieCLI.basePath, partition));
        Arrays.stream(fsStatuses).forEach(entry -> fileStatusMap.put(entry.getPath().getName(), entry));
        Set<String> allFiles = new HashSet<>();
        allFiles.addAll(fileStatusMap.keySet());
        allFiles.addAll(metadataFileStatusMap.keySet());
        for (String file : allFiles) {
            Comparable[] row = new Comparable[6];
            row[0] = partition;
            FileStatus fsFileStatus = fileStatusMap.get(file);
            FileStatus metaFileStatus = metadataFileStatusMap.get(file);
            boolean doesFsFileExists = fsFileStatus != null;
            boolean doesMetadataFileExists = metaFileStatus != null;
            long fsFileLength = doesFsFileExists ? fsFileStatus.getLen() : 0;
            long metadataFileLength = doesMetadataFileExists ? metaFileStatus.getLen() : 0;
            row[1] = file;
            row[2] = doesFsFileExists;
            row[3] = doesMetadataFileExists;
            row[4] = fsFileLength;
            row[5] = metadataFileLength;
            if (verbose) {
                // if verbose print all files
                rows.add(row);
            } else if ((doesFsFileExists != doesMetadataFileExists) || (fsFileLength != metadataFileLength)) {
                // if non verbose, print only non matching files
                rows.add(row);
            }
        }
        if (metadataStatuses.length != fsStatuses.length) {
            LOG.error(" FS and metadata files count not matching for " + partition + ". FS files count " + fsStatuses.length + ", metadata base files count " + metadataStatuses.length);
        }
        for (Map.Entry<String, FileStatus> entry : fileStatusMap.entrySet()) {
            if (!metadataFileStatusMap.containsKey(entry.getKey())) {
                LOG.error("FS file not found in metadata " + entry.getKey());
            } else {
                if (entry.getValue().getLen() != metadataFileStatusMap.get(entry.getKey()).getLen()) {
                    LOG.error(" FS file size mismatch " + entry.getKey() + ", size equality " + (entry.getValue().getLen() == metadataFileStatusMap.get(entry.getKey()).getLen()) + ". FS size " + entry.getValue().getLen() + ", metadata size " + metadataFileStatusMap.get(entry.getKey()).getLen());
                }
            }
        }
        for (Map.Entry<String, FileStatus> entry : metadataFileStatusMap.entrySet()) {
            if (!fileStatusMap.containsKey(entry.getKey())) {
                LOG.error("Metadata file not found in FS " + entry.getKey());
            } else {
                if (entry.getValue().getLen() != fileStatusMap.get(entry.getKey()).getLen()) {
                    LOG.error(" Metadata file size mismatch " + entry.getKey() + ", size equality " + (entry.getValue().getLen() == fileStatusMap.get(entry.getKey()).getLen()) + ". Metadata size " + entry.getValue().getLen() + ", FS size " + metadataFileStatusMap.get(entry.getKey()).getLen());
                }
            }
        }
    }
    TableHeader header = new TableHeader().addTableHeaderField("Partition").addTableHeaderField("File Name").addTableHeaderField(" Is Present in FS ").addTableHeaderField(" Is Present in Metadata").addTableHeaderField(" FS size").addTableHeaderField(" Metadata size");
    return HoodiePrintHelper.print(header, new HashMap<>(), "", false, Integer.MAX_VALUE, false, rows);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) TableHeader(org.apache.hudi.cli.TableHeader) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieBackedTableMetadata(org.apache.hudi.metadata.HoodieBackedTableMetadata) HashMap(java.util.HashMap) Map(java.util.Map) HashSet(java.util.HashSet) CliCommand(org.springframework.shell.core.annotation.CliCommand)

Aggregations

HoodieTimer (org.apache.hudi.common.util.HoodieTimer)35 ArrayList (java.util.ArrayList)16 Path (org.apache.hadoop.fs.Path)15 IOException (java.io.IOException)14 HashMap (java.util.HashMap)12 Option (org.apache.hudi.common.util.Option)12 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)11 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)10 Map (java.util.Map)9 Pair (org.apache.hudi.common.util.collection.Pair)9 List (java.util.List)8 FileStatus (org.apache.hadoop.fs.FileStatus)8 HoodieIOException (org.apache.hudi.exception.HoodieIOException)7 LogManager (org.apache.log4j.LogManager)7 Logger (org.apache.log4j.Logger)7 Collectors (java.util.stream.Collectors)6 HoodieSparkEngineContext (org.apache.hudi.client.common.HoodieSparkEngineContext)6 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)6 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)6 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)6