Search in sources :

Example 1 with HoodieTimer

use of org.apache.hudi.common.util.HoodieTimer in project hudi by apache.

the class HoodieClientTestHarness method validateMetadata.

/**
 * Validate the metadata tables contents to ensure it matches what is on the file system.
 */
public void validateMetadata(HoodieTestTable testTable, List<String> inflightCommits, HoodieWriteConfig writeConfig, String metadataTableBasePath, boolean doFullValidation) throws IOException {
    HoodieTableMetadata tableMetadata = metadata(writeConfig, context);
    assertNotNull(tableMetadata, "MetadataReader should have been initialized");
    if (!writeConfig.isMetadataTableEnabled()) {
        return;
    }
    if (!tableMetadata.getSyncedInstantTime().isPresent() || tableMetadata instanceof FileSystemBackedTableMetadata) {
        throw new IllegalStateException("Metadata should have synced some commits or tableMetadata should not be an instance " + "of FileSystemBackedTableMetadata");
    }
    assertEquals(inflightCommits, testTable.inflightCommits());
    HoodieTimer timer = new HoodieTimer().startTimer();
    HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
    // Partitions should match
    List<java.nio.file.Path> fsPartitionPaths = testTable.getAllPartitionPaths();
    List<String> fsPartitions = new ArrayList<>();
    fsPartitionPaths.forEach(entry -> fsPartitions.add(entry.getFileName().toString()));
    if (fsPartitions.isEmpty()) {
        fsPartitions.add("");
    }
    List<String> metadataPartitions = tableMetadata.getAllPartitionPaths();
    Collections.sort(fsPartitions);
    Collections.sort(metadataPartitions);
    assertEquals(fsPartitions.size(), metadataPartitions.size(), "Partitions should match");
    assertEquals(fsPartitions, metadataPartitions, "Partitions should match");
    // Files within each partition should match
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieTable table = HoodieSparkTable.create(writeConfig, engineContext, true);
    TableFileSystemView tableView = table.getHoodieView();
    List<String> fullPartitionPaths = fsPartitions.stream().map(partition -> basePath + "/" + partition).collect(Collectors.toList());
    Map<String, FileStatus[]> partitionToFilesMap = tableMetadata.getAllFilesInPartitions(fullPartitionPaths);
    assertEquals(fsPartitions.size(), partitionToFilesMap.size());
    fsPartitions.forEach(partition -> {
        try {
            validateFilesPerPartition(testTable, tableMetadata, tableView, partitionToFilesMap, partition);
        } catch (IOException e) {
            fail("Exception should not be raised: " + e);
        }
    });
    if (doFullValidation) {
        runFullValidation(writeConfig, metadataTableBasePath, engineContext);
    }
    LOG.info("Validation time=" + timer.endTimer());
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Random(java.util.Random) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) SparkTaskContextSupplier(org.apache.hudi.client.SparkTaskContextSupplier) AfterAll(org.junit.jupiter.api.AfterAll) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Assertions.assertLinesMatch(org.junit.jupiter.api.Assertions.assertLinesMatch) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) CleanPlanV2MigrationHandler(org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) WorkloadStat(org.apache.hudi.table.WorkloadStat) CleanerUtils.convertCleanMetadata(org.apache.hudi.common.util.CleanerUtils.convertCleanMetadata) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) Tuple2(scala.Tuple2) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) TestInfo(org.junit.jupiter.api.TestInfo) Executors(java.util.concurrent.Executors) Serializable(java.io.Serializable) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) FileSystemBackedTableMetadata(org.apache.hudi.metadata.FileSystemBackedTableMetadata) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) HoodieCleaningPolicy(org.apache.hudi.common.model.HoodieCleaningPolicy) MiniDFSCluster(org.apache.hadoop.hdfs.MiniDFSCluster) HoodieBackedTableMetadataWriter(org.apache.hudi.metadata.HoodieBackedTableMetadataWriter) FileSlice(org.apache.hudi.common.model.FileSlice) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) TimelineService(org.apache.hudi.timeline.service.TimelineService) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) ExecutorService(java.util.concurrent.ExecutorService) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HdfsTestService(org.apache.hudi.common.testutils.minicluster.HdfsTestService) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) SQLContext(org.apache.spark.sql.SQLContext) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) IOException(java.io.IOException) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) Assertions(org.junit.jupiter.api.Assertions) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) LogManager(org.apache.log4j.LogManager) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) ArrayList(java.util.ArrayList) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) IOException(java.io.IOException) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) FileSystemBackedTableMetadata(org.apache.hudi.metadata.FileSystemBackedTableMetadata) HoodieTable(org.apache.hudi.table.HoodieTable) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView)

Example 2 with HoodieTimer

use of org.apache.hudi.common.util.HoodieTimer in project hudi by apache.

the class AbstractTableFileSystemView method addFilesToView.

/**
 * Adds the provided statuses into the file system view, and also caches it inside this object.
 */
public List<HoodieFileGroup> addFilesToView(FileStatus[] statuses) {
    HoodieTimer timer = new HoodieTimer().startTimer();
    List<HoodieFileGroup> fileGroups = buildFileGroups(statuses, visibleCommitsAndCompactionTimeline, true);
    long fgBuildTimeTakenMs = timer.endTimer();
    timer.startTimer();
    // Group by partition for efficient updates for both InMemory and DiskBased stuctures.
    fileGroups.stream().collect(Collectors.groupingBy(HoodieFileGroup::getPartitionPath)).forEach((partition, value) -> {
        if (!isPartitionAvailableInStore(partition)) {
            if (bootstrapIndex.useIndex()) {
                try (BootstrapIndex.IndexReader reader = bootstrapIndex.createReader()) {
                    LOG.info("Bootstrap Index available for partition " + partition);
                    List<BootstrapFileMapping> sourceFileMappings = reader.getSourceFileMappingForPartition(partition);
                    addBootstrapBaseFileMapping(sourceFileMappings.stream().map(s -> new BootstrapBaseFileMapping(new HoodieFileGroupId(s.getPartitionPath(), s.getFileId()), s.getBootstrapFileStatus())));
                }
            }
            storePartitionView(partition, value);
        }
    });
    long storePartitionsTs = timer.endTimer();
    LOG.info("addFilesToView: NumFiles=" + statuses.length + ", NumFileGroups=" + fileGroups.size() + ", FileGroupsCreationTime=" + fgBuildTimeTakenMs + ", StoreTimeTaken=" + storePartitionsTs);
    return fileGroups;
}
Also used : BootstrapBaseFileMapping(org.apache.hudi.common.model.BootstrapBaseFileMapping) Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) ReentrantReadWriteLock(java.util.concurrent.locks.ReentrantReadWriteLock) ReadLock(java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) WriteLock(java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock) Predicate(java.util.function.Predicate) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) Serializable(java.io.Serializable) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) AbstractMap(java.util.AbstractMap) List(java.util.List) GREATER_THAN_OR_EQUALS(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN_OR_EQUALS) Stream(java.util.stream.Stream) ClusteringUtils(org.apache.hudi.common.util.ClusteringUtils) HoodieIOException(org.apache.hudi.exception.HoodieIOException) METADATA_BOOTSTRAP_INSTANT_TS(org.apache.hudi.common.table.timeline.HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) GREATER_THAN(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) BootstrapBaseFileMapping(org.apache.hudi.common.model.BootstrapBaseFileMapping) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping)

Example 3 with HoodieTimer

use of org.apache.hudi.common.util.HoodieTimer in project hudi by apache.

the class HiveQueryDDLExecutor method updateHiveSQLs.

private List<CommandProcessorResponse> updateHiveSQLs(List<String> sqls) {
    List<CommandProcessorResponse> responses = new ArrayList<>();
    try {
        for (String sql : sqls) {
            if (hiveDriver != null) {
                HoodieTimer timer = new HoodieTimer().startTimer();
                responses.add(hiveDriver.run(sql));
                LOG.info(String.format("Time taken to execute [%s]: %s ms", sql, timer.endTimer()));
            }
        }
    } catch (Exception e) {
        throw new HoodieHiveSyncException("Failed in executing SQL", e);
    }
    return responses;
}
Also used : CommandProcessorResponse(org.apache.hadoop.hive.ql.processors.CommandProcessorResponse) ArrayList(java.util.ArrayList) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) HoodieHiveSyncException(org.apache.hudi.hive.HoodieHiveSyncException) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) IOException(java.io.IOException) HoodieHiveSyncException(org.apache.hudi.hive.HoodieHiveSyncException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Example 4 with HoodieTimer

use of org.apache.hudi.common.util.HoodieTimer in project hudi by apache.

the class RequestHandler method jsonifyResult.

/**
 * Serializes the result into JSON String.
 *
 * @param ctx Javalin context
 * @param obj object to serialize
 * @param metricsRegistry {@code Registry} instance for storing metrics
 * @param objectMapper JSON object mapper
 * @param logger {@code Logger} instance
 * @return JSON String from the input object
 * @throws JsonProcessingException
 */
public static String jsonifyResult(Context ctx, Object obj, Registry metricsRegistry, ObjectMapper objectMapper, Logger logger) throws JsonProcessingException {
    HoodieTimer timer = new HoodieTimer().startTimer();
    boolean prettyPrint = ctx.queryParam("pretty") != null;
    String result = prettyPrint ? objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(obj) : objectMapper.writeValueAsString(obj);
    final long jsonifyTime = timer.endTimer();
    metricsRegistry.add("WRITE_VALUE_CNT", 1);
    metricsRegistry.add("WRITE_VALUE_TIME", jsonifyTime);
    if (logger.isDebugEnabled()) {
        logger.debug("Jsonify TimeTaken=" + jsonifyTime);
    }
    return result;
}
Also used : HoodieTimer(org.apache.hudi.common.util.HoodieTimer)

Example 5 with HoodieTimer

use of org.apache.hudi.common.util.HoodieTimer in project hudi by apache.

the class HoodieBackedTableMetadataWriter method initializeIfNeeded.

/**
 * Initialize the metadata table if needed.
 *
 * @param dataMetaClient           - meta client for the data table
 * @param actionMetadata           - optional action metadata
 * @param inflightInstantTimestamp - timestamp of an instant in progress on the dataset
 * @param <T>                      - action metadata types extending Avro generated SpecificRecordBase
 * @throws IOException
 */
protected <T extends SpecificRecordBase> void initializeIfNeeded(HoodieTableMetaClient dataMetaClient, Option<T> actionMetadata, Option<String> inflightInstantTimestamp) throws IOException {
    HoodieTimer timer = new HoodieTimer().startTimer();
    boolean exists = dataMetaClient.getFs().exists(new Path(metadataWriteConfig.getBasePath(), HoodieTableMetaClient.METAFOLDER_NAME));
    boolean reInitialize = false;
    // the metadata table will need to be initialized again.
    if (exists) {
        HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf.get()).setBasePath(metadataWriteConfig.getBasePath()).build();
        if (dataWriteConfig.getMetadataConfig().populateMetaFields() != metadataMetaClient.getTableConfig().populateMetaFields()) {
            LOG.info("Re-initiating metadata table properties since populate meta fields have changed");
            metadataMetaClient = initializeMetaClient(dataWriteConfig.getMetadataConfig().populateMetaFields());
        }
        final Option<HoodieInstant> latestMetadataInstant = metadataMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant();
        reInitialize = isBootstrapNeeded(latestMetadataInstant, actionMetadata);
    }
    if (reInitialize) {
        metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.REBOOTSTRAP_STR, 1));
        LOG.info("Deleting Metadata Table directory so that it can be re-initialized");
        dataMetaClient.getFs().delete(new Path(metadataWriteConfig.getBasePath()), true);
        exists = false;
    }
    if (!exists) {
        // Initialize for the first time by listing partitions and files directly from the file system
        if (initializeFromFilesystem(dataMetaClient, inflightInstantTimestamp)) {
            metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.INITIALIZE_STR, timer.endTimer()));
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTimer(org.apache.hudi.common.util.HoodieTimer)

Aggregations

HoodieTimer (org.apache.hudi.common.util.HoodieTimer)35 ArrayList (java.util.ArrayList)16 Path (org.apache.hadoop.fs.Path)15 IOException (java.io.IOException)14 HashMap (java.util.HashMap)12 Option (org.apache.hudi.common.util.Option)12 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)11 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)10 Map (java.util.Map)9 Pair (org.apache.hudi.common.util.collection.Pair)9 List (java.util.List)8 FileStatus (org.apache.hadoop.fs.FileStatus)8 HoodieIOException (org.apache.hudi.exception.HoodieIOException)7 LogManager (org.apache.log4j.LogManager)7 Logger (org.apache.log4j.Logger)7 Collectors (java.util.stream.Collectors)6 HoodieSparkEngineContext (org.apache.hudi.client.common.HoodieSparkEngineContext)6 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)6 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)6 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)6