Search in sources :

Example 1 with SyncableFileSystemView

use of org.apache.hudi.common.table.view.SyncableFileSystemView in project hudi by apache.

the class ClusteringPlanStrategy method getFileSlicesEligibleForClustering.

/**
 * Return file slices eligible for clustering. FileIds in pending clustering/compaction are not eligible for clustering.
 */
protected Stream<FileSlice> getFileSlicesEligibleForClustering(String partition) {
    SyncableFileSystemView fileSystemView = (SyncableFileSystemView) getHoodieTable().getSliceView();
    Set<HoodieFileGroupId> fgIdsInPendingCompactionAndClustering = fileSystemView.getPendingCompactionOperations().map(instantTimeOpPair -> instantTimeOpPair.getValue().getFileGroupId()).collect(Collectors.toSet());
    fgIdsInPendingCompactionAndClustering.addAll(fileSystemView.getFileGroupsInPendingClustering().map(Pair::getKey).collect(Collectors.toSet()));
    return hoodieTable.getSliceView().getLatestFileSlices(partition).filter(slice -> !fgIdsInPendingCompactionAndClustering.contains(slice.getFileGroupId()));
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Logger(org.apache.log4j.Logger) StringUtils(org.apache.hudi.common.util.StringUtils) FileSliceMetricUtils(org.apache.hudi.client.utils.FileSliceMetricUtils) BaseFile(org.apache.hudi.common.model.BaseFile) Map(java.util.Map) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) Set(java.util.Set) Collectors(java.util.stream.Collectors) Serializable(java.io.Serializable) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) List(java.util.List) Stream(java.util.stream.Stream) ClusteringPlanPartitionFilterMode(org.apache.hudi.table.action.cluster.ClusteringPlanPartitionFilterMode) HoodieSliceInfo(org.apache.hudi.avro.model.HoodieSliceInfo) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) Pair(org.apache.hudi.common.util.collection.Pair) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) Pair(org.apache.hudi.common.util.collection.Pair)

Example 2 with SyncableFileSystemView

use of org.apache.hudi.common.table.view.SyncableFileSystemView in project hudi by apache.

the class ScheduleCompactionActionExecutor method scheduleCompaction.

private HoodieCompactionPlan scheduleCompaction() {
    LOG.info("Checking if compaction needs to be run on " + config.getBasePath());
    // judge if we need to compact according to num delta commits and time elapsed
    boolean compactable = needCompact(config.getInlineCompactTriggerStrategy());
    if (compactable) {
        LOG.info("Generating compaction plan for merge on read table " + config.getBasePath());
        try {
            SyncableFileSystemView fileSystemView = (SyncableFileSystemView) table.getSliceView();
            Set<HoodieFileGroupId> fgInPendingCompactionAndClustering = fileSystemView.getPendingCompactionOperations().map(instantTimeOpPair -> instantTimeOpPair.getValue().getFileGroupId()).collect(Collectors.toSet());
            // exclude files in pending clustering from compaction.
            fgInPendingCompactionAndClustering.addAll(fileSystemView.getFileGroupsInPendingClustering().map(Pair::getLeft).collect(Collectors.toSet()));
            context.setJobStatus(this.getClass().getSimpleName(), "Compaction: generating compaction plan");
            return compactor.generateCompactionPlan(context, table, config, instantTime, fgInPendingCompactionAndClustering);
        } catch (IOException e) {
            throw new HoodieCompactionException("Could not schedule compaction " + config.getBasePath(), e);
        }
    }
    return new HoodieCompactionPlan();
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) BaseActionExecutor(org.apache.hudi.table.action.BaseActionExecutor) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Logger(org.apache.log4j.Logger) Map(java.util.Map) ParseException(java.text.ParseException) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Set(java.util.Set) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieCompactionException(org.apache.hudi.exception.HoodieCompactionException) List(java.util.List) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) EngineType(org.apache.hudi.common.engine.EngineType) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieCompactionException(org.apache.hudi.exception.HoodieCompactionException) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) Pair(org.apache.hudi.common.util.collection.Pair)

Example 3 with SyncableFileSystemView

use of org.apache.hudi.common.table.view.SyncableFileSystemView in project hudi by apache.

the class TimelineServerPerf method run.

public void run() throws IOException {
    JavaSparkContext jsc = UtilHelpers.buildSparkContext("hudi-view-perf-" + cfg.basePath, cfg.sparkMaster);
    HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
    List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(engineContext, cfg.basePath, cfg.useFileListingFromMetadata, true);
    Collections.shuffle(allPartitionPaths);
    List<String> selected = allPartitionPaths.stream().filter(p -> !p.contains("error")).limit(cfg.maxPartitions).collect(Collectors.toList());
    if (!useExternalTimelineServer) {
        this.timelineServer.startService();
        setHostAddrFromSparkConf(jsc.getConf());
    } else {
        this.hostAddr = cfg.serverHost;
    }
    HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(timelineServer.getConf()).setBasePath(cfg.basePath).setLoadActiveTimelineOnLoad(true).build();
    SyncableFileSystemView fsView = new RemoteHoodieTableFileSystemView(this.hostAddr, cfg.serverPort, metaClient);
    String reportDir = cfg.reportDir;
    metaClient.getFs().mkdirs(new Path(reportDir));
    String dumpPrefix = UUID.randomUUID().toString();
    System.out.println("First Iteration to load all partitions");
    Dumper d = new Dumper(metaClient.getFs(), new Path(reportDir, String.format("1_%s.csv", dumpPrefix)));
    d.init();
    d.dump(runLookups(jsc, selected, fsView, 1, 0));
    d.close();
    System.out.println("\n\n\n First Iteration is done");
    Dumper d2 = new Dumper(metaClient.getFs(), new Path(reportDir, String.format("2_%s.csv", dumpPrefix)));
    d2.init();
    d2.dump(runLookups(jsc, selected, fsView, cfg.numIterations, cfg.numCoresPerExecutor));
    d2.close();
    System.out.println("\n\n\nDumping all File Slices");
    selected.forEach(p -> fsView.getAllFileSlices(p).forEach(s -> System.out.println("\tMyFileSlice=" + s)));
    // Waiting for curl queries
    if (!useExternalTimelineServer && cfg.waitForManualQueries) {
        System.out.println("Timeline Server Host Address=" + hostAddr + ", port=" + timelineServer.getServerPort());
        while (true) {
            try {
                Thread.sleep(60000);
            } catch (InterruptedException e) {
            // skip it
            }
        }
    }
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Path(org.apache.hadoop.fs.Path) IntStream(java.util.stream.IntStream) Histogram(com.codahale.metrics.Histogram) ScheduledFuture(java.util.concurrent.ScheduledFuture) Parameter(com.beust.jcommander.Parameter) FileSystem(org.apache.hadoop.fs.FileSystem) FileSlice(org.apache.hudi.common.model.FileSlice) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) Random(java.util.Random) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) TimelineService(org.apache.hudi.timeline.service.TimelineService) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Logger(org.apache.log4j.Logger) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) UniformReservoir(com.codahale.metrics.UniformReservoir) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) FileSystemViewStorageType(org.apache.hudi.common.table.view.FileSystemViewStorageType) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) UtilHelpers(org.apache.hudi.utilities.UtilHelpers) JCommander(com.beust.jcommander.JCommander) SparkConf(org.apache.spark.SparkConf) IOException(java.io.IOException) ScheduledThreadPoolExecutor(java.util.concurrent.ScheduledThreadPoolExecutor) RemoteHoodieTableFileSystemView(org.apache.hudi.common.table.view.RemoteHoodieTableFileSystemView) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) Snapshot(com.codahale.metrics.Snapshot) Serializable(java.io.Serializable) ExecutionException(java.util.concurrent.ExecutionException) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) RemoteHoodieTableFileSystemView(org.apache.hudi.common.table.view.RemoteHoodieTableFileSystemView) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 4 with SyncableFileSystemView

use of org.apache.hudi.common.table.view.SyncableFileSystemView in project hudi by apache.

the class RequestHandler method syncIfLocalViewBehind.

/**
 * Syncs data-set view if local view is behind.
 */
private boolean syncIfLocalViewBehind(Context ctx) {
    if (isLocalViewBehind(ctx)) {
        String basePath = ctx.queryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM);
        String lastKnownInstantFromClient = ctx.queryParam(RemoteHoodieTableFileSystemView.LAST_INSTANT_TS, HoodieTimeline.INVALID_INSTANT_TS);
        SyncableFileSystemView view = viewManager.getFileSystemView(basePath);
        synchronized (view) {
            if (isLocalViewBehind(ctx)) {
                HoodieTimeline localTimeline = viewManager.getFileSystemView(basePath).getTimeline();
                LOG.info("Syncing view as client passed last known instant " + lastKnownInstantFromClient + " as last known instant but server has the following last instant on timeline :" + localTimeline.lastInstant());
                view.sync();
                return true;
            }
        }
    }
    return false;
}
Also used : SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline)

Example 5 with SyncableFileSystemView

use of org.apache.hudi.common.table.view.SyncableFileSystemView in project hudi by apache.

the class HoodieClientRollbackTestBase method twoUpsertCommitDataWithTwoPartitions.

protected void twoUpsertCommitDataWithTwoPartitions(List<FileSlice> firstPartitionCommit2FileSlices, List<FileSlice> secondPartitionCommit2FileSlices, HoodieWriteConfig cfg, boolean commitSecondUpsert) throws IOException {
    // just generate two partitions
    dataGen = new HoodieTestDataGenerator(new String[] { DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH });
    // 1. prepare data
    HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, new String[] { DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH }, basePath);
    SparkRDDWriteClient client = getHoodieWriteClient(cfg);
    /**
     * Write 1 (only inserts)
     */
    String newCommitTime = "001";
    client.startCommitWithTime(newCommitTime);
    List<HoodieRecord> records = dataGen.generateInsertsContainsAllPartitions(newCommitTime, 2);
    JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
    JavaRDD<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime);
    Assertions.assertNoWriteErrors(statuses.collect());
    client.commit(newCommitTime, statuses);
    /**
     * Write 2 (updates)
     */
    newCommitTime = "002";
    client.startCommitWithTime(newCommitTime);
    records = dataGen.generateUpdates(newCommitTime, records);
    statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime);
    Assertions.assertNoWriteErrors(statuses.collect());
    if (commitSecondUpsert) {
        client.commit(newCommitTime, statuses);
    }
    // 2. assert file group and get the first partition file slice
    HoodieTable table = this.getHoodieTable(metaClient, cfg);
    SyncableFileSystemView fsView = getFileSystemViewWithUnCommittedSlices(table.getMetaClient());
    List<HoodieFileGroup> firstPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList());
    assertEquals(1, firstPartitionCommit2FileGroups.size());
    firstPartitionCommit2FileSlices.addAll(firstPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()));
    // 3. assert file group and get the second partition file slice
    List<HoodieFileGroup> secondPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).collect(Collectors.toList());
    assertEquals(1, secondPartitionCommit2FileGroups.size());
    secondPartitionCommit2FileSlices.addAll(secondPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()));
    // 4. assert file slice
    HoodieTableType tableType = this.getTableType();
    if (tableType.equals(HoodieTableType.COPY_ON_WRITE)) {
        assertEquals(2, firstPartitionCommit2FileSlices.size());
        assertEquals(2, secondPartitionCommit2FileSlices.size());
    } else {
        assertEquals(1, firstPartitionCommit2FileSlices.size());
        assertEquals(1, secondPartitionCommit2FileSlices.size());
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) WriteStatus(org.apache.hudi.client.WriteStatus)

Aggregations

SyncableFileSystemView (org.apache.hudi.common.table.view.SyncableFileSystemView)10 HoodieTable (org.apache.hudi.table.HoodieTable)7 List (java.util.List)6 Collectors (java.util.stream.Collectors)6 IOException (java.io.IOException)5 WriteStatus (org.apache.hudi.client.WriteStatus)5 FileSlice (org.apache.hudi.common.model.FileSlice)5 HoodieFileGroup (org.apache.hudi.common.model.HoodieFileGroup)5 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)5 HoodieTableType (org.apache.hudi.common.model.HoodieTableType)5 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)5 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)5 Option (org.apache.hudi.common.util.Option)5 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)5 Map (java.util.Map)4 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)4 ArrayList (java.util.ArrayList)3 Collections (java.util.Collections)3 Set (java.util.Set)3 Stream (java.util.stream.Stream)3