Search in sources :

Example 11 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class DatePartitionPathSelector method pruneDatePartitionPaths.

/**
 * Prunes date level partitions to last few days configured by 'NUM_PREV_DAYS_TO_LIST' from
 * 'CURRENT_DATE'. Parallelizes listing by leveraging HoodieSparkEngineContext's methods.
 */
public List<String> pruneDatePartitionPaths(HoodieSparkEngineContext context, FileSystem fs, String rootPath, LocalDate currentDate) {
    List<String> partitionPaths = new ArrayList<>();
    // get all partition paths before date partition level
    partitionPaths.add(rootPath);
    if (datePartitionDepth <= 0) {
        return partitionPaths;
    }
    SerializableConfiguration serializedConf = new SerializableConfiguration(fs.getConf());
    for (int i = 0; i < datePartitionDepth; i++) {
        partitionPaths = context.flatMap(partitionPaths, path -> {
            Path subDir = new Path(path);
            FileSystem fileSystem = subDir.getFileSystem(serializedConf.get());
            // skip files/dirs whose names start with (_, ., etc)
            FileStatus[] statuses = fileSystem.listStatus(subDir, file -> IGNORE_FILEPREFIX_LIST.stream().noneMatch(pfx -> file.getName().startsWith(pfx)));
            List<String> res = new ArrayList<>();
            for (FileStatus status : statuses) {
                res.add(status.getPath().toString());
            }
            return res.stream();
        }, partitionsListParallelism);
    }
    // Prune date partitions to last few days
    return context.getJavaSparkContext().parallelize(partitionPaths, partitionsListParallelism).filter(s -> {
        LocalDate fromDate = currentDate.minusDays(numPrevDaysToList);
        String[] splits = s.split("/");
        String datePartition = splits[splits.length - 1];
        LocalDate partitionDate;
        DateTimeFormatter dateFormatter = DateTimeFormatter.ofPattern(dateFormat);
        if (datePartition.contains("=")) {
            String[] moreSplit = datePartition.split("=");
            ValidationUtils.checkArgument(moreSplit.length == 2, "Partition Field (" + datePartition + ") not in expected format");
            partitionDate = LocalDate.parse(moreSplit[1], dateFormatter);
        } else {
            partitionDate = LocalDate.parse(datePartition, dateFormatter);
        }
        return (partitionDate.isEqual(fromDate) || partitionDate.isAfter(fromDate)) && (partitionDate.isEqual(currentDate) || partitionDate.isBefore(currentDate));
    }).collect();
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) DEFAULT_PARTITIONS_LIST_PARALLELISM(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_PARTITIONS_LIST_PARALLELISM) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) ROOT_INPUT_PATH_PROP(org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP) TypedProperties(org.apache.hudi.common.config.TypedProperties) LOOKBACK_DAYS(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.LOOKBACK_DAYS) DATE_PARTITION_DEPTH(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DATE_PARTITION_DEPTH) DEFAULT_DATE_PARTITION_DEPTH(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_DATE_PARTITION_DEPTH) Collectors(java.util.stream.Collectors) DATE_FORMAT(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DATE_FORMAT) DEFAULT_DATE_FORMAT(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_DATE_FORMAT) PARTITIONS_LIST_PARALLELISM(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.PARTITIONS_LIST_PARALLELISM) List(java.util.List) LocalDate(java.time.LocalDate) DateTimeFormatter(java.time.format.DateTimeFormatter) DEFAULT_LOOKBACK_DAYS(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_LOOKBACK_DAYS) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) ArrayList(java.util.ArrayList) LocalDate(java.time.LocalDate) FileSystem(org.apache.hadoop.fs.FileSystem) ArrayList(java.util.ArrayList) List(java.util.List) DateTimeFormatter(java.time.format.DateTimeFormatter)

Example 12 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class TimelineServerPerf method runLookups.

public List<PerfStats> runLookups(JavaSparkContext jsc, List<String> partitionPaths, SyncableFileSystemView fsView, int numIterations, int concurrency) {
    HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
    context.setJobStatus(this.getClass().getSimpleName(), "Lookup all performance stats");
    return context.flatMap(partitionPaths, p -> {
        ScheduledThreadPoolExecutor executor = new ScheduledThreadPoolExecutor(100);
        final List<PerfStats> result = new ArrayList<>();
        final List<ScheduledFuture<PerfStats>> futures = new ArrayList<>();
        List<FileSlice> slices = fsView.getLatestFileSlices(p).collect(Collectors.toList());
        String fileId = slices.isEmpty() ? "dummyId" : slices.get(new Random(Double.doubleToLongBits(Math.random())).nextInt(slices.size())).getFileId();
        IntStream.range(0, concurrency).forEach(i -> futures.add(executor.schedule(() -> runOneRound(fsView, p, fileId, i, numIterations), 0, TimeUnit.NANOSECONDS)));
        futures.forEach(x -> {
            try {
                result.add(x.get());
            } catch (InterruptedException | ExecutionException e) {
                throw new RuntimeException(e);
            }
        });
        System.out.println("SLICES are=");
        slices.forEach(s -> System.out.println("\t\tFileSlice=" + s));
        return result.stream();
    }, cfg.numExecutors);
}
Also used : HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) ScheduledThreadPoolExecutor(java.util.concurrent.ScheduledThreadPoolExecutor) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) ScheduledFuture(java.util.concurrent.ScheduledFuture) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Random(java.util.Random) ExecutionException(java.util.concurrent.ExecutionException)

Example 13 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class ITTestClusteringCommand method generateCommits.

private void generateCommits() throws IOException {
    HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
    // Create the write client to write some records in
    HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withDeleteParallelism(2).forTable(tableName).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
    SparkRDDWriteClient<HoodieAvroPayload> client = new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jsc), cfg);
    insert(jsc, client, dataGen, "001");
    insert(jsc, client, dataGen, "002");
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload)

Example 14 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class ITTestCompactionCommand method testRepairCompaction.

/**
 * This function mainly tests the workflow of 'compaction repair' command.
 * The real test of {@link org.apache.hudi.client.CompactionAdminClient#repairCompaction}
 * is {@link TestCompactionAdminClient#testRepairCompactionPlan}.
 */
@Test
public void testRepairCompaction() throws Exception {
    int numEntriesPerInstant = 10;
    String compactionInstant = "001";
    CompactionTestUtils.setupAndValidateCompactionOperations(metaClient, false, numEntriesPerInstant, numEntriesPerInstant, numEntriesPerInstant, numEntriesPerInstant);
    metaClient.reloadActiveTimeline();
    CompactionAdminClient client = new CompactionAdminClient(new HoodieSparkEngineContext(jsc), metaClient.getBasePath());
    List<Pair<HoodieLogFile, HoodieLogFile>> renameFiles = client.getRenamingActionsForUnschedulingCompactionPlan(metaClient, compactionInstant, 1, Option.empty(), false);
    renameFiles.forEach(lfPair -> {
        try {
            metaClient.getFs().rename(lfPair.getLeft().getPath(), lfPair.getRight().getPath());
        } catch (IOException e) {
            throw new HoodieIOException(e.getMessage(), e);
        }
    });
    client.unscheduleCompactionPlan(compactionInstant, false, 1, false);
    CommandResult cr = getShell().executeCommand(String.format("compaction repair --instant %s --sparkMaster %s", compactionInstant, "local"));
    // All Executes is succeeded, result contains true and has no false
    // Expected:
    // ║ File Id │ Source File Path │ Destination File Path │ Rename Executed? │ Rename Succeeded? │ Error ║
    // ║ *       │     *            │        *              │    true          │     true          │       ║
    assertAll("Command run failed", () -> assertTrue(cr.isSuccess()), () -> assertTrue(removeNonWordAndStripSpace(cr.getResult().toString()).contains("true")), () -> assertFalse(removeNonWordAndStripSpace(cr.getResult().toString()).contains("false")));
}
Also used : HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) TestCompactionAdminClient(org.apache.hudi.client.TestCompactionAdminClient) CompactionAdminClient(org.apache.hudi.client.CompactionAdminClient) HoodieIOException(org.apache.hudi.exception.HoodieIOException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) Pair(org.apache.hudi.common.util.collection.Pair) CommandResult(org.springframework.shell.core.CommandResult) AbstractShellIntegrationTest(org.apache.hudi.cli.testutils.AbstractShellIntegrationTest) Test(org.junit.jupiter.api.Test)

Example 15 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class DFSHoodieDatasetInputReader method getPartitions.

protected List<String> getPartitions(Option<Integer> partitionsLimit) throws IOException {
    // Using FSUtils.getFS here instead of metaClient.getFS() since we dont want to count these listStatus
    // calls in metrics as they are not part of normal HUDI operation.
    HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
    List<String> partitionPaths = FSUtils.getAllPartitionPaths(engineContext, metaClient.getBasePath(), HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, false);
    // Sort partition so we can pick last N partitions by default
    Collections.sort(partitionPaths);
    if (!partitionPaths.isEmpty()) {
        ValidationUtils.checkArgument(partitionPaths.size() >= partitionsLimit.get(), "Cannot generate updates for more partitions " + "than present in the dataset, partitions " + "requested " + partitionsLimit.get() + ", partitions present " + partitionPaths.size());
        return partitionPaths.subList(0, partitionsLimit.get());
    }
    return partitionPaths;
}
Also used : HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext)

Aggregations

HoodieSparkEngineContext (org.apache.hudi.client.common.HoodieSparkEngineContext)58 Path (org.apache.hadoop.fs.Path)25 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)24 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)23 ArrayList (java.util.ArrayList)19 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)19 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)17 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)17 WriteStatus (org.apache.hudi.client.WriteStatus)15 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)15 IOException (java.io.IOException)14 List (java.util.List)14 Option (org.apache.hudi.common.util.Option)14 LogManager (org.apache.log4j.LogManager)14 Logger (org.apache.log4j.Logger)14 Test (org.junit.jupiter.api.Test)14 Collectors (java.util.stream.Collectors)12 FileStatus (org.apache.hadoop.fs.FileStatus)12 FileSystem (org.apache.hadoop.fs.FileSystem)12 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)11