use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class DatePartitionPathSelector method pruneDatePartitionPaths.
/**
* Prunes date level partitions to last few days configured by 'NUM_PREV_DAYS_TO_LIST' from
* 'CURRENT_DATE'. Parallelizes listing by leveraging HoodieSparkEngineContext's methods.
*/
public List<String> pruneDatePartitionPaths(HoodieSparkEngineContext context, FileSystem fs, String rootPath, LocalDate currentDate) {
List<String> partitionPaths = new ArrayList<>();
// get all partition paths before date partition level
partitionPaths.add(rootPath);
if (datePartitionDepth <= 0) {
return partitionPaths;
}
SerializableConfiguration serializedConf = new SerializableConfiguration(fs.getConf());
for (int i = 0; i < datePartitionDepth; i++) {
partitionPaths = context.flatMap(partitionPaths, path -> {
Path subDir = new Path(path);
FileSystem fileSystem = subDir.getFileSystem(serializedConf.get());
// skip files/dirs whose names start with (_, ., etc)
FileStatus[] statuses = fileSystem.listStatus(subDir, file -> IGNORE_FILEPREFIX_LIST.stream().noneMatch(pfx -> file.getName().startsWith(pfx)));
List<String> res = new ArrayList<>();
for (FileStatus status : statuses) {
res.add(status.getPath().toString());
}
return res.stream();
}, partitionsListParallelism);
}
// Prune date partitions to last few days
return context.getJavaSparkContext().parallelize(partitionPaths, partitionsListParallelism).filter(s -> {
LocalDate fromDate = currentDate.minusDays(numPrevDaysToList);
String[] splits = s.split("/");
String datePartition = splits[splits.length - 1];
LocalDate partitionDate;
DateTimeFormatter dateFormatter = DateTimeFormatter.ofPattern(dateFormat);
if (datePartition.contains("=")) {
String[] moreSplit = datePartition.split("=");
ValidationUtils.checkArgument(moreSplit.length == 2, "Partition Field (" + datePartition + ") not in expected format");
partitionDate = LocalDate.parse(moreSplit[1], dateFormatter);
} else {
partitionDate = LocalDate.parse(datePartition, dateFormatter);
}
return (partitionDate.isEqual(fromDate) || partitionDate.isAfter(fromDate)) && (partitionDate.isEqual(currentDate) || partitionDate.isBefore(currentDate));
}).collect();
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class TimelineServerPerf method runLookups.
public List<PerfStats> runLookups(JavaSparkContext jsc, List<String> partitionPaths, SyncableFileSystemView fsView, int numIterations, int concurrency) {
HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
context.setJobStatus(this.getClass().getSimpleName(), "Lookup all performance stats");
return context.flatMap(partitionPaths, p -> {
ScheduledThreadPoolExecutor executor = new ScheduledThreadPoolExecutor(100);
final List<PerfStats> result = new ArrayList<>();
final List<ScheduledFuture<PerfStats>> futures = new ArrayList<>();
List<FileSlice> slices = fsView.getLatestFileSlices(p).collect(Collectors.toList());
String fileId = slices.isEmpty() ? "dummyId" : slices.get(new Random(Double.doubleToLongBits(Math.random())).nextInt(slices.size())).getFileId();
IntStream.range(0, concurrency).forEach(i -> futures.add(executor.schedule(() -> runOneRound(fsView, p, fileId, i, numIterations), 0, TimeUnit.NANOSECONDS)));
futures.forEach(x -> {
try {
result.add(x.get());
} catch (InterruptedException | ExecutionException e) {
throw new RuntimeException(e);
}
});
System.out.println("SLICES are=");
slices.forEach(s -> System.out.println("\t\tFileSlice=" + s));
return result.stream();
}, cfg.numExecutors);
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class ITTestClusteringCommand method generateCommits.
private void generateCommits() throws IOException {
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
// Create the write client to write some records in
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withDeleteParallelism(2).forTable(tableName).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
SparkRDDWriteClient<HoodieAvroPayload> client = new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jsc), cfg);
insert(jsc, client, dataGen, "001");
insert(jsc, client, dataGen, "002");
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class ITTestCompactionCommand method testRepairCompaction.
/**
* This function mainly tests the workflow of 'compaction repair' command.
* The real test of {@link org.apache.hudi.client.CompactionAdminClient#repairCompaction}
* is {@link TestCompactionAdminClient#testRepairCompactionPlan}.
*/
@Test
public void testRepairCompaction() throws Exception {
int numEntriesPerInstant = 10;
String compactionInstant = "001";
CompactionTestUtils.setupAndValidateCompactionOperations(metaClient, false, numEntriesPerInstant, numEntriesPerInstant, numEntriesPerInstant, numEntriesPerInstant);
metaClient.reloadActiveTimeline();
CompactionAdminClient client = new CompactionAdminClient(new HoodieSparkEngineContext(jsc), metaClient.getBasePath());
List<Pair<HoodieLogFile, HoodieLogFile>> renameFiles = client.getRenamingActionsForUnschedulingCompactionPlan(metaClient, compactionInstant, 1, Option.empty(), false);
renameFiles.forEach(lfPair -> {
try {
metaClient.getFs().rename(lfPair.getLeft().getPath(), lfPair.getRight().getPath());
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
});
client.unscheduleCompactionPlan(compactionInstant, false, 1, false);
CommandResult cr = getShell().executeCommand(String.format("compaction repair --instant %s --sparkMaster %s", compactionInstant, "local"));
// All Executes is succeeded, result contains true and has no false
// Expected:
// ║ File Id │ Source File Path │ Destination File Path │ Rename Executed? │ Rename Succeeded? │ Error ║
// ║ * │ * │ * │ true │ true │ ║
assertAll("Command run failed", () -> assertTrue(cr.isSuccess()), () -> assertTrue(removeNonWordAndStripSpace(cr.getResult().toString()).contains("true")), () -> assertFalse(removeNonWordAndStripSpace(cr.getResult().toString()).contains("false")));
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class DFSHoodieDatasetInputReader method getPartitions.
protected List<String> getPartitions(Option<Integer> partitionsLimit) throws IOException {
// Using FSUtils.getFS here instead of metaClient.getFS() since we dont want to count these listStatus
// calls in metrics as they are not part of normal HUDI operation.
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
List<String> partitionPaths = FSUtils.getAllPartitionPaths(engineContext, metaClient.getBasePath(), HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, false);
// Sort partition so we can pick last N partitions by default
Collections.sort(partitionPaths);
if (!partitionPaths.isEmpty()) {
ValidationUtils.checkArgument(partitionPaths.size() >= partitionsLimit.get(), "Cannot generate updates for more partitions " + "than present in the dataset, partitions " + "requested " + partitionsLimit.get() + ", partitions present " + partitionPaths.size());
return partitionPaths.subList(0, partitionsLimit.get());
}
return partitionPaths;
}
Aggregations