use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestIncrementalFSViewSync method generateDataForInstant.
private List<Pair<String, HoodieWriteStat>> generateDataForInstant(String baseInstant, String instant, boolean deltaCommit, List<String> fileIds) {
return partitions.stream().flatMap(p -> fileIds.stream().map(f -> {
try {
File file = new File(basePath + "/" + p + "/" + (deltaCommit ? FSUtils.makeLogFileName(f, ".log", baseInstant, Integer.parseInt(instant), TEST_WRITE_TOKEN) : FSUtils.makeDataFileName(instant, TEST_WRITE_TOKEN, f)));
file.createNewFile();
HoodieWriteStat w = new HoodieWriteStat();
w.setFileId(f);
w.setPath(String.format("%s/%s", p, file.getName()));
return Pair.of(p, w);
} catch (IOException e) {
throw new HoodieException(e);
}
})).collect(Collectors.toList());
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class CompactionTestUtils method setupAndValidateCompactionOperations.
public static Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> setupAndValidateCompactionOperations(HoodieTableMetaClient metaClient, boolean inflight, int numEntriesInPlan1, int numEntriesInPlan2, int numEntriesInPlan3, int numEntriesInPlan4) throws IOException {
HoodieCompactionPlan plan1 = createCompactionPlan(metaClient, "000", "001", numEntriesInPlan1, true, true);
HoodieCompactionPlan plan2 = createCompactionPlan(metaClient, "002", "003", numEntriesInPlan2, false, true);
HoodieCompactionPlan plan3 = createCompactionPlan(metaClient, "004", "005", numEntriesInPlan3, true, false);
HoodieCompactionPlan plan4 = createCompactionPlan(metaClient, "006", "007", numEntriesInPlan4, false, false);
if (inflight) {
scheduleInflightCompaction(metaClient, "001", plan1);
scheduleInflightCompaction(metaClient, "003", plan2);
scheduleInflightCompaction(metaClient, "005", plan3);
scheduleInflightCompaction(metaClient, "007", plan4);
} else {
scheduleCompaction(metaClient, "001", plan1);
scheduleCompaction(metaClient, "003", plan2);
scheduleCompaction(metaClient, "005", plan3);
scheduleCompaction(metaClient, "007", plan4);
}
createDeltaCommit(metaClient, "000");
createDeltaCommit(metaClient, "002");
createDeltaCommit(metaClient, "004");
createDeltaCommit(metaClient, "006");
Map<String, String> baseInstantsToCompaction = new HashMap<String, String>() {
{
put("000", "001");
put("002", "003");
put("004", "005");
put("006", "007");
}
};
List<Integer> expectedNumEntries = Arrays.asList(numEntriesInPlan1, numEntriesInPlan2, numEntriesInPlan3, numEntriesInPlan4);
List<HoodieCompactionPlan> plans = CollectionUtils.createImmutableList(plan1, plan2, plan3, plan4);
IntStream.range(0, 4).boxed().forEach(idx -> {
if (expectedNumEntries.get(idx) > 0) {
assertEquals(expectedNumEntries.get(idx).longValue(), plans.get(idx).getOperations().size(), "check if plan " + idx + " has exp entries");
} else {
assertNull(plans.get(idx).getOperations(), "Plan " + idx + " has null ops");
}
});
metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(metaClient.getBasePath()).setLoadActiveTimelineOnLoad(true).build();
Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> pendingCompactionMap = CompactionUtils.getAllPendingCompactionOperations(metaClient);
Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> expPendingCompactionMap = generateExpectedCompactionOperations(Arrays.asList(plan1, plan2, plan3, plan4), baseInstantsToCompaction);
// Ensure Compaction operations are fine.
assertEquals(expPendingCompactionMap, pendingCompactionMap);
return expPendingCompactionMap;
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestRecordReaderValueIterator method testValueIterator.
@Test
public void testValueIterator() {
String[] values = new String[] { "hoodie", "efficient", "new project", "realtime", "spark", "table" };
List<Pair<Integer, String>> entries = IntStream.range(0, values.length).boxed().map(idx -> Pair.of(idx, values[idx])).collect(Collectors.toList());
TestRecordReader reader = new TestRecordReader(entries);
RecordReaderValueIterator<IntWritable, Text> itr = new RecordReaderValueIterator<IntWritable, Text>(reader);
for (int i = 0; i < values.length; i++) {
assertTrue(itr.hasNext());
Text val = itr.next();
assertEquals(values[i], val.toString());
}
assertFalse(itr.hasNext());
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class HoodieRealtimeInputFormatUtils method groupLogsByBaseFile.
// Return parquet file with a list of log files in the same file group.
public static List<Pair<Option<HoodieBaseFile>, List<HoodieLogFile>>> groupLogsByBaseFile(Configuration conf, List<Path> partitionPaths) {
Set<Path> partitionSet = new HashSet<>(partitionPaths);
// TODO(vc): Should we handle also non-hoodie splits here?
Map<Path, HoodieTableMetaClient> partitionsToMetaClient = getTableMetaClientByPartitionPath(conf, partitionSet);
// Get all the base file and it's log files pairs in required partition paths.
List<Pair<Option<HoodieBaseFile>, List<HoodieLogFile>>> baseAndLogsList = new ArrayList<>();
partitionSet.forEach(partitionPath -> {
// for each partition path obtain the data & log file groupings, then map back to inputsplits
HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath);
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline());
String relPartitionPath = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), partitionPath);
try {
// Both commit and delta-commits are included - pick the latest completed one
Option<HoodieInstant> latestCompletedInstant = metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants().lastInstant();
Stream<FileSlice> latestFileSlices = latestCompletedInstant.map(instant -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, instant.getTimestamp())).orElse(Stream.empty());
latestFileSlices.forEach(fileSlice -> {
List<HoodieLogFile> logFilePaths = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
baseAndLogsList.add(Pair.of(fileSlice.getBaseFile(), logFilePaths));
});
} catch (Exception e) {
throw new HoodieException("Error obtaining data file/log file grouping: " + partitionPath, e);
}
});
return baseAndLogsList;
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class DFSTestSuitePathSelector method getNextFilePathsAndMaxModificationTime.
@Override
public Pair<Option<String>, String> getNextFilePathsAndMaxModificationTime(Option<String> lastCheckpointStr, long sourceLimit) {
Integer lastBatchId;
Integer nextBatchId;
try {
if (lastCheckpointStr.isPresent()) {
lastBatchId = Integer.parseInt(lastCheckpointStr.get());
nextBatchId = lastBatchId + 1;
} else {
lastBatchId = 0;
nextBatchId = 1;
}
// obtain all eligible files for the batch
List<FileStatus> eligibleFiles = new ArrayList<>();
FileStatus[] fileStatuses = fs.globStatus(new Path(props.getString(Config.ROOT_INPUT_PATH_PROP), "*"));
// Say input data is as follow input/1, input/2, input/5 since 3,4 was rolled back and 5 is new generated data
// checkpoint from the latest commit metadata will be 2 since 3,4 has been rolled back. We need to set the
// next batch id correctly as 5 instead of 3
Option<String> correctBatchIdDueToRollback = Option.fromJavaOptional(Arrays.stream(fileStatuses).map(f -> f.getPath().toString().split("/")[f.getPath().toString().split("/").length - 1]).filter(bid1 -> Integer.parseInt(bid1) > lastBatchId).min((bid1, bid2) -> Integer.min(Integer.parseInt(bid1), Integer.parseInt(bid2))));
if (correctBatchIdDueToRollback.isPresent() && Integer.parseInt(correctBatchIdDueToRollback.get()) > nextBatchId) {
nextBatchId = Integer.parseInt(correctBatchIdDueToRollback.get());
}
log.info("Using DFSTestSuitePathSelector, checkpoint: " + lastCheckpointStr + " sourceLimit: " + sourceLimit + " lastBatchId: " + lastBatchId + " nextBatchId: " + nextBatchId);
for (FileStatus fileStatus : fileStatuses) {
if (!fileStatus.isDirectory() || IGNORE_FILEPREFIX_LIST.stream().anyMatch(pfx -> fileStatus.getPath().getName().startsWith(pfx))) {
continue;
} else if (Integer.parseInt(fileStatus.getPath().getName()) > lastBatchId && Integer.parseInt(fileStatus.getPath().getName()) <= nextBatchId) {
RemoteIterator<LocatedFileStatus> files = fs.listFiles(fileStatus.getPath(), true);
while (files.hasNext()) {
eligibleFiles.add(files.next());
}
}
}
// no data to readAvro
if (eligibleFiles.size() == 0) {
return new ImmutablePair<>(Option.empty(), lastCheckpointStr.orElseGet(() -> String.valueOf(Long.MIN_VALUE)));
}
// readAvro the files out.
String pathStr = eligibleFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(","));
return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(nextBatchId));
} catch (IOException ioe) {
throw new HoodieIOException("Unable to readAvro from source from checkpoint: " + lastCheckpointStr, ioe);
}
}
Aggregations