Search in sources :

Example 36 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class TestIncrementalFSViewSync method generateDataForInstant.

private List<Pair<String, HoodieWriteStat>> generateDataForInstant(String baseInstant, String instant, boolean deltaCommit, List<String> fileIds) {
    return partitions.stream().flatMap(p -> fileIds.stream().map(f -> {
        try {
            File file = new File(basePath + "/" + p + "/" + (deltaCommit ? FSUtils.makeLogFileName(f, ".log", baseInstant, Integer.parseInt(instant), TEST_WRITE_TOKEN) : FSUtils.makeDataFileName(instant, TEST_WRITE_TOKEN, f)));
            file.createNewFile();
            HoodieWriteStat w = new HoodieWriteStat();
            w.setFileId(f);
            w.setPath(String.format("%s/%s", p, file.getName()));
            return Pair.of(p, w);
        } catch (IOException e) {
            throw new HoodieException(e);
        }
    })).collect(Collectors.toList());
}
Also used : BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) COMPACTION_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.COMPACTION_ACTION) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) Path(org.apache.hadoop.fs.Path) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) Set(java.util.Set) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) UUID(java.util.UUID) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) IntStream(java.util.stream.IntStream) HoodieCleaningPolicy(org.apache.hudi.common.model.HoodieCleaningPolicy) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) ArrayList(java.util.ArrayList) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) Files(java.nio.file.Files) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) Paths(java.nio.file.Paths) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) HoodieRollbackStat(org.apache.hudi.common.HoodieRollbackStat) Comparator(java.util.Comparator) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) File(java.io.File)

Example 37 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class CompactionTestUtils method setupAndValidateCompactionOperations.

public static Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> setupAndValidateCompactionOperations(HoodieTableMetaClient metaClient, boolean inflight, int numEntriesInPlan1, int numEntriesInPlan2, int numEntriesInPlan3, int numEntriesInPlan4) throws IOException {
    HoodieCompactionPlan plan1 = createCompactionPlan(metaClient, "000", "001", numEntriesInPlan1, true, true);
    HoodieCompactionPlan plan2 = createCompactionPlan(metaClient, "002", "003", numEntriesInPlan2, false, true);
    HoodieCompactionPlan plan3 = createCompactionPlan(metaClient, "004", "005", numEntriesInPlan3, true, false);
    HoodieCompactionPlan plan4 = createCompactionPlan(metaClient, "006", "007", numEntriesInPlan4, false, false);
    if (inflight) {
        scheduleInflightCompaction(metaClient, "001", plan1);
        scheduleInflightCompaction(metaClient, "003", plan2);
        scheduleInflightCompaction(metaClient, "005", plan3);
        scheduleInflightCompaction(metaClient, "007", plan4);
    } else {
        scheduleCompaction(metaClient, "001", plan1);
        scheduleCompaction(metaClient, "003", plan2);
        scheduleCompaction(metaClient, "005", plan3);
        scheduleCompaction(metaClient, "007", plan4);
    }
    createDeltaCommit(metaClient, "000");
    createDeltaCommit(metaClient, "002");
    createDeltaCommit(metaClient, "004");
    createDeltaCommit(metaClient, "006");
    Map<String, String> baseInstantsToCompaction = new HashMap<String, String>() {

        {
            put("000", "001");
            put("002", "003");
            put("004", "005");
            put("006", "007");
        }
    };
    List<Integer> expectedNumEntries = Arrays.asList(numEntriesInPlan1, numEntriesInPlan2, numEntriesInPlan3, numEntriesInPlan4);
    List<HoodieCompactionPlan> plans = CollectionUtils.createImmutableList(plan1, plan2, plan3, plan4);
    IntStream.range(0, 4).boxed().forEach(idx -> {
        if (expectedNumEntries.get(idx) > 0) {
            assertEquals(expectedNumEntries.get(idx).longValue(), plans.get(idx).getOperations().size(), "check if plan " + idx + " has exp entries");
        } else {
            assertNull(plans.get(idx).getOperations(), "Plan " + idx + " has null ops");
        }
    });
    metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(metaClient.getBasePath()).setLoadActiveTimelineOnLoad(true).build();
    Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> pendingCompactionMap = CompactionUtils.getAllPendingCompactionOperations(metaClient);
    Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> expPendingCompactionMap = generateExpectedCompactionOperations(Arrays.asList(plan1, plan2, plan3, plan4), baseInstantsToCompaction);
    // Ensure Compaction operations are fine.
    assertEquals(expPendingCompactionMap, pendingCompactionMap);
    return expPendingCompactionMap;
}
Also used : HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HashMap(java.util.HashMap) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) Pair(org.apache.hudi.common.util.collection.Pair)

Example 38 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class TestRecordReaderValueIterator method testValueIterator.

@Test
public void testValueIterator() {
    String[] values = new String[] { "hoodie", "efficient", "new project", "realtime", "spark", "table" };
    List<Pair<Integer, String>> entries = IntStream.range(0, values.length).boxed().map(idx -> Pair.of(idx, values[idx])).collect(Collectors.toList());
    TestRecordReader reader = new TestRecordReader(entries);
    RecordReaderValueIterator<IntWritable, Text> itr = new RecordReaderValueIterator<IntWritable, Text>(reader);
    for (int i = 0; i < values.length; i++) {
        assertTrue(itr.hasNext());
        Text val = itr.next();
        assertEquals(values[i], val.toString());
    }
    assertFalse(itr.hasNext());
}
Also used : Test(org.junit.jupiter.api.Test) IntStream(java.util.stream.IntStream) List(java.util.List) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) Text(org.apache.hadoop.io.Text) RecordReader(org.apache.hadoop.mapred.RecordReader) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) Collectors(java.util.stream.Collectors) IntWritable(org.apache.hadoop.io.IntWritable) Pair(org.apache.hudi.common.util.collection.Pair) Text(org.apache.hadoop.io.Text) IntWritable(org.apache.hadoop.io.IntWritable) Pair(org.apache.hudi.common.util.collection.Pair) Test(org.junit.jupiter.api.Test)

Example 39 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class HoodieRealtimeInputFormatUtils method groupLogsByBaseFile.

// Return parquet file with a list of log files in the same file group.
public static List<Pair<Option<HoodieBaseFile>, List<HoodieLogFile>>> groupLogsByBaseFile(Configuration conf, List<Path> partitionPaths) {
    Set<Path> partitionSet = new HashSet<>(partitionPaths);
    // TODO(vc): Should we handle also non-hoodie splits here?
    Map<Path, HoodieTableMetaClient> partitionsToMetaClient = getTableMetaClientByPartitionPath(conf, partitionSet);
    // Get all the base file and it's log files pairs in required partition paths.
    List<Pair<Option<HoodieBaseFile>, List<HoodieLogFile>>> baseAndLogsList = new ArrayList<>();
    partitionSet.forEach(partitionPath -> {
        // for each partition path obtain the data & log file groupings, then map back to inputsplits
        HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath);
        HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline());
        String relPartitionPath = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), partitionPath);
        try {
            // Both commit and delta-commits are included - pick the latest completed one
            Option<HoodieInstant> latestCompletedInstant = metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants().lastInstant();
            Stream<FileSlice> latestFileSlices = latestCompletedInstant.map(instant -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, instant.getTimestamp())).orElse(Stream.empty());
            latestFileSlices.forEach(fileSlice -> {
                List<HoodieLogFile> logFilePaths = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
                baseAndLogsList.add(Pair.of(fileSlice.getBaseFile(), logFilePaths));
            });
        } catch (Exception e) {
            throw new HoodieException("Error obtaining data file/log file grouping: " + partitionPath, e);
        }
    });
    return baseAndLogsList;
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) ColumnProjectionUtils(org.apache.hadoop.hive.serde2.ColumnProjectionUtils) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieRealtimeBootstrapBaseFileSplit(org.apache.hudi.hadoop.realtime.HoodieRealtimeBootstrapBaseFileSplit) Logger(org.apache.log4j.Logger) FileSplit(org.apache.hadoop.mapred.FileSplit) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Configuration(org.apache.hadoop.conf.Configuration) RealtimeSplit(org.apache.hudi.hadoop.realtime.RealtimeSplit) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) TypeUtils.unsafeCast(org.apache.hudi.TypeUtils.unsafeCast) HoodieVirtualKeyInfo(org.apache.hudi.hadoop.realtime.HoodieVirtualKeyInfo) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Set(java.util.Set) HoodieRealtimeFileSplit(org.apache.hudi.hadoop.realtime.HoodieRealtimeFileSplit) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) JobConf(org.apache.hadoop.mapred.JobConf) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieException(org.apache.hudi.exception.HoodieException) HoodieException(org.apache.hudi.exception.HoodieException) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HashSet(java.util.HashSet) Pair(org.apache.hudi.common.util.collection.Pair)

Example 40 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class DFSTestSuitePathSelector method getNextFilePathsAndMaxModificationTime.

@Override
public Pair<Option<String>, String> getNextFilePathsAndMaxModificationTime(Option<String> lastCheckpointStr, long sourceLimit) {
    Integer lastBatchId;
    Integer nextBatchId;
    try {
        if (lastCheckpointStr.isPresent()) {
            lastBatchId = Integer.parseInt(lastCheckpointStr.get());
            nextBatchId = lastBatchId + 1;
        } else {
            lastBatchId = 0;
            nextBatchId = 1;
        }
        // obtain all eligible files for the batch
        List<FileStatus> eligibleFiles = new ArrayList<>();
        FileStatus[] fileStatuses = fs.globStatus(new Path(props.getString(Config.ROOT_INPUT_PATH_PROP), "*"));
        // Say input data is as follow input/1, input/2, input/5 since 3,4 was rolled back and 5 is new generated data
        // checkpoint from the latest commit metadata will be 2 since 3,4 has been rolled back. We need to set the
        // next batch id correctly as 5 instead of 3
        Option<String> correctBatchIdDueToRollback = Option.fromJavaOptional(Arrays.stream(fileStatuses).map(f -> f.getPath().toString().split("/")[f.getPath().toString().split("/").length - 1]).filter(bid1 -> Integer.parseInt(bid1) > lastBatchId).min((bid1, bid2) -> Integer.min(Integer.parseInt(bid1), Integer.parseInt(bid2))));
        if (correctBatchIdDueToRollback.isPresent() && Integer.parseInt(correctBatchIdDueToRollback.get()) > nextBatchId) {
            nextBatchId = Integer.parseInt(correctBatchIdDueToRollback.get());
        }
        log.info("Using DFSTestSuitePathSelector, checkpoint: " + lastCheckpointStr + " sourceLimit: " + sourceLimit + " lastBatchId: " + lastBatchId + " nextBatchId: " + nextBatchId);
        for (FileStatus fileStatus : fileStatuses) {
            if (!fileStatus.isDirectory() || IGNORE_FILEPREFIX_LIST.stream().anyMatch(pfx -> fileStatus.getPath().getName().startsWith(pfx))) {
                continue;
            } else if (Integer.parseInt(fileStatus.getPath().getName()) > lastBatchId && Integer.parseInt(fileStatus.getPath().getName()) <= nextBatchId) {
                RemoteIterator<LocatedFileStatus> files = fs.listFiles(fileStatus.getPath(), true);
                while (files.hasNext()) {
                    eligibleFiles.add(files.next());
                }
            }
        }
        // no data to readAvro
        if (eligibleFiles.size() == 0) {
            return new ImmutablePair<>(Option.empty(), lastCheckpointStr.orElseGet(() -> String.valueOf(Long.MIN_VALUE)));
        }
        // readAvro the files out.
        String pathStr = eligibleFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(","));
        return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(nextBatchId));
    } catch (IOException ioe) {
        throw new HoodieIOException("Unable to readAvro from source from checkpoint: " + lastCheckpointStr, ioe);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Arrays(java.util.Arrays) HoodieTestSuiteJob(org.apache.hudi.integ.testsuite.HoodieTestSuiteJob) Logger(org.slf4j.Logger) TypedProperties(org.apache.hudi.common.config.TypedProperties) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) LoggerFactory(org.slf4j.LoggerFactory) Option(org.apache.hudi.common.util.Option) IOException(java.io.IOException) FileStatus(org.apache.hadoop.fs.FileStatus) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) List(java.util.List) DFSPathSelector(org.apache.hudi.utilities.sources.helpers.DFSPathSelector) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) HoodieIOException(org.apache.hudi.exception.HoodieIOException) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) Pair(org.apache.hudi.common.util.collection.Pair) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) HoodieIOException(org.apache.hudi.exception.HoodieIOException) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair)

Aggregations

Pair (org.apache.hudi.common.util.collection.Pair)147 List (java.util.List)98 Map (java.util.Map)91 IOException (java.io.IOException)89 Collectors (java.util.stream.Collectors)87 Option (org.apache.hudi.common.util.Option)87 ArrayList (java.util.ArrayList)85 Path (org.apache.hadoop.fs.Path)81 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)76 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)66 HashMap (java.util.HashMap)65 LogManager (org.apache.log4j.LogManager)64 Logger (org.apache.log4j.Logger)64 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)63 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)58 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)54 HoodieIOException (org.apache.hudi.exception.HoodieIOException)54 Arrays (java.util.Arrays)48 HoodieTable (org.apache.hudi.table.HoodieTable)46 Test (org.junit.jupiter.api.Test)46