Search in sources :

Example 61 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class HoodieDataSourceHelpers method getClusteringPlan.

@PublicAPIMethod(maturity = ApiMaturityLevel.STABLE)
public static Option<HoodieClusteringPlan> getClusteringPlan(FileSystem fs, String basePath, String instantTime) {
    HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build();
    HoodieInstant hoodieInstant = HoodieTimeline.getReplaceCommitRequestedInstant(instantTime);
    Option<Pair<HoodieInstant, HoodieClusteringPlan>> clusteringPlan = ClusteringUtils.getClusteringPlan(metaClient, hoodieInstant);
    if (clusteringPlan.isPresent()) {
        return Option.of(clusteringPlan.get().getValue());
    } else {
        return Option.empty();
    }
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Pair(org.apache.hudi.common.util.collection.Pair)

Example 62 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class SparkFullBootstrapDataProviderBase method generateInputRecords.

@Override
public JavaRDD<HoodieRecord> generateInputRecords(String tableName, String sourceBasePath, List<Pair<String, List<HoodieFileStatus>>> partitionPathsWithFiles) {
    String[] filePaths = partitionPathsWithFiles.stream().map(Pair::getValue).flatMap(f -> f.stream().map(fs -> FileStatusUtils.toPath(fs.getPath()).toString())).toArray(String[]::new);
    Dataset inputDataset = sparkSession.read().format(getFormat()).load(filePaths);
    try {
        KeyGenerator keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(props);
        String structName = tableName + "_record";
        String namespace = "hoodie." + tableName;
        RDD<GenericRecord> genericRecords = HoodieSparkUtils.createRdd(inputDataset, structName, namespace, false, Option.empty());
        return genericRecords.toJavaRDD().map(gr -> {
            String orderingVal = HoodieAvroUtils.getNestedFieldValAsString(gr, props.getString("hoodie.datasource.write.precombine.field"), false, props.getBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), Boolean.parseBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue())));
            try {
                return DataSourceUtils.createHoodieRecord(gr, orderingVal, keyGenerator.getKey(gr), props.getString("hoodie.datasource.write.payload.class"));
            } catch (IOException ioe) {
                throw new HoodieIOException(ioe.getMessage(), ioe);
            }
        });
    } catch (IOException ioe) {
        throw new HoodieIOException(ioe.getMessage(), ioe);
    }
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) TypedProperties(org.apache.hudi.common.config.TypedProperties) Dataset(org.apache.spark.sql.Dataset) KeyGeneratorOptions(org.apache.hudi.keygen.constant.KeyGeneratorOptions) Option(org.apache.hudi.common.util.Option) IOException(java.io.IOException) DataSourceUtils(org.apache.hudi.DataSourceUtils) KeyGenerator(org.apache.hudi.keygen.KeyGenerator) List(java.util.List) HoodieSparkUtils(org.apache.hudi.HoodieSparkUtils) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) FileStatusUtils(org.apache.hudi.common.bootstrap.FileStatusUtils) HoodieIOException(org.apache.hudi.exception.HoodieIOException) RDD(org.apache.spark.rdd.RDD) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) JavaRDD(org.apache.spark.api.java.JavaRDD) FullRecordBootstrapDataProvider(org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider) Pair(org.apache.hudi.common.util.collection.Pair) SparkSession(org.apache.spark.sql.SparkSession) HoodieIOException(org.apache.hudi.exception.HoodieIOException) Dataset(org.apache.spark.sql.Dataset) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) GenericRecord(org.apache.avro.generic.GenericRecord) KeyGenerator(org.apache.hudi.keygen.KeyGenerator) Pair(org.apache.hudi.common.util.collection.Pair)

Example 63 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class HoodieTestCommitGenerator method generateCommitMetadata.

public static HoodieCommitMetadata generateCommitMetadata(Map<String, List<Pair<String, String>>> partitionPathToFileIdAndNameMap, Map<String, String> extraMetadata) {
    HoodieCommitMetadata metadata = new HoodieCommitMetadata();
    for (Map.Entry<String, String> entry : extraMetadata.entrySet()) {
        metadata.addMetadata(entry.getKey(), entry.getValue());
    }
    partitionPathToFileIdAndNameMap.forEach((partitionPath, fileInfoList) -> fileInfoList.forEach(fileInfo -> {
        HoodieWriteStat writeStat = new HoodieWriteStat();
        writeStat.setPartitionPath(partitionPath);
        writeStat.setPath(new Path(partitionPath, fileInfo.getValue()).toString());
        writeStat.setFileId(fileInfo.getKey());
        // Below are dummy values
        writeStat.setTotalWriteBytes(10000);
        writeStat.setPrevCommit("000");
        writeStat.setNumWrites(10);
        writeStat.setNumUpdateWrites(15);
        writeStat.setTotalLogBlocks(2);
        writeStat.setTotalLogRecords(100);
        metadata.addWriteStat(partitionPath, writeStat);
    }));
    return metadata;
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) FileSystem(org.apache.hadoop.fs.FileSystem) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HashMap(java.util.HashMap) UUID(java.util.UUID) StandardCharsets(java.nio.charset.StandardCharsets) ArrayList(java.util.ArrayList) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Logger(org.apache.log4j.Logger) List(java.util.List) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) DEFAULT_WRITE_TOKEN(org.apache.hudi.common.table.log.HoodieLogFormat.DEFAULT_WRITE_TOKEN) Path(org.apache.hadoop.fs.Path) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HashMap(java.util.HashMap) Map(java.util.Map)

Example 64 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class TestRepairUtils method testTagInstantsOfBaseAndLogFiles.

@Test
public void testTagInstantsOfBaseAndLogFiles() {
    Map<String, List<String>> expectedResult = new HashMap<>();
    List<Path> inputPathList = new ArrayList<>();
    for (Map.Entry<String, List<Pair<String, String>>> entry : BASE_FILE_INFO.entrySet()) {
        String instantTime = entry.getKey();
        List<String> fileNameList = entry.getValue().stream().map(e -> {
            String partitionPath = e.getKey();
            String fileId = e.getValue();
            return new Path(new Path(partitionPath), getBaseFilename(instantTime, fileId)).toString();
        }).collect(Collectors.toList());
        List<String> expectedList = expectedResult.computeIfAbsent(instantTime, k -> new ArrayList<>());
        expectedList.addAll(fileNameList);
        inputPathList.addAll(fileNameList.stream().map(path -> new Path(basePath, path)).collect(Collectors.toList()));
    }
    for (Map.Entry<String, List<Pair<String, String>>> entry : LOG_FILE_INFO.entrySet()) {
        String instantTime = entry.getKey();
        List<String> fileNameList = entry.getValue().stream().map(e -> {
            String partitionPath = e.getKey();
            String fileId = e.getValue();
            return new Path(new Path(partitionPath), getLogFilename(instantTime, fileId)).toString();
        }).collect(Collectors.toList());
        List<String> expectedList = expectedResult.computeIfAbsent(instantTime, k -> new ArrayList<>());
        expectedList.addAll(fileNameList);
        inputPathList.addAll(fileNameList.stream().map(path -> new Path(basePath, path)).collect(Collectors.toList()));
    }
    assertEquals(expectedResult, RepairUtils.tagInstantsOfBaseAndLogFiles(basePath, inputPathList));
}
Also used : Path(org.apache.hadoop.fs.Path) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) HoodieTestCommitGenerator(org.apache.hudi.HoodieTestCommitGenerator) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HoodieTestCommitGenerator.getLogFilename(org.apache.hudi.HoodieTestCommitGenerator.getLogFilename) HoodieTestCommitGenerator.initCommitInfoForRepairTests(org.apache.hudi.HoodieTestCommitGenerator.initCommitInfoForRepairTests) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) BeforeAll(org.junit.jupiter.api.BeforeAll) HoodieTestCommitGenerator.getBaseFilename(org.apache.hudi.HoodieTestCommitGenerator.getBaseFilename) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Set(java.util.Set) IOException(java.io.IOException) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) List(java.util.List) TempDir(org.junit.jupiter.api.io.TempDir) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map) Test(org.junit.jupiter.api.Test)

Example 65 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class TestRepairUtils method testFindInstantFilesToRemove.

@Test
public void testFindInstantFilesToRemove() throws IOException {
    setupTimelineInFS();
    HoodieInstant existingInstant = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "001");
    Map<String, List<Pair<String, String>>> partitionToFileIdAndNameMap = instantInfoMap.get(existingInstant.getTimestamp());
    List<String> fileListFromFs = partitionToFileIdAndNameMap.entrySet().stream().flatMap(entry -> entry.getValue().stream().map(fileInfo -> new Path(entry.getKey(), fileInfo.getValue()).toString()).collect(Collectors.toList()).stream()).collect(Collectors.toList());
    String danglingFilePath = new Path("2022/01/02", getBaseFilename(existingInstant.getTimestamp(), UUID.randomUUID().toString())).toString();
    fileListFromFs.add(danglingFilePath);
    // Existing instant
    assertEquals(CollectionUtils.createImmutableList(danglingFilePath), RepairUtils.findInstantFilesToRemove(existingInstant.getTimestamp(), fileListFromFs, metaClient.getActiveTimeline(), metaClient.getArchivedTimeline()));
    // Non-existing instant
    assertEquals(fileListFromFs, RepairUtils.findInstantFilesToRemove("004", fileListFromFs, metaClient.getActiveTimeline(), metaClient.getArchivedTimeline()));
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) HoodieTestCommitGenerator(org.apache.hudi.HoodieTestCommitGenerator) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HoodieTestCommitGenerator.getLogFilename(org.apache.hudi.HoodieTestCommitGenerator.getLogFilename) HoodieTestCommitGenerator.initCommitInfoForRepairTests(org.apache.hudi.HoodieTestCommitGenerator.initCommitInfoForRepairTests) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) BeforeAll(org.junit.jupiter.api.BeforeAll) HoodieTestCommitGenerator.getBaseFilename(org.apache.hudi.HoodieTestCommitGenerator.getBaseFilename) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Set(java.util.Set) IOException(java.io.IOException) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) List(java.util.List) TempDir(org.junit.jupiter.api.io.TempDir) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) List(java.util.List) Test(org.junit.jupiter.api.Test)

Aggregations

Pair (org.apache.hudi.common.util.collection.Pair)147 List (java.util.List)98 Map (java.util.Map)91 IOException (java.io.IOException)89 Collectors (java.util.stream.Collectors)87 Option (org.apache.hudi.common.util.Option)87 ArrayList (java.util.ArrayList)85 Path (org.apache.hadoop.fs.Path)81 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)76 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)66 HashMap (java.util.HashMap)65 LogManager (org.apache.log4j.LogManager)64 Logger (org.apache.log4j.Logger)64 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)63 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)58 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)54 HoodieIOException (org.apache.hudi.exception.HoodieIOException)54 Arrays (java.util.Arrays)48 HoodieTable (org.apache.hudi.table.HoodieTable)46 Test (org.junit.jupiter.api.Test)46