Search in sources :

Example 41 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class AbstractTableFileSystemView method getBaseFileOn.

@Override
public final Option<HoodieBaseFile> getBaseFileOn(String partitionStr, String instantTime, String fileId) {
    try {
        readLock.lock();
        String partitionPath = formatPartitionKey(partitionStr);
        ensurePartitionLoadedCorrectly(partitionPath);
        if (isFileGroupReplacedBeforeOrOn(new HoodieFileGroupId(partitionPath, fileId), instantTime)) {
            return Option.empty();
        } else {
            return fetchHoodieFileGroup(partitionPath, fileId).map(fileGroup -> fileGroup.getAllBaseFiles().filter(baseFile -> HoodieTimeline.compareTimestamps(baseFile.getCommitTime(), HoodieTimeline.EQUALS, instantTime)).filter(df -> !isBaseFileDueToPendingCompaction(df) && !isBaseFileDueToPendingClustering(df)).findFirst().orElse(null)).map(df -> addBootstrapBaseFileIfPresent(new HoodieFileGroupId(partitionPath, fileId), df));
        }
    } finally {
        readLock.unlock();
    }
}
Also used : BootstrapBaseFileMapping(org.apache.hudi.common.model.BootstrapBaseFileMapping) Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) ReentrantReadWriteLock(java.util.concurrent.locks.ReentrantReadWriteLock) ReadLock(java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) WriteLock(java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock) Predicate(java.util.function.Predicate) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) Serializable(java.io.Serializable) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) AbstractMap(java.util.AbstractMap) List(java.util.List) GREATER_THAN_OR_EQUALS(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN_OR_EQUALS) Stream(java.util.stream.Stream) ClusteringUtils(org.apache.hudi.common.util.ClusteringUtils) HoodieIOException(org.apache.hudi.exception.HoodieIOException) METADATA_BOOTSTRAP_INSTANT_TS(org.apache.hudi.common.table.timeline.HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) GREATER_THAN(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId)

Example 42 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class HoodieSnapshotCopier method snapshot.

public void snapshot(JavaSparkContext jsc, String baseDir, final String outputDir, final boolean shouldAssumeDatePartitioning, final boolean useFileListingFromMetadata) throws IOException {
    FileSystem fs = FSUtils.getFs(baseDir, jsc.hadoopConfiguration());
    final SerializableConfiguration serConf = new SerializableConfiguration(jsc.hadoopConfiguration());
    final HoodieTableMetaClient tableMetadata = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(baseDir).build();
    final BaseFileOnlyView fsView = new HoodieTableFileSystemView(tableMetadata, tableMetadata.getActiveTimeline().getWriteTimeline().filterCompletedInstants());
    HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
    // Get the latest commit
    Option<HoodieInstant> latestCommit = tableMetadata.getActiveTimeline().getWriteTimeline().filterCompletedInstants().lastInstant();
    if (!latestCommit.isPresent()) {
        LOG.warn("No commits present. Nothing to snapshot");
        return;
    }
    final String latestCommitTimestamp = latestCommit.get().getTimestamp();
    LOG.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.", latestCommitTimestamp));
    List<String> partitions = FSUtils.getAllPartitionPaths(context, baseDir, useFileListingFromMetadata, shouldAssumeDatePartitioning);
    if (partitions.size() > 0) {
        LOG.info(String.format("The job needs to copy %d partitions.", partitions.size()));
        // Make sure the output directory is empty
        Path outputPath = new Path(outputDir);
        if (fs.exists(outputPath)) {
            LOG.warn(String.format("The output path %s targetBasePath already exists, deleting", outputPath));
            fs.delete(new Path(outputDir), true);
        }
        context.setJobStatus(this.getClass().getSimpleName(), "Creating a snapshot");
        List<Tuple2<String, String>> filesToCopy = context.flatMap(partitions, partition -> {
            // Only take latest version files <= latestCommit.
            FileSystem fs1 = FSUtils.getFs(baseDir, serConf.newCopy());
            List<Tuple2<String, String>> filePaths = new ArrayList<>();
            Stream<HoodieBaseFile> dataFiles = fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp);
            dataFiles.forEach(hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath())));
            // also need to copy over partition metadata
            Path partitionMetaFile = new Path(FSUtils.getPartitionPath(baseDir, partition), HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE);
            if (fs1.exists(partitionMetaFile)) {
                filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString()));
            }
            return filePaths.stream();
        }, partitions.size());
        context.foreach(filesToCopy, tuple -> {
            String partition = tuple._1();
            Path sourceFilePath = new Path(tuple._2());
            Path toPartitionPath = FSUtils.getPartitionPath(outputDir, partition);
            FileSystem ifs = FSUtils.getFs(baseDir, serConf.newCopy());
            if (!ifs.exists(toPartitionPath)) {
                ifs.mkdirs(toPartitionPath);
            }
            FileUtil.copy(ifs, sourceFilePath, ifs, new Path(toPartitionPath, sourceFilePath.getName()), false, ifs.getConf());
        }, filesToCopy.size());
        // Also copy the .commit files
        LOG.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp));
        FileStatus[] commitFilesToCopy = fs.listStatus(new Path(baseDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> {
            if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) {
                return true;
            } else {
                String instantTime = FSUtils.getCommitFromCommitFile(commitFilePath.getName());
                return HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.LESSER_THAN_OR_EQUALS, latestCommitTimestamp);
            }
        });
        for (FileStatus commitStatus : commitFilesToCopy) {
            Path targetFilePath = new Path(outputDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus.getPath().getName());
            if (!fs.exists(targetFilePath.getParent())) {
                fs.mkdirs(targetFilePath.getParent());
            }
            if (fs.exists(targetFilePath)) {
                LOG.error(String.format("The target output commit file (%s targetBasePath) already exists.", targetFilePath));
            }
            FileUtil.copy(fs, commitStatus.getPath(), fs, targetFilePath, false, fs.getConf());
        }
    } else {
        LOG.info("The job has 0 partition to copy.");
    }
    // Create the _SUCCESS tag
    Path successTagPath = new Path(outputDir + "/_SUCCESS");
    if (!fs.exists(successTagPath)) {
        LOG.info(String.format("Creating _SUCCESS under targetBasePath: %s", outputDir));
        fs.createNewFile(successTagPath);
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) ArrayList(java.util.ArrayList) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Tuple2(scala.Tuple2) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView)

Example 43 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class HoodieSnapshotExporter method exportAsNonHudi.

private void exportAsNonHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) {
    Partitioner defaultPartitioner = dataset -> {
        Dataset<Row> hoodieDroppedDataset = dataset.drop(JavaConversions.asScalaIterator(HoodieRecord.HOODIE_META_COLUMNS.iterator()).toSeq());
        return StringUtils.isNullOrEmpty(cfg.outputPartitionField) ? hoodieDroppedDataset.write() : hoodieDroppedDataset.repartition(new Column(cfg.outputPartitionField)).write().partitionBy(cfg.outputPartitionField);
    };
    Partitioner partitioner = StringUtils.isNullOrEmpty(cfg.outputPartitioner) ? defaultPartitioner : ReflectionUtils.loadClass(cfg.outputPartitioner);
    HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
    context.setJobStatus(this.getClass().getSimpleName(), "Exporting as non-HUDI dataset");
    final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
    Iterator<String> exportingFilePaths = jsc.parallelize(partitions, partitions.size()).flatMap(partition -> fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp).map(HoodieBaseFile::getPath).iterator()).toLocalIterator();
    Dataset<Row> sourceDataset = new SQLContext(jsc).read().parquet(JavaConversions.asScalaIterator(exportingFilePaths).toSeq());
    partitioner.partition(sourceDataset).format(cfg.outputFormat).mode(SaveMode.Overwrite).save(cfg.targetOutputPath);
}
Also used : ParameterException(com.beust.jcommander.ParameterException) Dataset(org.apache.spark.sql.Dataset) Parameter(com.beust.jcommander.Parameter) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) FileUtil(org.apache.hadoop.fs.FileUtil) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) JavaConversions(scala.collection.JavaConversions) SaveMode(org.apache.spark.sql.SaveMode) Iterator(java.util.Iterator) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) Column(org.apache.spark.sql.Column) SQLContext(org.apache.spark.sql.SQLContext) JCommander(com.beust.jcommander.JCommander) SparkConf(org.apache.spark.SparkConf) HoodieSnapshotExporterException(org.apache.hudi.utilities.exception.HoodieSnapshotExporterException) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) Tuple2(scala.Tuple2) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Serializable(java.io.Serializable) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodiePartitionMetadata(org.apache.hudi.common.model.HoodiePartitionMetadata) DataFrameWriter(org.apache.spark.sql.DataFrameWriter) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) LogManager(org.apache.log4j.LogManager) IValueValidator(com.beust.jcommander.IValueValidator) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) Column(org.apache.spark.sql.Column) Dataset(org.apache.spark.sql.Dataset) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) Row(org.apache.spark.sql.Row) SQLContext(org.apache.spark.sql.SQLContext)

Example 44 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class HiveTestUtil method createLogFiles.

private static HoodieCommitMetadata createLogFiles(Map<String, List<HoodieWriteStat>> partitionWriteStats, boolean isLogSchemaSimple, boolean useSchemaFromCommitMetadata) throws InterruptedException, IOException, URISyntaxException {
    HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
    for (Entry<String, List<HoodieWriteStat>> wEntry : partitionWriteStats.entrySet()) {
        String partitionPath = wEntry.getKey();
        for (HoodieWriteStat wStat : wEntry.getValue()) {
            Path path = new Path(wStat.getPath());
            HoodieBaseFile dataFile = new HoodieBaseFile(fileSystem.getFileStatus(path));
            HoodieLogFile logFile = generateLogData(path, isLogSchemaSimple);
            HoodieDeltaWriteStat writeStat = new HoodieDeltaWriteStat();
            writeStat.setFileId(dataFile.getFileId());
            writeStat.setPath(logFile.getPath().toString());
            commitMetadata.addWriteStat(partitionPath, writeStat);
        }
    }
    addSchemaToCommitMetadata(commitMetadata, isLogSchemaSimple, useSchemaFromCommitMetadata);
    return commitMetadata;
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Path(org.apache.hadoop.fs.Path) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieDeltaWriteStat(org.apache.hudi.common.model.HoodieDeltaWriteStat) List(java.util.List) ArrayList(java.util.ArrayList) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile)

Example 45 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class TestHoodieBackedMetadata method testVirtualKeysInBaseFiles.

/**
 * Tests that virtual key configs are honored in base files after compaction in metadata table.
 *
 * @throws Exception
 */
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testVirtualKeysInBaseFiles(boolean populateMetaFields) throws Exception {
    HoodieTableType tableType = MERGE_ON_READ;
    init(tableType, false);
    writeConfig = getWriteConfigBuilder(true, true, false).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).enableFullScan(true).enableMetrics(false).withPopulateMetaFields(populateMetaFields).withMaxNumDeltaCommitsBeforeCompaction(2).build()).build();
    initWriteConfigAndMetatableWriter(writeConfig, true);
    doWriteOperation(testTable, "0000001", INSERT);
    doClean(testTable, "0000003", Arrays.asList("0000001"));
    // this should have triggered compaction in metadata table
    doWriteOperation(testTable, "0000004", UPSERT);
    HoodieTableMetadata tableMetadata = metadata(writeConfig, context);
    assertTrue(tableMetadata.getLatestCompactionTime().isPresent());
    assertEquals(tableMetadata.getLatestCompactionTime().get(), "0000003001");
    HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build();
    HoodieWriteConfig metadataTableWriteConfig = getMetadataWriteConfig(writeConfig);
    metadataMetaClient.reloadActiveTimeline();
    HoodieTable table = HoodieSparkTable.create(metadataTableWriteConfig, context, metadataMetaClient);
    table.getHoodieView().sync();
    List<FileSlice> fileSlices = table.getSliceView().getLatestFileSlices("files").collect(Collectors.toList());
    HoodieBaseFile baseFile = fileSlices.get(0).getBaseFile().get();
    HoodieHFileReader hoodieHFileReader = new HoodieHFileReader(context.getHadoopConf().get(), new Path(baseFile.getPath()), new CacheConfig(context.getHadoopConf().get()));
    List<Pair<String, IndexedRecord>> records = hoodieHFileReader.readAllRecords();
    records.forEach(entry -> {
        if (populateMetaFields) {
            assertNotNull(((GenericRecord) entry.getSecond()).get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
        } else {
            assertNull(((GenericRecord) entry.getSecond()).get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
        }
    });
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieHFileReader(org.apache.hudi.io.storage.HoodieHFileReader) CacheConfig(org.apache.hadoop.hbase.io.hfile.CacheConfig) Pair(org.apache.hadoop.hbase.util.Pair) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)71 Path (org.apache.hadoop.fs.Path)40 ArrayList (java.util.ArrayList)33 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)31 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)31 FileSlice (org.apache.hudi.common.model.FileSlice)29 List (java.util.List)27 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)27 IOException (java.io.IOException)26 FileStatus (org.apache.hadoop.fs.FileStatus)25 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)24 Pair (org.apache.hudi.common.util.collection.Pair)24 Option (org.apache.hudi.common.util.Option)23 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)23 Collectors (java.util.stream.Collectors)21 Test (org.junit.jupiter.api.Test)21 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)21 Map (java.util.Map)20 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)20 HoodieTable (org.apache.hudi.table.HoodieTable)20