Search in sources :

Example 1 with ReplacePartitions

use of org.apache.iceberg.ReplacePartitions in project hive by apache.

the class HiveIcebergOutputCommitter method commitTable.

/**
 * Collects the additions to a single table and adds/commits the new files to the Iceberg table.
 * @param io The io to read the forCommit files
 * @param executor The executor used to read the forCommit files
 * @param jobContext The job context
 * @param name The name of the table used for loading from the catalog
 * @param location The location of the table used for loading from the catalog
 * @param catalogName The name of the catalog that contains the table
 */
private void commitTable(FileIO io, ExecutorService executor, JobContext jobContext, String name, String location, String catalogName) {
    JobConf conf = jobContext.getJobConf();
    Properties catalogProperties = new Properties();
    catalogProperties.put(Catalogs.NAME, name);
    catalogProperties.put(Catalogs.LOCATION, location);
    if (catalogName != null) {
        catalogProperties.put(InputFormatConfig.CATALOG_NAME, catalogName);
    }
    Table table = Catalogs.loadTable(conf, catalogProperties);
    long startTime = System.currentTimeMillis();
    LOG.info("Committing job has started for table: {}, using location: {}", table, generateJobLocation(location, conf, jobContext.getJobID()));
    int numTasks = SessionStateUtil.getCommitInfo(conf, name).map(info -> info.getTaskNum()).orElseGet(() -> {
        // Fallback logic, if number of tasks are not available in the config
        // If there are reducers, then every reducer will generate a result file.
        // If this is a map only task, then every mapper will generate a result file.
        LOG.info("Number of tasks not available in session state for jobID: {}, table: {}. Falling back to jobConf " + "numReduceTasks/numMapTasks", jobContext.getJobID(), name);
        return conf.getNumReduceTasks() > 0 ? conf.getNumReduceTasks() : conf.getNumMapTasks();
    });
    Collection<DataFile> dataFiles = dataFiles(numTasks, executor, location, jobContext, io, true);
    boolean isOverwrite = conf.getBoolean(InputFormatConfig.IS_OVERWRITE, false);
    if (isOverwrite) {
        if (!dataFiles.isEmpty()) {
            ReplacePartitions overwrite = table.newReplacePartitions();
            dataFiles.forEach(overwrite::addFile);
            overwrite.commit();
            LOG.info("Overwrite commit took {} ms for table: {} with {} file(s)", System.currentTimeMillis() - startTime, table, dataFiles.size());
        } else if (table.spec().isUnpartitioned()) {
            // TODO: we won't get here if we have a formerly-partitioned table, whose partition specs have been turned void
            table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit();
            LOG.info("Cleared table contents as part of empty overwrite for unpartitioned table. " + "Commit took {} ms for table: {}", System.currentTimeMillis() - startTime, table);
        }
        LOG.debug("Overwrote partitions with files {}", dataFiles);
    } else if (dataFiles.size() > 0) {
        // Appending data files to the table
        // We only create a new commit if there's something to append
        AppendFiles append = table.newAppend();
        dataFiles.forEach(append::appendFile);
        append.commit();
        LOG.info("Append commit took {} ms for table: {} with {} file(s)", System.currentTimeMillis() - startTime, table, dataFiles.size());
        LOG.debug("Added files {}", dataFiles);
    } else {
        LOG.info("Not creating a new commit for table: {}, jobID: {}, since there were no new files to append", table, jobContext.getJobID());
    }
}
Also used : NotFoundException(org.apache.iceberg.exceptions.NotFoundException) Arrays(java.util.Arrays) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) FileSystem(org.apache.hadoop.fs.FileSystem) Catalogs(org.apache.iceberg.mr.Catalogs) ObjectInputStream(java.io.ObjectInputStream) LoggerFactory(org.slf4j.LoggerFactory) AppendFiles(org.apache.iceberg.AppendFiles) OutputFile(org.apache.iceberg.io.OutputFile) FileStatus(org.apache.hadoop.fs.FileStatus) TaskType(org.apache.hadoop.mapreduce.TaskType) OutputCommitter(org.apache.hadoop.mapred.OutputCommitter) TaskAttemptContext(org.apache.hadoop.mapred.TaskAttemptContext) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) ObjectOutputStream(java.io.ObjectOutputStream) TaskAttemptID(org.apache.hadoop.mapred.TaskAttemptID) JobID(org.apache.hadoop.mapreduce.JobID) DataFile(org.apache.iceberg.DataFile) ExecutorService(java.util.concurrent.ExecutorService) Properties(java.util.Properties) Logger(org.slf4j.Logger) Table(org.apache.iceberg.Table) Collection(java.util.Collection) HiveConf(org.apache.hadoop.hive.conf.HiveConf) InputFormatConfig(org.apache.iceberg.mr.InputFormatConfig) ThreadFactoryBuilder(org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) Executors(java.util.concurrent.Executors) JobConf(org.apache.hadoop.mapred.JobConf) Util(org.apache.iceberg.hadoop.Util) JobContext(org.apache.hadoop.mapred.JobContext) ReplacePartitions(org.apache.iceberg.ReplacePartitions) Tasks(org.apache.iceberg.util.Tasks) Optional(java.util.Optional) SessionStateUtil(org.apache.hadoop.hive.ql.session.SessionStateUtil) Expressions(org.apache.iceberg.expressions.Expressions) FileIO(org.apache.iceberg.io.FileIO) VisibleForTesting(org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) DataFile(org.apache.iceberg.DataFile) Table(org.apache.iceberg.Table) AppendFiles(org.apache.iceberg.AppendFiles) Properties(java.util.Properties) JobConf(org.apache.hadoop.mapred.JobConf) ReplacePartitions(org.apache.iceberg.ReplacePartitions)

Aggregations

IOException (java.io.IOException)1 ObjectInputStream (java.io.ObjectInputStream)1 ObjectOutputStream (java.io.ObjectOutputStream)1 Arrays (java.util.Arrays)1 Collection (java.util.Collection)1 Map (java.util.Map)1 Optional (java.util.Optional)1 Properties (java.util.Properties)1 Set (java.util.Set)1 ConcurrentLinkedQueue (java.util.concurrent.ConcurrentLinkedQueue)1 ExecutorService (java.util.concurrent.ExecutorService)1 Executors (java.util.concurrent.Executors)1 Collectors (java.util.stream.Collectors)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 HiveConf (org.apache.hadoop.hive.conf.HiveConf)1 SessionStateUtil (org.apache.hadoop.hive.ql.session.SessionStateUtil)1 JobConf (org.apache.hadoop.mapred.JobConf)1