Search in sources :

Example 1 with Tasks

use of org.apache.iceberg.util.Tasks in project hive by apache.

the class HiveIcebergOutputCommitter method commitTable.

/**
 * Collects the additions to a single table and adds/commits the new files to the Iceberg table.
 * @param io The io to read the forCommit files
 * @param executor The executor used to read the forCommit files
 * @param jobContext The job context
 * @param name The name of the table used for loading from the catalog
 * @param location The location of the table used for loading from the catalog
 * @param catalogName The name of the catalog that contains the table
 */
private void commitTable(FileIO io, ExecutorService executor, JobContext jobContext, String name, String location, String catalogName) {
    JobConf conf = jobContext.getJobConf();
    Properties catalogProperties = new Properties();
    catalogProperties.put(Catalogs.NAME, name);
    catalogProperties.put(Catalogs.LOCATION, location);
    if (catalogName != null) {
        catalogProperties.put(InputFormatConfig.CATALOG_NAME, catalogName);
    }
    Table table = Catalogs.loadTable(conf, catalogProperties);
    long startTime = System.currentTimeMillis();
    LOG.info("Committing job has started for table: {}, using location: {}", table, generateJobLocation(location, conf, jobContext.getJobID()));
    int numTasks = SessionStateUtil.getCommitInfo(conf, name).map(info -> info.getTaskNum()).orElseGet(() -> {
        // Fallback logic, if number of tasks are not available in the config
        // If there are reducers, then every reducer will generate a result file.
        // If this is a map only task, then every mapper will generate a result file.
        LOG.info("Number of tasks not available in session state for jobID: {}, table: {}. Falling back to jobConf " + "numReduceTasks/numMapTasks", jobContext.getJobID(), name);
        return conf.getNumReduceTasks() > 0 ? conf.getNumReduceTasks() : conf.getNumMapTasks();
    });
    Collection<DataFile> dataFiles = dataFiles(numTasks, executor, location, jobContext, io, true);
    boolean isOverwrite = conf.getBoolean(InputFormatConfig.IS_OVERWRITE, false);
    if (isOverwrite) {
        if (!dataFiles.isEmpty()) {
            ReplacePartitions overwrite = table.newReplacePartitions();
            dataFiles.forEach(overwrite::addFile);
            overwrite.commit();
            LOG.info("Overwrite commit took {} ms for table: {} with {} file(s)", System.currentTimeMillis() - startTime, table, dataFiles.size());
        } else if (table.spec().isUnpartitioned()) {
            // TODO: we won't get here if we have a formerly-partitioned table, whose partition specs have been turned void
            table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit();
            LOG.info("Cleared table contents as part of empty overwrite for unpartitioned table. " + "Commit took {} ms for table: {}", System.currentTimeMillis() - startTime, table);
        }
        LOG.debug("Overwrote partitions with files {}", dataFiles);
    } else if (dataFiles.size() > 0) {
        // Appending data files to the table
        // We only create a new commit if there's something to append
        AppendFiles append = table.newAppend();
        dataFiles.forEach(append::appendFile);
        append.commit();
        LOG.info("Append commit took {} ms for table: {} with {} file(s)", System.currentTimeMillis() - startTime, table, dataFiles.size());
        LOG.debug("Added files {}", dataFiles);
    } else {
        LOG.info("Not creating a new commit for table: {}, jobID: {}, since there were no new files to append", table, jobContext.getJobID());
    }
}
Also used : NotFoundException(org.apache.iceberg.exceptions.NotFoundException) Arrays(java.util.Arrays) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) FileSystem(org.apache.hadoop.fs.FileSystem) Catalogs(org.apache.iceberg.mr.Catalogs) ObjectInputStream(java.io.ObjectInputStream) LoggerFactory(org.slf4j.LoggerFactory) AppendFiles(org.apache.iceberg.AppendFiles) OutputFile(org.apache.iceberg.io.OutputFile) FileStatus(org.apache.hadoop.fs.FileStatus) TaskType(org.apache.hadoop.mapreduce.TaskType) OutputCommitter(org.apache.hadoop.mapred.OutputCommitter) TaskAttemptContext(org.apache.hadoop.mapred.TaskAttemptContext) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) ObjectOutputStream(java.io.ObjectOutputStream) TaskAttemptID(org.apache.hadoop.mapred.TaskAttemptID) JobID(org.apache.hadoop.mapreduce.JobID) DataFile(org.apache.iceberg.DataFile) ExecutorService(java.util.concurrent.ExecutorService) Properties(java.util.Properties) Logger(org.slf4j.Logger) Table(org.apache.iceberg.Table) Collection(java.util.Collection) HiveConf(org.apache.hadoop.hive.conf.HiveConf) InputFormatConfig(org.apache.iceberg.mr.InputFormatConfig) ThreadFactoryBuilder(org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) Executors(java.util.concurrent.Executors) JobConf(org.apache.hadoop.mapred.JobConf) Util(org.apache.iceberg.hadoop.Util) JobContext(org.apache.hadoop.mapred.JobContext) ReplacePartitions(org.apache.iceberg.ReplacePartitions) Tasks(org.apache.iceberg.util.Tasks) Optional(java.util.Optional) SessionStateUtil(org.apache.hadoop.hive.ql.session.SessionStateUtil) Expressions(org.apache.iceberg.expressions.Expressions) FileIO(org.apache.iceberg.io.FileIO) VisibleForTesting(org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) DataFile(org.apache.iceberg.DataFile) Table(org.apache.iceberg.Table) AppendFiles(org.apache.iceberg.AppendFiles) Properties(java.util.Properties) JobConf(org.apache.hadoop.mapred.JobConf) ReplacePartitions(org.apache.iceberg.ReplacePartitions)

Example 2 with Tasks

use of org.apache.iceberg.util.Tasks in project hive by apache.

the class HiveIcebergOutputCommitter method listForCommits.

/**
 * Lists the forCommit files under a job location. This should only be used by {@link #abortJob(JobContext, int)},
 * since on the Tez AM-side it will have no access to the correct number of writer tasks otherwise. The commitJob
 * should not need to use this listing as it should have access to the vertex status info on the HS2-side.
 * @param jobConf jobConf used for getting the FS
 * @param jobLocation The job location that we should list
 * @return The set of forCommit files under the job location
 * @throws IOException if the listing fails
 */
private Set<FileStatus> listForCommits(JobConf jobConf, String jobLocation) throws IOException {
    Path path = new Path(jobLocation);
    LOG.debug("Listing job location to get forCommits for abort: {}", jobLocation);
    FileStatus[] children = path.getFileSystem(jobConf).listStatus(path);
    LOG.debug("Listing the job location: {} yielded these files: {}", jobLocation, Arrays.toString(children));
    return Arrays.stream(children).filter(child -> !child.isDirectory() && child.getPath().getName().endsWith(FOR_COMMIT_EXTENSION)).collect(Collectors.toSet());
}
Also used : Path(org.apache.hadoop.fs.Path) NotFoundException(org.apache.iceberg.exceptions.NotFoundException) Arrays(java.util.Arrays) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) FileSystem(org.apache.hadoop.fs.FileSystem) Catalogs(org.apache.iceberg.mr.Catalogs) ObjectInputStream(java.io.ObjectInputStream) LoggerFactory(org.slf4j.LoggerFactory) AppendFiles(org.apache.iceberg.AppendFiles) OutputFile(org.apache.iceberg.io.OutputFile) FileStatus(org.apache.hadoop.fs.FileStatus) TaskType(org.apache.hadoop.mapreduce.TaskType) OutputCommitter(org.apache.hadoop.mapred.OutputCommitter) TaskAttemptContext(org.apache.hadoop.mapred.TaskAttemptContext) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) ObjectOutputStream(java.io.ObjectOutputStream) TaskAttemptID(org.apache.hadoop.mapred.TaskAttemptID) JobID(org.apache.hadoop.mapreduce.JobID) DataFile(org.apache.iceberg.DataFile) ExecutorService(java.util.concurrent.ExecutorService) Properties(java.util.Properties) Logger(org.slf4j.Logger) Table(org.apache.iceberg.Table) Collection(java.util.Collection) HiveConf(org.apache.hadoop.hive.conf.HiveConf) InputFormatConfig(org.apache.iceberg.mr.InputFormatConfig) ThreadFactoryBuilder(org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) Executors(java.util.concurrent.Executors) JobConf(org.apache.hadoop.mapred.JobConf) Util(org.apache.iceberg.hadoop.Util) JobContext(org.apache.hadoop.mapred.JobContext) ReplacePartitions(org.apache.iceberg.ReplacePartitions) Tasks(org.apache.iceberg.util.Tasks) Optional(java.util.Optional) SessionStateUtil(org.apache.hadoop.hive.ql.session.SessionStateUtil) Expressions(org.apache.iceberg.expressions.Expressions) FileIO(org.apache.iceberg.io.FileIO) VisibleForTesting(org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) FileStatus(org.apache.hadoop.fs.FileStatus)

Aggregations

IOException (java.io.IOException)2 ObjectInputStream (java.io.ObjectInputStream)2 ObjectOutputStream (java.io.ObjectOutputStream)2 Arrays (java.util.Arrays)2 Collection (java.util.Collection)2 Map (java.util.Map)2 Optional (java.util.Optional)2 Properties (java.util.Properties)2 Set (java.util.Set)2 ConcurrentLinkedQueue (java.util.concurrent.ConcurrentLinkedQueue)2 ExecutorService (java.util.concurrent.ExecutorService)2 Executors (java.util.concurrent.Executors)2 Collectors (java.util.stream.Collectors)2 Configuration (org.apache.hadoop.conf.Configuration)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 HiveConf (org.apache.hadoop.hive.conf.HiveConf)2 SessionStateUtil (org.apache.hadoop.hive.ql.session.SessionStateUtil)2 JobConf (org.apache.hadoop.mapred.JobConf)2