use of org.apache.iceberg.ReplacePartitions in project hive by apache.
the class HiveIcebergOutputCommitter method commitTable.
/**
* Collects the additions to a single table and adds/commits the new files to the Iceberg table.
* @param io The io to read the forCommit files
* @param executor The executor used to read the forCommit files
* @param jobContext The job context
* @param name The name of the table used for loading from the catalog
* @param location The location of the table used for loading from the catalog
* @param catalogName The name of the catalog that contains the table
*/
private void commitTable(FileIO io, ExecutorService executor, JobContext jobContext, String name, String location, String catalogName) {
JobConf conf = jobContext.getJobConf();
Properties catalogProperties = new Properties();
catalogProperties.put(Catalogs.NAME, name);
catalogProperties.put(Catalogs.LOCATION, location);
if (catalogName != null) {
catalogProperties.put(InputFormatConfig.CATALOG_NAME, catalogName);
}
Table table = Catalogs.loadTable(conf, catalogProperties);
long startTime = System.currentTimeMillis();
LOG.info("Committing job has started for table: {}, using location: {}", table, generateJobLocation(location, conf, jobContext.getJobID()));
int numTasks = SessionStateUtil.getCommitInfo(conf, name).map(info -> info.getTaskNum()).orElseGet(() -> {
// Fallback logic, if number of tasks are not available in the config
// If there are reducers, then every reducer will generate a result file.
// If this is a map only task, then every mapper will generate a result file.
LOG.info("Number of tasks not available in session state for jobID: {}, table: {}. Falling back to jobConf " + "numReduceTasks/numMapTasks", jobContext.getJobID(), name);
return conf.getNumReduceTasks() > 0 ? conf.getNumReduceTasks() : conf.getNumMapTasks();
});
Collection<DataFile> dataFiles = dataFiles(numTasks, executor, location, jobContext, io, true);
boolean isOverwrite = conf.getBoolean(InputFormatConfig.IS_OVERWRITE, false);
if (isOverwrite) {
if (!dataFiles.isEmpty()) {
ReplacePartitions overwrite = table.newReplacePartitions();
dataFiles.forEach(overwrite::addFile);
overwrite.commit();
LOG.info("Overwrite commit took {} ms for table: {} with {} file(s)", System.currentTimeMillis() - startTime, table, dataFiles.size());
} else if (table.spec().isUnpartitioned()) {
// TODO: we won't get here if we have a formerly-partitioned table, whose partition specs have been turned void
table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit();
LOG.info("Cleared table contents as part of empty overwrite for unpartitioned table. " + "Commit took {} ms for table: {}", System.currentTimeMillis() - startTime, table);
}
LOG.debug("Overwrote partitions with files {}", dataFiles);
} else if (dataFiles.size() > 0) {
// Appending data files to the table
// We only create a new commit if there's something to append
AppendFiles append = table.newAppend();
dataFiles.forEach(append::appendFile);
append.commit();
LOG.info("Append commit took {} ms for table: {} with {} file(s)", System.currentTimeMillis() - startTime, table, dataFiles.size());
LOG.debug("Added files {}", dataFiles);
} else {
LOG.info("Not creating a new commit for table: {}, jobID: {}, since there were no new files to append", table, jobContext.getJobID());
}
}
Aggregations