use of org.apache.iceberg.Table in project hive by apache.
the class HiveIcebergOutputCommitter method commitJob.
/**
* Reads the commit files stored in the temp directories and collects the generated committed data files.
* Appends the data files to the tables. At the end removes the temporary directories.
* @param originalContext The job context
* @throws IOException if there is a failure accessing the files
*/
@Override
public void commitJob(JobContext originalContext) throws IOException {
JobContext jobContext = TezUtil.enrichContextWithVertexId(originalContext);
JobConf jobConf = jobContext.getJobConf();
long startTime = System.currentTimeMillis();
LOG.info("Committing job {} has started", jobContext.getJobID());
Collection<String> outputs = HiveIcebergStorageHandler.outputTables(jobContext.getJobConf());
Collection<String> jobLocations = new ConcurrentLinkedQueue<>();
ExecutorService fileExecutor = fileExecutor(jobConf);
ExecutorService tableExecutor = tableExecutor(jobConf, outputs.size());
try {
// Commits the changes for the output tables in parallel
Tasks.foreach(outputs).throwFailureWhenFinished().stopOnFailure().executeWith(tableExecutor).run(output -> {
Table table = SessionStateUtil.getResource(jobConf, output).filter(o -> o instanceof Table).map(o -> (Table) o).orElseGet(() -> HiveIcebergStorageHandler.table(jobConf, output));
if (table != null) {
String catalogName = HiveIcebergStorageHandler.catalogName(jobConf, output);
jobLocations.add(generateJobLocation(table.location(), jobConf, jobContext.getJobID()));
commitTable(table.io(), fileExecutor, jobContext, output, table.location(), catalogName);
} else {
LOG.info("CommitJob found no table object in QueryState or conf for: {}. Skipping job commit.", output);
}
});
} finally {
fileExecutor.shutdown();
if (tableExecutor != null) {
tableExecutor.shutdown();
}
}
LOG.info("Commit took {} ms for job {}", System.currentTimeMillis() - startTime, jobContext.getJobID());
cleanup(jobContext, jobLocations);
}
use of org.apache.iceberg.Table in project hive by apache.
the class HiveIcebergOutputCommitter method commitTask.
/**
* Collects the generated data files and creates a commit file storing the data file list.
* @param originalContext The task attempt context
* @throws IOException Thrown if there is an error writing the commit file
*/
@Override
public void commitTask(TaskAttemptContext originalContext) throws IOException {
TaskAttemptContext context = TezUtil.enrichContextWithAttemptWrapper(originalContext);
TaskAttemptID attemptID = context.getTaskAttemptID();
JobConf jobConf = context.getJobConf();
Collection<String> outputs = HiveIcebergStorageHandler.outputTables(context.getJobConf());
Map<String, HiveIcebergRecordWriter> writers = Optional.ofNullable(HiveIcebergRecordWriter.getWriters(attemptID)).orElseGet(() -> {
LOG.info("CommitTask found no writers for output tables: {}, attemptID: {}", outputs, attemptID);
return ImmutableMap.of();
});
ExecutorService tableExecutor = tableExecutor(jobConf, outputs.size());
try {
// Generates commit files for the target tables in parallel
Tasks.foreach(outputs).retry(3).stopOnFailure().throwFailureWhenFinished().executeWith(tableExecutor).run(output -> {
Table table = HiveIcebergStorageHandler.table(context.getJobConf(), output);
if (table != null) {
HiveIcebergRecordWriter writer = writers.get(output);
DataFile[] closedFiles;
if (writer != null) {
closedFiles = writer.dataFiles().toArray(new DataFile[0]);
} else {
LOG.info("CommitTask found no writer for specific table: {}, attemptID: {}", output, attemptID);
closedFiles = new DataFile[0];
}
String fileForCommitLocation = generateFileForCommitLocation(table.location(), jobConf, attemptID.getJobID(), attemptID.getTaskID().getId());
// Creating the file containing the data files generated by this task for this table
createFileForCommit(closedFiles, fileForCommitLocation, table.io());
} else {
// When using Tez multi-table inserts, we could have more output tables in config than
// the actual tables this task has written to and has serialized in its config
LOG.info("CommitTask found no serialized table in config for table: {}.", output);
}
}, IOException.class);
} finally {
if (tableExecutor != null) {
tableExecutor.shutdown();
}
}
// remove the writer to release the object
HiveIcebergRecordWriter.removeWriters(attemptID);
}
use of org.apache.iceberg.Table in project hive by apache.
the class HiveIcebergOutputCommitter method commitTable.
/**
* Collects the additions to a single table and adds/commits the new files to the Iceberg table.
* @param io The io to read the forCommit files
* @param executor The executor used to read the forCommit files
* @param jobContext The job context
* @param name The name of the table used for loading from the catalog
* @param location The location of the table used for loading from the catalog
* @param catalogName The name of the catalog that contains the table
*/
private void commitTable(FileIO io, ExecutorService executor, JobContext jobContext, String name, String location, String catalogName) {
JobConf conf = jobContext.getJobConf();
Properties catalogProperties = new Properties();
catalogProperties.put(Catalogs.NAME, name);
catalogProperties.put(Catalogs.LOCATION, location);
if (catalogName != null) {
catalogProperties.put(InputFormatConfig.CATALOG_NAME, catalogName);
}
Table table = Catalogs.loadTable(conf, catalogProperties);
long startTime = System.currentTimeMillis();
LOG.info("Committing job has started for table: {}, using location: {}", table, generateJobLocation(location, conf, jobContext.getJobID()));
int numTasks = SessionStateUtil.getCommitInfo(conf, name).map(info -> info.getTaskNum()).orElseGet(() -> {
// Fallback logic, if number of tasks are not available in the config
// If there are reducers, then every reducer will generate a result file.
// If this is a map only task, then every mapper will generate a result file.
LOG.info("Number of tasks not available in session state for jobID: {}, table: {}. Falling back to jobConf " + "numReduceTasks/numMapTasks", jobContext.getJobID(), name);
return conf.getNumReduceTasks() > 0 ? conf.getNumReduceTasks() : conf.getNumMapTasks();
});
Collection<DataFile> dataFiles = dataFiles(numTasks, executor, location, jobContext, io, true);
boolean isOverwrite = conf.getBoolean(InputFormatConfig.IS_OVERWRITE, false);
if (isOverwrite) {
if (!dataFiles.isEmpty()) {
ReplacePartitions overwrite = table.newReplacePartitions();
dataFiles.forEach(overwrite::addFile);
overwrite.commit();
LOG.info("Overwrite commit took {} ms for table: {} with {} file(s)", System.currentTimeMillis() - startTime, table, dataFiles.size());
} else if (table.spec().isUnpartitioned()) {
// TODO: we won't get here if we have a formerly-partitioned table, whose partition specs have been turned void
table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit();
LOG.info("Cleared table contents as part of empty overwrite for unpartitioned table. " + "Commit took {} ms for table: {}", System.currentTimeMillis() - startTime, table);
}
LOG.debug("Overwrote partitions with files {}", dataFiles);
} else if (dataFiles.size() > 0) {
// Appending data files to the table
// We only create a new commit if there's something to append
AppendFiles append = table.newAppend();
dataFiles.forEach(append::appendFile);
append.commit();
LOG.info("Append commit took {} ms for table: {} with {} file(s)", System.currentTimeMillis() - startTime, table, dataFiles.size());
LOG.debug("Added files {}", dataFiles);
} else {
LOG.info("Not creating a new commit for table: {}, jobID: {}, since there were no new files to append", table, jobContext.getJobID());
}
}
use of org.apache.iceberg.Table in project hive by apache.
the class HiveCreateReplaceTableTest method testReplaceTableTxn.
@Test
public void testReplaceTableTxn() {
catalog.createTable(TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap());
Assert.assertTrue("Table should exist", catalog.tableExists(TABLE_IDENTIFIER));
Transaction txn = catalog.newReplaceTableTransaction(TABLE_IDENTIFIER, SCHEMA, false);
txn.commitTransaction();
Table table = catalog.loadTable(TABLE_IDENTIFIER);
PartitionSpec v1Expected = PartitionSpec.builderFor(table.schema()).alwaysNull("id", "id").withSpecId(1).build();
Assert.assertEquals("Table should have a spec with one void field", v1Expected, table.spec());
}
use of org.apache.iceberg.Table in project hive by apache.
the class HiveCreateReplaceTableTest method testCreateOrReplaceTableTxnTableExists.
@Test
public void testCreateOrReplaceTableTxnTableExists() {
catalog.createTable(TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap());
Assert.assertTrue("Table should exist", catalog.tableExists(TABLE_IDENTIFIER));
Transaction txn = catalog.newReplaceTableTransaction(TABLE_IDENTIFIER, SCHEMA, true);
txn.commitTransaction();
Table table = catalog.loadTable(TABLE_IDENTIFIER);
PartitionSpec v1Expected = PartitionSpec.builderFor(table.schema()).alwaysNull("id", "id").withSpecId(1).build();
Assert.assertEquals("Table should have a spec with one void field", v1Expected, table.spec());
}
Aggregations