Search in sources :

Example 11 with Table

use of org.apache.iceberg.Table in project hive by apache.

the class HiveIcebergOutputCommitter method commitJob.

/**
 * Reads the commit files stored in the temp directories and collects the generated committed data files.
 * Appends the data files to the tables. At the end removes the temporary directories.
 * @param originalContext The job context
 * @throws IOException if there is a failure accessing the files
 */
@Override
public void commitJob(JobContext originalContext) throws IOException {
    JobContext jobContext = TezUtil.enrichContextWithVertexId(originalContext);
    JobConf jobConf = jobContext.getJobConf();
    long startTime = System.currentTimeMillis();
    LOG.info("Committing job {} has started", jobContext.getJobID());
    Collection<String> outputs = HiveIcebergStorageHandler.outputTables(jobContext.getJobConf());
    Collection<String> jobLocations = new ConcurrentLinkedQueue<>();
    ExecutorService fileExecutor = fileExecutor(jobConf);
    ExecutorService tableExecutor = tableExecutor(jobConf, outputs.size());
    try {
        // Commits the changes for the output tables in parallel
        Tasks.foreach(outputs).throwFailureWhenFinished().stopOnFailure().executeWith(tableExecutor).run(output -> {
            Table table = SessionStateUtil.getResource(jobConf, output).filter(o -> o instanceof Table).map(o -> (Table) o).orElseGet(() -> HiveIcebergStorageHandler.table(jobConf, output));
            if (table != null) {
                String catalogName = HiveIcebergStorageHandler.catalogName(jobConf, output);
                jobLocations.add(generateJobLocation(table.location(), jobConf, jobContext.getJobID()));
                commitTable(table.io(), fileExecutor, jobContext, output, table.location(), catalogName);
            } else {
                LOG.info("CommitJob found no table object in QueryState or conf for: {}. Skipping job commit.", output);
            }
        });
    } finally {
        fileExecutor.shutdown();
        if (tableExecutor != null) {
            tableExecutor.shutdown();
        }
    }
    LOG.info("Commit took {} ms for job {}", System.currentTimeMillis() - startTime, jobContext.getJobID());
    cleanup(jobContext, jobLocations);
}
Also used : NotFoundException(org.apache.iceberg.exceptions.NotFoundException) Arrays(java.util.Arrays) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) FileSystem(org.apache.hadoop.fs.FileSystem) Catalogs(org.apache.iceberg.mr.Catalogs) ObjectInputStream(java.io.ObjectInputStream) LoggerFactory(org.slf4j.LoggerFactory) AppendFiles(org.apache.iceberg.AppendFiles) OutputFile(org.apache.iceberg.io.OutputFile) FileStatus(org.apache.hadoop.fs.FileStatus) TaskType(org.apache.hadoop.mapreduce.TaskType) OutputCommitter(org.apache.hadoop.mapred.OutputCommitter) TaskAttemptContext(org.apache.hadoop.mapred.TaskAttemptContext) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) ObjectOutputStream(java.io.ObjectOutputStream) TaskAttemptID(org.apache.hadoop.mapred.TaskAttemptID) JobID(org.apache.hadoop.mapreduce.JobID) DataFile(org.apache.iceberg.DataFile) ExecutorService(java.util.concurrent.ExecutorService) Properties(java.util.Properties) Logger(org.slf4j.Logger) Table(org.apache.iceberg.Table) Collection(java.util.Collection) HiveConf(org.apache.hadoop.hive.conf.HiveConf) InputFormatConfig(org.apache.iceberg.mr.InputFormatConfig) ThreadFactoryBuilder(org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) Executors(java.util.concurrent.Executors) JobConf(org.apache.hadoop.mapred.JobConf) Util(org.apache.iceberg.hadoop.Util) JobContext(org.apache.hadoop.mapred.JobContext) ReplacePartitions(org.apache.iceberg.ReplacePartitions) Tasks(org.apache.iceberg.util.Tasks) Optional(java.util.Optional) SessionStateUtil(org.apache.hadoop.hive.ql.session.SessionStateUtil) Expressions(org.apache.iceberg.expressions.Expressions) FileIO(org.apache.iceberg.io.FileIO) VisibleForTesting(org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) Table(org.apache.iceberg.Table) ExecutorService(java.util.concurrent.ExecutorService) JobContext(org.apache.hadoop.mapred.JobContext) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) JobConf(org.apache.hadoop.mapred.JobConf)

Example 12 with Table

use of org.apache.iceberg.Table in project hive by apache.

the class HiveIcebergOutputCommitter method commitTask.

/**
 * Collects the generated data files and creates a commit file storing the data file list.
 * @param originalContext The task attempt context
 * @throws IOException Thrown if there is an error writing the commit file
 */
@Override
public void commitTask(TaskAttemptContext originalContext) throws IOException {
    TaskAttemptContext context = TezUtil.enrichContextWithAttemptWrapper(originalContext);
    TaskAttemptID attemptID = context.getTaskAttemptID();
    JobConf jobConf = context.getJobConf();
    Collection<String> outputs = HiveIcebergStorageHandler.outputTables(context.getJobConf());
    Map<String, HiveIcebergRecordWriter> writers = Optional.ofNullable(HiveIcebergRecordWriter.getWriters(attemptID)).orElseGet(() -> {
        LOG.info("CommitTask found no writers for output tables: {}, attemptID: {}", outputs, attemptID);
        return ImmutableMap.of();
    });
    ExecutorService tableExecutor = tableExecutor(jobConf, outputs.size());
    try {
        // Generates commit files for the target tables in parallel
        Tasks.foreach(outputs).retry(3).stopOnFailure().throwFailureWhenFinished().executeWith(tableExecutor).run(output -> {
            Table table = HiveIcebergStorageHandler.table(context.getJobConf(), output);
            if (table != null) {
                HiveIcebergRecordWriter writer = writers.get(output);
                DataFile[] closedFiles;
                if (writer != null) {
                    closedFiles = writer.dataFiles().toArray(new DataFile[0]);
                } else {
                    LOG.info("CommitTask found no writer for specific table: {}, attemptID: {}", output, attemptID);
                    closedFiles = new DataFile[0];
                }
                String fileForCommitLocation = generateFileForCommitLocation(table.location(), jobConf, attemptID.getJobID(), attemptID.getTaskID().getId());
                // Creating the file containing the data files generated by this task for this table
                createFileForCommit(closedFiles, fileForCommitLocation, table.io());
            } else {
                // When using Tez multi-table inserts, we could have more output tables in config than
                // the actual tables this task has written to and has serialized in its config
                LOG.info("CommitTask found no serialized table in config for table: {}.", output);
            }
        }, IOException.class);
    } finally {
        if (tableExecutor != null) {
            tableExecutor.shutdown();
        }
    }
    // remove the writer to release the object
    HiveIcebergRecordWriter.removeWriters(attemptID);
}
Also used : DataFile(org.apache.iceberg.DataFile) Table(org.apache.iceberg.Table) TaskAttemptID(org.apache.hadoop.mapred.TaskAttemptID) ExecutorService(java.util.concurrent.ExecutorService) TaskAttemptContext(org.apache.hadoop.mapred.TaskAttemptContext) JobConf(org.apache.hadoop.mapred.JobConf)

Example 13 with Table

use of org.apache.iceberg.Table in project hive by apache.

the class HiveIcebergOutputCommitter method commitTable.

/**
 * Collects the additions to a single table and adds/commits the new files to the Iceberg table.
 * @param io The io to read the forCommit files
 * @param executor The executor used to read the forCommit files
 * @param jobContext The job context
 * @param name The name of the table used for loading from the catalog
 * @param location The location of the table used for loading from the catalog
 * @param catalogName The name of the catalog that contains the table
 */
private void commitTable(FileIO io, ExecutorService executor, JobContext jobContext, String name, String location, String catalogName) {
    JobConf conf = jobContext.getJobConf();
    Properties catalogProperties = new Properties();
    catalogProperties.put(Catalogs.NAME, name);
    catalogProperties.put(Catalogs.LOCATION, location);
    if (catalogName != null) {
        catalogProperties.put(InputFormatConfig.CATALOG_NAME, catalogName);
    }
    Table table = Catalogs.loadTable(conf, catalogProperties);
    long startTime = System.currentTimeMillis();
    LOG.info("Committing job has started for table: {}, using location: {}", table, generateJobLocation(location, conf, jobContext.getJobID()));
    int numTasks = SessionStateUtil.getCommitInfo(conf, name).map(info -> info.getTaskNum()).orElseGet(() -> {
        // Fallback logic, if number of tasks are not available in the config
        // If there are reducers, then every reducer will generate a result file.
        // If this is a map only task, then every mapper will generate a result file.
        LOG.info("Number of tasks not available in session state for jobID: {}, table: {}. Falling back to jobConf " + "numReduceTasks/numMapTasks", jobContext.getJobID(), name);
        return conf.getNumReduceTasks() > 0 ? conf.getNumReduceTasks() : conf.getNumMapTasks();
    });
    Collection<DataFile> dataFiles = dataFiles(numTasks, executor, location, jobContext, io, true);
    boolean isOverwrite = conf.getBoolean(InputFormatConfig.IS_OVERWRITE, false);
    if (isOverwrite) {
        if (!dataFiles.isEmpty()) {
            ReplacePartitions overwrite = table.newReplacePartitions();
            dataFiles.forEach(overwrite::addFile);
            overwrite.commit();
            LOG.info("Overwrite commit took {} ms for table: {} with {} file(s)", System.currentTimeMillis() - startTime, table, dataFiles.size());
        } else if (table.spec().isUnpartitioned()) {
            // TODO: we won't get here if we have a formerly-partitioned table, whose partition specs have been turned void
            table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit();
            LOG.info("Cleared table contents as part of empty overwrite for unpartitioned table. " + "Commit took {} ms for table: {}", System.currentTimeMillis() - startTime, table);
        }
        LOG.debug("Overwrote partitions with files {}", dataFiles);
    } else if (dataFiles.size() > 0) {
        // Appending data files to the table
        // We only create a new commit if there's something to append
        AppendFiles append = table.newAppend();
        dataFiles.forEach(append::appendFile);
        append.commit();
        LOG.info("Append commit took {} ms for table: {} with {} file(s)", System.currentTimeMillis() - startTime, table, dataFiles.size());
        LOG.debug("Added files {}", dataFiles);
    } else {
        LOG.info("Not creating a new commit for table: {}, jobID: {}, since there were no new files to append", table, jobContext.getJobID());
    }
}
Also used : NotFoundException(org.apache.iceberg.exceptions.NotFoundException) Arrays(java.util.Arrays) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) FileSystem(org.apache.hadoop.fs.FileSystem) Catalogs(org.apache.iceberg.mr.Catalogs) ObjectInputStream(java.io.ObjectInputStream) LoggerFactory(org.slf4j.LoggerFactory) AppendFiles(org.apache.iceberg.AppendFiles) OutputFile(org.apache.iceberg.io.OutputFile) FileStatus(org.apache.hadoop.fs.FileStatus) TaskType(org.apache.hadoop.mapreduce.TaskType) OutputCommitter(org.apache.hadoop.mapred.OutputCommitter) TaskAttemptContext(org.apache.hadoop.mapred.TaskAttemptContext) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) ObjectOutputStream(java.io.ObjectOutputStream) TaskAttemptID(org.apache.hadoop.mapred.TaskAttemptID) JobID(org.apache.hadoop.mapreduce.JobID) DataFile(org.apache.iceberg.DataFile) ExecutorService(java.util.concurrent.ExecutorService) Properties(java.util.Properties) Logger(org.slf4j.Logger) Table(org.apache.iceberg.Table) Collection(java.util.Collection) HiveConf(org.apache.hadoop.hive.conf.HiveConf) InputFormatConfig(org.apache.iceberg.mr.InputFormatConfig) ThreadFactoryBuilder(org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) Executors(java.util.concurrent.Executors) JobConf(org.apache.hadoop.mapred.JobConf) Util(org.apache.iceberg.hadoop.Util) JobContext(org.apache.hadoop.mapred.JobContext) ReplacePartitions(org.apache.iceberg.ReplacePartitions) Tasks(org.apache.iceberg.util.Tasks) Optional(java.util.Optional) SessionStateUtil(org.apache.hadoop.hive.ql.session.SessionStateUtil) Expressions(org.apache.iceberg.expressions.Expressions) FileIO(org.apache.iceberg.io.FileIO) VisibleForTesting(org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) DataFile(org.apache.iceberg.DataFile) Table(org.apache.iceberg.Table) AppendFiles(org.apache.iceberg.AppendFiles) Properties(java.util.Properties) JobConf(org.apache.hadoop.mapred.JobConf) ReplacePartitions(org.apache.iceberg.ReplacePartitions)

Example 14 with Table

use of org.apache.iceberg.Table in project hive by apache.

the class HiveCreateReplaceTableTest method testReplaceTableTxn.

@Test
public void testReplaceTableTxn() {
    catalog.createTable(TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap());
    Assert.assertTrue("Table should exist", catalog.tableExists(TABLE_IDENTIFIER));
    Transaction txn = catalog.newReplaceTableTransaction(TABLE_IDENTIFIER, SCHEMA, false);
    txn.commitTransaction();
    Table table = catalog.loadTable(TABLE_IDENTIFIER);
    PartitionSpec v1Expected = PartitionSpec.builderFor(table.schema()).alwaysNull("id", "id").withSpecId(1).build();
    Assert.assertEquals("Table should have a spec with one void field", v1Expected, table.spec());
}
Also used : Table(org.apache.iceberg.Table) Transaction(org.apache.iceberg.Transaction) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 15 with Table

use of org.apache.iceberg.Table in project hive by apache.

the class HiveCreateReplaceTableTest method testCreateOrReplaceTableTxnTableExists.

@Test
public void testCreateOrReplaceTableTxnTableExists() {
    catalog.createTable(TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap());
    Assert.assertTrue("Table should exist", catalog.tableExists(TABLE_IDENTIFIER));
    Transaction txn = catalog.newReplaceTableTransaction(TABLE_IDENTIFIER, SCHEMA, true);
    txn.commitTransaction();
    Table table = catalog.loadTable(TABLE_IDENTIFIER);
    PartitionSpec v1Expected = PartitionSpec.builderFor(table.schema()).alwaysNull("id", "id").withSpecId(1).build();
    Assert.assertEquals("Table should have a spec with one void field", v1Expected, table.spec());
}
Also used : Table(org.apache.iceberg.Table) Transaction(org.apache.iceberg.Transaction) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Aggregations

Table (org.apache.iceberg.Table)188 Test (org.junit.Test)132 Schema (org.apache.iceberg.Schema)66 TableIdentifier (org.apache.iceberg.catalog.TableIdentifier)56 Record (org.apache.iceberg.data.Record)56 PartitionSpec (org.apache.iceberg.PartitionSpec)51 IOException (java.io.IOException)27 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)27 List (java.util.List)22 Map (java.util.Map)20 DataFile (org.apache.iceberg.DataFile)19 NoSuchTableException (org.apache.iceberg.exceptions.NoSuchTableException)19 Collectors (java.util.stream.Collectors)18 BaseTable (org.apache.iceberg.BaseTable)18 Types (org.apache.iceberg.types.Types)18 Properties (java.util.Properties)17 Configuration (org.apache.hadoop.conf.Configuration)17 Path (org.apache.hadoop.fs.Path)17 FileFormat (org.apache.iceberg.FileFormat)16 ArrayList (java.util.ArrayList)15