Search in sources :

Example 1 with ThreadFactoryBuilder

use of org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder in project hive by apache.

the class HiveTableUtil method importFiles.

/**
 * Import files from given partitions to an Iceberg table.
 * @param sourceLocation location of the HMS table
 * @param format inputformat class name of the HMS table
 * @param partitionSpecProxy  list of HMS table partitions wrapped in partitionSpecProxy
 * @param partitionKeys list of partition keys
 * @param icebergTableProperties destination iceberg table properties
 * @param conf a Hadoop configuration
 */
public static void importFiles(String sourceLocation, String format, PartitionSpecProxy partitionSpecProxy, List<FieldSchema> partitionKeys, Properties icebergTableProperties, Configuration conf) throws MetaException {
    RemoteIterator<LocatedFileStatus> filesIterator = null;
    // this operation must be done before the iceberg table is created
    if (partitionSpecProxy.size() == 0) {
        filesIterator = getFilesIterator(new Path(sourceLocation), conf);
    }
    Table icebergTable = Catalogs.createTable(conf, icebergTableProperties);
    AppendFiles append = icebergTable.newAppend();
    PartitionSpec spec = icebergTable.spec();
    MetricsConfig metricsConfig = MetricsConfig.fromProperties(icebergTable.properties());
    String nameMappingString = icebergTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
    NameMapping nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null;
    try {
        if (partitionSpecProxy.size() == 0) {
            List<DataFile> dataFiles = getDataFiles(filesIterator, Collections.emptyMap(), format, spec, metricsConfig, nameMapping, conf);
            dataFiles.forEach(append::appendFile);
        } else {
            PartitionSpecProxy.PartitionIterator partitionIterator = partitionSpecProxy.getPartitionIterator();
            List<Callable<Void>> tasks = new ArrayList<>();
            while (partitionIterator.hasNext()) {
                Partition partition = partitionIterator.next();
                Callable<Void> task = () -> {
                    Path partitionPath = new Path(partition.getSd().getLocation());
                    String partitionName = Warehouse.makePartName(partitionKeys, partition.getValues());
                    Map<String, String> partitionSpec = Warehouse.makeSpecFromName(partitionName);
                    RemoteIterator<LocatedFileStatus> iterator = getFilesIterator(partitionPath, conf);
                    List<DataFile> dataFiles = getDataFiles(iterator, partitionSpec, format.toLowerCase(), spec, metricsConfig, nameMapping, conf);
                    synchronized (append) {
                        dataFiles.forEach(append::appendFile);
                    }
                    return null;
                };
                tasks.add(task);
            }
            int numThreads = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_SERVER2_ICEBERG_METADATA_GENERATOR_THREADS);
            ExecutorService executor = Executors.newFixedThreadPool(numThreads, new ThreadFactoryBuilder().setNameFormat("iceberg-metadata-generator-%d").setDaemon(true).build());
            executor.invokeAll(tasks);
            executor.shutdown();
        }
        append.commit();
    } catch (IOException | InterruptedException e) {
        throw new MetaException("Cannot import hive data into iceberg table.\n" + e.getMessage());
    }
}
Also used : NameMapping(org.apache.iceberg.mapping.NameMapping) AppendFiles(org.apache.iceberg.AppendFiles) ArrayList(java.util.ArrayList) MetricsConfig(org.apache.iceberg.MetricsConfig) Callable(java.util.concurrent.Callable) DataFile(org.apache.iceberg.DataFile) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) ThreadFactoryBuilder(org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder) ArrayList(java.util.ArrayList) List(java.util.List) PartitionSpecProxy(org.apache.hadoop.hive.metastore.partition.spec.PartitionSpecProxy) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.metastore.api.Partition) Table(org.apache.iceberg.Table) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) IOException(java.io.IOException) PartitionSpec(org.apache.iceberg.PartitionSpec) ExecutorService(java.util.concurrent.ExecutorService) Map(java.util.Map)

Example 2 with ThreadFactoryBuilder

use of org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder in project hive by apache.

the class HiveIcebergOutputCommitter method tableExecutor.

/**
 * Executor service for parallel handling of table manipulation. Could return null, if no parallelism is possible.
 * @param conf The configuration containing the pool size
 * @param maxThreadNum The number of requests we want to handle (might be decreased further by configuration)
 * @return The generated executor service, or null if executor is not needed.
 */
private static ExecutorService tableExecutor(Configuration conf, int maxThreadNum) {
    int size = conf.getInt(InputFormatConfig.COMMIT_TABLE_THREAD_POOL_SIZE, InputFormatConfig.COMMIT_TABLE_THREAD_POOL_SIZE_DEFAULT);
    size = Math.min(maxThreadNum, size);
    if (size > 1) {
        return Executors.newFixedThreadPool(size, new ThreadFactoryBuilder().setDaemon(true).setPriority(Thread.NORM_PRIORITY).setNameFormat("iceberg-commit-table-pool-%d").build());
    } else {
        return null;
    }
}
Also used : ThreadFactoryBuilder(org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder)

Aggregations

ThreadFactoryBuilder (org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Map (java.util.Map)1 Callable (java.util.concurrent.Callable)1 ExecutorService (java.util.concurrent.ExecutorService)1 LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)1 Path (org.apache.hadoop.fs.Path)1 RemoteIterator (org.apache.hadoop.fs.RemoteIterator)1 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)1 Partition (org.apache.hadoop.hive.metastore.api.Partition)1 PartitionSpecProxy (org.apache.hadoop.hive.metastore.partition.spec.PartitionSpecProxy)1 AppendFiles (org.apache.iceberg.AppendFiles)1 DataFile (org.apache.iceberg.DataFile)1 MetricsConfig (org.apache.iceberg.MetricsConfig)1 PartitionSpec (org.apache.iceberg.PartitionSpec)1 Table (org.apache.iceberg.Table)1 NameMapping (org.apache.iceberg.mapping.NameMapping)1