Search in sources :

Example 1 with MetricsConfig

use of org.apache.iceberg.MetricsConfig in project hive by apache.

the class HiveTableUtil method importFiles.

/**
 * Import files from given partitions to an Iceberg table.
 * @param sourceLocation location of the HMS table
 * @param format inputformat class name of the HMS table
 * @param partitionSpecProxy  list of HMS table partitions wrapped in partitionSpecProxy
 * @param partitionKeys list of partition keys
 * @param icebergTableProperties destination iceberg table properties
 * @param conf a Hadoop configuration
 */
public static void importFiles(String sourceLocation, String format, PartitionSpecProxy partitionSpecProxy, List<FieldSchema> partitionKeys, Properties icebergTableProperties, Configuration conf) throws MetaException {
    RemoteIterator<LocatedFileStatus> filesIterator = null;
    // this operation must be done before the iceberg table is created
    if (partitionSpecProxy.size() == 0) {
        filesIterator = getFilesIterator(new Path(sourceLocation), conf);
    }
    Table icebergTable = Catalogs.createTable(conf, icebergTableProperties);
    AppendFiles append = icebergTable.newAppend();
    PartitionSpec spec = icebergTable.spec();
    MetricsConfig metricsConfig = MetricsConfig.fromProperties(icebergTable.properties());
    String nameMappingString = icebergTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
    NameMapping nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null;
    try {
        if (partitionSpecProxy.size() == 0) {
            List<DataFile> dataFiles = getDataFiles(filesIterator, Collections.emptyMap(), format, spec, metricsConfig, nameMapping, conf);
            dataFiles.forEach(append::appendFile);
        } else {
            PartitionSpecProxy.PartitionIterator partitionIterator = partitionSpecProxy.getPartitionIterator();
            List<Callable<Void>> tasks = new ArrayList<>();
            while (partitionIterator.hasNext()) {
                Partition partition = partitionIterator.next();
                Callable<Void> task = () -> {
                    Path partitionPath = new Path(partition.getSd().getLocation());
                    String partitionName = Warehouse.makePartName(partitionKeys, partition.getValues());
                    Map<String, String> partitionSpec = Warehouse.makeSpecFromName(partitionName);
                    RemoteIterator<LocatedFileStatus> iterator = getFilesIterator(partitionPath, conf);
                    List<DataFile> dataFiles = getDataFiles(iterator, partitionSpec, format.toLowerCase(), spec, metricsConfig, nameMapping, conf);
                    synchronized (append) {
                        dataFiles.forEach(append::appendFile);
                    }
                    return null;
                };
                tasks.add(task);
            }
            int numThreads = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_SERVER2_ICEBERG_METADATA_GENERATOR_THREADS);
            ExecutorService executor = Executors.newFixedThreadPool(numThreads, new ThreadFactoryBuilder().setNameFormat("iceberg-metadata-generator-%d").setDaemon(true).build());
            executor.invokeAll(tasks);
            executor.shutdown();
        }
        append.commit();
    } catch (IOException | InterruptedException e) {
        throw new MetaException("Cannot import hive data into iceberg table.\n" + e.getMessage());
    }
}
Also used : NameMapping(org.apache.iceberg.mapping.NameMapping) AppendFiles(org.apache.iceberg.AppendFiles) ArrayList(java.util.ArrayList) MetricsConfig(org.apache.iceberg.MetricsConfig) Callable(java.util.concurrent.Callable) DataFile(org.apache.iceberg.DataFile) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) ThreadFactoryBuilder(org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder) ArrayList(java.util.ArrayList) List(java.util.List) PartitionSpecProxy(org.apache.hadoop.hive.metastore.partition.spec.PartitionSpecProxy) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.metastore.api.Partition) Table(org.apache.iceberg.Table) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) IOException(java.io.IOException) PartitionSpec(org.apache.iceberg.PartitionSpec) ExecutorService(java.util.concurrent.ExecutorService) Map(java.util.Map)

Aggregations

IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Map (java.util.Map)1 Callable (java.util.concurrent.Callable)1 ExecutorService (java.util.concurrent.ExecutorService)1 LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)1 Path (org.apache.hadoop.fs.Path)1 RemoteIterator (org.apache.hadoop.fs.RemoteIterator)1 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)1 Partition (org.apache.hadoop.hive.metastore.api.Partition)1 PartitionSpecProxy (org.apache.hadoop.hive.metastore.partition.spec.PartitionSpecProxy)1 AppendFiles (org.apache.iceberg.AppendFiles)1 DataFile (org.apache.iceberg.DataFile)1 MetricsConfig (org.apache.iceberg.MetricsConfig)1 PartitionSpec (org.apache.iceberg.PartitionSpec)1 Table (org.apache.iceberg.Table)1 NameMapping (org.apache.iceberg.mapping.NameMapping)1 ThreadFactoryBuilder (org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder)1