Search in sources :

Example 1 with NameMapping

use of org.apache.iceberg.mapping.NameMapping in project hive by apache.

the class HiveTableUtil method importFiles.

/**
 * Import files from given partitions to an Iceberg table.
 * @param sourceLocation location of the HMS table
 * @param format inputformat class name of the HMS table
 * @param partitionSpecProxy  list of HMS table partitions wrapped in partitionSpecProxy
 * @param partitionKeys list of partition keys
 * @param icebergTableProperties destination iceberg table properties
 * @param conf a Hadoop configuration
 */
public static void importFiles(String sourceLocation, String format, PartitionSpecProxy partitionSpecProxy, List<FieldSchema> partitionKeys, Properties icebergTableProperties, Configuration conf) throws MetaException {
    RemoteIterator<LocatedFileStatus> filesIterator = null;
    // this operation must be done before the iceberg table is created
    if (partitionSpecProxy.size() == 0) {
        filesIterator = getFilesIterator(new Path(sourceLocation), conf);
    }
    Table icebergTable = Catalogs.createTable(conf, icebergTableProperties);
    AppendFiles append = icebergTable.newAppend();
    PartitionSpec spec = icebergTable.spec();
    MetricsConfig metricsConfig = MetricsConfig.fromProperties(icebergTable.properties());
    String nameMappingString = icebergTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
    NameMapping nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null;
    try {
        if (partitionSpecProxy.size() == 0) {
            List<DataFile> dataFiles = getDataFiles(filesIterator, Collections.emptyMap(), format, spec, metricsConfig, nameMapping, conf);
            dataFiles.forEach(append::appendFile);
        } else {
            PartitionSpecProxy.PartitionIterator partitionIterator = partitionSpecProxy.getPartitionIterator();
            List<Callable<Void>> tasks = new ArrayList<>();
            while (partitionIterator.hasNext()) {
                Partition partition = partitionIterator.next();
                Callable<Void> task = () -> {
                    Path partitionPath = new Path(partition.getSd().getLocation());
                    String partitionName = Warehouse.makePartName(partitionKeys, partition.getValues());
                    Map<String, String> partitionSpec = Warehouse.makeSpecFromName(partitionName);
                    RemoteIterator<LocatedFileStatus> iterator = getFilesIterator(partitionPath, conf);
                    List<DataFile> dataFiles = getDataFiles(iterator, partitionSpec, format.toLowerCase(), spec, metricsConfig, nameMapping, conf);
                    synchronized (append) {
                        dataFiles.forEach(append::appendFile);
                    }
                    return null;
                };
                tasks.add(task);
            }
            int numThreads = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_SERVER2_ICEBERG_METADATA_GENERATOR_THREADS);
            ExecutorService executor = Executors.newFixedThreadPool(numThreads, new ThreadFactoryBuilder().setNameFormat("iceberg-metadata-generator-%d").setDaemon(true).build());
            executor.invokeAll(tasks);
            executor.shutdown();
        }
        append.commit();
    } catch (IOException | InterruptedException e) {
        throw new MetaException("Cannot import hive data into iceberg table.\n" + e.getMessage());
    }
}
Also used : NameMapping(org.apache.iceberg.mapping.NameMapping) AppendFiles(org.apache.iceberg.AppendFiles) ArrayList(java.util.ArrayList) MetricsConfig(org.apache.iceberg.MetricsConfig) Callable(java.util.concurrent.Callable) DataFile(org.apache.iceberg.DataFile) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) ThreadFactoryBuilder(org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder) ArrayList(java.util.ArrayList) List(java.util.List) PartitionSpecProxy(org.apache.hadoop.hive.metastore.partition.spec.PartitionSpecProxy) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.metastore.api.Partition) Table(org.apache.iceberg.Table) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) IOException(java.io.IOException) PartitionSpec(org.apache.iceberg.PartitionSpec) ExecutorService(java.util.concurrent.ExecutorService) Map(java.util.Map)

Example 2 with NameMapping

use of org.apache.iceberg.mapping.NameMapping in project hive by apache.

the class HiveIcebergMetaHook method preAlterTable.

@Override
public void preAlterTable(org.apache.hadoop.hive.metastore.api.Table hmsTable, EnvironmentContext context) throws MetaException {
    setupAlterOperationType(hmsTable, context);
    catalogProperties = getCatalogProperties(hmsTable);
    try {
        icebergTable = IcebergTableUtil.getTable(conf, catalogProperties);
    } catch (NoSuchTableException nte) {
        context.getProperties().put(MIGRATE_HIVE_TO_ICEBERG, "true");
        // If the iceberg table does not exist, then this is an ALTER command aimed at migrating the table to iceberg
        // First we must check whether it's eligible for migration to iceberg
        // If so, we will create the iceberg table in commitAlterTable and go ahead with the migration
        assertTableCanBeMigrated(hmsTable);
        isTableMigration = true;
        StorageDescriptor sd = hmsTable.getSd();
        preAlterTableProperties = new PreAlterTableProperties();
        preAlterTableProperties.tableLocation = sd.getLocation();
        preAlterTableProperties.format = sd.getInputFormat();
        preAlterTableProperties.schema = schema(catalogProperties, hmsTable);
        preAlterTableProperties.partitionKeys = hmsTable.getPartitionKeys();
        context.getProperties().put(HiveMetaHook.ALLOW_PARTITION_KEY_CHANGE, "true");
        // If there are partition keys specified remove them from the HMS table and add them to the column list
        if (hmsTable.isSetPartitionKeys() && !hmsTable.getPartitionKeys().isEmpty()) {
            List<PartitionTransformSpec> spec = PartitionTransform.getPartitionTransformSpec(hmsTable.getPartitionKeys());
            if (!SessionStateUtil.addResource(conf, hive_metastoreConstants.PARTITION_TRANSFORM_SPEC, spec)) {
                throw new MetaException("Query state attached to Session state must be not null. " + "Partition transform metadata cannot be saved.");
            }
            hmsTable.getSd().getCols().addAll(hmsTable.getPartitionKeys());
            hmsTable.setPartitionKeysIsSet(false);
        }
        preAlterTableProperties.spec = spec(conf, preAlterTableProperties.schema, hmsTable);
        sd.setInputFormat(HiveIcebergInputFormat.class.getCanonicalName());
        sd.setOutputFormat(HiveIcebergOutputFormat.class.getCanonicalName());
        sd.setSerdeInfo(new SerDeInfo("icebergSerde", HiveIcebergSerDe.class.getCanonicalName(), Collections.emptyMap()));
        setCommonHmsTablePropertiesForIceberg(hmsTable);
        // set an additional table prop to designate that this table has been migrated to Iceberg, i.e.
        // all or some of its data files have not been written out using the Iceberg writer, and therefore those data
        // files do not contain Iceberg field IDs. This makes certain schema evolution operations problematic, so we
        // want to disable these ops for now using this new table prop
        hmsTable.getParameters().put(MIGRATED_TO_ICEBERG, "true");
        NameMapping nameMapping = MappingUtil.create(preAlterTableProperties.schema);
        hmsTable.getParameters().put(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping));
    }
    if (AlterTableType.ADDCOLS.equals(currentAlterTableOp)) {
        handleAddColumns(hmsTable);
    } else if (AlterTableType.REPLACE_COLUMNS.equals(currentAlterTableOp)) {
        assertNotMigratedTable(hmsTable.getParameters(), currentAlterTableOp.getName().toUpperCase());
        handleReplaceColumns(hmsTable);
    } else if (AlterTableType.RENAME_COLUMN.equals(currentAlterTableOp)) {
        // passing in the "CHANGE COLUMN" string instead, since RENAME COLUMN is not part of SQL syntax (not to mention
        // that users can change data types or reorder columns too with this alter op type, so its name is misleading..)
        assertNotMigratedTable(hmsTable.getParameters(), "CHANGE COLUMN");
        handleChangeColumn(hmsTable);
    }
}
Also used : NameMapping(org.apache.iceberg.mapping.NameMapping) NoSuchTableException(org.apache.iceberg.exceptions.NoSuchTableException) SerDeInfo(org.apache.hadoop.hive.metastore.api.SerDeInfo) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) List(java.util.List) MetaException(org.apache.hadoop.hive.metastore.api.MetaException)

Aggregations

List (java.util.List)2 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)2 NameMapping (org.apache.iceberg.mapping.NameMapping)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Map (java.util.Map)1 Callable (java.util.concurrent.Callable)1 ExecutorService (java.util.concurrent.ExecutorService)1 LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)1 Path (org.apache.hadoop.fs.Path)1 RemoteIterator (org.apache.hadoop.fs.RemoteIterator)1 Partition (org.apache.hadoop.hive.metastore.api.Partition)1 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)1 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)1 PartitionSpecProxy (org.apache.hadoop.hive.metastore.partition.spec.PartitionSpecProxy)1 AppendFiles (org.apache.iceberg.AppendFiles)1 DataFile (org.apache.iceberg.DataFile)1 MetricsConfig (org.apache.iceberg.MetricsConfig)1 PartitionSpec (org.apache.iceberg.PartitionSpec)1 Table (org.apache.iceberg.Table)1