Search in sources :

Example 26 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveIcebergPartitions method testBucketTransform.

@Test
public void testBucketTransform() throws IOException {
    Schema schema = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "part_field", Types.StringType.get()));
    PartitionSpec spec = PartitionSpec.builderFor(schema).bucket("part_field", 2).build();
    List<Record> records = TestHelper.RecordsBuilder.newInstance(schema).add(1L, "Part1").add(2L, "Part2").add(3L, "Art3").build();
    Table table = testTables.createTable(shell, "part_test", schema, spec, fileFormat, records);
    HiveIcebergTestUtils.validateData(table, records, 0);
    HiveIcebergTestUtils.validateDataWithSQL(shell, "part_test", records, "id");
}
Also used : Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 27 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class HiveTableUtil method importFiles.

/**
 * Import files from given partitions to an Iceberg table.
 * @param sourceLocation location of the HMS table
 * @param format inputformat class name of the HMS table
 * @param partitionSpecProxy  list of HMS table partitions wrapped in partitionSpecProxy
 * @param partitionKeys list of partition keys
 * @param icebergTableProperties destination iceberg table properties
 * @param conf a Hadoop configuration
 */
public static void importFiles(String sourceLocation, String format, PartitionSpecProxy partitionSpecProxy, List<FieldSchema> partitionKeys, Properties icebergTableProperties, Configuration conf) throws MetaException {
    RemoteIterator<LocatedFileStatus> filesIterator = null;
    // this operation must be done before the iceberg table is created
    if (partitionSpecProxy.size() == 0) {
        filesIterator = getFilesIterator(new Path(sourceLocation), conf);
    }
    Table icebergTable = Catalogs.createTable(conf, icebergTableProperties);
    AppendFiles append = icebergTable.newAppend();
    PartitionSpec spec = icebergTable.spec();
    MetricsConfig metricsConfig = MetricsConfig.fromProperties(icebergTable.properties());
    String nameMappingString = icebergTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
    NameMapping nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null;
    try {
        if (partitionSpecProxy.size() == 0) {
            List<DataFile> dataFiles = getDataFiles(filesIterator, Collections.emptyMap(), format, spec, metricsConfig, nameMapping, conf);
            dataFiles.forEach(append::appendFile);
        } else {
            PartitionSpecProxy.PartitionIterator partitionIterator = partitionSpecProxy.getPartitionIterator();
            List<Callable<Void>> tasks = new ArrayList<>();
            while (partitionIterator.hasNext()) {
                Partition partition = partitionIterator.next();
                Callable<Void> task = () -> {
                    Path partitionPath = new Path(partition.getSd().getLocation());
                    String partitionName = Warehouse.makePartName(partitionKeys, partition.getValues());
                    Map<String, String> partitionSpec = Warehouse.makeSpecFromName(partitionName);
                    RemoteIterator<LocatedFileStatus> iterator = getFilesIterator(partitionPath, conf);
                    List<DataFile> dataFiles = getDataFiles(iterator, partitionSpec, format.toLowerCase(), spec, metricsConfig, nameMapping, conf);
                    synchronized (append) {
                        dataFiles.forEach(append::appendFile);
                    }
                    return null;
                };
                tasks.add(task);
            }
            int numThreads = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_SERVER2_ICEBERG_METADATA_GENERATOR_THREADS);
            ExecutorService executor = Executors.newFixedThreadPool(numThreads, new ThreadFactoryBuilder().setNameFormat("iceberg-metadata-generator-%d").setDaemon(true).build());
            executor.invokeAll(tasks);
            executor.shutdown();
        }
        append.commit();
    } catch (IOException | InterruptedException e) {
        throw new MetaException("Cannot import hive data into iceberg table.\n" + e.getMessage());
    }
}
Also used : NameMapping(org.apache.iceberg.mapping.NameMapping) AppendFiles(org.apache.iceberg.AppendFiles) ArrayList(java.util.ArrayList) MetricsConfig(org.apache.iceberg.MetricsConfig) Callable(java.util.concurrent.Callable) DataFile(org.apache.iceberg.DataFile) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) ThreadFactoryBuilder(org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder) ArrayList(java.util.ArrayList) List(java.util.List) PartitionSpecProxy(org.apache.hadoop.hive.metastore.partition.spec.PartitionSpecProxy) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.metastore.api.Partition) Table(org.apache.iceberg.Table) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) IOException(java.io.IOException) PartitionSpec(org.apache.iceberg.PartitionSpec) ExecutorService(java.util.concurrent.ExecutorService) Map(java.util.Map)

Example 28 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveCatalog method testCreateTableBuilder.

@Test
public void testCreateTableBuilder() throws Exception {
    Schema schema = new Schema(required(1, "id", Types.IntegerType.get(), "unique ID"), required(2, "data", Types.StringType.get()));
    PartitionSpec spec = PartitionSpec.builderFor(schema).bucket("data", 16).build();
    TableIdentifier tableIdent = TableIdentifier.of(DB_NAME, "tbl");
    String location = temp.newFolder("tbl").toString();
    try {
        Table table = catalog.buildTable(tableIdent, schema).withPartitionSpec(spec).withLocation(location).withProperty("key1", "value1").withProperty("key2", "value2").create();
        Assert.assertEquals(location, table.location());
        Assert.assertEquals(2, table.schema().columns().size());
        Assert.assertEquals(1, table.spec().fields().size());
        Assert.assertEquals("value1", table.properties().get("key1"));
        Assert.assertEquals("value2", table.properties().get("key2"));
    } finally {
        catalog.dropTable(tableIdent);
    }
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 29 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveCatalog method testReplaceTxnBuilder.

@Test
public void testReplaceTxnBuilder() throws Exception {
    Schema schema = new Schema(required(1, "id", Types.IntegerType.get(), "unique ID"), required(2, "data", Types.StringType.get()));
    PartitionSpec spec = PartitionSpec.builderFor(schema).bucket("data", 16).build();
    TableIdentifier tableIdent = TableIdentifier.of(DB_NAME, "tbl");
    String location = temp.newFolder("tbl").toString();
    try {
        Transaction createTxn = catalog.buildTable(tableIdent, schema).withPartitionSpec(spec).withLocation(location).withProperty("key1", "value1").createOrReplaceTransaction();
        createTxn.commitTransaction();
        Table table = catalog.loadTable(tableIdent);
        Assert.assertEquals(1, table.spec().fields().size());
        String newLocation = temp.newFolder("tbl-2").toString();
        Transaction replaceTxn = catalog.buildTable(tableIdent, schema).withProperty("key2", "value2").withLocation(newLocation).replaceTransaction();
        replaceTxn.commitTransaction();
        table = catalog.loadTable(tableIdent);
        Assert.assertEquals(newLocation, table.location());
        Assert.assertNull(table.currentSnapshot());
        PartitionSpec v1Expected = PartitionSpec.builderFor(table.schema()).alwaysNull("data", "data_bucket").withSpecId(1).build();
        Assert.assertEquals("Table should have a spec with one void field", v1Expected, table.spec());
        Assert.assertEquals("value1", table.properties().get("key1"));
        Assert.assertEquals("value2", table.properties().get("key2"));
    } finally {
        catalog.dropTable(tableIdent);
    }
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Table(org.apache.iceberg.Table) Transaction(org.apache.iceberg.Transaction) Schema(org.apache.iceberg.Schema) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 30 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveCatalog method testCreateTableWithCaching.

@Test
public void testCreateTableWithCaching() throws Exception {
    Schema schema = new Schema(required(1, "id", Types.IntegerType.get(), "unique ID"), required(2, "data", Types.StringType.get()));
    PartitionSpec spec = PartitionSpec.builderFor(schema).bucket("data", 16).build();
    TableIdentifier tableIdent = TableIdentifier.of(DB_NAME, "tbl");
    String location = temp.newFolder("tbl").toString();
    ImmutableMap<String, String> properties = ImmutableMap.of("key1", "value1", "key2", "value2");
    Catalog cachingCatalog = CachingCatalog.wrap(catalog);
    try {
        Table table = cachingCatalog.createTable(tableIdent, schema, spec, location, properties);
        Assert.assertEquals(location, table.location());
        Assert.assertEquals(2, table.schema().columns().size());
        Assert.assertEquals(1, table.spec().fields().size());
        Assert.assertEquals("value1", table.properties().get("key1"));
        Assert.assertEquals("value2", table.properties().get("key2"));
    } finally {
        cachingCatalog.dropTable(tableIdent);
    }
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) PartitionSpec(org.apache.iceberg.PartitionSpec) Catalog(org.apache.iceberg.catalog.Catalog) CachingCatalog(org.apache.iceberg.CachingCatalog) Test(org.junit.Test)

Aggregations

PartitionSpec (org.apache.iceberg.PartitionSpec)63 Table (org.apache.iceberg.Table)40 Test (org.junit.Test)39 Schema (org.apache.iceberg.Schema)38 TableIdentifier (org.apache.iceberg.catalog.TableIdentifier)19 Record (org.apache.iceberg.data.Record)19 List (java.util.List)10 ArrayList (java.util.ArrayList)9 FileFormat (org.apache.iceberg.FileFormat)9 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)8 IOException (java.io.IOException)7 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)7 UpdateSchema (org.apache.iceberg.UpdateSchema)6 Path (org.apache.hadoop.fs.Path)5 BaseTable (org.apache.iceberg.BaseTable)5 DataFile (org.apache.iceberg.DataFile)5 PartitionField (org.apache.iceberg.PartitionField)4 Types (org.apache.iceberg.types.Types)4 HdfsContext (com.facebook.presto.hive.HdfsContext)3 PrestoException (com.facebook.presto.spi.PrestoException)3