Search in sources :

Example 11 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class Catalogs method createTable.

/**
 * Creates an Iceberg table using the catalog specified by the configuration.
 * <p>
 * The properties should contain the following values:
 * <ul>
 * <li>Table identifier ({@link Catalogs#NAME}) or table path ({@link Catalogs#LOCATION}) is required
 * <li>Table schema ({@link InputFormatConfig#TABLE_SCHEMA}) is required
 * <li>Partition specification ({@link InputFormatConfig#PARTITION_SPEC}) is optional. Table will be unpartitioned if
 *  not provided
 * </ul><p>
 * Other properties will be handled over to the Table creation. The controlling properties above will not be
 * propagated.
 * @param conf a Hadoop conf
 * @param props the controlling properties
 * @return the created Iceberg table
 */
public static Table createTable(Configuration conf, Properties props) {
    String schemaString = props.getProperty(InputFormatConfig.TABLE_SCHEMA);
    Preconditions.checkNotNull(schemaString, "Table schema not set");
    Schema schema = SchemaParser.fromJson(props.getProperty(InputFormatConfig.TABLE_SCHEMA));
    String specString = props.getProperty(InputFormatConfig.PARTITION_SPEC);
    PartitionSpec spec = PartitionSpec.unpartitioned();
    if (specString != null) {
        spec = PartitionSpecParser.fromJson(schema, specString);
    }
    String location = props.getProperty(LOCATION);
    String catalogName = props.getProperty(InputFormatConfig.CATALOG_NAME);
    // Create a table property map without the controlling properties
    Map<String, String> map = Maps.newHashMapWithExpectedSize(props.size());
    for (Object key : props.keySet()) {
        if (!PROPERTIES_TO_REMOVE.contains(key)) {
            map.put(key.toString(), props.get(key).toString());
        }
    }
    Optional<Catalog> catalog = loadCatalog(conf, catalogName);
    if (catalog.isPresent()) {
        String name = props.getProperty(NAME);
        Preconditions.checkNotNull(name, "Table identifier not set");
        return catalog.get().createTable(TableIdentifier.parse(name), schema, spec, location, map);
    }
    Preconditions.checkNotNull(location, "Table location not set");
    return new HadoopTables(conf).create(schema, spec, map, location);
}
Also used : Schema(org.apache.iceberg.Schema) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) PartitionSpec(org.apache.iceberg.PartitionSpec) Catalog(org.apache.iceberg.catalog.Catalog)

Example 12 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveIcebergStatistics method testStatsWithPartitionedInsert.

@Test
public void testStatsWithPartitionedInsert() {
    TableIdentifier identifier = TableIdentifier.of("default", "customers");
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("last_name").build();
    shell.setHiveSessionValue(HiveConf.ConfVars.HIVESTATSAUTOGATHER.varname, true);
    testTables.createTable(shell, identifier.name(), HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, ImmutableList.of());
    if (testTableType != TestTables.TestTableType.HIVE_CATALOG) {
        // If the location is set and we have to gather stats, then we have to update the table stats now
        shell.executeStatement("ANALYZE TABLE " + identifier + " COMPUTE STATISTICS FOR COLUMNS");
    }
    String insert = testTables.getInsertQuery(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, identifier, false);
    shell.executeStatement(insert);
    checkColStat(identifier.name(), "customer_id", true);
    checkColStat(identifier.name(), "first_name", true);
    checkColStatMinMaxValue(identifier.name(), "customer_id", 0, 2);
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 13 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveIcebergStorageHandlerNoScan method testPartitionEvolution.

@Test
public void testPartitionEvolution() {
    Schema schema = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "ts", Types.TimestampType.withZone()));
    TableIdentifier identifier = TableIdentifier.of("default", "part_test");
    shell.executeStatement("CREATE EXTERNAL TABLE " + identifier + " STORED BY ICEBERG " + testTables.locationForCreateTableSQL(identifier) + " TBLPROPERTIES ('" + InputFormatConfig.TABLE_SCHEMA + "'='" + SchemaParser.toJson(schema) + "', " + "'" + InputFormatConfig.CATALOG_NAME + "'='" + testTables.catalogName() + "')");
    shell.executeStatement("ALTER TABLE " + identifier + " SET PARTITION SPEC (month(ts))");
    PartitionSpec spec = PartitionSpec.builderFor(schema).withSpecId(1).month("ts").build();
    Table table = testTables.loadTable(identifier);
    Assert.assertEquals(spec, table.spec());
    shell.executeStatement("ALTER TABLE " + identifier + " SET PARTITION SPEC (day(ts))");
    spec = PartitionSpec.builderFor(schema).withSpecId(2).alwaysNull("ts", "ts_month").day("ts").build();
    table.refresh();
    Assert.assertEquals(spec, table.spec());
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) BaseTable(org.apache.iceberg.BaseTable) Table(org.apache.iceberg.Table) UpdateSchema(org.apache.iceberg.UpdateSchema) Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 14 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveIcebergStorageHandlerNoScan method testCreatePartitionedTableWithPropertiesAndWithColumnSpecification.

@Test
public void testCreatePartitionedTableWithPropertiesAndWithColumnSpecification() {
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("last_name").build();
    AssertHelpers.assertThrows("should throw exception", IllegalArgumentException.class, "Provide only one of the following", () -> {
        shell.executeStatement("CREATE EXTERNAL TABLE customers (customer_id BIGINT) " + "PARTITIONED BY (first_name STRING) " + "STORED BY ICEBERG " + testTables.locationForCreateTableSQL(TableIdentifier.of("default", "customers")) + testTables.propertiesForCreateTableSQL(ImmutableMap.of(InputFormatConfig.PARTITION_SPEC, PartitionSpecParser.toJson(spec))));
    });
}
Also used : PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 15 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveIcebergStorageHandlerNoScan method testSetPartitionTransformSameField.

@Test
public void testSetPartitionTransformSameField() {
    Schema schema = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "truncate_field", Types.StringType.get()), optional(3, "bucket_field", Types.StringType.get()));
    TableIdentifier identifier = TableIdentifier.of("default", "part_test");
    shell.executeStatement("CREATE EXTERNAL TABLE " + identifier + " PARTITIONED BY SPEC (truncate(2, truncate_field), bucket(2, bucket_field))" + " STORED BY ICEBERG " + testTables.locationForCreateTableSQL(identifier) + "TBLPROPERTIES ('" + InputFormatConfig.TABLE_SCHEMA + "'='" + SchemaParser.toJson(schema) + "', " + "'" + InputFormatConfig.CATALOG_NAME + "'='" + testTables.catalogName() + "')");
    PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("truncate_field", 2).bucket("bucket_field", 2).build();
    Table table = testTables.loadTable(identifier);
    Assert.assertEquals(spec, table.spec());
    // Change one, keep one
    shell.executeStatement("ALTER TABLE default.part_test " + "SET PARTITION SPEC (truncate(3, truncate_field), bucket(2, bucket_field) )");
    spec = PartitionSpec.builderFor(schema).withSpecId(1).alwaysNull("truncate_field", "truncate_field_trunc").bucket("bucket_field", 2).truncate("truncate_field", 3, "truncate_field_trunc_3").build();
    table.refresh();
    Assert.assertEquals(spec, table.spec());
    // Change one again, keep the other one
    shell.executeStatement("ALTER TABLE default.part_test " + "SET PARTITION SPEC (truncate(4, truncate_field), bucket(2, bucket_field) )");
    spec = PartitionSpec.builderFor(schema).withSpecId(2).alwaysNull("truncate_field", "truncate_field_trunc").bucket("bucket_field", 2).alwaysNull("truncate_field", "truncate_field_trunc_3").truncate("truncate_field", 4, "truncate_field_trunc_4").build();
    table.refresh();
    Assert.assertEquals(spec, table.spec());
    // Keep the already changed, change the other one (change the order of clauses in the spec)
    shell.executeStatement("ALTER TABLE default.part_test " + "SET PARTITION SPEC (bucket(3, bucket_field), truncate(4, truncate_field))");
    spec = PartitionSpec.builderFor(schema).withSpecId(3).alwaysNull("truncate_field", "truncate_field_trunc").alwaysNull("bucket_field", "bucket_field_bucket").alwaysNull("truncate_field", "truncate_field_trunc_3").truncate("truncate_field", 4, "truncate_field_trunc_4").bucket("bucket_field", 3, "bucket_field_bucket_3").build();
    table.refresh();
    Assert.assertEquals(spec, table.spec());
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) BaseTable(org.apache.iceberg.BaseTable) Table(org.apache.iceberg.Table) UpdateSchema(org.apache.iceberg.UpdateSchema) Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Aggregations

PartitionSpec (org.apache.iceberg.PartitionSpec)63 Table (org.apache.iceberg.Table)40 Test (org.junit.Test)39 Schema (org.apache.iceberg.Schema)38 TableIdentifier (org.apache.iceberg.catalog.TableIdentifier)19 Record (org.apache.iceberg.data.Record)19 List (java.util.List)10 ArrayList (java.util.ArrayList)9 FileFormat (org.apache.iceberg.FileFormat)9 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)8 IOException (java.io.IOException)7 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)7 UpdateSchema (org.apache.iceberg.UpdateSchema)6 Path (org.apache.hadoop.fs.Path)5 BaseTable (org.apache.iceberg.BaseTable)5 DataFile (org.apache.iceberg.DataFile)5 PartitionField (org.apache.iceberg.PartitionField)4 Types (org.apache.iceberg.types.Types)4 HdfsContext (com.facebook.presto.hive.HdfsContext)3 PrestoException (com.facebook.presto.spi.PrestoException)3