Search in sources :

Example 21 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveIcebergStorageHandlerNoScan method testPartitionTransform.

@Test
public void testPartitionTransform() {
    Schema schema = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "year_field", Types.DateType.get()), optional(3, "month_field", Types.TimestampType.withZone()), optional(4, "day_field", Types.TimestampType.withoutZone()), optional(5, "hour_field", Types.TimestampType.withoutZone()), optional(6, "truncate_field", Types.StringType.get()), optional(7, "bucket_field", Types.StringType.get()), optional(8, "identity_field", Types.StringType.get()));
    PartitionSpec spec = PartitionSpec.builderFor(schema).year("year_field").month("month_field").day("day_field").hour("hour_field").truncate("truncate_field", 2).bucket("bucket_field", 2).identity("identity_field").build();
    TableIdentifier identifier = TableIdentifier.of("default", "part_test");
    shell.executeStatement("CREATE EXTERNAL TABLE " + identifier + " PARTITIONED BY SPEC (year(year_field), month(month_field), day(day_field), hour(hour_field), " + "truncate(2, truncate_field), bucket(2, bucket_field), identity_field)" + " STORED BY ICEBERG " + testTables.locationForCreateTableSQL(identifier) + " TBLPROPERTIES ('" + InputFormatConfig.TABLE_SCHEMA + "'='" + SchemaParser.toJson(schema) + "', " + "'" + InputFormatConfig.CATALOG_NAME + "'='" + testTables.catalogName() + "')");
    Table table = testTables.loadTable(identifier);
    Assert.assertEquals(spec, table.spec());
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) BaseTable(org.apache.iceberg.BaseTable) Table(org.apache.iceberg.Table) UpdateSchema(org.apache.iceberg.UpdateSchema) Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 22 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class HiveIcebergSerDe method initialize.

@Override
public void initialize(@Nullable Configuration configuration, Properties serDeProperties, Properties partitionProperties) throws SerDeException {
    super.initialize(configuration, serDeProperties, partitionProperties);
    if (serDeProperties.get(InputFormatConfig.TABLE_SCHEMA) != null) {
        this.tableSchema = SchemaParser.fromJson((String) serDeProperties.get(InputFormatConfig.TABLE_SCHEMA));
        if (serDeProperties.get(InputFormatConfig.PARTITION_SPEC) != null) {
            PartitionSpec spec = PartitionSpecParser.fromJson(tableSchema, serDeProperties.getProperty(InputFormatConfig.PARTITION_SPEC));
            this.partitionColumns = spec.fields().stream().map(PartitionField::name).collect(Collectors.toList());
        } else {
            this.partitionColumns = ImmutableList.of();
        }
    } else {
        try {
            Table table = IcebergTableUtil.getTable(configuration, serDeProperties);
            // always prefer the original table schema if there is one
            this.tableSchema = table.schema();
            this.partitionColumns = table.spec().fields().stream().map(PartitionField::name).collect(Collectors.toList());
            LOG.info("Using schema from existing table {}", SchemaParser.toJson(tableSchema));
        } catch (Exception e) {
            // During table creation we might not have the schema information from the Iceberg table, nor from the HMS
            // table. In this case we have to generate the schema using the serdeProperties which contains the info
            // provided in the CREATE TABLE query.
            boolean autoConversion = configuration.getBoolean(InputFormatConfig.SCHEMA_AUTO_CONVERSION, false);
            // If we can not load the table try the provided hive schema
            this.tableSchema = hiveSchemaOrThrow(e, autoConversion);
            // This is only for table creation, it is ok to have an empty partition column list
            this.partitionColumns = ImmutableList.of();
            // create table for CTAS
            if (e instanceof NoSuchTableException && Boolean.parseBoolean(serDeProperties.getProperty(hive_metastoreConstants.TABLE_IS_CTAS))) {
                if (!Catalogs.hiveCatalog(configuration, serDeProperties)) {
                    throw new SerDeException(CTAS_EXCEPTION_MSG);
                }
                createTableForCTAS(configuration, serDeProperties);
            }
        }
    }
    Schema projectedSchema;
    if (serDeProperties.get(HiveIcebergStorageHandler.WRITE_KEY) != null) {
        // when writing out data, we should not do projection pushdown
        projectedSchema = tableSchema;
    } else {
        configuration.setBoolean(InputFormatConfig.CASE_SENSITIVE, false);
        String[] selectedColumns = ColumnProjectionUtils.getReadColumnNames(configuration);
        // When same table is joined multiple times, it is possible some selected columns are duplicated,
        // in this case wrong recordStructField position leads wrong value or ArrayIndexOutOfBoundException
        String[] distinctSelectedColumns = Arrays.stream(selectedColumns).distinct().toArray(String[]::new);
        projectedSchema = distinctSelectedColumns.length > 0 ? tableSchema.caseInsensitiveSelect(distinctSelectedColumns) : tableSchema;
        // or we cannot find selectOperator's column from inspector
        if (projectedSchema.columns().size() != distinctSelectedColumns.length) {
            projectedSchema = tableSchema;
        }
    }
    try {
        this.inspector = IcebergObjectInspector.create(projectedSchema);
    } catch (Exception e) {
        throw new SerDeException(e);
    }
}
Also used : PartitionField(org.apache.iceberg.PartitionField) Table(org.apache.iceberg.Table) NoSuchTableException(org.apache.iceberg.exceptions.NoSuchTableException) Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) PartitionSpec(org.apache.iceberg.PartitionSpec) NoSuchTableException(org.apache.iceberg.exceptions.NoSuchTableException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException)

Example 23 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveIcebergPartitions method testHourTransform.

@Test
public void testHourTransform() throws IOException {
    Schema schema = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "part_field", Types.TimestampType.withoutZone()));
    PartitionSpec spec = PartitionSpec.builderFor(schema).hour("part_field").build();
    List<Record> records = TestHelper.RecordsBuilder.newInstance(schema).add(1L, LocalDateTime.of(2019, 2, 22, 9, 44, 54)).add(2L, LocalDateTime.of(2019, 2, 22, 10, 44, 54)).add(3L, LocalDateTime.of(2019, 2, 23, 9, 44, 54)).build();
    Table table = testTables.createTable(shell, "part_test", schema, spec, fileFormat, records);
    HiveIcebergTestUtils.validateData(table, records, 0);
    HiveIcebergTestUtils.validateDataWithSQL(shell, "part_test", records, "id");
}
Also used : Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 24 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveIcebergPartitions method testDayTransform.

@Test
public void testDayTransform() throws IOException {
    Schema schema = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "part_field", Types.TimestampType.withoutZone()));
    PartitionSpec spec = PartitionSpec.builderFor(schema).day("part_field").build();
    List<Record> records = TestHelper.RecordsBuilder.newInstance(schema).add(1L, LocalDateTime.of(2019, 2, 22, 9, 44, 54)).add(2L, LocalDateTime.of(2019, 2, 22, 10, 44, 54)).add(3L, LocalDateTime.of(2019, 2, 23, 9, 44, 54)).build();
    Table table = testTables.createTable(shell, "part_test", schema, spec, fileFormat, records);
    HiveIcebergTestUtils.validateData(table, records, 0);
    HiveIcebergTestUtils.validateDataWithSQL(shell, "part_test", records, "id");
}
Also used : Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 25 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveIcebergPartitions method testPartitionPruning.

@Test
public void testPartitionPruning() throws IOException {
    Schema salesSchema = new Schema(required(1, "ss_item_sk", Types.IntegerType.get()), required(2, "ss_sold_date_sk", Types.IntegerType.get()));
    PartitionSpec salesSpec = PartitionSpec.builderFor(salesSchema).identity("ss_sold_date_sk").build();
    Schema dimSchema = new Schema(required(1, "d_date_sk", Types.IntegerType.get()), required(2, "d_moy", Types.IntegerType.get()));
    List<Record> salesRecords = TestHelper.RecordsBuilder.newInstance(salesSchema).add(51, 5).add(61, 6).add(71, 7).add(81, 8).add(91, 9).build();
    List<Record> dimRecords = TestHelper.RecordsBuilder.newInstance(salesSchema).add(1, 10).add(2, 20).add(3, 30).add(4, 40).add(5, 50).build();
    Table salesTable = testTables.createTable(shell, "x1_store_sales", salesSchema, salesSpec, fileFormat, null);
    PartitionKey partitionKey = new PartitionKey(salesSpec, salesSchema);
    for (Record r : salesRecords) {
        partitionKey.partition(r);
        testTables.appendIcebergTable(shell.getHiveConf(), salesTable, fileFormat, partitionKey, ImmutableList.of(r));
    }
    testTables.createTable(shell, "x1_date_dim", dimSchema, fileFormat, dimRecords);
    String query = "select s.ss_item_sk from x1_store_sales s, x1_date_dim d " + "where s.ss_sold_date_sk=d.d_date_sk*2 and d.d_moy=30";
    // Check the query results
    List<Object[]> rows = shell.executeStatement(query);
    Assert.assertEquals(1, rows.size());
    Assert.assertArrayEquals(new Object[] { 61 }, rows.get(0));
    // Check if Dynamic Partitioning is used
    Assert.assertTrue(shell.executeStatement("explain " + query).stream().filter(a -> ((String) a[0]).contains("Dynamic Partitioning Event Operator")).findAny().isPresent());
}
Also used : Types(org.apache.iceberg.types.Types) Table(org.apache.iceberg.Table) LocalDateTime(java.time.LocalDateTime) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) IOException(java.io.IOException) Test(org.junit.Test) TestHelper(org.apache.iceberg.mr.TestHelper) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) FileFormat(org.apache.iceberg.FileFormat) List(java.util.List) Record(org.apache.iceberg.data.Record) OffsetDateTime(java.time.OffsetDateTime) NestedField.required(org.apache.iceberg.types.Types.NestedField.required) LocalDate(java.time.LocalDate) PartitionSpec(org.apache.iceberg.PartitionSpec) PartitionKey(org.apache.iceberg.PartitionKey) Assume(org.junit.Assume) ZoneOffset(java.time.ZoneOffset) Assert(org.junit.Assert) Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) PartitionKey(org.apache.iceberg.PartitionKey) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Aggregations

PartitionSpec (org.apache.iceberg.PartitionSpec)63 Table (org.apache.iceberg.Table)40 Test (org.junit.Test)39 Schema (org.apache.iceberg.Schema)38 TableIdentifier (org.apache.iceberg.catalog.TableIdentifier)19 Record (org.apache.iceberg.data.Record)19 List (java.util.List)10 ArrayList (java.util.ArrayList)9 FileFormat (org.apache.iceberg.FileFormat)9 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)8 IOException (java.io.IOException)7 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)7 UpdateSchema (org.apache.iceberg.UpdateSchema)6 Path (org.apache.hadoop.fs.Path)5 BaseTable (org.apache.iceberg.BaseTable)5 DataFile (org.apache.iceberg.DataFile)5 PartitionField (org.apache.iceberg.PartitionField)4 Types (org.apache.iceberg.types.Types)4 HdfsContext (com.facebook.presto.hive.HdfsContext)3 PrestoException (com.facebook.presto.spi.PrestoException)3