Search in sources :

Example 36 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class TestHiveIcebergSchemaEvolution method testRemoveColumnFromIcebergTable.

@Test
public void testRemoveColumnFromIcebergTable() throws IOException {
    // Create an Iceberg table with the columns customer_id, first_name and last_name with some initial data.
    Table icebergTable = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
    // Remove the first_name column from the table.
    icebergTable.updateSchema().deleteColumn("first_name").commit();
    Schema customerSchemaWithoutFirstName = new Schema(optional(1, "customer_id", Types.LongType.get()), optional(2, "last_name", Types.StringType.get(), "This is last name"));
    TestHelper.RecordsBuilder customersWithoutFirstNameBuilder = TestHelper.RecordsBuilder.newInstance(customerSchemaWithoutFirstName).add(0L, "Brown").add(1L, "Green").add(2L, "Pink");
    List<Record> customersWithoutFirstName = customersWithoutFirstNameBuilder.build();
    // Run a 'select *' from Hive to see if the result doesn't contain the first_name column any more.
    List<Object[]> rows = shell.executeStatement("SELECT * FROM default.customers");
    HiveIcebergTestUtils.validateData(customersWithoutFirstName, HiveIcebergTestUtils.valueForRow(customerSchemaWithoutFirstName, rows), 0);
    // Run a 'select first_name' and check if an exception is thrown.
    AssertHelpers.assertThrows("should throw exception", IllegalArgumentException.class, "Invalid table alias or column reference 'first_name'", () -> {
        shell.executeStatement("SELECT first_name FROM default.customers");
    });
    // Insert an entry from Hive to check if it can be inserted without the first_name column.
    shell.executeStatement("INSERT INTO default.customers values (4L, 'Magenta')");
    rows = shell.executeStatement("SELECT * FROM default.customers");
    customersWithoutFirstNameBuilder.add(4L, "Magenta");
    customersWithoutFirstName = customersWithoutFirstNameBuilder.build();
    HiveIcebergTestUtils.validateData(customersWithoutFirstName, HiveIcebergTestUtils.valueForRow(customerSchemaWithoutFirstName, rows), 0);
}
Also used : TestHelper(org.apache.iceberg.mr.TestHelper) Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) Record(org.apache.iceberg.data.Record) Test(org.junit.Test)

Example 37 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class TestHiveIcebergStorageHandlerNoScan method testPartitionTransform.

@Test
public void testPartitionTransform() {
    Schema schema = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "year_field", Types.DateType.get()), optional(3, "month_field", Types.TimestampType.withZone()), optional(4, "day_field", Types.TimestampType.withoutZone()), optional(5, "hour_field", Types.TimestampType.withoutZone()), optional(6, "truncate_field", Types.StringType.get()), optional(7, "bucket_field", Types.StringType.get()), optional(8, "identity_field", Types.StringType.get()));
    PartitionSpec spec = PartitionSpec.builderFor(schema).year("year_field").month("month_field").day("day_field").hour("hour_field").truncate("truncate_field", 2).bucket("bucket_field", 2).identity("identity_field").build();
    TableIdentifier identifier = TableIdentifier.of("default", "part_test");
    shell.executeStatement("CREATE EXTERNAL TABLE " + identifier + " PARTITIONED BY SPEC (year(year_field), month(month_field), day(day_field), hour(hour_field), " + "truncate(2, truncate_field), bucket(2, bucket_field), identity_field)" + " STORED BY ICEBERG " + testTables.locationForCreateTableSQL(identifier) + " TBLPROPERTIES ('" + InputFormatConfig.TABLE_SCHEMA + "'='" + SchemaParser.toJson(schema) + "', " + "'" + InputFormatConfig.CATALOG_NAME + "'='" + testTables.catalogName() + "')");
    Table table = testTables.loadTable(identifier);
    Assert.assertEquals(spec, table.spec());
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) BaseTable(org.apache.iceberg.BaseTable) Table(org.apache.iceberg.Table) UpdateSchema(org.apache.iceberg.UpdateSchema) Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 38 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class TestIcebergRecordObjectInspector method testIcebergRecordObjectInspector.

@Test
public void testIcebergRecordObjectInspector() {
    Schema schema = new Schema(required(1, "integer_field", Types.IntegerType.get()), required(2, "struct_field", Types.StructType.of(Types.NestedField.required(3, "string_field", Types.StringType.get()))));
    Record record = RandomGenericData.generate(schema, 1, 0L).get(0);
    Record innerRecord = record.get(1, Record.class);
    StructObjectInspector soi = (StructObjectInspector) IcebergObjectInspector.create(schema);
    Assert.assertEquals(ImmutableList.of(record.get(0), record.get(1)), soi.getStructFieldsDataAsList(record));
    StructField integerField = soi.getStructFieldRef("integer_field");
    Assert.assertEquals(record.get(0), soi.getStructFieldData(record, integerField));
    StructField structField = soi.getStructFieldRef("struct_field");
    Object innerData = soi.getStructFieldData(record, structField);
    Assert.assertEquals(innerRecord, innerData);
    StructObjectInspector innerSoi = (StructObjectInspector) structField.getFieldObjectInspector();
    StructField stringField = innerSoi.getStructFieldRef("string_field");
    Assert.assertEquals(ImmutableList.of(innerRecord.get(0)), innerSoi.getStructFieldsDataAsList(innerRecord));
    Assert.assertEquals(innerRecord.get(0), innerSoi.getStructFieldData(innerData, stringField));
}
Also used : StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) Schema(org.apache.iceberg.Schema) Record(org.apache.iceberg.data.Record) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 39 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class HiveIcebergSerDe method initialize.

@Override
public void initialize(@Nullable Configuration configuration, Properties serDeProperties, Properties partitionProperties) throws SerDeException {
    super.initialize(configuration, serDeProperties, partitionProperties);
    if (serDeProperties.get(InputFormatConfig.TABLE_SCHEMA) != null) {
        this.tableSchema = SchemaParser.fromJson((String) serDeProperties.get(InputFormatConfig.TABLE_SCHEMA));
        if (serDeProperties.get(InputFormatConfig.PARTITION_SPEC) != null) {
            PartitionSpec spec = PartitionSpecParser.fromJson(tableSchema, serDeProperties.getProperty(InputFormatConfig.PARTITION_SPEC));
            this.partitionColumns = spec.fields().stream().map(PartitionField::name).collect(Collectors.toList());
        } else {
            this.partitionColumns = ImmutableList.of();
        }
    } else {
        try {
            Table table = IcebergTableUtil.getTable(configuration, serDeProperties);
            // always prefer the original table schema if there is one
            this.tableSchema = table.schema();
            this.partitionColumns = table.spec().fields().stream().map(PartitionField::name).collect(Collectors.toList());
            LOG.info("Using schema from existing table {}", SchemaParser.toJson(tableSchema));
        } catch (Exception e) {
            // During table creation we might not have the schema information from the Iceberg table, nor from the HMS
            // table. In this case we have to generate the schema using the serdeProperties which contains the info
            // provided in the CREATE TABLE query.
            boolean autoConversion = configuration.getBoolean(InputFormatConfig.SCHEMA_AUTO_CONVERSION, false);
            // If we can not load the table try the provided hive schema
            this.tableSchema = hiveSchemaOrThrow(e, autoConversion);
            // This is only for table creation, it is ok to have an empty partition column list
            this.partitionColumns = ImmutableList.of();
            // create table for CTAS
            if (e instanceof NoSuchTableException && Boolean.parseBoolean(serDeProperties.getProperty(hive_metastoreConstants.TABLE_IS_CTAS))) {
                if (!Catalogs.hiveCatalog(configuration, serDeProperties)) {
                    throw new SerDeException(CTAS_EXCEPTION_MSG);
                }
                createTableForCTAS(configuration, serDeProperties);
            }
        }
    }
    Schema projectedSchema;
    if (serDeProperties.get(HiveIcebergStorageHandler.WRITE_KEY) != null) {
        // when writing out data, we should not do projection pushdown
        projectedSchema = tableSchema;
    } else {
        configuration.setBoolean(InputFormatConfig.CASE_SENSITIVE, false);
        String[] selectedColumns = ColumnProjectionUtils.getReadColumnNames(configuration);
        // When same table is joined multiple times, it is possible some selected columns are duplicated,
        // in this case wrong recordStructField position leads wrong value or ArrayIndexOutOfBoundException
        String[] distinctSelectedColumns = Arrays.stream(selectedColumns).distinct().toArray(String[]::new);
        projectedSchema = distinctSelectedColumns.length > 0 ? tableSchema.caseInsensitiveSelect(distinctSelectedColumns) : tableSchema;
        // or we cannot find selectOperator's column from inspector
        if (projectedSchema.columns().size() != distinctSelectedColumns.length) {
            projectedSchema = tableSchema;
        }
    }
    try {
        this.inspector = IcebergObjectInspector.create(projectedSchema);
    } catch (Exception e) {
        throw new SerDeException(e);
    }
}
Also used : PartitionField(org.apache.iceberg.PartitionField) Table(org.apache.iceberg.Table) NoSuchTableException(org.apache.iceberg.exceptions.NoSuchTableException) Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) PartitionSpec(org.apache.iceberg.PartitionSpec) NoSuchTableException(org.apache.iceberg.exceptions.NoSuchTableException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException)

Example 40 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class TestInputFormatReaderDeletes method rowSet.

@Override
public StructLikeSet rowSet(String name, Table table, String... columns) {
    InputFormatConfig.ConfigBuilder builder = new InputFormatConfig.ConfigBuilder(conf).readFrom(table.location());
    Schema projected = table.schema().select(columns);
    StructLikeSet set = StructLikeSet.create(projected.asStruct());
    set.addAll(TestIcebergInputFormats.TESTED_INPUT_FORMATS.stream().filter(recordFactory -> recordFactory.name().equals(inputFormat)).map(recordFactory -> recordFactory.create(builder.project(projected).conf()).getRecords()).flatMap(List::stream).map(record -> new InternalRecordWrapper(projected.asStruct()).wrap(record)).collect(Collectors.toList()));
    return set;
}
Also used : BaseTable(org.apache.iceberg.BaseTable) InternalRecordWrapper(org.apache.iceberg.data.InternalRecordWrapper) Table(org.apache.iceberg.Table) StructLikeSet(org.apache.iceberg.util.StructLikeSet) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) RunWith(org.junit.runner.RunWith) IOException(java.io.IOException) Schema(org.apache.iceberg.Schema) TestIcebergInputFormats(org.apache.iceberg.mr.hive.TestIcebergInputFormats) Collectors(java.util.stream.Collectors) FileFormat(org.apache.iceberg.FileFormat) File(java.io.File) TableMetadata(org.apache.iceberg.TableMetadata) List(java.util.List) TableOperations(org.apache.iceberg.TableOperations) Configuration(org.apache.hadoop.conf.Configuration) PartitionSpec(org.apache.iceberg.PartitionSpec) DeleteReadTests(org.apache.iceberg.data.DeleteReadTests) Assert(org.junit.Assert) Parameterized(org.junit.runners.Parameterized) Before(org.junit.Before) Schema(org.apache.iceberg.Schema) InternalRecordWrapper(org.apache.iceberg.data.InternalRecordWrapper) StructLikeSet(org.apache.iceberg.util.StructLikeSet)

Aggregations

Schema (org.apache.iceberg.Schema)126 Test (org.junit.Test)93 Record (org.apache.iceberg.data.Record)68 Table (org.apache.iceberg.Table)55 PartitionSpec (org.apache.iceberg.PartitionSpec)39 GenericRecord (org.apache.iceberg.data.GenericRecord)36 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)30 List (java.util.List)21 TableIdentifier (org.apache.iceberg.catalog.TableIdentifier)20 IOException (java.io.IOException)16 Types (org.apache.iceberg.types.Types)16 ArrayList (java.util.ArrayList)15 Map (java.util.Map)14 HashMap (java.util.HashMap)13 FileFormat (org.apache.iceberg.FileFormat)13 UpdateSchema (org.apache.iceberg.UpdateSchema)12 Path (org.apache.hadoop.fs.Path)11 Collectors (java.util.stream.Collectors)10 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)10 TestHelper (org.apache.iceberg.mr.TestHelper)9