Search in sources :

Example 51 with Schema

use of org.apache.iceberg.Schema in project metacat by Netflix.

the class IcebergTableHandler method update.

/**
 * Updates the iceberg schema if the provided tableInfo has updated field comments.
 *
 * @param tableInfo table information
 * @return true if an update is done
 */
public boolean update(final TableInfo tableInfo) {
    boolean result = false;
    final List<FieldInfo> fields = tableInfo.getFields();
    if (fields != null && !fields.isEmpty() && // This parameter is only sent during data change and not during schema change.
    Strings.isNullOrEmpty(tableInfo.getMetadata().get(DirectSqlTable.PARAM_PREVIOUS_METADATA_LOCATION))) {
        final QualifiedName tableName = tableInfo.getName();
        final String tableMetadataLocation = HiveTableUtil.getIcebergTableMetadataLocation(tableInfo);
        if (Strings.isNullOrEmpty(tableMetadataLocation)) {
            final String message = String.format("No metadata location specified for table %s", tableName);
            log.error(message);
            throw new MetacatBadRequestException(message);
        }
        final IcebergMetastoreTables icebergMetastoreTables = new IcebergMetastoreTables(new IcebergTableOps(conf, tableMetadataLocation, connectorContext.getConfig(), icebergTableOpsProxy));
        final Table table = icebergMetastoreTables.loadTable(HiveTableUtil.qualifiedNameToTableIdentifier(tableName));
        final UpdateSchema updateSchema = table.updateSchema();
        final Schema schema = table.schema();
        for (FieldInfo field : fields) {
            final Types.NestedField iField = schema.findField(field.getName());
            if (iField != null && !Objects.equals(field.getComment(), iField.doc())) {
                updateSchema.updateColumnDoc(field.getName(), field.getComment());
                result = true;
            }
        }
        if (result) {
            updateSchema.commit();
            final String newTableMetadataLocation = icebergMetastoreTables.getTableOps().currentMetadataLocation();
            if (!tableMetadataLocation.equalsIgnoreCase(newTableMetadataLocation)) {
                tableInfo.getMetadata().put(DirectSqlTable.PARAM_PREVIOUS_METADATA_LOCATION, tableMetadataLocation);
                tableInfo.getMetadata().put(DirectSqlTable.PARAM_METADATA_LOCATION, newTableMetadataLocation);
            }
        }
    }
    return result;
}
Also used : Types(org.apache.iceberg.types.Types) DirectSqlTable(com.netflix.metacat.connector.hive.sql.DirectSqlTable) Table(org.apache.iceberg.Table) UpdateSchema(org.apache.iceberg.UpdateSchema) QualifiedName(com.netflix.metacat.common.QualifiedName) UpdateSchema(org.apache.iceberg.UpdateSchema) Schema(org.apache.iceberg.Schema) MetacatBadRequestException(com.netflix.metacat.common.exception.MetacatBadRequestException) FieldInfo(com.netflix.metacat.common.server.connectors.model.FieldInfo)

Example 52 with Schema

use of org.apache.iceberg.Schema in project metacat by Netflix.

the class HiveTypeConverter method icebergeSchemaTofieldDtos.

/**
 * Converts iceberg schema to field dto.
 *
 * @param schema          schema
 * @param partitionFields partitioned fields
 * @return list of field Info
 */
public List<FieldInfo> icebergeSchemaTofieldDtos(final Schema schema, final List<PartitionField> partitionFields) {
    final List<FieldInfo> fields = Lists.newArrayList();
    final List<String> partitionNames = partitionFields.stream().map(f -> schema.findField(f.sourceId()).name()).collect(Collectors.toList());
    for (Types.NestedField field : schema.columns()) {
        final FieldInfo fieldInfo = new FieldInfo();
        fieldInfo.setName(field.name());
        final org.apache.iceberg.types.Type fieldType = field.type();
        fieldInfo.setSourceType(fieldType.toString());
        fieldInfo.setType(toMetacatType(fromIcebergToHiveType(fieldType)));
        fieldInfo.setIsNullable(field.isOptional());
        fieldInfo.setComment(field.doc());
        fieldInfo.setPartitionKey(partitionNames.contains(field.name()));
        fields.add(fieldInfo);
    }
    return fields;
}
Also used : DecimalType(com.netflix.metacat.common.type.DecimalType) TypeInfoUtils(org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) Types(org.apache.iceberg.types.Types) PrimitiveCategory(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory) StandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector) Type(com.netflix.metacat.common.type.Type) ConnectorTypeConverter(com.netflix.metacat.common.server.connectors.ConnectorTypeConverter) PartitionField(org.apache.iceberg.PartitionField) ArrayList(java.util.ArrayList) FieldInfo(com.netflix.metacat.common.server.connectors.model.FieldInfo) CharType(com.netflix.metacat.common.type.CharType) Lists(com.google.common.collect.Lists) ImmutableList(com.google.common.collect.ImmutableList) VarcharType(com.netflix.metacat.common.type.VarcharType) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) TypeEnum(com.netflix.metacat.common.type.TypeEnum) ParametricType(com.netflix.metacat.common.type.ParametricType) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) org.apache.hadoop.hive.serde.serdeConstants(org.apache.hadoop.hive.serde.serdeConstants) BaseType(com.netflix.metacat.common.type.BaseType) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Schema(org.apache.iceberg.Schema) TypeUtils(com.netflix.metacat.common.type.TypeUtils) DecimalTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo) Collectors(java.util.stream.Collectors) TypeRegistry(com.netflix.metacat.common.type.TypeRegistry) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) Slf4j(lombok.extern.slf4j.Slf4j) List(java.util.List) VarcharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo) TypeSignature(com.netflix.metacat.common.type.TypeSignature) RowType(com.netflix.metacat.common.type.RowType) CharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo) MapType(com.netflix.metacat.common.type.MapType) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) Types(org.apache.iceberg.types.Types) FieldInfo(com.netflix.metacat.common.server.connectors.model.FieldInfo)

Example 53 with Schema

use of org.apache.iceberg.Schema in project incubator-gobblin by apache.

the class IcebergUtils method getIcebergSchema.

/**
 * Given a avro schema string and a hive table,
 * calculate the iceberg table schema and partition schema.
 * (E.g. we use 'datepartition' as the partition column, which is not included inside the data schema,
 * we'll need to add that column to data schema to construct table schema
 */
public static IcebergDataAndPartitionSchema getIcebergSchema(String schema, org.apache.hadoop.hive.metastore.api.Table table) {
    org.apache.iceberg.shaded.org.apache.avro.Schema icebergDataSchema = new org.apache.iceberg.shaded.org.apache.avro.Schema.Parser().parse(schema);
    Types.StructType dataStructType = AvroSchemaUtil.convert(icebergDataSchema).asStructType();
    List<Types.NestedField> dataFields = Lists.newArrayList(dataStructType.fields());
    org.apache.iceberg.shaded.org.apache.avro.Schema icebergPartitionSchema = parseSchemaFromCols(table.getPartitionKeys(), table.getDbName(), table.getTableName(), true);
    Types.StructType partitionStructType = AvroSchemaUtil.convert(icebergPartitionSchema).asStructType();
    List<Types.NestedField> partitionFields = partitionStructType.fields();
    Preconditions.checkArgument(partitionFields.stream().allMatch(f -> f.type().isPrimitiveType()), "Only primitive fields are supported for partition columns");
    dataFields.addAll(partitionFields);
    Types.StructType updatedStructType = Types.StructType.of(dataFields);
    updatedStructType = (Types.StructType) TypeUtil.assignFreshIds(updatedStructType, new AtomicInteger(0)::incrementAndGet);
    return new IcebergDataAndPartitionSchema(new org.apache.iceberg.Schema(updatedStructType.fields()), new org.apache.iceberg.Schema(partitionFields));
}
Also used : TypeInfoUtils(org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils) TypeUtil(org.apache.iceberg.types.TypeUtil) Types(org.apache.iceberg.types.Types) HashMap(java.util.HashMap) Metrics(org.apache.iceberg.Metrics) StructLike(org.apache.iceberg.StructLike) ByteBuffer(java.nio.ByteBuffer) ArrayList(java.util.ArrayList) Lists(com.google.common.collect.Lists) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) DataFiles(org.apache.iceberg.DataFiles) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) IntegerLongPair(org.apache.gobblin.metadata.IntegerLongPair) DataFile(org.apache.iceberg.DataFile) RuntimeIOException(org.apache.iceberg.exceptions.RuntimeIOException) State(org.apache.gobblin.configuration.State) IOException(java.io.IOException) ConfigurationKeys(org.apache.gobblin.configuration.ConfigurationKeys) Schema(org.apache.iceberg.Schema) Collectors(java.util.stream.Collectors) FileFormat(org.apache.iceberg.FileFormat) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) List(java.util.List) Slf4j(lombok.extern.slf4j.Slf4j) AvroSchemaUtil(org.apache.iceberg.avro.AvroSchemaUtil) Conversions(org.apache.iceberg.types.Conversions) IntegerBytesPair(org.apache.gobblin.metadata.IntegerBytesPair) PartitionSpec(org.apache.iceberg.PartitionSpec) Preconditions(com.google.common.base.Preconditions) Collections(java.util.Collections) Types(org.apache.iceberg.types.Types) Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) Schema(org.apache.iceberg.Schema) AtomicInteger(java.util.concurrent.atomic.AtomicInteger)

Example 54 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class HiveTableTest method testColumnTypeChangeInMetastore.

@Test
public void testColumnTypeChangeInMetastore() throws TException {
    Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);
    Schema expectedSchema = new Schema(Types.StructType.of(required(1, "id", Types.LongType.get()), optional(2, "data", Types.LongType.get()), optional(3, "string", Types.StringType.get()), optional(4, "int", Types.IntegerType.get())).fields());
    // Add columns with different types, then verify we could delete one column in hive metastore
    // as hive conf METASTORE_DISALLOW_INCOMPATIBLE_COL_TYPE_CHANGES was set to false. If this was set to true,
    // an InvalidOperationException would thrown in method MetaStoreUtils#throwExceptionIfIncompatibleColTypeChange()
    icebergTable.updateSchema().addColumn("data", Types.LongType.get()).addColumn("string", Types.StringType.get()).addColumn("int", Types.IntegerType.get()).commit();
    Assert.assertEquals("Schema should match expected", expectedSchema.asStruct(), icebergTable.schema().asStruct());
    expectedSchema = new Schema(Types.StructType.of(required(1, "id", Types.LongType.get()), optional(2, "data", Types.LongType.get()), optional(4, "int", Types.IntegerType.get())).fields());
    icebergTable.updateSchema().deleteColumn("string").commit();
    Assert.assertEquals("Schema should match expected", expectedSchema.asStruct(), icebergTable.schema().asStruct());
}
Also used : Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) Test(org.junit.Test)

Example 55 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class TestHiveCatalog method testCreateTableBuilder.

@Test
public void testCreateTableBuilder() throws Exception {
    Schema schema = new Schema(required(1, "id", Types.IntegerType.get(), "unique ID"), required(2, "data", Types.StringType.get()));
    PartitionSpec spec = PartitionSpec.builderFor(schema).bucket("data", 16).build();
    TableIdentifier tableIdent = TableIdentifier.of(DB_NAME, "tbl");
    String location = temp.newFolder("tbl").toString();
    try {
        Table table = catalog.buildTable(tableIdent, schema).withPartitionSpec(spec).withLocation(location).withProperty("key1", "value1").withProperty("key2", "value2").create();
        Assert.assertEquals(location, table.location());
        Assert.assertEquals(2, table.schema().columns().size());
        Assert.assertEquals(1, table.spec().fields().size());
        Assert.assertEquals("value1", table.properties().get("key1"));
        Assert.assertEquals("value2", table.properties().get("key2"));
    } finally {
        catalog.dropTable(tableIdent);
    }
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Aggregations

Schema (org.apache.iceberg.Schema)126 Test (org.junit.Test)93 Record (org.apache.iceberg.data.Record)68 Table (org.apache.iceberg.Table)55 PartitionSpec (org.apache.iceberg.PartitionSpec)39 GenericRecord (org.apache.iceberg.data.GenericRecord)36 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)30 List (java.util.List)21 TableIdentifier (org.apache.iceberg.catalog.TableIdentifier)20 IOException (java.io.IOException)16 Types (org.apache.iceberg.types.Types)16 ArrayList (java.util.ArrayList)15 Map (java.util.Map)14 HashMap (java.util.HashMap)13 FileFormat (org.apache.iceberg.FileFormat)13 UpdateSchema (org.apache.iceberg.UpdateSchema)12 Path (org.apache.hadoop.fs.Path)11 Collectors (java.util.stream.Collectors)10 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)10 TestHelper (org.apache.iceberg.mr.TestHelper)9