Search in sources :

Example 1 with Schema

use of org.apache.iceberg.Schema in project drill by apache.

the class IcebergTableSchema method of.

/**
 * Based on given class fields annotated with {@link MetastoreFieldDefinition}
 * generates Iceberg table schema and its partition specification.
 *
 * @param clazz base class for Iceberg schema
 * @param partitionKeys list of partition keys
 * @return instance of Iceberg table schema
 */
public static IcebergTableSchema of(Class<?> clazz, List<MetastoreColumn> partitionKeys) {
    List<Types.NestedField> tableSchemaFields = new ArrayList<>();
    Types.NestedField[] partitionSpecSchemaFields = new Types.NestedField[partitionKeys.size()];
    int schemaIndex = STARTING_SCHEMA_INDEX;
    int complexTypesIndex = STARTING_COMPLEX_TYPES_INDEX;
    for (Field field : clazz.getDeclaredFields()) {
        MetastoreFieldDefinition definition = field.getAnnotation(MetastoreFieldDefinition.class);
        if (definition == null) {
            continue;
        }
        MetastoreColumn column = definition.column();
        String typeSimpleName = field.getType().getSimpleName().toLowerCase();
        org.apache.iceberg.types.Type icebergType = JAVA_TO_ICEBERG_TYPE_MAP.get(typeSimpleName);
        if (icebergType == null && field.getAnnotatedType().getType() instanceof ParameterizedType) {
            Type[] actualTypeArguments = ((ParameterizedType) field.getAnnotatedType().getType()).getActualTypeArguments();
            switch(typeSimpleName) {
                case "list":
                    org.apache.iceberg.types.Type listIcebergType = getGenericsType(actualTypeArguments[0]);
                    icebergType = Types.ListType.ofOptional(complexTypesIndex++, listIcebergType);
                    break;
                case "map":
                    org.apache.iceberg.types.Type keyIcebergType = getGenericsType(actualTypeArguments[0]);
                    org.apache.iceberg.types.Type valueIcebergType = getGenericsType(actualTypeArguments[1]);
                    icebergType = Types.MapType.ofOptional(complexTypesIndex++, complexTypesIndex++, keyIcebergType, valueIcebergType);
                    break;
                default:
                    throw new IcebergMetastoreException(String.format("Unexpected parametrized type for class [%s]: %s", clazz.getCanonicalName(), typeSimpleName));
            }
        }
        if (icebergType == null) {
            throw new IcebergMetastoreException(String.format("Unexpected type for class [%s]: %s", clazz.getCanonicalName(), typeSimpleName));
        }
        Types.NestedField icebergField = Types.NestedField.optional(schemaIndex++, column.columnName(), icebergType);
        tableSchemaFields.add(icebergField);
        int partitionIndex = partitionKeys.indexOf(column);
        if (partitionIndex != -1) {
            partitionSpecSchemaFields[partitionIndex] = icebergField;
        }
    }
    if (Stream.of(partitionSpecSchemaFields).anyMatch(Objects::isNull)) {
        throw new IcebergMetastoreException(String.format("Some of partition fields are missing in the class [%s]. Partition keys: %s. Partition values: %s.", clazz.getCanonicalName(), partitionKeys, Arrays.asList(partitionSpecSchemaFields)));
    }
    Schema tableSchema = new Schema(tableSchemaFields);
    PartitionSpec partitionSpec = buildPartitionSpec(partitionSpecSchemaFields);
    logger.debug("Constructed Iceberg table schema for class [{}]. Table schema : {}. Partition spec: {}.", clazz.getCanonicalName(), tableSchema, partitionSpec);
    return new IcebergTableSchema(tableSchema, partitionSpec);
}
Also used : IcebergMetastoreException(org.apache.drill.metastore.iceberg.exceptions.IcebergMetastoreException) Types(org.apache.iceberg.types.Types) Schema(org.apache.iceberg.Schema) ArrayList(java.util.ArrayList) MetastoreFieldDefinition(org.apache.drill.metastore.MetastoreFieldDefinition) PartitionSpec(org.apache.iceberg.PartitionSpec) MetastoreColumn(org.apache.drill.metastore.MetastoreColumn) ParameterizedType(java.lang.reflect.ParameterizedType) Field(java.lang.reflect.Field) ParameterizedType(java.lang.reflect.ParameterizedType) Type(java.lang.reflect.Type) Objects(java.util.Objects)

Example 2 with Schema

use of org.apache.iceberg.Schema in project drill by apache.

the class TestIcebergTableSchema method testPartitionedPartitionSpec.

@Test
public void testPartitionedPartitionSpec() {
    Class<?> clazz = new ClassGenerator(getClass().getSimpleName() + "PartitionedPartitionSpec") {

        @Override
        void addFields(ClassWriter classWriter) {
            FieldVisitor partKey1 = addField(classWriter, Opcodes.ACC_PRIVATE, MetastoreColumn.STORAGE_PLUGIN, String.class);
            annotate(partKey1, MetastoreColumn.STORAGE_PLUGIN, MetadataType.ALL);
            FieldVisitor partKey2 = addField(classWriter, Opcodes.ACC_PRIVATE, MetastoreColumn.WORKSPACE, String.class);
            annotate(partKey2, MetastoreColumn.WORKSPACE, MetadataType.ALL);
            FieldVisitor partKey3 = addField(classWriter, Opcodes.ACC_PRIVATE, MetastoreColumn.TABLE_NAME, String.class);
            annotate(partKey3, MetastoreColumn.TABLE_NAME, MetadataType.ALL);
            FieldVisitor integerField = addField(classWriter, Opcodes.ACC_PRIVATE, MetastoreColumn.ROW_GROUP_INDEX, Integer.class);
            annotate(integerField, MetastoreColumn.ROW_GROUP_INDEX, MetadataType.ROW_GROUP);
            FieldVisitor stringField = addField(classWriter, Opcodes.ACC_PRIVATE, MetastoreColumn.OWNER, Boolean.class);
            annotate(stringField, MetastoreColumn.OWNER, MetadataType.TABLE);
        }
    }.generate();
    IcebergTableSchema schema = IcebergTableSchema.of(clazz, Arrays.asList(MetastoreColumn.STORAGE_PLUGIN, MetastoreColumn.WORKSPACE, MetastoreColumn.TABLE_NAME));
    Types.NestedField partKey1 = schema.tableSchema().findField(MetastoreColumn.STORAGE_PLUGIN.columnName());
    assertNotNull(partKey1);
    Types.NestedField partKey2 = schema.tableSchema().findField(MetastoreColumn.WORKSPACE.columnName());
    assertNotNull(partKey2);
    Types.NestedField partKey3 = schema.tableSchema().findField(MetastoreColumn.TABLE_NAME.columnName());
    assertNotNull(partKey3);
    assertNotNull(schema.tableSchema().findField(MetastoreColumn.ROW_GROUP_INDEX.columnName()));
    assertNotNull(schema.tableSchema().findField(MetastoreColumn.OWNER.columnName()));
    Schema partitionSchema = new Schema(partKey1, partKey2, partKey3);
    PartitionSpec expectedPartitionSpec = PartitionSpec.builderFor(partitionSchema).identity(partKey1.name()).identity(partKey2.name()).identity(partKey3.name()).build();
    assertEquals(expectedPartitionSpec, schema.partitionSpec());
}
Also used : Types(org.apache.iceberg.types.Types) Schema(org.apache.iceberg.Schema) FieldVisitor(org.objectweb.asm.FieldVisitor) PartitionSpec(org.apache.iceberg.PartitionSpec) ClassWriter(org.objectweb.asm.ClassWriter) IcebergBaseTest(org.apache.drill.metastore.iceberg.IcebergBaseTest) Test(org.junit.Test)

Example 3 with Schema

use of org.apache.iceberg.Schema in project drill by apache.

the class TestParquetFileWriter method testTypeMismatch.

@Test
public void testTypeMismatch() throws Exception {
    Schema schema = new Schema(Types.NestedField.optional(1, "int_field", Types.IntegerType.get()));
    Record record = GenericRecord.create(schema);
    record.setField("int_field", 1);
    record.setField("int_field", "abc");
    String location = defaultFolder.newFolder("testTypeMismatch").toURI().getPath();
    Table table = tables.create(schema, location);
    thrown.expect(IcebergMetastoreException.class);
    new ParquetFileWriter(table).records(Collections.singletonList(record)).location(location).name("typeMismatch").write();
}
Also used : Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) GenericRecord(org.apache.iceberg.data.GenericRecord) Record(org.apache.iceberg.data.Record) IcebergBaseTest(org.apache.drill.metastore.iceberg.IcebergBaseTest) Test(org.junit.Test)

Example 4 with Schema

use of org.apache.iceberg.Schema in project drill by apache.

the class TestParquetFileWriter method testAllTypes.

@Test
public void testAllTypes() throws Exception {
    Schema schema = new Schema(Types.NestedField.optional(1, "int_field", Types.IntegerType.get()), Types.NestedField.optional(2, "long_field", Types.LongType.get()), Types.NestedField.optional(3, "float_field", Types.FloatType.get()), Types.NestedField.optional(4, "double_field", Types.DoubleType.get()), Types.NestedField.optional(5, "string_field", Types.StringType.get()), Types.NestedField.optional(6, "boolean_field", Types.BooleanType.get()), Types.NestedField.optional(7, "list_field", Types.ListType.ofOptional(9, Types.StringType.get())), Types.NestedField.optional(8, "map_field", Types.MapType.ofOptional(10, 11, Types.StringType.get(), Types.FloatType.get())));
    List<String> listValue = Arrays.asList("a", "b", "c");
    Map<String, Float> mapValue = new HashMap<>();
    mapValue.put("a", 0.1F);
    mapValue.put("b", 0.2F);
    Record record = GenericRecord.create(schema);
    record.setField("int_field", 1);
    record.setField("long_field", 100L);
    record.setField("float_field", 0.5F);
    record.setField("double_field", 1.5D);
    record.setField("string_field", "abc");
    record.setField("boolean_field", true);
    record.setField("list_field", listValue);
    record.setField("map_field", mapValue);
    String location = defaultFolder.newFolder("testAllTypes").toURI().getPath();
    String fileName = "allTypes";
    Table table = tables.create(schema, location);
    org.apache.drill.metastore.iceberg.write.File result = new ParquetFileWriter(table).records(Collections.singletonList(record)).location(location).name(fileName).write();
    String writePath = new Path(location, FileFormat.PARQUET.addExtension(fileName)).toUri().getPath();
    assertEquals(new Path(FileFormat.PARQUET.addExtension(writePath)), new Path(result.location()));
    assertEquals(Long.valueOf(1), result.metrics().recordCount());
    List<Record> rows = readData(result.input(), schema);
    assertEquals(1, rows.size());
    Record row = rows.get(0);
    assertEquals(1, row.getField("int_field"));
    assertEquals(100L, row.getField("long_field"));
    assertEquals(0.5F, row.getField("float_field"));
    assertEquals(1.5D, row.getField("double_field"));
    assertEquals("abc", row.getField("string_field"));
    assertEquals(true, row.getField("boolean_field"));
    assertEquals(listValue, row.getField("list_field"));
    assertEquals(mapValue, row.getField("map_field"));
}
Also used : Path(org.apache.hadoop.fs.Path) Table(org.apache.iceberg.Table) HashMap(java.util.HashMap) Schema(org.apache.iceberg.Schema) GenericRecord(org.apache.iceberg.data.GenericRecord) Record(org.apache.iceberg.data.Record) IcebergBaseTest(org.apache.drill.metastore.iceberg.IcebergBaseTest) Test(org.junit.Test)

Example 5 with Schema

use of org.apache.iceberg.Schema in project drill by apache.

the class TestParquetFileWriter method testSeveralRecords.

@Test
public void testSeveralRecords() throws Exception {
    int fieldIndex = 1;
    Schema schema = new Schema(Types.NestedField.optional(fieldIndex, "int_field", Types.IntegerType.get()));
    List<Integer> values = Arrays.asList(1, 2, 3, 3, null, null, null);
    List<Record> records = values.stream().map(value -> {
        Record record = GenericRecord.create(schema);
        record.setField("int_field", value);
        return record;
    }).collect(Collectors.toList());
    String location = defaultFolder.newFolder("testSeveralRecords").toURI().getPath();
    Table table = tables.create(schema, location);
    org.apache.drill.metastore.iceberg.write.File result = new ParquetFileWriter(table).records(records).location(location).name("severalRecords").write();
    assertEquals(Long.valueOf(7), result.metrics().recordCount());
    assertEquals(Long.valueOf(7), result.metrics().valueCounts().get(fieldIndex));
    assertEquals(Long.valueOf(3), result.metrics().nullValueCounts().get(fieldIndex));
    List<Record> rows = readData(result.input(), schema);
    assertEquals(7, rows.size());
    List<Integer> actual = rows.stream().map(row -> (Integer) row.getField("int_field")).collect(Collectors.toList());
    assertEquals(values, actual);
}
Also used : Tables(org.apache.iceberg.Tables) Arrays(java.util.Arrays) Types(org.apache.iceberg.types.Types) BeforeClass(org.junit.BeforeClass) HashMap(java.util.HashMap) IcebergMetastoreException(org.apache.drill.metastore.iceberg.exceptions.IcebergMetastoreException) IcebergBaseTest(org.apache.drill.metastore.iceberg.IcebergBaseTest) GenericRecord(org.apache.iceberg.data.GenericRecord) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) CloseableIterable(org.apache.iceberg.io.CloseableIterable) Files(java.nio.file.Files) Table(org.apache.iceberg.Table) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) Parquet(org.apache.iceberg.parquet.Parquet) Test(org.junit.Test) IOException(java.io.IOException) Schema(org.apache.iceberg.Schema) FileFormat(org.apache.iceberg.FileFormat) Collectors(java.util.stream.Collectors) File(java.io.File) Record(org.apache.iceberg.data.Record) List(java.util.List) Lists(org.apache.drill.shaded.guava.com.google.common.collect.Lists) Assert.assertNull(org.junit.Assert.assertNull) Paths(java.nio.file.Paths) GenericParquetReaders(org.apache.iceberg.data.parquet.GenericParquetReaders) InputFile(org.apache.iceberg.io.InputFile) Collections(java.util.Collections) Assert.assertEquals(org.junit.Assert.assertEquals) Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) GenericRecord(org.apache.iceberg.data.GenericRecord) Record(org.apache.iceberg.data.Record) IcebergBaseTest(org.apache.drill.metastore.iceberg.IcebergBaseTest) Test(org.junit.Test)

Aggregations

Schema (org.apache.iceberg.Schema)126 Test (org.junit.Test)93 Record (org.apache.iceberg.data.Record)68 Table (org.apache.iceberg.Table)55 PartitionSpec (org.apache.iceberg.PartitionSpec)39 GenericRecord (org.apache.iceberg.data.GenericRecord)36 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)30 List (java.util.List)21 TableIdentifier (org.apache.iceberg.catalog.TableIdentifier)20 IOException (java.io.IOException)16 Types (org.apache.iceberg.types.Types)16 ArrayList (java.util.ArrayList)15 Map (java.util.Map)14 HashMap (java.util.HashMap)13 FileFormat (org.apache.iceberg.FileFormat)13 UpdateSchema (org.apache.iceberg.UpdateSchema)12 Path (org.apache.hadoop.fs.Path)11 Collectors (java.util.stream.Collectors)10 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)10 TestHelper (org.apache.iceberg.mr.TestHelper)9