use of org.apache.iceberg.Schema in project drill by apache.
the class IcebergTableSchema method of.
/**
* Based on given class fields annotated with {@link MetastoreFieldDefinition}
* generates Iceberg table schema and its partition specification.
*
* @param clazz base class for Iceberg schema
* @param partitionKeys list of partition keys
* @return instance of Iceberg table schema
*/
public static IcebergTableSchema of(Class<?> clazz, List<MetastoreColumn> partitionKeys) {
List<Types.NestedField> tableSchemaFields = new ArrayList<>();
Types.NestedField[] partitionSpecSchemaFields = new Types.NestedField[partitionKeys.size()];
int schemaIndex = STARTING_SCHEMA_INDEX;
int complexTypesIndex = STARTING_COMPLEX_TYPES_INDEX;
for (Field field : clazz.getDeclaredFields()) {
MetastoreFieldDefinition definition = field.getAnnotation(MetastoreFieldDefinition.class);
if (definition == null) {
continue;
}
MetastoreColumn column = definition.column();
String typeSimpleName = field.getType().getSimpleName().toLowerCase();
org.apache.iceberg.types.Type icebergType = JAVA_TO_ICEBERG_TYPE_MAP.get(typeSimpleName);
if (icebergType == null && field.getAnnotatedType().getType() instanceof ParameterizedType) {
Type[] actualTypeArguments = ((ParameterizedType) field.getAnnotatedType().getType()).getActualTypeArguments();
switch(typeSimpleName) {
case "list":
org.apache.iceberg.types.Type listIcebergType = getGenericsType(actualTypeArguments[0]);
icebergType = Types.ListType.ofOptional(complexTypesIndex++, listIcebergType);
break;
case "map":
org.apache.iceberg.types.Type keyIcebergType = getGenericsType(actualTypeArguments[0]);
org.apache.iceberg.types.Type valueIcebergType = getGenericsType(actualTypeArguments[1]);
icebergType = Types.MapType.ofOptional(complexTypesIndex++, complexTypesIndex++, keyIcebergType, valueIcebergType);
break;
default:
throw new IcebergMetastoreException(String.format("Unexpected parametrized type for class [%s]: %s", clazz.getCanonicalName(), typeSimpleName));
}
}
if (icebergType == null) {
throw new IcebergMetastoreException(String.format("Unexpected type for class [%s]: %s", clazz.getCanonicalName(), typeSimpleName));
}
Types.NestedField icebergField = Types.NestedField.optional(schemaIndex++, column.columnName(), icebergType);
tableSchemaFields.add(icebergField);
int partitionIndex = partitionKeys.indexOf(column);
if (partitionIndex != -1) {
partitionSpecSchemaFields[partitionIndex] = icebergField;
}
}
if (Stream.of(partitionSpecSchemaFields).anyMatch(Objects::isNull)) {
throw new IcebergMetastoreException(String.format("Some of partition fields are missing in the class [%s]. Partition keys: %s. Partition values: %s.", clazz.getCanonicalName(), partitionKeys, Arrays.asList(partitionSpecSchemaFields)));
}
Schema tableSchema = new Schema(tableSchemaFields);
PartitionSpec partitionSpec = buildPartitionSpec(partitionSpecSchemaFields);
logger.debug("Constructed Iceberg table schema for class [{}]. Table schema : {}. Partition spec: {}.", clazz.getCanonicalName(), tableSchema, partitionSpec);
return new IcebergTableSchema(tableSchema, partitionSpec);
}
use of org.apache.iceberg.Schema in project drill by apache.
the class TestIcebergTableSchema method testPartitionedPartitionSpec.
@Test
public void testPartitionedPartitionSpec() {
Class<?> clazz = new ClassGenerator(getClass().getSimpleName() + "PartitionedPartitionSpec") {
@Override
void addFields(ClassWriter classWriter) {
FieldVisitor partKey1 = addField(classWriter, Opcodes.ACC_PRIVATE, MetastoreColumn.STORAGE_PLUGIN, String.class);
annotate(partKey1, MetastoreColumn.STORAGE_PLUGIN, MetadataType.ALL);
FieldVisitor partKey2 = addField(classWriter, Opcodes.ACC_PRIVATE, MetastoreColumn.WORKSPACE, String.class);
annotate(partKey2, MetastoreColumn.WORKSPACE, MetadataType.ALL);
FieldVisitor partKey3 = addField(classWriter, Opcodes.ACC_PRIVATE, MetastoreColumn.TABLE_NAME, String.class);
annotate(partKey3, MetastoreColumn.TABLE_NAME, MetadataType.ALL);
FieldVisitor integerField = addField(classWriter, Opcodes.ACC_PRIVATE, MetastoreColumn.ROW_GROUP_INDEX, Integer.class);
annotate(integerField, MetastoreColumn.ROW_GROUP_INDEX, MetadataType.ROW_GROUP);
FieldVisitor stringField = addField(classWriter, Opcodes.ACC_PRIVATE, MetastoreColumn.OWNER, Boolean.class);
annotate(stringField, MetastoreColumn.OWNER, MetadataType.TABLE);
}
}.generate();
IcebergTableSchema schema = IcebergTableSchema.of(clazz, Arrays.asList(MetastoreColumn.STORAGE_PLUGIN, MetastoreColumn.WORKSPACE, MetastoreColumn.TABLE_NAME));
Types.NestedField partKey1 = schema.tableSchema().findField(MetastoreColumn.STORAGE_PLUGIN.columnName());
assertNotNull(partKey1);
Types.NestedField partKey2 = schema.tableSchema().findField(MetastoreColumn.WORKSPACE.columnName());
assertNotNull(partKey2);
Types.NestedField partKey3 = schema.tableSchema().findField(MetastoreColumn.TABLE_NAME.columnName());
assertNotNull(partKey3);
assertNotNull(schema.tableSchema().findField(MetastoreColumn.ROW_GROUP_INDEX.columnName()));
assertNotNull(schema.tableSchema().findField(MetastoreColumn.OWNER.columnName()));
Schema partitionSchema = new Schema(partKey1, partKey2, partKey3);
PartitionSpec expectedPartitionSpec = PartitionSpec.builderFor(partitionSchema).identity(partKey1.name()).identity(partKey2.name()).identity(partKey3.name()).build();
assertEquals(expectedPartitionSpec, schema.partitionSpec());
}
use of org.apache.iceberg.Schema in project drill by apache.
the class TestParquetFileWriter method testTypeMismatch.
@Test
public void testTypeMismatch() throws Exception {
Schema schema = new Schema(Types.NestedField.optional(1, "int_field", Types.IntegerType.get()));
Record record = GenericRecord.create(schema);
record.setField("int_field", 1);
record.setField("int_field", "abc");
String location = defaultFolder.newFolder("testTypeMismatch").toURI().getPath();
Table table = tables.create(schema, location);
thrown.expect(IcebergMetastoreException.class);
new ParquetFileWriter(table).records(Collections.singletonList(record)).location(location).name("typeMismatch").write();
}
use of org.apache.iceberg.Schema in project drill by apache.
the class TestParquetFileWriter method testAllTypes.
@Test
public void testAllTypes() throws Exception {
Schema schema = new Schema(Types.NestedField.optional(1, "int_field", Types.IntegerType.get()), Types.NestedField.optional(2, "long_field", Types.LongType.get()), Types.NestedField.optional(3, "float_field", Types.FloatType.get()), Types.NestedField.optional(4, "double_field", Types.DoubleType.get()), Types.NestedField.optional(5, "string_field", Types.StringType.get()), Types.NestedField.optional(6, "boolean_field", Types.BooleanType.get()), Types.NestedField.optional(7, "list_field", Types.ListType.ofOptional(9, Types.StringType.get())), Types.NestedField.optional(8, "map_field", Types.MapType.ofOptional(10, 11, Types.StringType.get(), Types.FloatType.get())));
List<String> listValue = Arrays.asList("a", "b", "c");
Map<String, Float> mapValue = new HashMap<>();
mapValue.put("a", 0.1F);
mapValue.put("b", 0.2F);
Record record = GenericRecord.create(schema);
record.setField("int_field", 1);
record.setField("long_field", 100L);
record.setField("float_field", 0.5F);
record.setField("double_field", 1.5D);
record.setField("string_field", "abc");
record.setField("boolean_field", true);
record.setField("list_field", listValue);
record.setField("map_field", mapValue);
String location = defaultFolder.newFolder("testAllTypes").toURI().getPath();
String fileName = "allTypes";
Table table = tables.create(schema, location);
org.apache.drill.metastore.iceberg.write.File result = new ParquetFileWriter(table).records(Collections.singletonList(record)).location(location).name(fileName).write();
String writePath = new Path(location, FileFormat.PARQUET.addExtension(fileName)).toUri().getPath();
assertEquals(new Path(FileFormat.PARQUET.addExtension(writePath)), new Path(result.location()));
assertEquals(Long.valueOf(1), result.metrics().recordCount());
List<Record> rows = readData(result.input(), schema);
assertEquals(1, rows.size());
Record row = rows.get(0);
assertEquals(1, row.getField("int_field"));
assertEquals(100L, row.getField("long_field"));
assertEquals(0.5F, row.getField("float_field"));
assertEquals(1.5D, row.getField("double_field"));
assertEquals("abc", row.getField("string_field"));
assertEquals(true, row.getField("boolean_field"));
assertEquals(listValue, row.getField("list_field"));
assertEquals(mapValue, row.getField("map_field"));
}
use of org.apache.iceberg.Schema in project drill by apache.
the class TestParquetFileWriter method testSeveralRecords.
@Test
public void testSeveralRecords() throws Exception {
int fieldIndex = 1;
Schema schema = new Schema(Types.NestedField.optional(fieldIndex, "int_field", Types.IntegerType.get()));
List<Integer> values = Arrays.asList(1, 2, 3, 3, null, null, null);
List<Record> records = values.stream().map(value -> {
Record record = GenericRecord.create(schema);
record.setField("int_field", value);
return record;
}).collect(Collectors.toList());
String location = defaultFolder.newFolder("testSeveralRecords").toURI().getPath();
Table table = tables.create(schema, location);
org.apache.drill.metastore.iceberg.write.File result = new ParquetFileWriter(table).records(records).location(location).name("severalRecords").write();
assertEquals(Long.valueOf(7), result.metrics().recordCount());
assertEquals(Long.valueOf(7), result.metrics().valueCounts().get(fieldIndex));
assertEquals(Long.valueOf(3), result.metrics().nullValueCounts().get(fieldIndex));
List<Record> rows = readData(result.input(), schema);
assertEquals(7, rows.size());
List<Integer> actual = rows.stream().map(row -> (Integer) row.getField("int_field")).collect(Collectors.toList());
assertEquals(values, actual);
}
Aggregations