Search in sources :

Example 6 with Record

use of org.apache.iceberg.data.Record in project drill by apache.

the class TestParquetFileWriter method testAllTypes.

@Test
public void testAllTypes() throws Exception {
    Schema schema = new Schema(Types.NestedField.optional(1, "int_field", Types.IntegerType.get()), Types.NestedField.optional(2, "long_field", Types.LongType.get()), Types.NestedField.optional(3, "float_field", Types.FloatType.get()), Types.NestedField.optional(4, "double_field", Types.DoubleType.get()), Types.NestedField.optional(5, "string_field", Types.StringType.get()), Types.NestedField.optional(6, "boolean_field", Types.BooleanType.get()), Types.NestedField.optional(7, "list_field", Types.ListType.ofOptional(9, Types.StringType.get())), Types.NestedField.optional(8, "map_field", Types.MapType.ofOptional(10, 11, Types.StringType.get(), Types.FloatType.get())));
    List<String> listValue = Arrays.asList("a", "b", "c");
    Map<String, Float> mapValue = new HashMap<>();
    mapValue.put("a", 0.1F);
    mapValue.put("b", 0.2F);
    Record record = GenericRecord.create(schema);
    record.setField("int_field", 1);
    record.setField("long_field", 100L);
    record.setField("float_field", 0.5F);
    record.setField("double_field", 1.5D);
    record.setField("string_field", "abc");
    record.setField("boolean_field", true);
    record.setField("list_field", listValue);
    record.setField("map_field", mapValue);
    String location = defaultFolder.newFolder("testAllTypes").toURI().getPath();
    String fileName = "allTypes";
    Table table = tables.create(schema, location);
    org.apache.drill.metastore.iceberg.write.File result = new ParquetFileWriter(table).records(Collections.singletonList(record)).location(location).name(fileName).write();
    String writePath = new Path(location, FileFormat.PARQUET.addExtension(fileName)).toUri().getPath();
    assertEquals(new Path(FileFormat.PARQUET.addExtension(writePath)), new Path(result.location()));
    assertEquals(Long.valueOf(1), result.metrics().recordCount());
    List<Record> rows = readData(result.input(), schema);
    assertEquals(1, rows.size());
    Record row = rows.get(0);
    assertEquals(1, row.getField("int_field"));
    assertEquals(100L, row.getField("long_field"));
    assertEquals(0.5F, row.getField("float_field"));
    assertEquals(1.5D, row.getField("double_field"));
    assertEquals("abc", row.getField("string_field"));
    assertEquals(true, row.getField("boolean_field"));
    assertEquals(listValue, row.getField("list_field"));
    assertEquals(mapValue, row.getField("map_field"));
}
Also used : Path(org.apache.hadoop.fs.Path) Table(org.apache.iceberg.Table) HashMap(java.util.HashMap) Schema(org.apache.iceberg.Schema) GenericRecord(org.apache.iceberg.data.GenericRecord) Record(org.apache.iceberg.data.Record) IcebergBaseTest(org.apache.drill.metastore.iceberg.IcebergBaseTest) Test(org.junit.Test)

Example 7 with Record

use of org.apache.iceberg.data.Record in project drill by apache.

the class TestParquetFileWriter method testSeveralRecords.

@Test
public void testSeveralRecords() throws Exception {
    int fieldIndex = 1;
    Schema schema = new Schema(Types.NestedField.optional(fieldIndex, "int_field", Types.IntegerType.get()));
    List<Integer> values = Arrays.asList(1, 2, 3, 3, null, null, null);
    List<Record> records = values.stream().map(value -> {
        Record record = GenericRecord.create(schema);
        record.setField("int_field", value);
        return record;
    }).collect(Collectors.toList());
    String location = defaultFolder.newFolder("testSeveralRecords").toURI().getPath();
    Table table = tables.create(schema, location);
    org.apache.drill.metastore.iceberg.write.File result = new ParquetFileWriter(table).records(records).location(location).name("severalRecords").write();
    assertEquals(Long.valueOf(7), result.metrics().recordCount());
    assertEquals(Long.valueOf(7), result.metrics().valueCounts().get(fieldIndex));
    assertEquals(Long.valueOf(3), result.metrics().nullValueCounts().get(fieldIndex));
    List<Record> rows = readData(result.input(), schema);
    assertEquals(7, rows.size());
    List<Integer> actual = rows.stream().map(row -> (Integer) row.getField("int_field")).collect(Collectors.toList());
    assertEquals(values, actual);
}
Also used : Tables(org.apache.iceberg.Tables) Arrays(java.util.Arrays) Types(org.apache.iceberg.types.Types) BeforeClass(org.junit.BeforeClass) HashMap(java.util.HashMap) IcebergMetastoreException(org.apache.drill.metastore.iceberg.exceptions.IcebergMetastoreException) IcebergBaseTest(org.apache.drill.metastore.iceberg.IcebergBaseTest) GenericRecord(org.apache.iceberg.data.GenericRecord) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) CloseableIterable(org.apache.iceberg.io.CloseableIterable) Files(java.nio.file.Files) Table(org.apache.iceberg.Table) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) Parquet(org.apache.iceberg.parquet.Parquet) Test(org.junit.Test) IOException(java.io.IOException) Schema(org.apache.iceberg.Schema) FileFormat(org.apache.iceberg.FileFormat) Collectors(java.util.stream.Collectors) File(java.io.File) Record(org.apache.iceberg.data.Record) List(java.util.List) Lists(org.apache.drill.shaded.guava.com.google.common.collect.Lists) Assert.assertNull(org.junit.Assert.assertNull) Paths(java.nio.file.Paths) GenericParquetReaders(org.apache.iceberg.data.parquet.GenericParquetReaders) InputFile(org.apache.iceberg.io.InputFile) Collections(java.util.Collections) Assert.assertEquals(org.junit.Assert.assertEquals) Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) GenericRecord(org.apache.iceberg.data.GenericRecord) Record(org.apache.iceberg.data.Record) IcebergBaseTest(org.apache.drill.metastore.iceberg.IcebergBaseTest) Test(org.junit.Test)

Example 8 with Record

use of org.apache.iceberg.data.Record in project drill by apache.

the class MapColumnConverter method convert.

@Override
public void convert(Object value) {
    if (value == null) {
        return;
    }
    Record record = (Record) value;
    if (converters.isEmpty()) {
        buildMapMembers(record, providedSchema, tupleWriter, converters);
    }
    record.struct().fields().forEach(field -> processValue(field.name(), record.getField(field.name())));
}
Also used : Record(org.apache.iceberg.data.Record)

Example 9 with Record

use of org.apache.iceberg.data.Record in project hive by apache.

the class TestHiveIcebergStorageHandlerNoScan method testIcebergAndHmsTableProperties.

@Test
public void testIcebergAndHmsTableProperties() throws Exception {
    TableIdentifier identifier = TableIdentifier.of("default", "customers");
    shell.executeStatement(String.format("CREATE EXTERNAL TABLE default.customers " + "STORED BY ICEBERG %s" + "TBLPROPERTIES ('%s'='%s', '%s'='%s', '%s'='%s', '%s'='%s')", // we need the location for HadoopTable based tests only
    testTables.locationForCreateTableSQL(identifier), InputFormatConfig.TABLE_SCHEMA, SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA), InputFormatConfig.PARTITION_SPEC, PartitionSpecParser.toJson(SPEC), "custom_property", "initial_val", InputFormatConfig.CATALOG_NAME, testTables.catalogName()));
    // Check the Iceberg table parameters
    org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier);
    Map<String, String> expectedIcebergProperties = new HashMap<>();
    expectedIcebergProperties.put("custom_property", "initial_val");
    expectedIcebergProperties.put("EXTERNAL", "TRUE");
    expectedIcebergProperties.put("storage_handler", HiveIcebergStorageHandler.class.getName());
    expectedIcebergProperties.put(serdeConstants.SERIALIZATION_FORMAT, "1");
    // Check the HMS table parameters
    org.apache.hadoop.hive.metastore.api.Table hmsTable = shell.metastore().getTable("default", "customers");
    Map<String, String> hmsParams = hmsTable.getParameters().entrySet().stream().filter(e -> !IGNORED_PARAMS.contains(e.getKey())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
    Properties tableProperties = new Properties();
    tableProperties.putAll(hmsParams);
    if (Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) {
        expectedIcebergProperties.put(TableProperties.ENGINE_HIVE_ENABLED, "true");
    }
    if (MetastoreUtil.hive3PresentOnClasspath()) {
        expectedIcebergProperties.put("bucketing_version", "2");
    }
    Assert.assertEquals(expectedIcebergProperties, icebergTable.properties());
    if (Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) {
        Assert.assertEquals(10, hmsParams.size());
        Assert.assertEquals("initial_val", hmsParams.get("custom_property"));
        Assert.assertEquals("TRUE", hmsParams.get(InputFormatConfig.EXTERNAL_TABLE_PURGE));
        Assert.assertEquals("TRUE", hmsParams.get("EXTERNAL"));
        Assert.assertEquals("true", hmsParams.get(TableProperties.ENGINE_HIVE_ENABLED));
        Assert.assertEquals(HiveIcebergStorageHandler.class.getName(), hmsParams.get(hive_metastoreConstants.META_TABLE_STORAGE));
        Assert.assertEquals(BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE.toUpperCase(), hmsParams.get(BaseMetastoreTableOperations.TABLE_TYPE_PROP));
        Assert.assertEquals(hmsParams.get(BaseMetastoreTableOperations.METADATA_LOCATION_PROP), getCurrentSnapshotForHiveCatalogTable(icebergTable));
        Assert.assertNull(hmsParams.get(BaseMetastoreTableOperations.PREVIOUS_METADATA_LOCATION_PROP));
        Assert.assertNotNull(hmsParams.get(hive_metastoreConstants.DDL_TIME));
        Assert.assertNotNull(hmsParams.get(serdeConstants.SERIALIZATION_FORMAT));
    } else {
        Assert.assertEquals(7, hmsParams.size());
        Assert.assertNull(hmsParams.get(TableProperties.ENGINE_HIVE_ENABLED));
    }
    // Check HMS inputformat/outputformat/serde
    Assert.assertEquals(HiveIcebergInputFormat.class.getName(), hmsTable.getSd().getInputFormat());
    Assert.assertEquals(HiveIcebergOutputFormat.class.getName(), hmsTable.getSd().getOutputFormat());
    Assert.assertEquals(HiveIcebergSerDe.class.getName(), hmsTable.getSd().getSerdeInfo().getSerializationLib());
    // Add two new properties to the Iceberg table and update an existing one
    icebergTable.updateProperties().set("new_prop_1", "true").set("new_prop_2", "false").set("custom_property", "new_val").commit();
    // Refresh the HMS table to see if new Iceberg properties got synced into HMS
    hmsParams = shell.metastore().getTable("default", "customers").getParameters().entrySet().stream().filter(e -> !IGNORED_PARAMS.contains(e.getKey())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
    if (Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) {
        // 2 newly-added properties + previous_metadata_location prop
        Assert.assertEquals(13, hmsParams.size());
        Assert.assertEquals("true", hmsParams.get("new_prop_1"));
        Assert.assertEquals("false", hmsParams.get("new_prop_2"));
        Assert.assertEquals("new_val", hmsParams.get("custom_property"));
        String prevSnapshot = getCurrentSnapshotForHiveCatalogTable(icebergTable);
        icebergTable.refresh();
        String newSnapshot = getCurrentSnapshotForHiveCatalogTable(icebergTable);
        Assert.assertEquals(hmsParams.get(BaseMetastoreTableOperations.PREVIOUS_METADATA_LOCATION_PROP), prevSnapshot);
        Assert.assertEquals(hmsParams.get(BaseMetastoreTableOperations.METADATA_LOCATION_PROP), newSnapshot);
    } else {
        Assert.assertEquals(7, hmsParams.size());
    }
    // Remove some Iceberg props and see if they're removed from HMS table props as well
    if (Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) {
        icebergTable.updateProperties().remove("custom_property").remove("new_prop_1").commit();
        hmsParams = shell.metastore().getTable("default", "customers").getParameters();
        Assert.assertFalse(hmsParams.containsKey("custom_property"));
        Assert.assertFalse(hmsParams.containsKey("new_prop_1"));
        Assert.assertTrue(hmsParams.containsKey("new_prop_2"));
    }
    // append some data and check whether HMS stats are aligned with snapshot summary
    if (Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) {
        List<Record> records = HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS;
        testTables.appendIcebergTable(shell.getHiveConf(), icebergTable, FileFormat.PARQUET, null, records);
        hmsParams = shell.metastore().getTable("default", "customers").getParameters();
        Map<String, String> summary = icebergTable.currentSnapshot().summary();
        Assert.assertEquals(summary.get(SnapshotSummary.TOTAL_DATA_FILES_PROP), hmsParams.get(StatsSetupConst.NUM_FILES));
        Assert.assertEquals(summary.get(SnapshotSummary.TOTAL_RECORDS_PROP), hmsParams.get(StatsSetupConst.ROW_COUNT));
        Assert.assertEquals(summary.get(SnapshotSummary.TOTAL_FILE_SIZE_PROP), hmsParams.get(StatsSetupConst.TOTAL_SIZE));
    }
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Types(org.apache.iceberg.types.Types) UpdateSchema(org.apache.iceberg.UpdateSchema) FileSystem(org.apache.hadoop.fs.FileSystem) URISyntaxException(java.net.URISyntaxException) HiveSchemaUtil(org.apache.iceberg.hive.HiveSchemaUtil) Catalogs(org.apache.iceberg.mr.Catalogs) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) PartitionField(org.apache.iceberg.PartitionField) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) StatsSetupConst(org.apache.hadoop.hive.common.StatsSetupConst) Map(java.util.Map) NoSuchTableException(org.apache.iceberg.exceptions.NoSuchTableException) After(org.junit.After) Path(org.apache.hadoop.fs.Path) URI(java.net.URI) Parameterized(org.junit.runners.Parameterized) AssertHelpers(org.apache.iceberg.AssertHelpers) CommitFailedException(org.apache.iceberg.exceptions.CommitFailedException) AfterClass(org.junit.AfterClass) BaseTable(org.apache.iceberg.BaseTable) Collection(java.util.Collection) org.apache.hadoop.hive.serde.serdeConstants(org.apache.hadoop.hive.serde.serdeConstants) InputFormatConfig(org.apache.iceberg.mr.InputFormatConfig) Set(java.util.Set) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) Collectors(java.util.stream.Collectors) PartitionSpecParser(org.apache.iceberg.PartitionSpecParser) SchemaParser(org.apache.iceberg.SchemaParser) Type(org.apache.iceberg.types.Type) Util(org.apache.iceberg.hadoop.Util) List(java.util.List) MetastoreUtil(org.apache.iceberg.hive.MetastoreUtil) PartitionSpec(org.apache.iceberg.PartitionSpec) TableProperties(org.apache.iceberg.TableProperties) ImmutableSet(org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet) BeforeClass(org.junit.BeforeClass) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) RunWith(org.junit.runner.RunWith) Parameters(org.junit.runners.Parameterized.Parameters) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) GC_ENABLED(org.apache.iceberg.TableProperties.GC_ENABLED) BaseMetastoreTableOperations(org.apache.iceberg.BaseMetastoreTableOperations) Assume(org.junit.Assume) Before(org.junit.Before) Properties(java.util.Properties) TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Table(org.apache.iceberg.Table) Parameter(org.junit.runners.Parameterized.Parameter) EnvironmentContext(org.apache.hadoop.hive.metastore.api.EnvironmentContext) TException(org.apache.thrift.TException) IOException(java.io.IOException) Test(org.junit.Test) FileFormat(org.apache.iceberg.FileFormat) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) SnapshotSummary(org.apache.iceberg.SnapshotSummary) Record(org.apache.iceberg.data.Record) Rule(org.junit.Rule) Assert(org.junit.Assert) Collections(java.util.Collections) org.apache.hadoop.hive.metastore.api.hive_metastoreConstants(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants) TemporaryFolder(org.junit.rules.TemporaryFolder) HashMap(java.util.HashMap) Table(org.apache.iceberg.Table) TableProperties(org.apache.iceberg.TableProperties) Properties(java.util.Properties) Record(org.apache.iceberg.data.Record) Map(java.util.Map) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) Test(org.junit.Test)

Example 10 with Record

use of org.apache.iceberg.data.Record in project hive by apache.

the class TestHiveIcebergV2 method testReadAndWriteFormatV2UnpartitionedWithEqDelete.

@Test
public void testReadAndWriteFormatV2UnpartitionedWithEqDelete() throws IOException {
    Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
    Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, PartitionSpec.unpartitioned(), fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
    // delete one of the rows
    List<Record> toDelete = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).add(1L, "Bob", null).build();
    DeleteFile deleteFile = HiveIcebergTestUtils.createEqualityDeleteFile(tbl, "dummyPath", ImmutableList.of("customer_id", "first_name"), fileFormat, toDelete);
    tbl.newRowDelta().addDeletes(deleteFile).commit();
    List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id");
    // only the other two rows are present
    Assert.assertEquals(2, objects.size());
    Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
    Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(1));
}
Also used : Table(org.apache.iceberg.Table) Record(org.apache.iceberg.data.Record) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Aggregations

Record (org.apache.iceberg.data.Record)114 Test (org.junit.Test)99 Schema (org.apache.iceberg.Schema)68 Table (org.apache.iceberg.Table)51 GenericRecord (org.apache.iceberg.data.GenericRecord)51 PartitionSpec (org.apache.iceberg.PartitionSpec)19 ArrayList (java.util.ArrayList)14 List (java.util.List)13 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)12 HashMap (java.util.HashMap)11 IcebergBaseTest (org.apache.drill.metastore.iceberg.IcebergBaseTest)11 TestHelper (org.apache.iceberg.mr.TestHelper)11 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)10 Types (org.apache.iceberg.types.Types)10 Map (java.util.Map)9 IOException (java.io.IOException)8 ImmutableMap (org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap)8 FileFormat (org.apache.iceberg.FileFormat)7 DeleteFile (org.apache.iceberg.DeleteFile)6 NestedField.optional (org.apache.iceberg.types.Types.NestedField.optional)6