Search in sources :

Example 11 with Record

use of org.apache.iceberg.data.Record in project hive by apache.

the class TestHiveIcebergV2 method testReadAndWriteFormatV2Partitioned_PosDelete_RowSupplied.

@Test
public void testReadAndWriteFormatV2Partitioned_PosDelete_RowSupplied() throws IOException {
    Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
    Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
    // add some more data to the same partition
    shell.executeStatement("insert into customers values (0, 'Laura', 'Yellow'), (0, 'John', 'Green'), " + "(0, 'Blake', 'Blue')");
    tbl.refresh();
    // delete the first and third rows from the newly-added data file
    DataFile dataFile = StreamSupport.stream(tbl.currentSnapshot().addedFiles().spliterator(), false).filter(file -> file.partition().get(0, Long.class) == 0L).filter(file -> file.recordCount() == 3).findAny().orElseThrow(() -> new RuntimeException("Did not find the desired data file in the test table"));
    List<Record> rowsToDel = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).add(0L, "Laura", "Yellow").add(0L, "Blake", "Blue").build();
    List<PositionDelete<Record>> deletes = ImmutableList.of(positionDelete(dataFile.path(), 0L, rowsToDel.get(0)), positionDelete(dataFile.path(), 2L, rowsToDel.get(1)));
    DeleteFile deleteFile = HiveIcebergTestUtils.createPositionalDeleteFile(tbl, "dummyPath", fileFormat, ImmutableMap.of("customer_id", 0L), deletes);
    tbl.newRowDelta().addDeletes(deleteFile).commit();
    List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id, first_name");
    Assert.assertEquals(4, objects.size());
    Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
    Assert.assertArrayEquals(new Object[] { 0L, "John", "Green" }, objects.get(1));
    Assert.assertArrayEquals(new Object[] { 1L, "Bob", "Green" }, objects.get(2));
    Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(3));
}
Also used : DataFile(org.apache.iceberg.DataFile) Types(org.apache.iceberg.types.Types) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) Table(org.apache.iceberg.Table) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) IOException(java.io.IOException) Test(org.junit.Test) TestHelper(org.apache.iceberg.mr.TestHelper) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) FileFormat(org.apache.iceberg.FileFormat) List(java.util.List) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) StreamSupport(java.util.stream.StreamSupport) DeleteFile(org.apache.iceberg.DeleteFile) Assume(org.junit.Assume) DataFile(org.apache.iceberg.DataFile) Assert(org.junit.Assert) PositionDelete(org.apache.iceberg.deletes.PositionDelete) Table(org.apache.iceberg.Table) PositionDelete(org.apache.iceberg.deletes.PositionDelete) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 12 with Record

use of org.apache.iceberg.data.Record in project hive by apache.

the class TestIcebergInputFormats method testFilterExp.

@Test
public void testFilterExp() throws Exception {
    helper.createTable();
    List<Record> expectedRecords = helper.generateRandomRecords(2, 0L);
    expectedRecords.get(0).set(2, "2020-03-20");
    expectedRecords.get(1).set(2, "2020-03-20");
    DataFile dataFile1 = helper.writeFile(Row.of("2020-03-20", 0), expectedRecords);
    DataFile dataFile2 = helper.writeFile(Row.of("2020-03-21", 0), helper.generateRandomRecords(2, 0L));
    helper.appendToTable(dataFile1, dataFile2);
    builder.filter(Expressions.equal("date", "2020-03-20"));
    testInputFormat.create(builder.conf()).validate(expectedRecords);
}
Also used : DataFile(org.apache.iceberg.DataFile) Record(org.apache.iceberg.data.Record) Test(org.junit.Test)

Example 13 with Record

use of org.apache.iceberg.data.Record in project hive by apache.

the class TestIcebergInputFormats method testIdentityPartitionProjections.

@Test
public void testIdentityPartitionProjections() throws Exception {
    helper.createTable(LOG_SCHEMA, IDENTITY_PARTITION_SPEC);
    List<Record> inputRecords = helper.generateRandomRecords(10, 0L);
    Integer idx = 0;
    AppendFiles append = helper.table().newAppend();
    for (Record record : inputRecords) {
        record.set(1, "2020-03-2" + idx);
        record.set(2, idx.toString());
        append.appendFile(helper.writeFile(Row.of("2020-03-2" + idx, idx.toString()), ImmutableList.of(record)));
        idx += 1;
    }
    append.commit();
    // individual fields
    validateIdentityPartitionProjections(withColumns("date"), inputRecords);
    validateIdentityPartitionProjections(withColumns("level"), inputRecords);
    validateIdentityPartitionProjections(withColumns("message"), inputRecords);
    validateIdentityPartitionProjections(withColumns("id"), inputRecords);
    // field pairs
    validateIdentityPartitionProjections(withColumns("date", "message"), inputRecords);
    validateIdentityPartitionProjections(withColumns("level", "message"), inputRecords);
    validateIdentityPartitionProjections(withColumns("date", "level"), inputRecords);
    // out-of-order pairs
    validateIdentityPartitionProjections(withColumns("message", "date"), inputRecords);
    validateIdentityPartitionProjections(withColumns("message", "level"), inputRecords);
    validateIdentityPartitionProjections(withColumns("level", "date"), inputRecords);
    // full projection
    validateIdentityPartitionProjections(LOG_SCHEMA, inputRecords);
    // out-of-order triplets
    validateIdentityPartitionProjections(withColumns("date", "level", "message"), inputRecords);
    validateIdentityPartitionProjections(withColumns("level", "date", "message"), inputRecords);
    validateIdentityPartitionProjections(withColumns("date", "message", "level"), inputRecords);
    validateIdentityPartitionProjections(withColumns("level", "message", "date"), inputRecords);
    validateIdentityPartitionProjections(withColumns("message", "date", "level"), inputRecords);
    validateIdentityPartitionProjections(withColumns("message", "level", "date"), inputRecords);
}
Also used : AppendFiles(org.apache.iceberg.AppendFiles) Record(org.apache.iceberg.data.Record) Test(org.junit.Test)

Example 14 with Record

use of org.apache.iceberg.data.Record in project hive by apache.

the class TestIcebergInputFormats method testCustomCatalog.

@Test
public void testCustomCatalog() throws IOException {
    String warehouseLocation = temp.newFolder("hadoop_catalog").getAbsolutePath();
    conf.set("warehouse.location", warehouseLocation);
    conf.set(InputFormatConfig.CATALOG_NAME, Catalogs.ICEBERG_DEFAULT_CATALOG_NAME);
    conf.set(InputFormatConfig.catalogPropertyConfigKey(Catalogs.ICEBERG_DEFAULT_CATALOG_NAME, CatalogUtil.ICEBERG_CATALOG_TYPE), CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP);
    conf.set(InputFormatConfig.catalogPropertyConfigKey(Catalogs.ICEBERG_DEFAULT_CATALOG_NAME, CatalogProperties.WAREHOUSE_LOCATION), warehouseLocation);
    Catalog catalog = new HadoopCatalog(conf, conf.get("warehouse.location"));
    TableIdentifier identifier = TableIdentifier.of("db", "t");
    Table table = catalog.createTable(identifier, SCHEMA, SPEC, helper.properties());
    helper.setTable(table);
    List<Record> expectedRecords = helper.generateRandomRecords(1, 0L);
    expectedRecords.get(0).set(2, "2020-03-20");
    helper.appendToTable(Row.of("2020-03-20", 0), expectedRecords);
    builder.readFrom(identifier);
    testInputFormat.create(builder.conf()).validate(expectedRecords);
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Table(org.apache.iceberg.Table) Record(org.apache.iceberg.data.Record) HadoopCatalog(org.apache.iceberg.hadoop.HadoopCatalog) HadoopCatalog(org.apache.iceberg.hadoop.HadoopCatalog) Catalog(org.apache.iceberg.catalog.Catalog) Test(org.junit.Test)

Example 15 with Record

use of org.apache.iceberg.data.Record in project hive by apache.

the class TestIcebergInputFormats method testResidualsUnserialized.

@Test
public void testResidualsUnserialized() throws Exception {
    helper.createUnpartitionedTable();
    List<Record> expectedRecords = helper.generateRandomRecords(10, 0L);
    helper.appendToTable(null, expectedRecords);
    builder.filter(Expressions.greaterThan("id", 123));
    for (InputSplit split : testInputFormat.create(builder.conf()).getSplits()) {
        HiveIcebergSplit originalSplit = new HiveIcebergSplit((IcebergSplit) split, "noop");
        // In the original split, residual should still be there as per above expression
        assertNotEquals(Expressions.alwaysTrue(), originalSplit.icebergSplit().task().files().stream().findFirst().get().residual());
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        DataOutputStream out = new DataOutputStream(baos);
        originalSplit.write(out);
        HiveIcebergSplit deserializedSplit = new HiveIcebergSplit();
        ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
        DataInputStream in = new DataInputStream(bais);
        deserializedSplit.readFields(in);
        // After ser/de the expression should be always-true
        assertEquals(Expressions.alwaysTrue(), deserializedSplit.icebergSplit().task().files().stream().findFirst().get().residual());
    }
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) DataOutputStream(java.io.DataOutputStream) Record(org.apache.iceberg.data.Record) ByteArrayOutputStream(java.io.ByteArrayOutputStream) DataInputStream(java.io.DataInputStream) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Aggregations

Record (org.apache.iceberg.data.Record)114 Test (org.junit.Test)99 Schema (org.apache.iceberg.Schema)68 Table (org.apache.iceberg.Table)51 GenericRecord (org.apache.iceberg.data.GenericRecord)51 PartitionSpec (org.apache.iceberg.PartitionSpec)19 ArrayList (java.util.ArrayList)14 List (java.util.List)13 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)12 HashMap (java.util.HashMap)11 IcebergBaseTest (org.apache.drill.metastore.iceberg.IcebergBaseTest)11 TestHelper (org.apache.iceberg.mr.TestHelper)11 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)10 Types (org.apache.iceberg.types.Types)10 Map (java.util.Map)9 IOException (java.io.IOException)8 ImmutableMap (org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap)8 FileFormat (org.apache.iceberg.FileFormat)7 DeleteFile (org.apache.iceberg.DeleteFile)6 NestedField.optional (org.apache.iceberg.types.Types.NestedField.optional)6