Search in sources :

Example 6 with DeleteFile

use of org.apache.iceberg.DeleteFile in project hive by apache.

the class TestHiveIcebergV2 method testReadAndWriteFormatV2Partitioned_EqDelete_OnlyEqColumnsSupplied.

@Test
public void testReadAndWriteFormatV2Partitioned_EqDelete_OnlyEqColumnsSupplied() throws IOException {
    Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
    Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
    // add one more row to the same partition
    shell.executeStatement("insert into customers values (1, 'Bob', 'Hoover')");
    // delete all rows with id=1 and first_name=Bob
    Schema shorterSchema = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "name", Types.StringType.get()));
    List<Record> toDelete = TestHelper.RecordsBuilder.newInstance(shorterSchema).add(1L, "Bob").build();
    DeleteFile deleteFile = HiveIcebergTestUtils.createEqualityDeleteFile(tbl, "dummyPath", ImmutableList.of("customer_id", "first_name"), fileFormat, toDelete);
    tbl.newRowDelta().addDeletes(deleteFile).commit();
    List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id");
    Assert.assertEquals(2, objects.size());
    Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
    Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(1));
}
Also used : Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 7 with DeleteFile

use of org.apache.iceberg.DeleteFile in project hive by apache.

the class TestHiveIcebergV2 method testReadAndWriteFormatV2Partitioned_PosDelete_RowNotSupplied.

@Test
public void testReadAndWriteFormatV2Partitioned_PosDelete_RowNotSupplied() throws IOException {
    Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
    Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
    // add some more data to the same partition
    shell.executeStatement("insert into customers values (0, 'Laura', 'Yellow'), (0, 'John', 'Green'), " + "(0, 'Blake', 'Blue')");
    tbl.refresh();
    // delete the first and third rows from the newly-added data file - with row supplied
    DataFile dataFile = StreamSupport.stream(tbl.currentSnapshot().addedFiles().spliterator(), false).filter(file -> file.partition().get(0, Long.class) == 0L).filter(file -> file.recordCount() == 3).findAny().orElseThrow(() -> new RuntimeException("Did not find the desired data file in the test table"));
    List<PositionDelete<Record>> deletes = ImmutableList.of(positionDelete(dataFile.path(), 0L, null), positionDelete(dataFile.path(), 2L, null));
    DeleteFile deleteFile = HiveIcebergTestUtils.createPositionalDeleteFile(tbl, "dummyPath", fileFormat, ImmutableMap.of("customer_id", 0L), deletes);
    tbl.newRowDelta().addDeletes(deleteFile).commit();
    List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id, first_name");
    Assert.assertEquals(4, objects.size());
    Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
    Assert.assertArrayEquals(new Object[] { 0L, "John", "Green" }, objects.get(1));
    Assert.assertArrayEquals(new Object[] { 1L, "Bob", "Green" }, objects.get(2));
    Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(3));
}
Also used : DataFile(org.apache.iceberg.DataFile) Types(org.apache.iceberg.types.Types) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) Table(org.apache.iceberg.Table) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) IOException(java.io.IOException) Test(org.junit.Test) TestHelper(org.apache.iceberg.mr.TestHelper) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) FileFormat(org.apache.iceberg.FileFormat) List(java.util.List) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) StreamSupport(java.util.stream.StreamSupport) DeleteFile(org.apache.iceberg.DeleteFile) Assume(org.junit.Assume) DataFile(org.apache.iceberg.DataFile) Assert(org.junit.Assert) PositionDelete(org.apache.iceberg.deletes.PositionDelete) Table(org.apache.iceberg.Table) PositionDelete(org.apache.iceberg.deletes.PositionDelete) PartitionSpec(org.apache.iceberg.PartitionSpec) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 8 with DeleteFile

use of org.apache.iceberg.DeleteFile in project hive by apache.

the class TestHiveIcebergV2 method testReadAndWriteFormatV2Partitioned_EqDelete_AllColumnsSupplied.

@Test
public void testReadAndWriteFormatV2Partitioned_EqDelete_AllColumnsSupplied() throws IOException {
    Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
    Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
    // add one more row to the same partition
    shell.executeStatement("insert into customers values (1, 'Bob', 'Hoover')");
    // delete all rows with id=1 and first_name=Bob
    List<Record> toDelete = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).add(1L, "Bob", null).build();
    DeleteFile deleteFile = HiveIcebergTestUtils.createEqualityDeleteFile(tbl, "dummyPath", ImmutableList.of("customer_id", "first_name"), fileFormat, toDelete);
    tbl.newRowDelta().addDeletes(deleteFile).commit();
    List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id");
    Assert.assertEquals(2, objects.size());
    Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
    Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(1));
}
Also used : Table(org.apache.iceberg.Table) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 9 with DeleteFile

use of org.apache.iceberg.DeleteFile in project hive by apache.

the class DeleteReadTests method testMixedPositionAndEqualityDeletes.

@Test
public void testMixedPositionAndEqualityDeletes() throws IOException {
    Schema dataSchema = table.schema().select("data");
    Record dataDelete = GenericRecord.create(dataSchema);
    List<Record> dataDeletes = Lists.newArrayList(// id = 29
    dataDelete.copy("data", "a"), // id = 89
    dataDelete.copy("data", "d"), // id = 122
    dataDelete.copy("data", "g"));
    DeleteFile eqDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, dataSchema);
    List<Pair<CharSequence, Long>> deletes = Lists.newArrayList(// id = 89
    Pair.of(dataFile.path(), 3L), // id = 121
    Pair.of(dataFile.path(), 5L));
    Pair<DeleteFile, Set<CharSequence>> posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), Row.of(0), deletes);
    table.newRowDelta().addDeletes(eqDeletes).addDeletes(posDeletes.first()).validateDataFilesExist(posDeletes.second()).commit();
    StructLikeSet expected = rowSetWithoutIds(29, 89, 121, 122);
    StructLikeSet actual = rowSet(tableName, table, "*");
    Assert.assertEquals("Table should contain expected rows", expected, actual);
}
Also used : StructLikeSet(org.apache.iceberg.util.StructLikeSet) Set(java.util.Set) Schema(org.apache.iceberg.Schema) StructLikeSet(org.apache.iceberg.util.StructLikeSet) DeleteFile(org.apache.iceberg.DeleteFile) Pair(org.apache.iceberg.util.Pair) Test(org.junit.Test)

Example 10 with DeleteFile

use of org.apache.iceberg.DeleteFile in project hive by apache.

the class DeleteReadTests method testEqualityDeletesSpanningMultipleDataFiles.

@Test
public void testEqualityDeletesSpanningMultipleDataFiles() throws IOException {
    // Add another DataFile with common values
    GenericRecord record = GenericRecord.create(table.schema());
    records.add(record.copy("id", 144, "data", "a"));
    this.dataFile = FileHelpers.writeDataFile(table, Files.localOutput(temp.newFile()), Row.of(0), records);
    table.newAppend().appendFile(dataFile).commit();
    Schema deleteRowSchema = table.schema().select("data");
    Record dataDelete = GenericRecord.create(deleteRowSchema);
    List<Record> dataDeletes = Lists.newArrayList(// id = 29, 144
    dataDelete.copy("data", "a"), // id = 89
    dataDelete.copy("data", "d"), // id = 122
    dataDelete.copy("data", "g"));
    DeleteFile eqDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, deleteRowSchema);
    table.newRowDelta().addDeletes(eqDeletes).commit();
    StructLikeSet expected = rowSetWithoutIds(29, 89, 122, 144);
    StructLikeSet actual = rowSet(tableName, table, "*");
    Assert.assertEquals("Table should contain expected rows", expected, actual);
}
Also used : Schema(org.apache.iceberg.Schema) StructLikeSet(org.apache.iceberg.util.StructLikeSet) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Aggregations

DeleteFile (org.apache.iceberg.DeleteFile)14 Test (org.junit.Test)13 Schema (org.apache.iceberg.Schema)10 Table (org.apache.iceberg.Table)7 StructLikeSet (org.apache.iceberg.util.StructLikeSet)7 Record (org.apache.iceberg.data.Record)6 DataFile (org.apache.iceberg.DataFile)4 PartitionSpec (org.apache.iceberg.PartitionSpec)4 PositionDelete (org.apache.iceberg.deletes.PositionDelete)4 IOException (java.io.IOException)3 List (java.util.List)3 FileFormat (org.apache.iceberg.FileFormat)3 Types (org.apache.iceberg.types.Types)3 NestedField.optional (org.apache.iceberg.types.Types.NestedField.optional)3 Assert (org.junit.Assert)3 Set (java.util.Set)2 StreamSupport (java.util.stream.StreamSupport)2 File (java.io.File)1 BigDecimal (java.math.BigDecimal)1 ByteBuffer (java.nio.ByteBuffer)1