use of org.apache.iceberg.DeleteFile in project hive by apache.
the class TestHiveIcebergV2 method testReadAndWriteFormatV2UnpartitionedWithEqDelete.
@Test
public void testReadAndWriteFormatV2UnpartitionedWithEqDelete() throws IOException {
Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, PartitionSpec.unpartitioned(), fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
// delete one of the rows
List<Record> toDelete = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).add(1L, "Bob", null).build();
DeleteFile deleteFile = HiveIcebergTestUtils.createEqualityDeleteFile(tbl, "dummyPath", ImmutableList.of("customer_id", "first_name"), fileFormat, toDelete);
tbl.newRowDelta().addDeletes(deleteFile).commit();
List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id");
// only the other two rows are present
Assert.assertEquals(2, objects.size());
Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(1));
}
use of org.apache.iceberg.DeleteFile in project hive by apache.
the class TestHiveIcebergV2 method testReadAndWriteFormatV2Partitioned_PosDelete_RowSupplied.
@Test
public void testReadAndWriteFormatV2Partitioned_PosDelete_RowSupplied() throws IOException {
Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
// add some more data to the same partition
shell.executeStatement("insert into customers values (0, 'Laura', 'Yellow'), (0, 'John', 'Green'), " + "(0, 'Blake', 'Blue')");
tbl.refresh();
// delete the first and third rows from the newly-added data file
DataFile dataFile = StreamSupport.stream(tbl.currentSnapshot().addedFiles().spliterator(), false).filter(file -> file.partition().get(0, Long.class) == 0L).filter(file -> file.recordCount() == 3).findAny().orElseThrow(() -> new RuntimeException("Did not find the desired data file in the test table"));
List<Record> rowsToDel = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).add(0L, "Laura", "Yellow").add(0L, "Blake", "Blue").build();
List<PositionDelete<Record>> deletes = ImmutableList.of(positionDelete(dataFile.path(), 0L, rowsToDel.get(0)), positionDelete(dataFile.path(), 2L, rowsToDel.get(1)));
DeleteFile deleteFile = HiveIcebergTestUtils.createPositionalDeleteFile(tbl, "dummyPath", fileFormat, ImmutableMap.of("customer_id", 0L), deletes);
tbl.newRowDelta().addDeletes(deleteFile).commit();
List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id, first_name");
Assert.assertEquals(4, objects.size());
Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
Assert.assertArrayEquals(new Object[] { 0L, "John", "Green" }, objects.get(1));
Assert.assertArrayEquals(new Object[] { 1L, "Bob", "Green" }, objects.get(2));
Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(3));
}
use of org.apache.iceberg.DeleteFile in project hive by apache.
the class DeleteReadTests method testPositionDeletes.
@Test
public void testPositionDeletes() throws IOException {
List<Pair<CharSequence, Long>> deletes = Lists.newArrayList(// id = 29
Pair.of(dataFile.path(), 0L), // id = 89
Pair.of(dataFile.path(), 3L), // id = 122
Pair.of(dataFile.path(), 6L));
Pair<DeleteFile, Set<CharSequence>> posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), Row.of(0), deletes);
table.newRowDelta().addDeletes(posDeletes.first()).validateDataFilesExist(posDeletes.second()).commit();
StructLikeSet expected = rowSetWithoutIds(29, 89, 122);
StructLikeSet actual = rowSet(tableName, table, "*");
Assert.assertEquals("Table should contain expected rows", expected, actual);
}
use of org.apache.iceberg.DeleteFile in project hive by apache.
the class HiveIcebergTestUtils method createEqualityDeleteFile.
/**
* @param table The table to create the delete file for
* @param deleteFilePath The path where the delete file should be created, relative to the table location root
* @param equalityFields List of field names that should play a role in the equality check
* @param fileFormat The file format that should be used for writing out the delete file
* @param rowsToDelete The rows that should be deleted. It's enough to fill out the fields that are relevant for the
* equality check, as listed in equalityFields, the rest of the fields are ignored
* @return The DeleteFile created
* @throws IOException If there is an error during DeleteFile write
*/
public static DeleteFile createEqualityDeleteFile(Table table, String deleteFilePath, List<String> equalityFields, FileFormat fileFormat, List<Record> rowsToDelete) throws IOException {
List<Integer> equalityFieldIds = equalityFields.stream().map(id -> table.schema().findField(id).fieldId()).collect(Collectors.toList());
Schema eqDeleteRowSchema = table.schema().select(equalityFields.toArray(new String[] {}));
FileAppenderFactory<Record> appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), ArrayUtil.toIntArray(equalityFieldIds), eqDeleteRowSchema, null);
EncryptedOutputFile outputFile = table.encryption().encrypt(HadoopOutputFile.fromPath(new org.apache.hadoop.fs.Path(table.location(), deleteFilePath), new Configuration()));
PartitionKey part = new PartitionKey(table.spec(), eqDeleteRowSchema);
part.partition(rowsToDelete.get(0));
EqualityDeleteWriter<Record> eqWriter = appenderFactory.newEqDeleteWriter(outputFile, fileFormat, part);
try (EqualityDeleteWriter<Record> writer = eqWriter) {
writer.deleteAll(rowsToDelete);
}
return eqWriter.toDeleteFile();
}
use of org.apache.iceberg.DeleteFile in project hive by apache.
the class TestHiveIcebergV2 method testReadAndWriteFormatV2Unpartitioned_PosDelete.
@Test
public void testReadAndWriteFormatV2Unpartitioned_PosDelete() throws IOException {
Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, PartitionSpec.unpartitioned(), fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
// delete one of the rows
DataFile dataFile = StreamSupport.stream(tbl.currentSnapshot().addedFiles().spliterator(), false).findFirst().orElseThrow(() -> new RuntimeException("Did not find any data files for test table"));
List<PositionDelete<Record>> deletes = ImmutableList.of(positionDelete(dataFile.path(), 2L, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2)));
DeleteFile deleteFile = HiveIcebergTestUtils.createPositionalDeleteFile(tbl, "dummyPath", fileFormat, null, deletes);
tbl.newRowDelta().addDeletes(deleteFile).commit();
List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id");
// only the other two rows are present
Assert.assertEquals(2, objects.size());
Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
Assert.assertArrayEquals(new Object[] { 1L, "Bob", "Green" }, objects.get(1));
}
Aggregations