use of org.apache.iceberg.data.Record in project hive by apache.
the class TestHiveIcebergV2 method testReadAndWriteFormatV2Partitioned_PosDelete_RowSupplied.
@Test
public void testReadAndWriteFormatV2Partitioned_PosDelete_RowSupplied() throws IOException {
Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
// add some more data to the same partition
shell.executeStatement("insert into customers values (0, 'Laura', 'Yellow'), (0, 'John', 'Green'), " + "(0, 'Blake', 'Blue')");
tbl.refresh();
// delete the first and third rows from the newly-added data file
DataFile dataFile = StreamSupport.stream(tbl.currentSnapshot().addedFiles().spliterator(), false).filter(file -> file.partition().get(0, Long.class) == 0L).filter(file -> file.recordCount() == 3).findAny().orElseThrow(() -> new RuntimeException("Did not find the desired data file in the test table"));
List<Record> rowsToDel = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).add(0L, "Laura", "Yellow").add(0L, "Blake", "Blue").build();
List<PositionDelete<Record>> deletes = ImmutableList.of(positionDelete(dataFile.path(), 0L, rowsToDel.get(0)), positionDelete(dataFile.path(), 2L, rowsToDel.get(1)));
DeleteFile deleteFile = HiveIcebergTestUtils.createPositionalDeleteFile(tbl, "dummyPath", fileFormat, ImmutableMap.of("customer_id", 0L), deletes);
tbl.newRowDelta().addDeletes(deleteFile).commit();
List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id, first_name");
Assert.assertEquals(4, objects.size());
Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
Assert.assertArrayEquals(new Object[] { 0L, "John", "Green" }, objects.get(1));
Assert.assertArrayEquals(new Object[] { 1L, "Bob", "Green" }, objects.get(2));
Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(3));
}
use of org.apache.iceberg.data.Record in project hive by apache.
the class TestIcebergInputFormats method testFilterExp.
@Test
public void testFilterExp() throws Exception {
helper.createTable();
List<Record> expectedRecords = helper.generateRandomRecords(2, 0L);
expectedRecords.get(0).set(2, "2020-03-20");
expectedRecords.get(1).set(2, "2020-03-20");
DataFile dataFile1 = helper.writeFile(Row.of("2020-03-20", 0), expectedRecords);
DataFile dataFile2 = helper.writeFile(Row.of("2020-03-21", 0), helper.generateRandomRecords(2, 0L));
helper.appendToTable(dataFile1, dataFile2);
builder.filter(Expressions.equal("date", "2020-03-20"));
testInputFormat.create(builder.conf()).validate(expectedRecords);
}
use of org.apache.iceberg.data.Record in project hive by apache.
the class TestIcebergInputFormats method testIdentityPartitionProjections.
@Test
public void testIdentityPartitionProjections() throws Exception {
helper.createTable(LOG_SCHEMA, IDENTITY_PARTITION_SPEC);
List<Record> inputRecords = helper.generateRandomRecords(10, 0L);
Integer idx = 0;
AppendFiles append = helper.table().newAppend();
for (Record record : inputRecords) {
record.set(1, "2020-03-2" + idx);
record.set(2, idx.toString());
append.appendFile(helper.writeFile(Row.of("2020-03-2" + idx, idx.toString()), ImmutableList.of(record)));
idx += 1;
}
append.commit();
// individual fields
validateIdentityPartitionProjections(withColumns("date"), inputRecords);
validateIdentityPartitionProjections(withColumns("level"), inputRecords);
validateIdentityPartitionProjections(withColumns("message"), inputRecords);
validateIdentityPartitionProjections(withColumns("id"), inputRecords);
// field pairs
validateIdentityPartitionProjections(withColumns("date", "message"), inputRecords);
validateIdentityPartitionProjections(withColumns("level", "message"), inputRecords);
validateIdentityPartitionProjections(withColumns("date", "level"), inputRecords);
// out-of-order pairs
validateIdentityPartitionProjections(withColumns("message", "date"), inputRecords);
validateIdentityPartitionProjections(withColumns("message", "level"), inputRecords);
validateIdentityPartitionProjections(withColumns("level", "date"), inputRecords);
// full projection
validateIdentityPartitionProjections(LOG_SCHEMA, inputRecords);
// out-of-order triplets
validateIdentityPartitionProjections(withColumns("date", "level", "message"), inputRecords);
validateIdentityPartitionProjections(withColumns("level", "date", "message"), inputRecords);
validateIdentityPartitionProjections(withColumns("date", "message", "level"), inputRecords);
validateIdentityPartitionProjections(withColumns("level", "message", "date"), inputRecords);
validateIdentityPartitionProjections(withColumns("message", "date", "level"), inputRecords);
validateIdentityPartitionProjections(withColumns("message", "level", "date"), inputRecords);
}
use of org.apache.iceberg.data.Record in project hive by apache.
the class TestIcebergInputFormats method testCustomCatalog.
@Test
public void testCustomCatalog() throws IOException {
String warehouseLocation = temp.newFolder("hadoop_catalog").getAbsolutePath();
conf.set("warehouse.location", warehouseLocation);
conf.set(InputFormatConfig.CATALOG_NAME, Catalogs.ICEBERG_DEFAULT_CATALOG_NAME);
conf.set(InputFormatConfig.catalogPropertyConfigKey(Catalogs.ICEBERG_DEFAULT_CATALOG_NAME, CatalogUtil.ICEBERG_CATALOG_TYPE), CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP);
conf.set(InputFormatConfig.catalogPropertyConfigKey(Catalogs.ICEBERG_DEFAULT_CATALOG_NAME, CatalogProperties.WAREHOUSE_LOCATION), warehouseLocation);
Catalog catalog = new HadoopCatalog(conf, conf.get("warehouse.location"));
TableIdentifier identifier = TableIdentifier.of("db", "t");
Table table = catalog.createTable(identifier, SCHEMA, SPEC, helper.properties());
helper.setTable(table);
List<Record> expectedRecords = helper.generateRandomRecords(1, 0L);
expectedRecords.get(0).set(2, "2020-03-20");
helper.appendToTable(Row.of("2020-03-20", 0), expectedRecords);
builder.readFrom(identifier);
testInputFormat.create(builder.conf()).validate(expectedRecords);
}
use of org.apache.iceberg.data.Record in project hive by apache.
the class TestIcebergInputFormats method testResidualsUnserialized.
@Test
public void testResidualsUnserialized() throws Exception {
helper.createUnpartitionedTable();
List<Record> expectedRecords = helper.generateRandomRecords(10, 0L);
helper.appendToTable(null, expectedRecords);
builder.filter(Expressions.greaterThan("id", 123));
for (InputSplit split : testInputFormat.create(builder.conf()).getSplits()) {
HiveIcebergSplit originalSplit = new HiveIcebergSplit((IcebergSplit) split, "noop");
// In the original split, residual should still be there as per above expression
assertNotEquals(Expressions.alwaysTrue(), originalSplit.icebergSplit().task().files().stream().findFirst().get().residual());
ByteArrayOutputStream baos = new ByteArrayOutputStream();
DataOutputStream out = new DataOutputStream(baos);
originalSplit.write(out);
HiveIcebergSplit deserializedSplit = new HiveIcebergSplit();
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
DataInputStream in = new DataInputStream(bais);
deserializedSplit.readFields(in);
// After ser/de the expression should be always-true
assertEquals(Expressions.alwaysTrue(), deserializedSplit.icebergSplit().task().files().stream().findFirst().get().residual());
}
}
Aggregations