use of org.apache.iceberg.DataFile in project hive by apache.
the class TestHiveIcebergV2 method testReadAndWriteFormatV2Partitioned_PosDelete_RowNotSupplied.
@Test
public void testReadAndWriteFormatV2Partitioned_PosDelete_RowNotSupplied() throws IOException {
Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
// add some more data to the same partition
shell.executeStatement("insert into customers values (0, 'Laura', 'Yellow'), (0, 'John', 'Green'), " + "(0, 'Blake', 'Blue')");
tbl.refresh();
// delete the first and third rows from the newly-added data file - with row supplied
DataFile dataFile = StreamSupport.stream(tbl.currentSnapshot().addedFiles().spliterator(), false).filter(file -> file.partition().get(0, Long.class) == 0L).filter(file -> file.recordCount() == 3).findAny().orElseThrow(() -> new RuntimeException("Did not find the desired data file in the test table"));
List<PositionDelete<Record>> deletes = ImmutableList.of(positionDelete(dataFile.path(), 0L, null), positionDelete(dataFile.path(), 2L, null));
DeleteFile deleteFile = HiveIcebergTestUtils.createPositionalDeleteFile(tbl, "dummyPath", fileFormat, ImmutableMap.of("customer_id", 0L), deletes);
tbl.newRowDelta().addDeletes(deleteFile).commit();
List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id, first_name");
Assert.assertEquals(4, objects.size());
Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
Assert.assertArrayEquals(new Object[] { 0L, "John", "Green" }, objects.get(1));
Assert.assertArrayEquals(new Object[] { 1L, "Bob", "Green" }, objects.get(2));
Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(3));
}
use of org.apache.iceberg.DataFile in project hive by apache.
the class TestIcebergInputFormats method testResiduals.
@Test
public void testResiduals() throws Exception {
helper.createTable();
List<Record> writeRecords = helper.generateRandomRecords(2, 0L);
writeRecords.get(0).set(1, 123L);
writeRecords.get(0).set(2, "2020-03-20");
writeRecords.get(1).set(1, 456L);
writeRecords.get(1).set(2, "2020-03-20");
List<Record> expectedRecords = new ArrayList<>();
expectedRecords.add(writeRecords.get(0));
DataFile dataFile1 = helper.writeFile(Row.of("2020-03-20", 0), writeRecords);
DataFile dataFile2 = helper.writeFile(Row.of("2020-03-21", 0), helper.generateRandomRecords(2, 0L));
helper.appendToTable(dataFile1, dataFile2);
builder.filter(Expressions.and(Expressions.equal("date", "2020-03-20"), Expressions.equal("id", 123)));
testInputFormat.create(builder.conf()).validate(expectedRecords);
// skip residual filtering
builder.skipResidualFiltering();
testInputFormat.create(builder.conf()).validate(writeRecords);
}
use of org.apache.iceberg.DataFile in project hive by apache.
the class DeleteReadTests method testEqualityDeleteByNull.
@Test
public void testEqualityDeleteByNull() throws IOException {
// data is required in the test table; make it optional for this test
table.updateSchema().makeColumnOptional("data").commit();
// add a new data file with a record where data is null
Record record = GenericRecord.create(table.schema());
DataFile dataFileWithNull = FileHelpers.writeDataFile(table, Files.localOutput(temp.newFile()), Row.of(0), Lists.newArrayList(record.copy("id", 131, "data", null)));
table.newAppend().appendFile(dataFileWithNull).commit();
// delete where data is null
Schema dataSchema = table.schema().select("data");
Record dataDelete = GenericRecord.create(dataSchema);
List<Record> dataDeletes = Lists.newArrayList(// id = 131
dataDelete.copy("data", null));
DeleteFile eqDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, dataSchema);
table.newRowDelta().addDeletes(eqDeletes).commit();
StructLikeSet expected = rowSetWithoutIds(131);
StructLikeSet actual = rowSet(tableName, table, "*");
Assert.assertEquals("Table should contain expected rows", expected, actual);
}
use of org.apache.iceberg.DataFile in project presto by prestodb.
the class TableStatisticsMaker method makeTableStatistics.
private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Constraint constraint) {
if (!tableHandle.getSnapshotId().isPresent() || constraint.getSummary().isNone()) {
return TableStatistics.empty();
}
TupleDomain<IcebergColumnHandle> intersection = constraint.getSummary().transform(IcebergColumnHandle.class::cast).intersect(tableHandle.getPredicate());
if (intersection.isNone()) {
return TableStatistics.empty();
}
List<Types.NestedField> columns = icebergTable.schema().columns();
Map<Integer, Type.PrimitiveType> idToTypeMapping = columns.stream().filter(column -> column.type().isPrimitiveType()).collect(Collectors.toMap(Types.NestedField::fieldId, column -> column.type().asPrimitiveType()));
List<PartitionField> partitionFields = icebergTable.spec().fields();
Set<Integer> identityPartitionIds = getIdentityPartitions(icebergTable.spec()).keySet().stream().map(PartitionField::sourceId).collect(toSet());
List<Types.NestedField> nonPartitionPrimitiveColumns = columns.stream().filter(column -> !identityPartitionIds.contains(column.fieldId()) && column.type().isPrimitiveType()).collect(toImmutableList());
List<Type> icebergPartitionTypes = partitionTypes(partitionFields, idToTypeMapping);
List<IcebergColumnHandle> columnHandles = getColumns(icebergTable.schema(), typeManager);
Map<Integer, IcebergColumnHandle> idToColumnHandle = columnHandles.stream().collect(toImmutableMap(IcebergColumnHandle::getId, identity()));
ImmutableMap.Builder<Integer, ColumnFieldDetails> idToDetailsBuilder = ImmutableMap.builder();
for (int index = 0; index < partitionFields.size(); index++) {
PartitionField field = partitionFields.get(index);
Type type = icebergPartitionTypes.get(index);
idToDetailsBuilder.put(field.sourceId(), new ColumnFieldDetails(field, idToColumnHandle.get(field.sourceId()), type, toPrestoType(type, typeManager), type.typeId().javaClass()));
}
Map<Integer, ColumnFieldDetails> idToDetails = idToDetailsBuilder.build();
TableScan tableScan = icebergTable.newScan().filter(toIcebergExpression(intersection)).useSnapshot(tableHandle.getSnapshotId().get()).includeColumnStats();
Partition summary = null;
try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
for (FileScanTask fileScanTask : fileScanTasks) {
DataFile dataFile = fileScanTask.file();
if (!dataFileMatches(dataFile, constraint, idToTypeMapping, partitionFields, idToDetails)) {
continue;
}
if (summary == null) {
summary = new Partition(idToTypeMapping, nonPartitionPrimitiveColumns, dataFile.partition(), dataFile.recordCount(), dataFile.fileSizeInBytes(), toMap(idToTypeMapping, dataFile.lowerBounds()), toMap(idToTypeMapping, dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.columnSizes());
} else {
summary.incrementFileCount();
summary.incrementRecordCount(dataFile.recordCount());
summary.incrementSize(dataFile.fileSizeInBytes());
updateSummaryMin(summary, partitionFields, toMap(idToTypeMapping, dataFile.lowerBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
updateSummaryMax(summary, partitionFields, toMap(idToTypeMapping, dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
summary.updateNullCount(dataFile.nullValueCounts());
updateColumnSizes(summary, dataFile.columnSizes());
}
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
if (summary == null) {
return TableStatistics.empty();
}
double recordCount = summary.getRecordCount();
TableStatistics.Builder result = TableStatistics.builder();
result.setRowCount(Estimate.of(recordCount));
result.setTotalSize(Estimate.of(summary.getSize()));
for (IcebergColumnHandle columnHandle : idToColumnHandle.values()) {
int fieldId = columnHandle.getId();
ColumnStatistics.Builder columnBuilder = new ColumnStatistics.Builder();
Long nullCount = summary.getNullCounts().get(fieldId);
if (nullCount != null) {
columnBuilder.setNullsFraction(Estimate.of(nullCount / recordCount));
}
if (summary.getColumnSizes() != null) {
Long columnSize = summary.getColumnSizes().get(fieldId);
if (columnSize != null) {
columnBuilder.setDataSize(Estimate.of(columnSize));
}
}
Object min = summary.getMinValues().get(fieldId);
Object max = summary.getMaxValues().get(fieldId);
if (min instanceof Number && max instanceof Number) {
columnBuilder.setRange(Optional.of(new DoubleRange(((Number) min).doubleValue(), ((Number) max).doubleValue())));
}
result.setColumnStatistics(columnHandle, columnBuilder.build());
}
return result.build();
}
use of org.apache.iceberg.DataFile in project drill by apache.
the class IcebergQueriesTest method writeAndCommitDataFile.
private static void writeAndCommitDataFile(Table table, String name, FileFormat fileFormat, Iterable<Record> records) throws IOException {
OutputFile outputFile = table.io().newOutputFile(new Path(table.location(), fileFormat.addExtension(name)).toUri().getPath());
FileAppender<Record> fileAppender = new GenericAppenderFactory(table.schema()).newAppender(outputFile, fileFormat);
fileAppender.addAll(records);
fileAppender.close();
DataFile dataFile = DataFiles.builder(table.spec()).withInputFile(outputFile.toInputFile()).withMetrics(fileAppender.metrics()).build();
Transaction transaction = table.newTransaction();
transaction.newAppend().appendFile(dataFile).commit();
transaction.commitTransaction();
}
Aggregations