Search in sources :

Example 21 with DataFile

use of org.apache.iceberg.DataFile in project hive by apache.

the class TestHiveIcebergV2 method testReadAndWriteFormatV2Partitioned_PosDelete_RowNotSupplied.

@Test
public void testReadAndWriteFormatV2Partitioned_PosDelete_RowNotSupplied() throws IOException {
    Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
    Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
    // add some more data to the same partition
    shell.executeStatement("insert into customers values (0, 'Laura', 'Yellow'), (0, 'John', 'Green'), " + "(0, 'Blake', 'Blue')");
    tbl.refresh();
    // delete the first and third rows from the newly-added data file - with row supplied
    DataFile dataFile = StreamSupport.stream(tbl.currentSnapshot().addedFiles().spliterator(), false).filter(file -> file.partition().get(0, Long.class) == 0L).filter(file -> file.recordCount() == 3).findAny().orElseThrow(() -> new RuntimeException("Did not find the desired data file in the test table"));
    List<PositionDelete<Record>> deletes = ImmutableList.of(positionDelete(dataFile.path(), 0L, null), positionDelete(dataFile.path(), 2L, null));
    DeleteFile deleteFile = HiveIcebergTestUtils.createPositionalDeleteFile(tbl, "dummyPath", fileFormat, ImmutableMap.of("customer_id", 0L), deletes);
    tbl.newRowDelta().addDeletes(deleteFile).commit();
    List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id, first_name");
    Assert.assertEquals(4, objects.size());
    Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
    Assert.assertArrayEquals(new Object[] { 0L, "John", "Green" }, objects.get(1));
    Assert.assertArrayEquals(new Object[] { 1L, "Bob", "Green" }, objects.get(2));
    Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(3));
}
Also used : DataFile(org.apache.iceberg.DataFile) Types(org.apache.iceberg.types.Types) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) Table(org.apache.iceberg.Table) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) IOException(java.io.IOException) Test(org.junit.Test) TestHelper(org.apache.iceberg.mr.TestHelper) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) FileFormat(org.apache.iceberg.FileFormat) List(java.util.List) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) StreamSupport(java.util.stream.StreamSupport) DeleteFile(org.apache.iceberg.DeleteFile) Assume(org.junit.Assume) DataFile(org.apache.iceberg.DataFile) Assert(org.junit.Assert) PositionDelete(org.apache.iceberg.deletes.PositionDelete) Table(org.apache.iceberg.Table) PositionDelete(org.apache.iceberg.deletes.PositionDelete) PartitionSpec(org.apache.iceberg.PartitionSpec) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 22 with DataFile

use of org.apache.iceberg.DataFile in project hive by apache.

the class TestIcebergInputFormats method testResiduals.

@Test
public void testResiduals() throws Exception {
    helper.createTable();
    List<Record> writeRecords = helper.generateRandomRecords(2, 0L);
    writeRecords.get(0).set(1, 123L);
    writeRecords.get(0).set(2, "2020-03-20");
    writeRecords.get(1).set(1, 456L);
    writeRecords.get(1).set(2, "2020-03-20");
    List<Record> expectedRecords = new ArrayList<>();
    expectedRecords.add(writeRecords.get(0));
    DataFile dataFile1 = helper.writeFile(Row.of("2020-03-20", 0), writeRecords);
    DataFile dataFile2 = helper.writeFile(Row.of("2020-03-21", 0), helper.generateRandomRecords(2, 0L));
    helper.appendToTable(dataFile1, dataFile2);
    builder.filter(Expressions.and(Expressions.equal("date", "2020-03-20"), Expressions.equal("id", 123)));
    testInputFormat.create(builder.conf()).validate(expectedRecords);
    // skip residual filtering
    builder.skipResidualFiltering();
    testInputFormat.create(builder.conf()).validate(writeRecords);
}
Also used : DataFile(org.apache.iceberg.DataFile) ArrayList(java.util.ArrayList) Record(org.apache.iceberg.data.Record) Test(org.junit.Test)

Example 23 with DataFile

use of org.apache.iceberg.DataFile in project hive by apache.

the class DeleteReadTests method testEqualityDeleteByNull.

@Test
public void testEqualityDeleteByNull() throws IOException {
    // data is required in the test table; make it optional for this test
    table.updateSchema().makeColumnOptional("data").commit();
    // add a new data file with a record where data is null
    Record record = GenericRecord.create(table.schema());
    DataFile dataFileWithNull = FileHelpers.writeDataFile(table, Files.localOutput(temp.newFile()), Row.of(0), Lists.newArrayList(record.copy("id", 131, "data", null)));
    table.newAppend().appendFile(dataFileWithNull).commit();
    // delete where data is null
    Schema dataSchema = table.schema().select("data");
    Record dataDelete = GenericRecord.create(dataSchema);
    List<Record> dataDeletes = Lists.newArrayList(// id = 131
    dataDelete.copy("data", null));
    DeleteFile eqDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, dataSchema);
    table.newRowDelta().addDeletes(eqDeletes).commit();
    StructLikeSet expected = rowSetWithoutIds(131);
    StructLikeSet actual = rowSet(tableName, table, "*");
    Assert.assertEquals("Table should contain expected rows", expected, actual);
}
Also used : DataFile(org.apache.iceberg.DataFile) Schema(org.apache.iceberg.Schema) StructLikeSet(org.apache.iceberg.util.StructLikeSet) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 24 with DataFile

use of org.apache.iceberg.DataFile in project presto by prestodb.

the class TableStatisticsMaker method makeTableStatistics.

private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Constraint constraint) {
    if (!tableHandle.getSnapshotId().isPresent() || constraint.getSummary().isNone()) {
        return TableStatistics.empty();
    }
    TupleDomain<IcebergColumnHandle> intersection = constraint.getSummary().transform(IcebergColumnHandle.class::cast).intersect(tableHandle.getPredicate());
    if (intersection.isNone()) {
        return TableStatistics.empty();
    }
    List<Types.NestedField> columns = icebergTable.schema().columns();
    Map<Integer, Type.PrimitiveType> idToTypeMapping = columns.stream().filter(column -> column.type().isPrimitiveType()).collect(Collectors.toMap(Types.NestedField::fieldId, column -> column.type().asPrimitiveType()));
    List<PartitionField> partitionFields = icebergTable.spec().fields();
    Set<Integer> identityPartitionIds = getIdentityPartitions(icebergTable.spec()).keySet().stream().map(PartitionField::sourceId).collect(toSet());
    List<Types.NestedField> nonPartitionPrimitiveColumns = columns.stream().filter(column -> !identityPartitionIds.contains(column.fieldId()) && column.type().isPrimitiveType()).collect(toImmutableList());
    List<Type> icebergPartitionTypes = partitionTypes(partitionFields, idToTypeMapping);
    List<IcebergColumnHandle> columnHandles = getColumns(icebergTable.schema(), typeManager);
    Map<Integer, IcebergColumnHandle> idToColumnHandle = columnHandles.stream().collect(toImmutableMap(IcebergColumnHandle::getId, identity()));
    ImmutableMap.Builder<Integer, ColumnFieldDetails> idToDetailsBuilder = ImmutableMap.builder();
    for (int index = 0; index < partitionFields.size(); index++) {
        PartitionField field = partitionFields.get(index);
        Type type = icebergPartitionTypes.get(index);
        idToDetailsBuilder.put(field.sourceId(), new ColumnFieldDetails(field, idToColumnHandle.get(field.sourceId()), type, toPrestoType(type, typeManager), type.typeId().javaClass()));
    }
    Map<Integer, ColumnFieldDetails> idToDetails = idToDetailsBuilder.build();
    TableScan tableScan = icebergTable.newScan().filter(toIcebergExpression(intersection)).useSnapshot(tableHandle.getSnapshotId().get()).includeColumnStats();
    Partition summary = null;
    try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
        for (FileScanTask fileScanTask : fileScanTasks) {
            DataFile dataFile = fileScanTask.file();
            if (!dataFileMatches(dataFile, constraint, idToTypeMapping, partitionFields, idToDetails)) {
                continue;
            }
            if (summary == null) {
                summary = new Partition(idToTypeMapping, nonPartitionPrimitiveColumns, dataFile.partition(), dataFile.recordCount(), dataFile.fileSizeInBytes(), toMap(idToTypeMapping, dataFile.lowerBounds()), toMap(idToTypeMapping, dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.columnSizes());
            } else {
                summary.incrementFileCount();
                summary.incrementRecordCount(dataFile.recordCount());
                summary.incrementSize(dataFile.fileSizeInBytes());
                updateSummaryMin(summary, partitionFields, toMap(idToTypeMapping, dataFile.lowerBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
                updateSummaryMax(summary, partitionFields, toMap(idToTypeMapping, dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
                summary.updateNullCount(dataFile.nullValueCounts());
                updateColumnSizes(summary, dataFile.columnSizes());
            }
        }
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    }
    if (summary == null) {
        return TableStatistics.empty();
    }
    double recordCount = summary.getRecordCount();
    TableStatistics.Builder result = TableStatistics.builder();
    result.setRowCount(Estimate.of(recordCount));
    result.setTotalSize(Estimate.of(summary.getSize()));
    for (IcebergColumnHandle columnHandle : idToColumnHandle.values()) {
        int fieldId = columnHandle.getId();
        ColumnStatistics.Builder columnBuilder = new ColumnStatistics.Builder();
        Long nullCount = summary.getNullCounts().get(fieldId);
        if (nullCount != null) {
            columnBuilder.setNullsFraction(Estimate.of(nullCount / recordCount));
        }
        if (summary.getColumnSizes() != null) {
            Long columnSize = summary.getColumnSizes().get(fieldId);
            if (columnSize != null) {
                columnBuilder.setDataSize(Estimate.of(columnSize));
            }
        }
        Object min = summary.getMinValues().get(fieldId);
        Object max = summary.getMaxValues().get(fieldId);
        if (min instanceof Number && max instanceof Number) {
            columnBuilder.setRange(Optional.of(new DoubleRange(((Number) min).doubleValue(), ((Number) max).doubleValue())));
        }
        result.setColumnStatistics(columnHandle, columnBuilder.build());
    }
    return result.build();
}
Also used : Types(org.apache.iceberg.types.Types) ColumnStatistics(com.facebook.presto.spi.statistics.ColumnStatistics) TableStatistics(com.facebook.presto.spi.statistics.TableStatistics) PartitionField(org.apache.iceberg.PartitionField) DoubleRange(com.facebook.presto.spi.statistics.DoubleRange) ImmutableList(com.google.common.collect.ImmutableList) Partition.toMap(com.facebook.presto.iceberg.Partition.toMap) TypeManager(com.facebook.presto.common.type.TypeManager) Map(java.util.Map) Objects.requireNonNull(java.util.Objects.requireNonNull) IcebergUtil.getIdentityPartitions(com.facebook.presto.iceberg.IcebergUtil.getIdentityPartitions) FileScanTask(org.apache.iceberg.FileScanTask) DataFile(org.apache.iceberg.DataFile) ExpressionConverter.toIcebergExpression(com.facebook.presto.iceberg.ExpressionConverter.toIcebergExpression) IcebergUtil.getColumns(com.facebook.presto.iceberg.IcebergUtil.getColumns) Collectors.toSet(java.util.stream.Collectors.toSet) Comparators(org.apache.iceberg.types.Comparators) TypeConverter.toPrestoType(com.facebook.presto.iceberg.TypeConverter.toPrestoType) CloseableIterable(org.apache.iceberg.io.CloseableIterable) NullableValue(com.facebook.presto.common.predicate.NullableValue) ImmutableMap(com.google.common.collect.ImmutableMap) Table(org.apache.iceberg.Table) Predicate(java.util.function.Predicate) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) Constraint(com.facebook.presto.spi.Constraint) TableScan(org.apache.iceberg.TableScan) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) Type(org.apache.iceberg.types.Type) UncheckedIOException(java.io.UncheckedIOException) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Estimate(com.facebook.presto.spi.statistics.Estimate) Function.identity(java.util.function.Function.identity) Optional(java.util.Optional) Comparator(java.util.Comparator) Types(org.apache.iceberg.types.Types) UncheckedIOException(java.io.UncheckedIOException) DataFile(org.apache.iceberg.DataFile) PartitionField(org.apache.iceberg.PartitionField) ColumnStatistics(com.facebook.presto.spi.statistics.ColumnStatistics) TableScan(org.apache.iceberg.TableScan) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Constraint(com.facebook.presto.spi.Constraint) DoubleRange(com.facebook.presto.spi.statistics.DoubleRange) TypeConverter.toPrestoType(com.facebook.presto.iceberg.TypeConverter.toPrestoType) Type(org.apache.iceberg.types.Type) TableStatistics(com.facebook.presto.spi.statistics.TableStatistics) FileScanTask(org.apache.iceberg.FileScanTask)

Example 25 with DataFile

use of org.apache.iceberg.DataFile in project drill by apache.

the class IcebergQueriesTest method writeAndCommitDataFile.

private static void writeAndCommitDataFile(Table table, String name, FileFormat fileFormat, Iterable<Record> records) throws IOException {
    OutputFile outputFile = table.io().newOutputFile(new Path(table.location(), fileFormat.addExtension(name)).toUri().getPath());
    FileAppender<Record> fileAppender = new GenericAppenderFactory(table.schema()).newAppender(outputFile, fileFormat);
    fileAppender.addAll(records);
    fileAppender.close();
    DataFile dataFile = DataFiles.builder(table.spec()).withInputFile(outputFile.toInputFile()).withMetrics(fileAppender.metrics()).build();
    Transaction transaction = table.newTransaction();
    transaction.newAppend().appendFile(dataFile).commit();
    transaction.commitTransaction();
}
Also used : OutputFile(org.apache.iceberg.io.OutputFile) Path(org.apache.hadoop.fs.Path) DataFile(org.apache.iceberg.DataFile) Transaction(org.apache.iceberg.Transaction) GenericRecord(org.apache.iceberg.data.GenericRecord) Record(org.apache.iceberg.data.Record) GenericAppenderFactory(org.apache.iceberg.data.GenericAppenderFactory)

Aggregations

DataFile (org.apache.iceberg.DataFile)25 Table (org.apache.iceberg.Table)14 Test (org.junit.Test)12 IOException (java.io.IOException)7 ExecutorService (java.util.concurrent.ExecutorService)6 AppendFiles (org.apache.iceberg.AppendFiles)5 Record (org.apache.iceberg.data.Record)5 List (java.util.List)4 Map (java.util.Map)4 Path (org.apache.hadoop.fs.Path)4 ImmutableMap (org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap)4 Types (org.apache.iceberg.types.Types)4 File (java.io.File)3 ArrayList (java.util.ArrayList)3 Optional (java.util.Optional)3 Set (java.util.Set)3 Collectors (java.util.stream.Collectors)3 JobConf (org.apache.hadoop.mapred.JobConf)3 DeleteFile (org.apache.iceberg.DeleteFile)3 Transaction (org.apache.iceberg.Transaction)3