Search in sources :

Example 1 with PartitionKey

use of org.apache.iceberg.PartitionKey in project hive by apache.

the class TestHiveIcebergPartitions method testPartitionPruning.

@Test
public void testPartitionPruning() throws IOException {
    Schema salesSchema = new Schema(required(1, "ss_item_sk", Types.IntegerType.get()), required(2, "ss_sold_date_sk", Types.IntegerType.get()));
    PartitionSpec salesSpec = PartitionSpec.builderFor(salesSchema).identity("ss_sold_date_sk").build();
    Schema dimSchema = new Schema(required(1, "d_date_sk", Types.IntegerType.get()), required(2, "d_moy", Types.IntegerType.get()));
    List<Record> salesRecords = TestHelper.RecordsBuilder.newInstance(salesSchema).add(51, 5).add(61, 6).add(71, 7).add(81, 8).add(91, 9).build();
    List<Record> dimRecords = TestHelper.RecordsBuilder.newInstance(salesSchema).add(1, 10).add(2, 20).add(3, 30).add(4, 40).add(5, 50).build();
    Table salesTable = testTables.createTable(shell, "x1_store_sales", salesSchema, salesSpec, fileFormat, null);
    PartitionKey partitionKey = new PartitionKey(salesSpec, salesSchema);
    for (Record r : salesRecords) {
        partitionKey.partition(r);
        testTables.appendIcebergTable(shell.getHiveConf(), salesTable, fileFormat, partitionKey, ImmutableList.of(r));
    }
    testTables.createTable(shell, "x1_date_dim", dimSchema, fileFormat, dimRecords);
    String query = "select s.ss_item_sk from x1_store_sales s, x1_date_dim d " + "where s.ss_sold_date_sk=d.d_date_sk*2 and d.d_moy=30";
    // Check the query results
    List<Object[]> rows = shell.executeStatement(query);
    Assert.assertEquals(1, rows.size());
    Assert.assertArrayEquals(new Object[] { 61 }, rows.get(0));
    // Check if Dynamic Partitioning is used
    Assert.assertTrue(shell.executeStatement("explain " + query).stream().filter(a -> ((String) a[0]).contains("Dynamic Partitioning Event Operator")).findAny().isPresent());
}
Also used : Types(org.apache.iceberg.types.Types) Table(org.apache.iceberg.Table) LocalDateTime(java.time.LocalDateTime) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) IOException(java.io.IOException) Test(org.junit.Test) TestHelper(org.apache.iceberg.mr.TestHelper) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) FileFormat(org.apache.iceberg.FileFormat) List(java.util.List) Record(org.apache.iceberg.data.Record) OffsetDateTime(java.time.OffsetDateTime) NestedField.required(org.apache.iceberg.types.Types.NestedField.required) LocalDate(java.time.LocalDate) PartitionSpec(org.apache.iceberg.PartitionSpec) PartitionKey(org.apache.iceberg.PartitionKey) Assume(org.junit.Assume) ZoneOffset(java.time.ZoneOffset) Assert(org.junit.Assert) Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) PartitionKey(org.apache.iceberg.PartitionKey) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 2 with PartitionKey

use of org.apache.iceberg.PartitionKey in project hive by apache.

the class HiveIcebergTestUtils method createEqualityDeleteFile.

/**
 * @param table The table to create the delete file for
 * @param deleteFilePath The path where the delete file should be created, relative to the table location root
 * @param equalityFields List of field names that should play a role in the equality check
 * @param fileFormat The file format that should be used for writing out the delete file
 * @param rowsToDelete The rows that should be deleted. It's enough to fill out the fields that are relevant for the
 *                     equality check, as listed in equalityFields, the rest of the fields are ignored
 * @return The DeleteFile created
 * @throws IOException If there is an error during DeleteFile write
 */
public static DeleteFile createEqualityDeleteFile(Table table, String deleteFilePath, List<String> equalityFields, FileFormat fileFormat, List<Record> rowsToDelete) throws IOException {
    List<Integer> equalityFieldIds = equalityFields.stream().map(id -> table.schema().findField(id).fieldId()).collect(Collectors.toList());
    Schema eqDeleteRowSchema = table.schema().select(equalityFields.toArray(new String[] {}));
    FileAppenderFactory<Record> appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), ArrayUtil.toIntArray(equalityFieldIds), eqDeleteRowSchema, null);
    EncryptedOutputFile outputFile = table.encryption().encrypt(HadoopOutputFile.fromPath(new org.apache.hadoop.fs.Path(table.location(), deleteFilePath), new Configuration()));
    PartitionKey part = new PartitionKey(table.spec(), eqDeleteRowSchema);
    part.partition(rowsToDelete.get(0));
    EqualityDeleteWriter<Record> eqWriter = appenderFactory.newEqDeleteWriter(outputFile, fileFormat, part);
    try (EqualityDeleteWriter<Record> writer = eqWriter) {
        writer.deleteAll(rowsToDelete);
    }
    return eqWriter.toDeleteFile();
}
Also used : Arrays(java.util.Arrays) HadoopOutputFile(org.apache.iceberg.hadoop.HadoopOutputFile) Types(org.apache.iceberg.types.Types) Text(org.apache.hadoop.io.Text) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) DateWritable(org.apache.hadoop.hive.serde2.io.DateWritable) StandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector) JobID(org.apache.hadoop.mapred.JobID) LongWritable(org.apache.hadoop.io.LongWritable) ByteBuffer(java.nio.ByteBuffer) BigDecimal(java.math.BigDecimal) TimestampUtils(org.apache.hadoop.hive.common.type.TimestampUtils) ArrayUtil(org.apache.iceberg.util.ArrayUtil) ByteBuffers(org.apache.iceberg.util.ByteBuffers) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) GenericRecord(org.apache.iceberg.data.GenericRecord) PositionDeleteWriter(org.apache.iceberg.deletes.PositionDeleteWriter) LocalTime(java.time.LocalTime) PartitionKey(org.apache.iceberg.PartitionKey) ZoneOffset(java.time.ZoneOffset) Path(java.nio.file.Path) IntWritable(org.apache.hadoop.io.IntWritable) CloseableIterable(org.apache.iceberg.io.CloseableIterable) Timestamp(java.sql.Timestamp) UUID(java.util.UUID) Schema(org.apache.iceberg.Schema) Collectors(java.util.stream.Collectors) List(java.util.List) OffsetDateTime(java.time.OffsetDateTime) BooleanWritable(org.apache.hadoop.io.BooleanWritable) EncryptedOutputFile(org.apache.iceberg.encryption.EncryptedOutputFile) LocalDate(java.time.LocalDate) GenericAppenderFactory(org.apache.iceberg.data.GenericAppenderFactory) PositionDelete(org.apache.iceberg.deletes.PositionDelete) LocalDateTime(java.time.LocalDateTime) IcebergGenerics(org.apache.iceberg.data.IcebergGenerics) DoubleWritable(org.apache.hadoop.io.DoubleWritable) ArrayList(java.util.ArrayList) BytesWritable(org.apache.hadoop.io.BytesWritable) TimestampWritable(org.apache.hadoop.hive.serde2.io.TimestampWritable) Files(java.nio.file.Files) Table(org.apache.iceberg.Table) EqualityDeleteWriter(org.apache.iceberg.deletes.EqualityDeleteWriter) IOException(java.io.IOException) FileFormat(org.apache.iceberg.FileFormat) File(java.io.File) Record(org.apache.iceberg.data.Record) ObjectInspectorFactory(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory) Paths(java.nio.file.Paths) TimestampTZUtil(org.apache.hadoop.hive.common.type.TimestampTZUtil) PrimitiveObjectInspectorFactory(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory) HiveDecimal(org.apache.hadoop.hive.common.type.HiveDecimal) FileAppenderFactory(org.apache.iceberg.io.FileAppenderFactory) DeleteFile(org.apache.iceberg.DeleteFile) Comparator(java.util.Comparator) HiveDecimalWritable(org.apache.hadoop.hive.serde2.io.HiveDecimalWritable) Assert(org.junit.Assert) FloatWritable(org.apache.hadoop.io.FloatWritable) Path(java.nio.file.Path) Configuration(org.apache.hadoop.conf.Configuration) EncryptedOutputFile(org.apache.iceberg.encryption.EncryptedOutputFile) Schema(org.apache.iceberg.Schema) GenericAppenderFactory(org.apache.iceberg.data.GenericAppenderFactory) PartitionKey(org.apache.iceberg.PartitionKey) GenericRecord(org.apache.iceberg.data.GenericRecord) Record(org.apache.iceberg.data.Record)

Example 3 with PartitionKey

use of org.apache.iceberg.PartitionKey in project hive by apache.

the class HiveIcebergTestUtils method createPositionalDeleteFile.

/**
 * @param table The table to create the delete file for
 * @param deleteFilePath The path where the delete file should be created, relative to the table location root
 * @param fileFormat The file format that should be used for writing out the delete file
 * @param partitionValues A map of partition values (partitionKey=partitionVal, ...) to be used for the delete file
 * @param deletes The list of position deletes, each containing the data file path, the position of the row in the
 *                data file and the row itself that should be deleted
 * @return The DeleteFile created
 * @throws IOException If there is an error during DeleteFile write
 */
public static DeleteFile createPositionalDeleteFile(Table table, String deleteFilePath, FileFormat fileFormat, Map<String, Object> partitionValues, List<PositionDelete<Record>> deletes) throws IOException {
    Schema posDeleteRowSchema = deletes.get(0).row() == null ? null : table.schema();
    FileAppenderFactory<Record> appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), null, null, posDeleteRowSchema);
    EncryptedOutputFile outputFile = table.encryption().encrypt(HadoopOutputFile.fromPath(new org.apache.hadoop.fs.Path(table.location(), deleteFilePath), new Configuration()));
    PartitionKey partitionKey = null;
    if (partitionValues != null) {
        Record record = GenericRecord.create(table.schema()).copy(partitionValues);
        partitionKey = new PartitionKey(table.spec(), table.schema());
        partitionKey.partition(record);
    }
    PositionDeleteWriter<Record> posWriter = appenderFactory.newPosDeleteWriter(outputFile, fileFormat, partitionKey);
    try (PositionDeleteWriter<Record> writer = posWriter) {
        deletes.forEach(del -> writer.delete(del.path(), del.pos(), del.row()));
    }
    return posWriter.toDeleteFile();
}
Also used : Path(java.nio.file.Path) Configuration(org.apache.hadoop.conf.Configuration) EncryptedOutputFile(org.apache.iceberg.encryption.EncryptedOutputFile) Schema(org.apache.iceberg.Schema) PartitionKey(org.apache.iceberg.PartitionKey) GenericRecord(org.apache.iceberg.data.GenericRecord) Record(org.apache.iceberg.data.Record) GenericAppenderFactory(org.apache.iceberg.data.GenericAppenderFactory)

Aggregations

PartitionKey (org.apache.iceberg.PartitionKey)3 Schema (org.apache.iceberg.Schema)3 Record (org.apache.iceberg.data.Record)3 IOException (java.io.IOException)2 Path (java.nio.file.Path)2 LocalDate (java.time.LocalDate)2 LocalDateTime (java.time.LocalDateTime)2 OffsetDateTime (java.time.OffsetDateTime)2 ZoneOffset (java.time.ZoneOffset)2 List (java.util.List)2 Configuration (org.apache.hadoop.conf.Configuration)2 GenericAppenderFactory (org.apache.iceberg.data.GenericAppenderFactory)2 GenericRecord (org.apache.iceberg.data.GenericRecord)2 EncryptedOutputFile (org.apache.iceberg.encryption.EncryptedOutputFile)2 File (java.io.File)1 BigDecimal (java.math.BigDecimal)1 ByteBuffer (java.nio.ByteBuffer)1 Files (java.nio.file.Files)1 Paths (java.nio.file.Paths)1 Timestamp (java.sql.Timestamp)1