Search in sources :

Example 91 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class TestHiveIcebergPartitions method testMonthTransform.

@Test
public void testMonthTransform() throws IOException {
    Assume.assumeTrue("ORC/TIMESTAMP_INSTANT is not a supported vectorized type for Hive", isVectorized && fileFormat == FileFormat.ORC);
    Schema schema = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "part_field", Types.TimestampType.withZone()));
    PartitionSpec spec = PartitionSpec.builderFor(schema).month("part_field").build();
    List<Record> records = TestHelper.RecordsBuilder.newInstance(schema).add(1L, OffsetDateTime.of(2017, 11, 22, 11, 30, 7, 0, ZoneOffset.ofHours(1))).add(2L, OffsetDateTime.of(2017, 11, 22, 11, 30, 7, 0, ZoneOffset.ofHours(2))).add(3L, OffsetDateTime.of(2017, 11, 23, 11, 30, 7, 0, ZoneOffset.ofHours(3))).build();
    Table table = testTables.createTable(shell, "part_test", schema, spec, fileFormat, records);
    HiveIcebergTestUtils.validateData(table, records, 0);
    HiveIcebergTestUtils.validateDataWithSQL(shell, "part_test", records, "id");
}
Also used : Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 92 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class TestHiveIcebergPartitions method testTruncateTransform.

@Test
public void testTruncateTransform() throws IOException {
    Schema schema = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "part_field", Types.StringType.get()));
    PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("part_field", 2).build();
    List<Record> records = TestHelper.RecordsBuilder.newInstance(schema).add(1L, "Part1").add(2L, "Part2").add(3L, "Art3").build();
    Table table = testTables.createTable(shell, "part_test", schema, spec, fileFormat, records);
    HiveIcebergTestUtils.validateData(table, records, 0);
    HiveIcebergTestUtils.validateDataWithSQL(shell, "part_test", records, "id");
}
Also used : Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 93 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class VectorizedReadUtils method handleIcebergProjection.

/**
 * Adjusts the jobConf so that column reorders and renames that might have happened since this ORC file was written
 * are properly mapped to the schema of the original file.
 * @param task - Iceberg task - required for
 * @param job - JobConf instance to adjust
 * @param fileSchema - ORC file schema of the input file
 * @throws IOException - errors relating to accessing the ORC file
 */
public static void handleIcebergProjection(FileScanTask task, JobConf job, TypeDescription fileSchema) throws IOException {
    // We need to map with the current (i.e. current Hive table columns) full schema (without projections),
    // as OrcInputFormat will take care of the projections by the use of an include boolean array
    PartitionSpec spec = task.spec();
    Schema currentSchema = spec.schema();
    TypeDescription readOrcSchema;
    if (ORCSchemaUtil.hasIds(fileSchema)) {
        readOrcSchema = ORCSchemaUtil.buildOrcProjection(currentSchema, fileSchema);
    } else {
        Schema readSchemaForOriginalFile = currentSchema;
        // In case of migrated, originally partitioned tables, partition values are not present in the file
        if (spec.isPartitioned()) {
            readSchemaForOriginalFile = currentSchema.select(currentSchema.columns().stream().filter(c -> !spec.identitySourceIds().contains(c.fieldId())).map(c -> c.name()).collect(Collectors.toList()));
        }
        TypeDescription typeWithIds = ORCSchemaUtil.applyNameMapping(fileSchema, MappingUtil.create(currentSchema));
        readOrcSchema = ORCSchemaUtil.buildOrcProjection(readSchemaForOriginalFile, typeWithIds);
    }
    job.set(ColumnProjectionUtils.ORC_SCHEMA_STRING, readOrcSchema.toString());
    // Predicate pushdowns needs to be adjusted too in case of column renames, we let Iceberg generate this into job
    if (task.residual() != null) {
        Expression boundFilter = Binder.bind(currentSchema.asStruct(), task.residual(), false);
        // Note the use of the unshaded version of this class here (required for SARG deseralization later)
        org.apache.hadoop.hive.ql.io.sarg.SearchArgument sarg = ExpressionToOrcSearchArgument.convert(boundFilter, readOrcSchema);
        if (sarg != null) {
            job.unset(TableScanDesc.FILTER_EXPR_CONF_STR);
            job.unset(ConvertAstToSearchArg.SARG_PUSHDOWN);
            job.set(ConvertAstToSearchArg.SARG_PUSHDOWN, ConvertAstToSearchArg.sargToKryo(sarg));
        }
    }
}
Also used : ConvertAstToSearchArg(org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg) TypeDescription(org.apache.hive.iceberg.org.apache.orc.TypeDescription) OrcTail(org.apache.hive.iceberg.org.apache.orc.impl.OrcTail) ColumnProjectionUtils(org.apache.hadoop.hive.serde2.ColumnProjectionUtils) LoggerFactory(org.slf4j.LoggerFactory) LlapProxy(org.apache.hadoop.hive.llap.io.api.LlapProxy) ByteBuffer(java.nio.ByteBuffer) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) ReaderImpl(org.apache.hive.iceberg.org.apache.orc.impl.ReaderImpl) MappingUtil(org.apache.iceberg.mapping.MappingUtil) Expression(org.apache.iceberg.expressions.Expression) Path(org.apache.hadoop.fs.Path) SyntheticFileId(org.apache.hadoop.hive.ql.io.SyntheticFileId) FileScanTask(org.apache.iceberg.FileScanTask) BufferChunk(org.apache.orc.impl.BufferChunk) CacheTag(org.apache.hadoop.hive.common.io.CacheTag) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) Logger(org.slf4j.Logger) Binder(org.apache.iceberg.expressions.Binder) HiveConf(org.apache.hadoop.hive.conf.HiveConf) IOException(java.io.IOException) Schema(org.apache.iceberg.Schema) Collectors(java.util.stream.Collectors) LlapHiveUtils(org.apache.hadoop.hive.llap.LlapHiveUtils) JobConf(org.apache.hadoop.mapred.JobConf) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) PartitionSpec(org.apache.iceberg.PartitionSpec) InputFile(org.apache.iceberg.io.InputFile) Expression(org.apache.iceberg.expressions.Expression) Schema(org.apache.iceberg.Schema) TypeDescription(org.apache.hive.iceberg.org.apache.orc.TypeDescription) PartitionSpec(org.apache.iceberg.PartitionSpec)

Example 94 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class HiveIcebergTestUtils method createPositionalDeleteFile.

/**
 * @param table The table to create the delete file for
 * @param deleteFilePath The path where the delete file should be created, relative to the table location root
 * @param fileFormat The file format that should be used for writing out the delete file
 * @param partitionValues A map of partition values (partitionKey=partitionVal, ...) to be used for the delete file
 * @param deletes The list of position deletes, each containing the data file path, the position of the row in the
 *                data file and the row itself that should be deleted
 * @return The DeleteFile created
 * @throws IOException If there is an error during DeleteFile write
 */
public static DeleteFile createPositionalDeleteFile(Table table, String deleteFilePath, FileFormat fileFormat, Map<String, Object> partitionValues, List<PositionDelete<Record>> deletes) throws IOException {
    Schema posDeleteRowSchema = deletes.get(0).row() == null ? null : table.schema();
    FileAppenderFactory<Record> appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), null, null, posDeleteRowSchema);
    EncryptedOutputFile outputFile = table.encryption().encrypt(HadoopOutputFile.fromPath(new org.apache.hadoop.fs.Path(table.location(), deleteFilePath), new Configuration()));
    PartitionKey partitionKey = null;
    if (partitionValues != null) {
        Record record = GenericRecord.create(table.schema()).copy(partitionValues);
        partitionKey = new PartitionKey(table.spec(), table.schema());
        partitionKey.partition(record);
    }
    PositionDeleteWriter<Record> posWriter = appenderFactory.newPosDeleteWriter(outputFile, fileFormat, partitionKey);
    try (PositionDeleteWriter<Record> writer = posWriter) {
        deletes.forEach(del -> writer.delete(del.path(), del.pos(), del.row()));
    }
    return posWriter.toDeleteFile();
}
Also used : Path(java.nio.file.Path) Configuration(org.apache.hadoop.conf.Configuration) EncryptedOutputFile(org.apache.iceberg.encryption.EncryptedOutputFile) Schema(org.apache.iceberg.Schema) PartitionKey(org.apache.iceberg.PartitionKey) GenericRecord(org.apache.iceberg.data.GenericRecord) Record(org.apache.iceberg.data.Record) GenericAppenderFactory(org.apache.iceberg.data.GenericAppenderFactory)

Example 95 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class TestHiveIcebergSchemaEvolution method testRemoveAndAddBackColumnFromIcebergTable.

@Test
public void testRemoveAndAddBackColumnFromIcebergTable() throws IOException {
    // Create an Iceberg table with the columns customer_id, first_name and last_name with some initial data.
    Table icebergTable = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
    // Remove the first_name column
    icebergTable.updateSchema().deleteColumn("first_name").commit();
    // Add a new column with the name first_name
    icebergTable.updateSchema().addColumn("first_name", Types.StringType.get(), "This is new first name").commit();
    // Add new data to the table with the new first_name column filled.
    icebergTable = testTables.loadTable(TableIdentifier.of("default", "customers"));
    Schema customerSchemaWithNewFirstName = new Schema(optional(1, "customer_id", Types.LongType.get()), optional(2, "last_name", Types.StringType.get(), "This is last name"), optional(3, "first_name", Types.StringType.get(), "This is the newly added first name"));
    List<Record> newCustomersWithNewFirstName = TestHelper.RecordsBuilder.newInstance(customerSchemaWithNewFirstName).add(3L, "Red", "James").build();
    testTables.appendIcebergTable(shell.getHiveConf(), icebergTable, fileFormat, null, newCustomersWithNewFirstName);
    TestHelper.RecordsBuilder customersWithNewFirstNameBuilder = TestHelper.RecordsBuilder.newInstance(customerSchemaWithNewFirstName).add(0L, "Brown", null).add(1L, "Green", null).add(2L, "Pink", null).add(3L, "Red", "James");
    List<Record> customersWithNewFirstName = customersWithNewFirstNameBuilder.build();
    // Run a 'select *' from Hive and check if the first_name column is returned.
    // It should be null for the old data and should be filled in the entry added after the column addition.
    List<Object[]> rows = shell.executeStatement("SELECT * FROM default.customers");
    HiveIcebergTestUtils.validateData(customersWithNewFirstName, HiveIcebergTestUtils.valueForRow(customerSchemaWithNewFirstName, rows), 0);
    Schema customerSchemaWithNewFirstNameOnly = new Schema(optional(1, "customer_id", Types.LongType.get()), optional(3, "first_name", Types.StringType.get(), "This is the newly added first name"));
    TestHelper.RecordsBuilder customersWithNewFirstNameOnlyBuilder = TestHelper.RecordsBuilder.newInstance(customerSchemaWithNewFirstNameOnly).add(0L, null).add(1L, null).add(2L, null).add(3L, "James");
    List<Record> customersWithNewFirstNameOnly = customersWithNewFirstNameOnlyBuilder.build();
    // Run a 'select first_name' from Hive to check if the new first-name column can be queried.
    rows = shell.executeStatement("SELECT customer_id, first_name FROM default.customers");
    HiveIcebergTestUtils.validateData(customersWithNewFirstNameOnly, HiveIcebergTestUtils.valueForRow(customerSchemaWithNewFirstNameOnly, rows), 0);
    // Insert data from Hive with first_name filled and with null first_name value.
    shell.executeStatement("INSERT INTO default.customers values (4L, 'Magenta', 'Lily'), (5L, 'Purple', NULL)");
    // Check if the newly inserted data is returned correctly by select statements.
    customersWithNewFirstNameBuilder.add(4L, "Magenta", "Lily").add(5L, "Purple", null);
    customersWithNewFirstName = customersWithNewFirstNameBuilder.build();
    rows = shell.executeStatement("SELECT * FROM default.customers");
    HiveIcebergTestUtils.validateData(customersWithNewFirstName, HiveIcebergTestUtils.valueForRow(customerSchemaWithNewFirstName, rows), 0);
    customersWithNewFirstNameOnlyBuilder.add(4L, "Lily").add(5L, null);
    customersWithNewFirstNameOnly = customersWithNewFirstNameOnlyBuilder.build();
    rows = shell.executeStatement("SELECT customer_id, first_name FROM default.customers");
    HiveIcebergTestUtils.validateData(customersWithNewFirstNameOnly, HiveIcebergTestUtils.valueForRow(customerSchemaWithNewFirstNameOnly, rows), 0);
}
Also used : TestHelper(org.apache.iceberg.mr.TestHelper) Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) Record(org.apache.iceberg.data.Record) Test(org.junit.Test)

Aggregations

Schema (org.apache.iceberg.Schema)126 Test (org.junit.Test)93 Record (org.apache.iceberg.data.Record)68 Table (org.apache.iceberg.Table)55 PartitionSpec (org.apache.iceberg.PartitionSpec)39 GenericRecord (org.apache.iceberg.data.GenericRecord)36 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)30 List (java.util.List)21 TableIdentifier (org.apache.iceberg.catalog.TableIdentifier)20 IOException (java.io.IOException)16 Types (org.apache.iceberg.types.Types)16 ArrayList (java.util.ArrayList)15 Map (java.util.Map)14 HashMap (java.util.HashMap)13 FileFormat (org.apache.iceberg.FileFormat)13 UpdateSchema (org.apache.iceberg.UpdateSchema)12 Path (org.apache.hadoop.fs.Path)11 Collectors (java.util.stream.Collectors)10 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)10 TestHelper (org.apache.iceberg.mr.TestHelper)9