Search in sources :

Example 56 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveIcebergPartitions method testIdentityPartitionedWrite.

@Test
public void testIdentityPartitionedWrite() throws IOException {
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
    List<Record> records = TestHelper.generateRandomRecords(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, 4, 0L);
    Table table = testTables.createTable(shell, "partitioned_customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, records);
    HiveIcebergTestUtils.validateData(table, records, 0);
}
Also used : Table(org.apache.iceberg.Table) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 57 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class VectorizedReadUtils method handleIcebergProjection.

/**
 * Adjusts the jobConf so that column reorders and renames that might have happened since this ORC file was written
 * are properly mapped to the schema of the original file.
 * @param task - Iceberg task - required for
 * @param job - JobConf instance to adjust
 * @param fileSchema - ORC file schema of the input file
 * @throws IOException - errors relating to accessing the ORC file
 */
public static void handleIcebergProjection(FileScanTask task, JobConf job, TypeDescription fileSchema) throws IOException {
    // We need to map with the current (i.e. current Hive table columns) full schema (without projections),
    // as OrcInputFormat will take care of the projections by the use of an include boolean array
    PartitionSpec spec = task.spec();
    Schema currentSchema = spec.schema();
    TypeDescription readOrcSchema;
    if (ORCSchemaUtil.hasIds(fileSchema)) {
        readOrcSchema = ORCSchemaUtil.buildOrcProjection(currentSchema, fileSchema);
    } else {
        Schema readSchemaForOriginalFile = currentSchema;
        // In case of migrated, originally partitioned tables, partition values are not present in the file
        if (spec.isPartitioned()) {
            readSchemaForOriginalFile = currentSchema.select(currentSchema.columns().stream().filter(c -> !spec.identitySourceIds().contains(c.fieldId())).map(c -> c.name()).collect(Collectors.toList()));
        }
        TypeDescription typeWithIds = ORCSchemaUtil.applyNameMapping(fileSchema, MappingUtil.create(currentSchema));
        readOrcSchema = ORCSchemaUtil.buildOrcProjection(readSchemaForOriginalFile, typeWithIds);
    }
    job.set(ColumnProjectionUtils.ORC_SCHEMA_STRING, readOrcSchema.toString());
    // Predicate pushdowns needs to be adjusted too in case of column renames, we let Iceberg generate this into job
    if (task.residual() != null) {
        Expression boundFilter = Binder.bind(currentSchema.asStruct(), task.residual(), false);
        // Note the use of the unshaded version of this class here (required for SARG deseralization later)
        org.apache.hadoop.hive.ql.io.sarg.SearchArgument sarg = ExpressionToOrcSearchArgument.convert(boundFilter, readOrcSchema);
        if (sarg != null) {
            job.unset(TableScanDesc.FILTER_EXPR_CONF_STR);
            job.unset(ConvertAstToSearchArg.SARG_PUSHDOWN);
            job.set(ConvertAstToSearchArg.SARG_PUSHDOWN, ConvertAstToSearchArg.sargToKryo(sarg));
        }
    }
}
Also used : ConvertAstToSearchArg(org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg) TypeDescription(org.apache.hive.iceberg.org.apache.orc.TypeDescription) OrcTail(org.apache.hive.iceberg.org.apache.orc.impl.OrcTail) ColumnProjectionUtils(org.apache.hadoop.hive.serde2.ColumnProjectionUtils) LoggerFactory(org.slf4j.LoggerFactory) LlapProxy(org.apache.hadoop.hive.llap.io.api.LlapProxy) ByteBuffer(java.nio.ByteBuffer) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) ReaderImpl(org.apache.hive.iceberg.org.apache.orc.impl.ReaderImpl) MappingUtil(org.apache.iceberg.mapping.MappingUtil) Expression(org.apache.iceberg.expressions.Expression) Path(org.apache.hadoop.fs.Path) SyntheticFileId(org.apache.hadoop.hive.ql.io.SyntheticFileId) FileScanTask(org.apache.iceberg.FileScanTask) BufferChunk(org.apache.orc.impl.BufferChunk) CacheTag(org.apache.hadoop.hive.common.io.CacheTag) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) Logger(org.slf4j.Logger) Binder(org.apache.iceberg.expressions.Binder) HiveConf(org.apache.hadoop.hive.conf.HiveConf) IOException(java.io.IOException) Schema(org.apache.iceberg.Schema) Collectors(java.util.stream.Collectors) LlapHiveUtils(org.apache.hadoop.hive.llap.LlapHiveUtils) JobConf(org.apache.hadoop.mapred.JobConf) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) PartitionSpec(org.apache.iceberg.PartitionSpec) InputFile(org.apache.iceberg.io.InputFile) Expression(org.apache.iceberg.expressions.Expression) Schema(org.apache.iceberg.Schema) TypeDescription(org.apache.hive.iceberg.org.apache.orc.TypeDescription) PartitionSpec(org.apache.iceberg.PartitionSpec)

Example 58 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveIcebergStorageHandlerLocalScan method testCreatePartitionedTableByProperty.

@Test
public void testCreatePartitionedTableByProperty() throws IOException {
    TableIdentifier identifier = TableIdentifier.of("default", "customers");
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("last_name").build();
    Map<StructLike, List<Record>> data = ImmutableMap.of(Row.of("Brown"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), Row.of("Green"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), Row.of("Pink"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2)));
    String createSql = "CREATE EXTERNAL TABLE " + identifier + " STORED BY ICEBERG " + testTables.locationForCreateTableSQL(identifier) + "TBLPROPERTIES ('" + InputFormatConfig.PARTITION_SPEC + "'='" + PartitionSpecParser.toJson(spec) + "', " + "'" + InputFormatConfig.TABLE_SCHEMA + "'='" + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + "', " + "'" + InputFormatConfig.CATALOG_NAME + "'='" + testTables.catalogName() + "')";
    runCreateAndReadTest(identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data);
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) ArrayList(java.util.ArrayList) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) List(java.util.List) StructLike(org.apache.iceberg.StructLike) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 59 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveIcebergStorageHandlerLocalScan method testCreateTableWithColumnSpecificationMultilevelPartitioned.

@Test
public void testCreateTableWithColumnSpecificationMultilevelPartitioned() throws IOException {
    TableIdentifier identifier = TableIdentifier.of("default", "customers");
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("first_name").identity("last_name").build();
    Map<StructLike, List<Record>> data = ImmutableMap.of(Row.of("Alice", "Brown"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), Row.of("Bob", "Green"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), Row.of("Trudy", "Pink"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2)));
    String createSql = "CREATE EXTERNAL TABLE " + identifier + " (customer_id BIGINT) " + "PARTITIONED BY (first_name STRING COMMENT 'This is first name', " + "last_name STRING COMMENT 'This is last name') " + "STORED BY ICEBERG " + testTables.locationForCreateTableSQL(identifier) + testTables.propertiesForCreateTableSQL(ImmutableMap.of());
    runCreateAndReadTest(identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data);
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) ArrayList(java.util.ArrayList) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) List(java.util.List) StructLike(org.apache.iceberg.StructLike) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Aggregations

PartitionSpec (org.apache.iceberg.PartitionSpec)59 Test (org.junit.Test)39 Table (org.apache.iceberg.Table)38 Schema (org.apache.iceberg.Schema)37 Record (org.apache.iceberg.data.Record)19 TableIdentifier (org.apache.iceberg.catalog.TableIdentifier)18 List (java.util.List)10 FileFormat (org.apache.iceberg.FileFormat)9 ArrayList (java.util.ArrayList)8 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)8 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)7 IOException (java.io.IOException)6 UpdateSchema (org.apache.iceberg.UpdateSchema)6 BaseTable (org.apache.iceberg.BaseTable)5 Path (org.apache.hadoop.fs.Path)4 PartitionField (org.apache.iceberg.PartitionField)4 Types (org.apache.iceberg.types.Types)4 HdfsContext (com.facebook.presto.hive.HdfsContext)3 PrestoException (com.facebook.presto.spi.PrestoException)3 Map (java.util.Map)3