use of org.apache.iceberg.PartitionSpec in project hive by apache.
the class TestHiveIcebergPartitions method testIdentityPartitionedWrite.
@Test
public void testIdentityPartitionedWrite() throws IOException {
PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
List<Record> records = TestHelper.generateRandomRecords(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, 4, 0L);
Table table = testTables.createTable(shell, "partitioned_customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, records);
HiveIcebergTestUtils.validateData(table, records, 0);
}
use of org.apache.iceberg.PartitionSpec in project hive by apache.
the class VectorizedReadUtils method handleIcebergProjection.
/**
* Adjusts the jobConf so that column reorders and renames that might have happened since this ORC file was written
* are properly mapped to the schema of the original file.
* @param task - Iceberg task - required for
* @param job - JobConf instance to adjust
* @param fileSchema - ORC file schema of the input file
* @throws IOException - errors relating to accessing the ORC file
*/
public static void handleIcebergProjection(FileScanTask task, JobConf job, TypeDescription fileSchema) throws IOException {
// We need to map with the current (i.e. current Hive table columns) full schema (without projections),
// as OrcInputFormat will take care of the projections by the use of an include boolean array
PartitionSpec spec = task.spec();
Schema currentSchema = spec.schema();
TypeDescription readOrcSchema;
if (ORCSchemaUtil.hasIds(fileSchema)) {
readOrcSchema = ORCSchemaUtil.buildOrcProjection(currentSchema, fileSchema);
} else {
Schema readSchemaForOriginalFile = currentSchema;
// In case of migrated, originally partitioned tables, partition values are not present in the file
if (spec.isPartitioned()) {
readSchemaForOriginalFile = currentSchema.select(currentSchema.columns().stream().filter(c -> !spec.identitySourceIds().contains(c.fieldId())).map(c -> c.name()).collect(Collectors.toList()));
}
TypeDescription typeWithIds = ORCSchemaUtil.applyNameMapping(fileSchema, MappingUtil.create(currentSchema));
readOrcSchema = ORCSchemaUtil.buildOrcProjection(readSchemaForOriginalFile, typeWithIds);
}
job.set(ColumnProjectionUtils.ORC_SCHEMA_STRING, readOrcSchema.toString());
// Predicate pushdowns needs to be adjusted too in case of column renames, we let Iceberg generate this into job
if (task.residual() != null) {
Expression boundFilter = Binder.bind(currentSchema.asStruct(), task.residual(), false);
// Note the use of the unshaded version of this class here (required for SARG deseralization later)
org.apache.hadoop.hive.ql.io.sarg.SearchArgument sarg = ExpressionToOrcSearchArgument.convert(boundFilter, readOrcSchema);
if (sarg != null) {
job.unset(TableScanDesc.FILTER_EXPR_CONF_STR);
job.unset(ConvertAstToSearchArg.SARG_PUSHDOWN);
job.set(ConvertAstToSearchArg.SARG_PUSHDOWN, ConvertAstToSearchArg.sargToKryo(sarg));
}
}
}
use of org.apache.iceberg.PartitionSpec in project hive by apache.
the class TestHiveIcebergStorageHandlerLocalScan method testCreatePartitionedTableByProperty.
@Test
public void testCreatePartitionedTableByProperty() throws IOException {
TableIdentifier identifier = TableIdentifier.of("default", "customers");
PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("last_name").build();
Map<StructLike, List<Record>> data = ImmutableMap.of(Row.of("Brown"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), Row.of("Green"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), Row.of("Pink"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2)));
String createSql = "CREATE EXTERNAL TABLE " + identifier + " STORED BY ICEBERG " + testTables.locationForCreateTableSQL(identifier) + "TBLPROPERTIES ('" + InputFormatConfig.PARTITION_SPEC + "'='" + PartitionSpecParser.toJson(spec) + "', " + "'" + InputFormatConfig.TABLE_SCHEMA + "'='" + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + "', " + "'" + InputFormatConfig.CATALOG_NAME + "'='" + testTables.catalogName() + "')";
runCreateAndReadTest(identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data);
}
use of org.apache.iceberg.PartitionSpec in project hive by apache.
the class TestHiveIcebergStorageHandlerLocalScan method testCreateTableWithColumnSpecificationMultilevelPartitioned.
@Test
public void testCreateTableWithColumnSpecificationMultilevelPartitioned() throws IOException {
TableIdentifier identifier = TableIdentifier.of("default", "customers");
PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("first_name").identity("last_name").build();
Map<StructLike, List<Record>> data = ImmutableMap.of(Row.of("Alice", "Brown"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), Row.of("Bob", "Green"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), Row.of("Trudy", "Pink"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2)));
String createSql = "CREATE EXTERNAL TABLE " + identifier + " (customer_id BIGINT) " + "PARTITIONED BY (first_name STRING COMMENT 'This is first name', " + "last_name STRING COMMENT 'This is last name') " + "STORED BY ICEBERG " + testTables.locationForCreateTableSQL(identifier) + testTables.propertiesForCreateTableSQL(ImmutableMap.of());
runCreateAndReadTest(identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data);
}
Aggregations