use of org.apache.iceberg.PartitionSpec in project hive by apache.
the class HiveIcebergMetaHook method preCreateTable.
@Override
public void preCreateTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) {
this.catalogProperties = getCatalogProperties(hmsTable);
// Set the table type even for non HiveCatalog based tables
hmsTable.getParameters().put(BaseMetastoreTableOperations.TABLE_TYPE_PROP, BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE.toUpperCase());
if (!Catalogs.hiveCatalog(conf, catalogProperties)) {
// For non-HiveCatalog tables too, we should set the input and output format
// so that the table can be read by other engines like Impala
hmsTable.getSd().setInputFormat(HiveIcebergInputFormat.class.getCanonicalName());
hmsTable.getSd().setOutputFormat(HiveIcebergOutputFormat.class.getCanonicalName());
// If not using HiveCatalog check for existing table
try {
this.icebergTable = IcebergTableUtil.getTable(conf, catalogProperties);
Preconditions.checkArgument(catalogProperties.getProperty(InputFormatConfig.TABLE_SCHEMA) == null, "Iceberg table already created - can not use provided schema");
Preconditions.checkArgument(catalogProperties.getProperty(InputFormatConfig.PARTITION_SPEC) == null, "Iceberg table already created - can not use provided partition specification");
LOG.info("Iceberg table already exists {}", icebergTable);
return;
} catch (NoSuchTableException nte) {
// If the table does not exist we will create it below
}
}
// If the table does not exist collect data for table creation
// - InputFormatConfig.TABLE_SCHEMA, InputFormatConfig.PARTITION_SPEC takes precedence so the user can override the
// Iceberg schema and specification generated by the code
Schema schema = schema(catalogProperties, hmsTable);
PartitionSpec spec = spec(conf, schema, hmsTable);
// If there are partition keys specified remove them from the HMS table and add them to the column list
if (hmsTable.isSetPartitionKeys()) {
hmsTable.getSd().getCols().addAll(hmsTable.getPartitionKeys());
hmsTable.setPartitionKeysIsSet(false);
}
catalogProperties.put(InputFormatConfig.TABLE_SCHEMA, SchemaParser.toJson(schema));
catalogProperties.put(InputFormatConfig.PARTITION_SPEC, PartitionSpecParser.toJson(spec));
setCommonHmsTablePropertiesForIceberg(hmsTable);
}
use of org.apache.iceberg.PartitionSpec in project hive by apache.
the class HiveIcebergMetaHook method spec.
private static PartitionSpec spec(Configuration configuration, Schema schema, org.apache.hadoop.hive.metastore.api.Table hmsTable) {
Preconditions.checkArgument(!hmsTable.isSetPartitionKeys() || hmsTable.getPartitionKeys().isEmpty(), "We can only handle non-partitioned Hive tables. The Iceberg schema should be in " + InputFormatConfig.PARTITION_SPEC + " or already converted to a partition transform ");
PartitionSpec spec = IcebergTableUtil.spec(configuration, schema);
if (spec != null) {
Preconditions.checkArgument(hmsTable.getParameters().get(InputFormatConfig.PARTITION_SPEC) == null, "Provide only one of the following: Hive partition transform specification, or the " + InputFormatConfig.PARTITION_SPEC + " property");
return spec;
}
if (hmsTable.getParameters().get(InputFormatConfig.PARTITION_SPEC) != null) {
return PartitionSpecParser.fromJson(schema, hmsTable.getParameters().get(InputFormatConfig.PARTITION_SPEC));
} else {
return PartitionSpec.unpartitioned();
}
}
use of org.apache.iceberg.PartitionSpec in project hive by apache.
the class HiveIcebergOutputFormat method writer.
private static HiveIcebergRecordWriter writer(JobConf jc) {
TaskAttemptID taskAttemptID = TezUtil.taskAttemptWrapper(jc);
// It gets the config from the FileSinkOperator which has its own config for every target table
Table table = HiveIcebergStorageHandler.table(jc, jc.get(hive_metastoreConstants.META_TABLE_NAME));
Schema schema = HiveIcebergStorageHandler.schema(jc);
PartitionSpec spec = table.spec();
FileFormat fileFormat = FileFormat.valueOf(PropertyUtil.propertyAsString(table.properties(), TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT).toUpperCase(Locale.ENGLISH));
long targetFileSize = PropertyUtil.propertyAsLong(table.properties(), TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT);
FileIO io = table.io();
int partitionId = taskAttemptID.getTaskID().getId();
int taskId = taskAttemptID.getId();
String operationId = jc.get(HiveConf.ConfVars.HIVEQUERYID.varname) + "-" + taskAttemptID.getJobID();
OutputFileFactory outputFileFactory = OutputFileFactory.builderFor(table, partitionId, taskId).format(fileFormat).operationId(operationId).build();
String tableName = jc.get(Catalogs.NAME);
HiveFileWriterFactory hfwf = new HiveFileWriterFactory(table, fileFormat, schema, null, fileFormat, null, null, null, null);
return new HiveIcebergRecordWriter(schema, spec, fileFormat, hfwf, outputFileFactory, io, targetFileSize, taskAttemptID, tableName);
}
use of org.apache.iceberg.PartitionSpec in project hive by apache.
the class HiveIcebergSerDe method createTableForCTAS.
private void createTableForCTAS(Configuration configuration, Properties serDeProperties) {
serDeProperties.setProperty(InputFormatConfig.TABLE_SCHEMA, SchemaParser.toJson(tableSchema));
// build partition spec, if any
if (!getPartitionColumnNames().isEmpty()) {
List<FieldSchema> partitionFields = IntStream.range(0, getPartitionColumnNames().size()).mapToObj(i -> new FieldSchema(getPartitionColumnNames().get(i), getPartitionColumnTypes().get(i).getTypeName(), null)).collect(Collectors.toList());
PartitionSpec spec = HiveSchemaUtil.spec(tableSchema, partitionFields);
serDeProperties.put(InputFormatConfig.PARTITION_SPEC, PartitionSpecParser.toJson(spec));
}
// clean up the properties for table creation (so that internal serde props don't become table props)
Properties createProps = getCTASTableCreationProperties(serDeProperties);
// create CTAS table
LOG.info("Creating table {} for CTAS with schema: {}, and spec: {}", serDeProperties.get(Catalogs.NAME), tableSchema, serDeProperties.get(InputFormatConfig.PARTITION_SPEC));
Catalogs.createTable(configuration, createProps);
// set this in the query state so that we can rollback the table in the lifecycle hook in case of failures
SessionStateUtil.addResource(configuration, InputFormatConfig.CTAS_TABLE_NAME, serDeProperties.getProperty(Catalogs.NAME));
}
use of org.apache.iceberg.PartitionSpec in project hive by apache.
the class HiveVectorizedReader method reader.
public static <D> CloseableIterable<D> reader(InputFile inputFile, FileScanTask task, Map<Integer, ?> idToConstant, TaskAttemptContext context) {
// Tweaks on jobConf here are relevant for this task only, so we need to copy it first as context's conf is reused..
JobConf job = new JobConf((JobConf) context.getConfiguration());
Path path = new Path(inputFile.location());
FileFormat format = task.file().format();
Reporter reporter = ((MapredIcebergInputFormat.CompatibilityTaskAttemptContextImpl) context).getLegacyReporter();
// Hive by default requires partition columns to be read too. This is not required for identity partition
// columns, as we will add this as constants later.
int[] partitionColIndices = null;
Object[] partitionValues = null;
PartitionSpec partitionSpec = task.spec();
List<Integer> readColumnIds = ColumnProjectionUtils.getReadColumnIDs(job);
if (!partitionSpec.isUnpartitioned()) {
List<PartitionField> fields = partitionSpec.fields();
List<Integer> partitionColIndicesList = Lists.newLinkedList();
List<Object> partitionValuesList = Lists.newLinkedList();
for (PartitionField partitionField : fields) {
if (partitionField.transform().isIdentity()) {
// Get columns in read schema order (which matches those of readColumnIds) to find partition column indices
List<Types.NestedField> columns = task.spec().schema().columns();
for (int colIdx = 0; colIdx < columns.size(); ++colIdx) {
if (columns.get(colIdx).fieldId() == partitionField.sourceId()) {
// Skip reading identity partition columns from source file...
readColumnIds.remove((Integer) colIdx);
// ...and use the corresponding constant value instead
partitionColIndicesList.add(colIdx);
partitionValuesList.add(idToConstant.get(partitionField.sourceId()));
break;
}
}
}
}
partitionColIndices = ArrayUtils.toPrimitive(partitionColIndicesList.toArray(new Integer[0]));
partitionValues = partitionValuesList.toArray(new Object[0]);
ColumnProjectionUtils.setReadColumns(job, readColumnIds);
}
try {
long start = task.start();
long length = task.length();
// TODO: Iceberg currently does not track the last modification time of a file. Until that's added,
// we need to set Long.MIN_VALUE as last modification time in the fileId triplet.
SyntheticFileId fileId = new SyntheticFileId(path, task.file().fileSizeInBytes(), Long.MIN_VALUE);
RecordReader<NullWritable, VectorizedRowBatch> recordReader = null;
switch(format) {
case ORC:
recordReader = orcRecordReader(job, reporter, task, inputFile, path, start, length, readColumnIds, fileId);
break;
case PARQUET:
recordReader = parquetRecordReader(job, reporter, task, path, start, length);
break;
default:
throw new UnsupportedOperationException("Vectorized Hive reading unimplemented for format: " + format);
}
return createVectorizedRowBatchIterable(recordReader, job, partitionColIndices, partitionValues);
} catch (IOException ioe) {
throw new RuntimeException("Error creating vectorized record reader for " + inputFile, ioe);
}
}
Aggregations