Search in sources :

Example 31 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class HiveIcebergMetaHook method preCreateTable.

@Override
public void preCreateTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) {
    this.catalogProperties = getCatalogProperties(hmsTable);
    // Set the table type even for non HiveCatalog based tables
    hmsTable.getParameters().put(BaseMetastoreTableOperations.TABLE_TYPE_PROP, BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE.toUpperCase());
    if (!Catalogs.hiveCatalog(conf, catalogProperties)) {
        // For non-HiveCatalog tables too, we should set the input and output format
        // so that the table can be read by other engines like Impala
        hmsTable.getSd().setInputFormat(HiveIcebergInputFormat.class.getCanonicalName());
        hmsTable.getSd().setOutputFormat(HiveIcebergOutputFormat.class.getCanonicalName());
        // If not using HiveCatalog check for existing table
        try {
            this.icebergTable = IcebergTableUtil.getTable(conf, catalogProperties);
            Preconditions.checkArgument(catalogProperties.getProperty(InputFormatConfig.TABLE_SCHEMA) == null, "Iceberg table already created - can not use provided schema");
            Preconditions.checkArgument(catalogProperties.getProperty(InputFormatConfig.PARTITION_SPEC) == null, "Iceberg table already created - can not use provided partition specification");
            LOG.info("Iceberg table already exists {}", icebergTable);
            return;
        } catch (NoSuchTableException nte) {
        // If the table does not exist we will create it below
        }
    }
    // If the table does not exist collect data for table creation
    // - InputFormatConfig.TABLE_SCHEMA, InputFormatConfig.PARTITION_SPEC takes precedence so the user can override the
    // Iceberg schema and specification generated by the code
    Schema schema = schema(catalogProperties, hmsTable);
    PartitionSpec spec = spec(conf, schema, hmsTable);
    // If there are partition keys specified remove them from the HMS table and add them to the column list
    if (hmsTable.isSetPartitionKeys()) {
        hmsTable.getSd().getCols().addAll(hmsTable.getPartitionKeys());
        hmsTable.setPartitionKeysIsSet(false);
    }
    catalogProperties.put(InputFormatConfig.TABLE_SCHEMA, SchemaParser.toJson(schema));
    catalogProperties.put(InputFormatConfig.PARTITION_SPEC, PartitionSpecParser.toJson(spec));
    setCommonHmsTablePropertiesForIceberg(hmsTable);
}
Also used : NoSuchTableException(org.apache.iceberg.exceptions.NoSuchTableException) UpdateSchema(org.apache.iceberg.UpdateSchema) Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) PartitionSpec(org.apache.iceberg.PartitionSpec) UpdatePartitionSpec(org.apache.iceberg.UpdatePartitionSpec)

Example 32 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class HiveIcebergMetaHook method spec.

private static PartitionSpec spec(Configuration configuration, Schema schema, org.apache.hadoop.hive.metastore.api.Table hmsTable) {
    Preconditions.checkArgument(!hmsTable.isSetPartitionKeys() || hmsTable.getPartitionKeys().isEmpty(), "We can only handle non-partitioned Hive tables. The Iceberg schema should be in " + InputFormatConfig.PARTITION_SPEC + " or already converted to a partition transform ");
    PartitionSpec spec = IcebergTableUtil.spec(configuration, schema);
    if (spec != null) {
        Preconditions.checkArgument(hmsTable.getParameters().get(InputFormatConfig.PARTITION_SPEC) == null, "Provide only one of the following: Hive partition transform specification, or the " + InputFormatConfig.PARTITION_SPEC + " property");
        return spec;
    }
    if (hmsTable.getParameters().get(InputFormatConfig.PARTITION_SPEC) != null) {
        return PartitionSpecParser.fromJson(schema, hmsTable.getParameters().get(InputFormatConfig.PARTITION_SPEC));
    } else {
        return PartitionSpec.unpartitioned();
    }
}
Also used : PartitionSpec(org.apache.iceberg.PartitionSpec) UpdatePartitionSpec(org.apache.iceberg.UpdatePartitionSpec)

Example 33 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class HiveIcebergOutputFormat method writer.

private static HiveIcebergRecordWriter writer(JobConf jc) {
    TaskAttemptID taskAttemptID = TezUtil.taskAttemptWrapper(jc);
    // It gets the config from the FileSinkOperator which has its own config for every target table
    Table table = HiveIcebergStorageHandler.table(jc, jc.get(hive_metastoreConstants.META_TABLE_NAME));
    Schema schema = HiveIcebergStorageHandler.schema(jc);
    PartitionSpec spec = table.spec();
    FileFormat fileFormat = FileFormat.valueOf(PropertyUtil.propertyAsString(table.properties(), TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT).toUpperCase(Locale.ENGLISH));
    long targetFileSize = PropertyUtil.propertyAsLong(table.properties(), TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT);
    FileIO io = table.io();
    int partitionId = taskAttemptID.getTaskID().getId();
    int taskId = taskAttemptID.getId();
    String operationId = jc.get(HiveConf.ConfVars.HIVEQUERYID.varname) + "-" + taskAttemptID.getJobID();
    OutputFileFactory outputFileFactory = OutputFileFactory.builderFor(table, partitionId, taskId).format(fileFormat).operationId(operationId).build();
    String tableName = jc.get(Catalogs.NAME);
    HiveFileWriterFactory hfwf = new HiveFileWriterFactory(table, fileFormat, schema, null, fileFormat, null, null, null, null);
    return new HiveIcebergRecordWriter(schema, spec, fileFormat, hfwf, outputFileFactory, io, targetFileSize, taskAttemptID, tableName);
}
Also used : OutputFileFactory(org.apache.iceberg.io.OutputFileFactory) Table(org.apache.iceberg.Table) TaskAttemptID(org.apache.hadoop.mapred.TaskAttemptID) Schema(org.apache.iceberg.Schema) FileFormat(org.apache.iceberg.FileFormat) PartitionSpec(org.apache.iceberg.PartitionSpec) FileIO(org.apache.iceberg.io.FileIO)

Example 34 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class HiveIcebergSerDe method createTableForCTAS.

private void createTableForCTAS(Configuration configuration, Properties serDeProperties) {
    serDeProperties.setProperty(InputFormatConfig.TABLE_SCHEMA, SchemaParser.toJson(tableSchema));
    // build partition spec, if any
    if (!getPartitionColumnNames().isEmpty()) {
        List<FieldSchema> partitionFields = IntStream.range(0, getPartitionColumnNames().size()).mapToObj(i -> new FieldSchema(getPartitionColumnNames().get(i), getPartitionColumnTypes().get(i).getTypeName(), null)).collect(Collectors.toList());
        PartitionSpec spec = HiveSchemaUtil.spec(tableSchema, partitionFields);
        serDeProperties.put(InputFormatConfig.PARTITION_SPEC, PartitionSpecParser.toJson(spec));
    }
    // clean up the properties for table creation (so that internal serde props don't become table props)
    Properties createProps = getCTASTableCreationProperties(serDeProperties);
    // create CTAS table
    LOG.info("Creating table {} for CTAS with schema: {}, and spec: {}", serDeProperties.get(Catalogs.NAME), tableSchema, serDeProperties.get(InputFormatConfig.PARTITION_SPEC));
    Catalogs.createTable(configuration, createProps);
    // set this in the query state so that we can rollback the table in the lifecycle hook in case of failures
    SessionStateUtil.addResource(configuration, InputFormatConfig.CTAS_TABLE_NAME, serDeProperties.getProperty(Catalogs.NAME));
}
Also used : IntStream(java.util.stream.IntStream) Arrays(java.util.Arrays) HiveSchemaUtil(org.apache.iceberg.hive.HiveSchemaUtil) Catalogs(org.apache.iceberg.mr.Catalogs) ColumnProjectionUtils(org.apache.hadoop.hive.serde2.ColumnProjectionUtils) LoggerFactory(org.slf4j.LoggerFactory) Writable(org.apache.hadoop.io.Writable) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) PartitionField(org.apache.iceberg.PartitionField) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) IcebergObjectInspector(org.apache.iceberg.mr.hive.serde.objectinspector.IcebergObjectInspector) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) NoSuchTableException(org.apache.iceberg.exceptions.NoSuchTableException) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) Container(org.apache.iceberg.mr.mapred.Container) Nullable(javax.annotation.Nullable) Properties(java.util.Properties) Logger(org.slf4j.Logger) Table(org.apache.iceberg.Table) Collection(java.util.Collection) org.apache.hadoop.hive.serde.serdeConstants(org.apache.hadoop.hive.serde.serdeConstants) InputFormatConfig(org.apache.iceberg.mr.InputFormatConfig) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Schema(org.apache.iceberg.Schema) Collectors(java.util.stream.Collectors) PartitionSpecParser(org.apache.iceberg.PartitionSpecParser) SchemaParser(org.apache.iceberg.SchemaParser) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) List(java.util.List) Record(org.apache.iceberg.data.Record) SerDeStats(org.apache.hadoop.hive.serde2.SerDeStats) PartitionSpec(org.apache.iceberg.PartitionSpec) TableProperties(org.apache.iceberg.TableProperties) SessionStateUtil(org.apache.hadoop.hive.ql.session.SessionStateUtil) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) org.apache.hadoop.hive.metastore.api.hive_metastoreConstants(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) Properties(java.util.Properties) TableProperties(org.apache.iceberg.TableProperties) PartitionSpec(org.apache.iceberg.PartitionSpec)

Example 35 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class HiveVectorizedReader method reader.

public static <D> CloseableIterable<D> reader(InputFile inputFile, FileScanTask task, Map<Integer, ?> idToConstant, TaskAttemptContext context) {
    // Tweaks on jobConf here are relevant for this task only, so we need to copy it first as context's conf is reused..
    JobConf job = new JobConf((JobConf) context.getConfiguration());
    Path path = new Path(inputFile.location());
    FileFormat format = task.file().format();
    Reporter reporter = ((MapredIcebergInputFormat.CompatibilityTaskAttemptContextImpl) context).getLegacyReporter();
    // Hive by default requires partition columns to be read too. This is not required for identity partition
    // columns, as we will add this as constants later.
    int[] partitionColIndices = null;
    Object[] partitionValues = null;
    PartitionSpec partitionSpec = task.spec();
    List<Integer> readColumnIds = ColumnProjectionUtils.getReadColumnIDs(job);
    if (!partitionSpec.isUnpartitioned()) {
        List<PartitionField> fields = partitionSpec.fields();
        List<Integer> partitionColIndicesList = Lists.newLinkedList();
        List<Object> partitionValuesList = Lists.newLinkedList();
        for (PartitionField partitionField : fields) {
            if (partitionField.transform().isIdentity()) {
                // Get columns in read schema order (which matches those of readColumnIds) to find partition column indices
                List<Types.NestedField> columns = task.spec().schema().columns();
                for (int colIdx = 0; colIdx < columns.size(); ++colIdx) {
                    if (columns.get(colIdx).fieldId() == partitionField.sourceId()) {
                        // Skip reading identity partition columns from source file...
                        readColumnIds.remove((Integer) colIdx);
                        // ...and use the corresponding constant value instead
                        partitionColIndicesList.add(colIdx);
                        partitionValuesList.add(idToConstant.get(partitionField.sourceId()));
                        break;
                    }
                }
            }
        }
        partitionColIndices = ArrayUtils.toPrimitive(partitionColIndicesList.toArray(new Integer[0]));
        partitionValues = partitionValuesList.toArray(new Object[0]);
        ColumnProjectionUtils.setReadColumns(job, readColumnIds);
    }
    try {
        long start = task.start();
        long length = task.length();
        // TODO: Iceberg currently does not track the last modification time of a file. Until that's added,
        // we need to set Long.MIN_VALUE as last modification time in the fileId triplet.
        SyntheticFileId fileId = new SyntheticFileId(path, task.file().fileSizeInBytes(), Long.MIN_VALUE);
        RecordReader<NullWritable, VectorizedRowBatch> recordReader = null;
        switch(format) {
            case ORC:
                recordReader = orcRecordReader(job, reporter, task, inputFile, path, start, length, readColumnIds, fileId);
                break;
            case PARQUET:
                recordReader = parquetRecordReader(job, reporter, task, path, start, length);
                break;
            default:
                throw new UnsupportedOperationException("Vectorized Hive reading unimplemented for format: " + format);
        }
        return createVectorizedRowBatchIterable(recordReader, job, partitionColIndices, partitionValues);
    } catch (IOException ioe) {
        throw new RuntimeException("Error creating vectorized record reader for " + inputFile, ioe);
    }
}
Also used : SyntheticFileId(org.apache.hadoop.hive.ql.io.SyntheticFileId) FileFormat(org.apache.iceberg.FileFormat) VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) PartitionField(org.apache.iceberg.PartitionField) JobConf(org.apache.hadoop.mapred.JobConf) Path(org.apache.hadoop.fs.Path) Reporter(org.apache.hadoop.mapred.Reporter) IOException(java.io.IOException) PartitionSpec(org.apache.iceberg.PartitionSpec) NullWritable(org.apache.hadoop.io.NullWritable)

Aggregations

PartitionSpec (org.apache.iceberg.PartitionSpec)63 Table (org.apache.iceberg.Table)40 Test (org.junit.Test)39 Schema (org.apache.iceberg.Schema)38 TableIdentifier (org.apache.iceberg.catalog.TableIdentifier)19 Record (org.apache.iceberg.data.Record)19 List (java.util.List)10 ArrayList (java.util.ArrayList)9 FileFormat (org.apache.iceberg.FileFormat)9 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)8 IOException (java.io.IOException)7 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)7 UpdateSchema (org.apache.iceberg.UpdateSchema)6 Path (org.apache.hadoop.fs.Path)5 BaseTable (org.apache.iceberg.BaseTable)5 DataFile (org.apache.iceberg.DataFile)5 PartitionField (org.apache.iceberg.PartitionField)4 Types (org.apache.iceberg.types.Types)4 HdfsContext (com.facebook.presto.hive.HdfsContext)3 PrestoException (com.facebook.presto.spi.PrestoException)3