Search in sources :

Example 61 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project incubator-gobblin by apache.

the class IcebergMetadataWriter method createTable.

protected Table createTable(GobblinMetadataChangeEvent gmce, HiveSpec spec) throws IOException {
    String schema = gmce.getTableSchema();
    org.apache.hadoop.hive.metastore.api.Table table = HiveMetaStoreUtils.getTable(spec.getTable());
    IcebergUtils.IcebergDataAndPartitionSchema schemas = IcebergUtils.getIcebergSchema(schema, table);
    TableIdentifier tid = TableIdentifier.of(table.getDbName(), table.getTableName());
    Schema tableSchema = schemas.tableSchema;
    Preconditions.checkState(tableSchema != null, "Table schema cannot be null when creating a table");
    PartitionSpec partitionSpec = IcebergUtils.getPartitionSpec(tableSchema, schemas.partitionSchema);
    Table icebergTable = null;
    String tableLocation = null;
    if (useDataLocationAsTableLocation) {
        tableLocation = gmce.getDatasetIdentifier().getNativeName() + String.format(TABLE_LOCATION_SUFFIX, table.getDbName());
        // Set the path permission
        Path tablePath = new Path(tableLocation);
        WriterUtils.mkdirsWithRecursivePermission(tablePath.getFileSystem(conf), tablePath, permission);
    }
    try (Timer.Context context = metricContext.timer(CREATE_TABLE_TIME).time()) {
        icebergTable = catalog.createTable(tid, tableSchema, partitionSpec, tableLocation, IcebergUtils.getTableProperties(table));
        log.info("Created table {}, schema: {} partition spec: {}", tid, tableSchema, partitionSpec);
    } catch (AlreadyExistsException e) {
        log.warn("table {} already exist, there may be some other process try to create table concurrently", tid);
    }
    return icebergTable;
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Path(org.apache.hadoop.fs.Path) Table(org.apache.iceberg.Table) AlreadyExistsException(org.apache.iceberg.exceptions.AlreadyExistsException) IcebergUtils(org.apache.gobblin.iceberg.Utils.IcebergUtils) Schema(org.apache.iceberg.Schema) PartitionSpec(org.apache.iceberg.PartitionSpec) Timer(com.codahale.metrics.Timer)

Example 62 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project incubator-gobblin by apache.

the class IcebergMetadataWriter method dropFiles.

/**
 * Deal with both regular file deletions manifested by GMCE(aggregation but no commit),
 * and expiring older snapshots(commit).
 */
protected void dropFiles(GobblinMetadataChangeEvent gmce, Map<String, Collection<HiveSpec>> oldSpecsMap, Table table, TableMetadata tableMetadata, TableIdentifier tid) throws IOException {
    PartitionSpec partitionSpec = table.spec();
    // Update DeleteFiles in tableMetadata: This is regular file deletion
    DeleteFiles deleteFiles = tableMetadata.getOrInitDeleteFiles();
    Set<DataFile> oldDataFiles = getIcebergDataFilesToBeDeleted(gmce, table, new HashMap<>(), oldSpecsMap, partitionSpec);
    oldDataFiles.forEach(deleteFiles::deleteFile);
    // Update ExpireSnapshots and commit the updates at once: This is for expiring snapshots that are
    // beyond look-back allowance for time-travel.
    parallelRunner.submitCallable(new Callable<Void>() {

        @Override
        public Void call() throws Exception {
            try {
                long olderThan = getExpireSnapshotTime();
                long start = System.currentTimeMillis();
                ExpireSnapshots expireSnapshots = table.expireSnapshots();
                final Table tmpTable = table;
                expireSnapshots.deleteWith(new Consumer<String>() {

                    @Override
                    public void accept(String file) {
                        if (file.startsWith(tmpTable.location())) {
                            tmpTable.io().deleteFile(file);
                        }
                    }
                }).expireOlderThan(olderThan).commit();
                // TODO: emit these metrics to Ingraphs, in addition to metrics for publishing new snapshots and other Iceberg metadata operations.
                log.info("Spent {} ms to expire snapshots older than {} ({}) in table {}", System.currentTimeMillis() - start, new DateTime(olderThan).toString(), olderThan, tid.toString());
            } catch (Exception e) {
                log.error(String.format("Fail to expire snapshots for table %s due to exception ", tid.toString()), e);
            }
            return null;
        }
    }, tid.toString());
}
Also used : Table(org.apache.iceberg.Table) DeleteFiles(org.apache.iceberg.DeleteFiles) PartitionSpec(org.apache.iceberg.PartitionSpec) AlreadyExistsException(org.apache.iceberg.exceptions.AlreadyExistsException) SchemaRegistryException(org.apache.gobblin.metrics.kafka.SchemaRegistryException) NoSuchTableException(org.apache.iceberg.exceptions.NoSuchTableException) IOException(java.io.IOException) ZonedDateTime(java.time.ZonedDateTime) DateTime(org.joda.time.DateTime) DataFile(org.apache.iceberg.DataFile) Consumer(java.util.function.Consumer) ExpireSnapshots(org.apache.iceberg.ExpireSnapshots)

Example 63 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project incubator-gobblin by apache.

the class IcebergMetadataWriter method addLatePartitionValueToIcebergTable.

/**
 * 1. Add "late" partition column to iceberg table if not exists
 * 2. compute "late" partition value based on datepartition and completion watermark
 * 3. Default to late=0 if completion watermark check is disabled
 * @param table
 * @param tableMetadata
 * @param hivePartition
 * @param datepartition
 * @return new iceberg partition value for file
 */
private StructLike addLatePartitionValueToIcebergTable(Table table, TableMetadata tableMetadata, HivePartition hivePartition, String datepartition) {
    table = addPartitionToIcebergTable(table, newPartitionColumn, newPartitionColumnType);
    PartitionSpec partitionSpec = table.spec();
    long prevCompletenessWatermark = tableMetadata.prevCompletenessWatermark;
    int late = !tableMetadata.completenessEnabled ? 0 : isLate(datepartition, prevCompletenessWatermark);
    List<String> partitionValues = new ArrayList<>(hivePartition.getValues());
    partitionValues.add(String.valueOf(late));
    return IcebergUtils.getPartition(partitionSpec.partitionType(), partitionValues);
}
Also used : ArrayList(java.util.ArrayList) PartitionSpec(org.apache.iceberg.PartitionSpec)

Aggregations

PartitionSpec (org.apache.iceberg.PartitionSpec)63 Table (org.apache.iceberg.Table)40 Test (org.junit.Test)39 Schema (org.apache.iceberg.Schema)38 TableIdentifier (org.apache.iceberg.catalog.TableIdentifier)19 Record (org.apache.iceberg.data.Record)19 List (java.util.List)10 ArrayList (java.util.ArrayList)9 FileFormat (org.apache.iceberg.FileFormat)9 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)8 IOException (java.io.IOException)7 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)7 UpdateSchema (org.apache.iceberg.UpdateSchema)6 Path (org.apache.hadoop.fs.Path)5 BaseTable (org.apache.iceberg.BaseTable)5 DataFile (org.apache.iceberg.DataFile)5 PartitionField (org.apache.iceberg.PartitionField)4 Types (org.apache.iceberg.types.Types)4 HdfsContext (com.facebook.presto.hive.HdfsContext)3 PrestoException (com.facebook.presto.spi.PrestoException)3