Search in sources :

Example 6 with Table

use of org.apache.spark.sql.connector.catalog.Table in project OpenLineage by OpenLineage.

the class AlterTableDatasetBuilder method apply.

@Override
public List<OpenLineage.OutputDataset> apply(AlterTable alterTable) {
    TableCatalog tableCatalog = alterTable.catalog();
    Table table;
    try {
        table = alterTable.catalog().loadTable(alterTable.ident());
    } catch (Exception e) {
        return Collections.emptyList();
    }
    Optional<DatasetIdentifier> di = PlanUtils3.getDatasetIdentifier(context, tableCatalog, alterTable.ident(), table.properties());
    if (di.isPresent()) {
        OpenLineage openLineage = context.getOpenLineage();
        OpenLineage.DatasetFacetsBuilder builder = openLineage.newDatasetFacetsBuilder().schema(PlanUtils.schemaFacet(openLineage, table.schema())).dataSource(PlanUtils.datasourceFacet(openLineage, di.get().getNamespace()));
        Optional<String> datasetVersion = CatalogUtils3.getDatasetVersion(tableCatalog, alterTable.ident(), table.properties());
        datasetVersion.ifPresent(version -> builder.version(openLineage.newDatasetVersionDatasetFacet(version)));
        return Collections.singletonList(outputDataset().getDataset(di.get().getName(), di.get().getNamespace(), builder.build()));
    } else {
        return Collections.emptyList();
    }
}
Also used : AlterTable(org.apache.spark.sql.catalyst.plans.logical.AlterTable) Table(org.apache.spark.sql.connector.catalog.Table) DatasetIdentifier(io.openlineage.spark.agent.util.DatasetIdentifier) TableCatalog(org.apache.spark.sql.connector.catalog.TableCatalog) OpenLineage(io.openlineage.client.OpenLineage)

Example 7 with Table

use of org.apache.spark.sql.connector.catalog.Table in project iceberg by apache.

the class Spark3Util method loadIcebergTable.

/**
 * Returns an Iceberg Table by its name from a Spark V2 Catalog. If cache is enabled in {@link SparkCatalog},
 * the {@link TableOperations} of the table may be stale, please refresh the table to get the latest one.
 *
 * @param spark SparkSession used for looking up catalog references and tables
 * @param name  The multipart identifier of the Iceberg table
 * @return an Iceberg table
 */
public static org.apache.iceberg.Table loadIcebergTable(SparkSession spark, String name) throws ParseException, NoSuchTableException {
    CatalogAndIdentifier catalogAndIdentifier = catalogAndIdentifier(spark, name);
    TableCatalog catalog = asTableCatalog(catalogAndIdentifier.catalog);
    Table sparkTable = catalog.loadTable(catalogAndIdentifier.identifier);
    return toIcebergTable(sparkTable);
}
Also used : SparkTable(org.apache.iceberg.spark.source.SparkTable) Table(org.apache.spark.sql.connector.catalog.Table) TableCatalog(org.apache.spark.sql.connector.catalog.TableCatalog)

Example 8 with Table

use of org.apache.spark.sql.connector.catalog.Table in project iceberg by apache.

the class Spark3Util method getPartitions.

/**
 * Use Spark to list all partitions in the table.
 *
 * @param spark a Spark session
 * @param rootPath a table identifier
 * @param format format of the file
 * @param partitionFilter partitionFilter of the file
 * @return all table's partitions
 */
public static List<SparkPartition> getPartitions(SparkSession spark, Path rootPath, String format, Map<String, String> partitionFilter) {
    FileStatusCache fileStatusCache = FileStatusCache.getOrCreate(spark);
    InMemoryFileIndex fileIndex = new InMemoryFileIndex(spark, JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(rootPath)).asScala().toSeq(), scala.collection.immutable.Map$.MODULE$.<String, String>empty(), Option.empty(), fileStatusCache, Option.empty(), Option.empty());
    org.apache.spark.sql.execution.datasources.PartitionSpec spec = fileIndex.partitionSpec();
    StructType schema = spec.partitionColumns();
    if (schema.isEmpty()) {
        return Lists.newArrayList();
    }
    List<org.apache.spark.sql.catalyst.expressions.Expression> filterExpressions = SparkUtil.partitionMapToExpression(schema, partitionFilter);
    Seq<org.apache.spark.sql.catalyst.expressions.Expression> scalaPartitionFilters = JavaConverters.asScalaBufferConverter(filterExpressions).asScala().toIndexedSeq();
    List<org.apache.spark.sql.catalyst.expressions.Expression> dataFilters = Lists.newArrayList();
    Seq<org.apache.spark.sql.catalyst.expressions.Expression> scalaDataFilters = JavaConverters.asScalaBufferConverter(dataFilters).asScala().toIndexedSeq();
    Seq<PartitionDirectory> filteredPartitions = fileIndex.listFiles(scalaPartitionFilters, scalaDataFilters).toIndexedSeq();
    return JavaConverters.seqAsJavaListConverter(filteredPartitions).asJava().stream().map(partition -> {
        Map<String, String> values = Maps.newHashMap();
        JavaConverters.asJavaIterableConverter(schema).asJava().forEach(field -> {
            int fieldIndex = schema.fieldIndex(field.name());
            Object catalystValue = partition.values().get(fieldIndex, field.dataType());
            Object value = CatalystTypeConverters.convertToScala(catalystValue, field.dataType());
            values.put(field.name(), String.valueOf(value));
        });
        FileStatus fileStatus = JavaConverters.seqAsJavaListConverter(partition.files()).asJava().get(0);
        return new SparkPartition(values, fileStatus.getPath().getParent().toString(), format);
    }).collect(Collectors.toList());
}
Also used : FileStatusCache(org.apache.spark.sql.execution.datasources.FileStatusCache) Arrays(java.util.Arrays) DataSourceV2Relation(org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation) TypeUtil(org.apache.iceberg.types.TypeUtil) Types(org.apache.iceberg.types.Types) MetadataTableUtils(org.apache.iceberg.MetadataTableUtils) UpdateSchema(org.apache.iceberg.UpdateSchema) FileStatus(org.apache.hadoop.fs.FileStatus) PartitionSpecVisitor(org.apache.iceberg.transforms.PartitionSpecVisitor) ByteBuffer(java.nio.ByteBuffer) TableOperations(org.apache.iceberg.TableOperations) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) TableCatalog(org.apache.spark.sql.connector.catalog.TableCatalog) ByteBuffers(org.apache.iceberg.util.ByteBuffers) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) StructType(org.apache.spark.sql.types.StructType) Some(scala.Some) Term(org.apache.iceberg.expressions.Term) IntegerType(org.apache.spark.sql.types.IntegerType) SortOrderVisitor(org.apache.iceberg.transforms.SortOrderVisitor) Set(java.util.Set) LongType(org.apache.spark.sql.types.LongType) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) Type(org.apache.iceberg.types.Type) List(java.util.List) UpdateProperties(org.apache.iceberg.UpdateProperties) ExpressionVisitors(org.apache.iceberg.expressions.ExpressionVisitors) Expressions(org.apache.spark.sql.connector.expressions.Expressions) PartitionSpec(org.apache.iceberg.PartitionSpec) JavaConverters(scala.collection.JavaConverters) TableProperties(org.apache.iceberg.TableProperties) Transform(org.apache.spark.sql.connector.expressions.Transform) Seq(scala.collection.immutable.Seq) ImmutableSet(org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet) Dataset(org.apache.spark.sql.Dataset) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) PartitionDirectory(org.apache.spark.sql.execution.datasources.PartitionDirectory) TableChange(org.apache.spark.sql.connector.catalog.TableChange) Pair(org.apache.iceberg.util.Pair) BaseEncoding(org.apache.iceberg.relocated.com.google.common.io.BaseEncoding) ParseException(org.apache.spark.sql.catalyst.parser.ParseException) BoundPredicate(org.apache.iceberg.expressions.BoundPredicate) InMemoryFileIndex(org.apache.spark.sql.execution.datasources.InMemoryFileIndex) SparkPartition(org.apache.iceberg.spark.SparkTableUtil.SparkPartition) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException) NullOrder(org.apache.iceberg.NullOrder) Namespace(org.apache.iceberg.catalog.Namespace) SparkSession(org.apache.spark.sql.SparkSession) CatalystTypeConverters(org.apache.spark.sql.catalyst.CatalystTypeConverters) NamedReference(org.apache.spark.sql.connector.expressions.NamedReference) TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Literal(org.apache.spark.sql.connector.expressions.Literal) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) MetadataTableType(org.apache.iceberg.MetadataTableType) Row(org.apache.spark.sql.Row) Option(scala.Option) Joiner(org.apache.iceberg.relocated.com.google.common.base.Joiner) Expression(org.apache.spark.sql.connector.expressions.Expression) CatalogPlugin(org.apache.spark.sql.connector.catalog.CatalogPlugin) Preconditions(org.apache.iceberg.relocated.com.google.common.base.Preconditions) UnboundPredicate(org.apache.iceberg.expressions.UnboundPredicate) Identifier(org.apache.spark.sql.connector.catalog.Identifier) ParserInterface(org.apache.spark.sql.catalyst.parser.ParserInterface) SparkTable(org.apache.iceberg.spark.source.SparkTable) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap) CatalogManager(org.apache.spark.sql.connector.catalog.CatalogManager) Table(org.apache.spark.sql.connector.catalog.Table) FileStatus(org.apache.hadoop.fs.FileStatus) StructType(org.apache.spark.sql.types.StructType) InMemoryFileIndex(org.apache.spark.sql.execution.datasources.InMemoryFileIndex) PartitionDirectory(org.apache.spark.sql.execution.datasources.PartitionDirectory) SparkPartition(org.apache.iceberg.spark.SparkTableUtil.SparkPartition) Expression(org.apache.spark.sql.connector.expressions.Expression) Map(java.util.Map) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap) FileStatusCache(org.apache.spark.sql.execution.datasources.FileStatusCache)

Example 9 with Table

use of org.apache.spark.sql.connector.catalog.Table in project iceberg by apache.

the class SparkSessionCatalog method stageReplace.

@Override
public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] partitions, Map<String, String> properties) throws NoSuchNamespaceException, NoSuchTableException {
    String provider = properties.get("provider");
    TableCatalog catalog;
    if (useIceberg(provider)) {
        if (asStagingCatalog != null) {
            return asStagingCatalog.stageReplace(ident, schema, partitions, properties);
        }
        catalog = icebergCatalog;
    } else {
        catalog = getSessionCatalog();
    }
    // attempt to drop the table and fail if it doesn't exist
    if (!catalog.dropTable(ident)) {
        throw new NoSuchTableException(ident);
    }
    try {
        // create the table with the session catalog, then wrap it in a staged table that will delete to roll back
        Table table = catalog.createTable(ident, schema, partitions, properties);
        return new RollbackStagedTable(catalog, ident, table);
    } catch (TableAlreadyExistsException e) {
        // the table was deleted, but now already exists again. retry the replace.
        return stageReplace(ident, schema, partitions, properties);
    }
}
Also used : TableAlreadyExistsException(org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException) StagedTable(org.apache.spark.sql.connector.catalog.StagedTable) Table(org.apache.spark.sql.connector.catalog.Table) TableCatalog(org.apache.spark.sql.connector.catalog.TableCatalog) StagingTableCatalog(org.apache.spark.sql.connector.catalog.StagingTableCatalog) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException)

Example 10 with Table

use of org.apache.spark.sql.connector.catalog.Table in project iceberg by apache.

the class TestSparkCatalogOperations method testAlterTable.

@Test
public void testAlterTable() throws NoSuchTableException {
    BaseCatalog catalog = (BaseCatalog) spark.sessionState().catalogManager().catalog(catalogName);
    Identifier identifier = Identifier.of(tableIdent.namespace().levels(), tableIdent.name());
    String fieldName = "location";
    String propsKey = "note";
    String propsValue = "jazz";
    Table table = catalog.alterTable(identifier, TableChange.addColumn(new String[] { fieldName }, DataTypes.StringType, true), TableChange.setProperty(propsKey, propsValue));
    Assert.assertNotNull("Should return updated table", table);
    StructField expectedField = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
    Assert.assertEquals("Adding a column to a table should return the updated table with the new column", table.schema().fields()[2], expectedField);
    Assert.assertTrue("Adding a property to a table should return the updated table with the new property", table.properties().containsKey(propsKey));
    Assert.assertEquals("Altering a table to add a new property should add the correct value", propsValue, table.properties().get(propsKey));
}
Also used : Identifier(org.apache.spark.sql.connector.catalog.Identifier) Table(org.apache.spark.sql.connector.catalog.Table) StructField(org.apache.spark.sql.types.StructField) Test(org.junit.Test)

Aggregations

Table (org.apache.spark.sql.connector.catalog.Table)10 TableCatalog (org.apache.spark.sql.connector.catalog.TableCatalog)7 SparkTable (org.apache.iceberg.spark.source.SparkTable)4 NoSuchTableException (org.apache.spark.sql.catalyst.analysis.NoSuchTableException)4 ByteBuffer (java.nio.ByteBuffer)2 Arrays (java.util.Arrays)2 List (java.util.List)2 Map (java.util.Map)2 Objects (java.util.Objects)2 Set (java.util.Set)2 Collectors (java.util.stream.Collectors)2 Path (org.apache.hadoop.fs.Path)2 MetadataTableType (org.apache.iceberg.MetadataTableType)2 MetadataTableUtils (org.apache.iceberg.MetadataTableUtils)2 NullOrder (org.apache.iceberg.NullOrder)2 PartitionSpec (org.apache.iceberg.PartitionSpec)2 Schema (org.apache.iceberg.Schema)2 TableOperations (org.apache.iceberg.TableOperations)2 TableProperties (org.apache.iceberg.TableProperties)2 UpdateProperties (org.apache.iceberg.UpdateProperties)2