Search in sources :

Example 1 with CatalogTable

use of org.apache.spark.sql.catalyst.catalog.CatalogTable in project iceberg by apache.

the class SparkTableUtil method getPartitions.

/**
 * Returns all partitions in the table.
 *
 * @param spark a Spark session
 * @param tableIdent a table identifier
 * @param partitionFilter partition filter, or null if no filter
 * @return all table's partitions
 */
public static List<SparkPartition> getPartitions(SparkSession spark, TableIdentifier tableIdent, Map<String, String> partitionFilter) {
    try {
        SessionCatalog catalog = spark.sessionState().catalog();
        CatalogTable catalogTable = catalog.getTableMetadata(tableIdent);
        Option<scala.collection.immutable.Map<String, String>> scalaPartitionFilter;
        if (partitionFilter != null && !partitionFilter.isEmpty()) {
            Builder<Tuple2<String, String>, scala.collection.immutable.Map<String, String>> builder = Map$.MODULE$.<String, String>newBuilder();
            partitionFilter.forEach((key, value) -> builder.$plus$eq(Tuple2.apply(key, value)));
            scalaPartitionFilter = Option.apply(builder.result());
        } else {
            scalaPartitionFilter = Option.empty();
        }
        Seq<CatalogTablePartition> partitions = catalog.listPartitions(tableIdent, scalaPartitionFilter).toIndexedSeq();
        return JavaConverters.seqAsJavaListConverter(partitions).asJava().stream().map(catalogPartition -> toSparkPartition(catalogPartition, catalogTable)).collect(Collectors.toList());
    } catch (NoSuchDatabaseException e) {
        throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Database not found in catalog.", tableIdent);
    } catch (NoSuchTableException e) {
        throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Table not found in catalog.", tableIdent);
    }
}
Also used : Objects(org.apache.iceberg.relocated.com.google.common.base.Objects) Map$(scala.collection.immutable.Map$) LoggerFactory(org.slf4j.LoggerFactory) AppendFiles(org.apache.iceberg.AppendFiles) CatalogTable(org.apache.spark.sql.catalyst.catalog.CatalogTable) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) NameMapping(org.apache.iceberg.mapping.NameMapping) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) PropertyUtil(org.apache.iceberg.util.PropertyUtil) Path(org.apache.hadoop.fs.Path) URI(java.net.URI) Expression(org.apache.spark.sql.catalyst.expressions.Expression) DataFile(org.apache.iceberg.DataFile) SessionCatalog(org.apache.spark.sql.catalyst.catalog.SessionCatalog) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) MapFunction(org.apache.spark.api.java.function.MapFunction) ManifestWriter(org.apache.iceberg.ManifestWriter) Some(scala.Some) LogicalPlan(org.apache.spark.sql.catalyst.plans.logical.LogicalPlan) DataFrameReader(org.apache.spark.sql.DataFrameReader) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) TableIdentifier(org.apache.spark.sql.catalyst.TableIdentifier) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) Serializable(java.io.Serializable) Util(org.apache.iceberg.hadoop.Util) List(java.util.List) DynMethods(org.apache.iceberg.common.DynMethods) SerializableConfiguration(org.apache.iceberg.hadoop.SerializableConfiguration) PartitionSpec(org.apache.iceberg.PartitionSpec) JavaConverters(scala.collection.JavaConverters) TableProperties(org.apache.iceberg.TableProperties) org.apache.spark.sql.functions.col(org.apache.spark.sql.functions.col) Builder(scala.collection.mutable.Builder) Seq(scala.collection.immutable.Seq) AnalysisException(org.apache.spark.sql.AnalysisException) Dataset(org.apache.spark.sql.Dataset) UnresolvedAttribute(org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PathFilter(org.apache.hadoop.fs.PathFilter) HadoopFileIO(org.apache.iceberg.hadoop.HadoopFileIO) MapPartitionsFunction(org.apache.spark.api.java.function.MapPartitionsFunction) OutputFile(org.apache.iceberg.io.OutputFile) ParseException(org.apache.spark.sql.catalyst.parser.ParseException) Function2(scala.Function2) ManifestFile(org.apache.iceberg.ManifestFile) ManifestFiles(org.apache.iceberg.ManifestFiles) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException) NamedExpression(org.apache.spark.sql.catalyst.expressions.NamedExpression) CatalogTablePartition(org.apache.spark.sql.catalyst.catalog.CatalogTablePartition) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) MetricsConfig(org.apache.iceberg.MetricsConfig) Table(org.apache.iceberg.Table) TaskContext(org.apache.spark.TaskContext) Column(org.apache.spark.sql.Column) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) NameMappingParser(org.apache.iceberg.mapping.NameMappingParser) IOException(java.io.IOException) MoreObjects(org.apache.iceberg.relocated.com.google.common.base.MoreObjects) MetadataTableType(org.apache.iceberg.MetadataTableType) Row(org.apache.spark.sql.Row) Option(scala.Option) FileFormat(org.apache.iceberg.FileFormat) NoSuchDatabaseException(org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException) Joiner(org.apache.iceberg.relocated.com.google.common.base.Joiner) Encoders(org.apache.spark.sql.Encoders) TableMigrationUtil(org.apache.iceberg.data.TableMigrationUtil) Tasks(org.apache.iceberg.util.Tasks) Preconditions(org.apache.iceberg.relocated.com.google.common.base.Preconditions) FileIO(org.apache.iceberg.io.FileIO) Collections(java.util.Collections) AbstractPartialFunction(scala.runtime.AbstractPartialFunction) CatalogTablePartition(org.apache.spark.sql.catalyst.catalog.CatalogTablePartition) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException) SessionCatalog(org.apache.spark.sql.catalyst.catalog.SessionCatalog) CatalogTable(org.apache.spark.sql.catalyst.catalog.CatalogTable) NoSuchDatabaseException(org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException) Tuple2(scala.Tuple2) Map(java.util.Map)

Example 2 with CatalogTable

use of org.apache.spark.sql.catalyst.catalog.CatalogTable in project iceberg by apache.

the class SparkTableUtil method getPartitionsByFilter.

/**
 * Returns partitions that match the specified 'predicate'.
 *
 * @param spark a Spark session
 * @param tableIdent a table identifier
 * @param predicateExpr a predicate expression on partition columns
 * @return matching table's partitions
 */
public static List<SparkPartition> getPartitionsByFilter(SparkSession spark, TableIdentifier tableIdent, Expression predicateExpr) {
    try {
        SessionCatalog catalog = spark.sessionState().catalog();
        CatalogTable catalogTable = catalog.getTableMetadata(tableIdent);
        Expression resolvedPredicateExpr;
        if (!predicateExpr.resolved()) {
            resolvedPredicateExpr = resolveAttrs(spark, tableIdent.quotedString(), predicateExpr);
        } else {
            resolvedPredicateExpr = predicateExpr;
        }
        Seq<Expression> predicates = JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(resolvedPredicateExpr)).asScala().toIndexedSeq();
        Seq<CatalogTablePartition> partitions = catalog.listPartitionsByFilter(tableIdent, predicates).toIndexedSeq();
        return JavaConverters.seqAsJavaListConverter(partitions).asJava().stream().map(catalogPartition -> toSparkPartition(catalogPartition, catalogTable)).collect(Collectors.toList());
    } catch (NoSuchDatabaseException e) {
        throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Database not found in catalog.", tableIdent);
    } catch (NoSuchTableException e) {
        throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Table not found in catalog.", tableIdent);
    }
}
Also used : Objects(org.apache.iceberg.relocated.com.google.common.base.Objects) Map$(scala.collection.immutable.Map$) LoggerFactory(org.slf4j.LoggerFactory) AppendFiles(org.apache.iceberg.AppendFiles) CatalogTable(org.apache.spark.sql.catalyst.catalog.CatalogTable) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) NameMapping(org.apache.iceberg.mapping.NameMapping) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) PropertyUtil(org.apache.iceberg.util.PropertyUtil) Path(org.apache.hadoop.fs.Path) URI(java.net.URI) Expression(org.apache.spark.sql.catalyst.expressions.Expression) DataFile(org.apache.iceberg.DataFile) SessionCatalog(org.apache.spark.sql.catalyst.catalog.SessionCatalog) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) MapFunction(org.apache.spark.api.java.function.MapFunction) ManifestWriter(org.apache.iceberg.ManifestWriter) Some(scala.Some) LogicalPlan(org.apache.spark.sql.catalyst.plans.logical.LogicalPlan) DataFrameReader(org.apache.spark.sql.DataFrameReader) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) TableIdentifier(org.apache.spark.sql.catalyst.TableIdentifier) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) Serializable(java.io.Serializable) Util(org.apache.iceberg.hadoop.Util) List(java.util.List) DynMethods(org.apache.iceberg.common.DynMethods) SerializableConfiguration(org.apache.iceberg.hadoop.SerializableConfiguration) PartitionSpec(org.apache.iceberg.PartitionSpec) JavaConverters(scala.collection.JavaConverters) TableProperties(org.apache.iceberg.TableProperties) org.apache.spark.sql.functions.col(org.apache.spark.sql.functions.col) Builder(scala.collection.mutable.Builder) Seq(scala.collection.immutable.Seq) AnalysisException(org.apache.spark.sql.AnalysisException) Dataset(org.apache.spark.sql.Dataset) UnresolvedAttribute(org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PathFilter(org.apache.hadoop.fs.PathFilter) HadoopFileIO(org.apache.iceberg.hadoop.HadoopFileIO) MapPartitionsFunction(org.apache.spark.api.java.function.MapPartitionsFunction) OutputFile(org.apache.iceberg.io.OutputFile) ParseException(org.apache.spark.sql.catalyst.parser.ParseException) Function2(scala.Function2) ManifestFile(org.apache.iceberg.ManifestFile) ManifestFiles(org.apache.iceberg.ManifestFiles) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException) NamedExpression(org.apache.spark.sql.catalyst.expressions.NamedExpression) CatalogTablePartition(org.apache.spark.sql.catalyst.catalog.CatalogTablePartition) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) MetricsConfig(org.apache.iceberg.MetricsConfig) Table(org.apache.iceberg.Table) TaskContext(org.apache.spark.TaskContext) Column(org.apache.spark.sql.Column) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) NameMappingParser(org.apache.iceberg.mapping.NameMappingParser) IOException(java.io.IOException) MoreObjects(org.apache.iceberg.relocated.com.google.common.base.MoreObjects) MetadataTableType(org.apache.iceberg.MetadataTableType) Row(org.apache.spark.sql.Row) Option(scala.Option) FileFormat(org.apache.iceberg.FileFormat) NoSuchDatabaseException(org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException) Joiner(org.apache.iceberg.relocated.com.google.common.base.Joiner) Encoders(org.apache.spark.sql.Encoders) TableMigrationUtil(org.apache.iceberg.data.TableMigrationUtil) Tasks(org.apache.iceberg.util.Tasks) Preconditions(org.apache.iceberg.relocated.com.google.common.base.Preconditions) FileIO(org.apache.iceberg.io.FileIO) Collections(java.util.Collections) AbstractPartialFunction(scala.runtime.AbstractPartialFunction) NoSuchDatabaseException(org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException) Expression(org.apache.spark.sql.catalyst.expressions.Expression) NamedExpression(org.apache.spark.sql.catalyst.expressions.NamedExpression) CatalogTablePartition(org.apache.spark.sql.catalyst.catalog.CatalogTablePartition) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException) SessionCatalog(org.apache.spark.sql.catalyst.catalog.SessionCatalog) CatalogTable(org.apache.spark.sql.catalyst.catalog.CatalogTable)

Example 3 with CatalogTable

use of org.apache.spark.sql.catalyst.catalog.CatalogTable in project iceberg by apache.

the class TestCreateActions method expectedFilesCount.

private long expectedFilesCount(String source) throws NoSuchDatabaseException, NoSuchTableException, ParseException {
    CatalogTable sourceTable = loadSessionTable(source);
    List<URI> uris;
    if (sourceTable.partitionColumnNames().size() == 0) {
        uris = Lists.newArrayList();
        uris.add(sourceTable.location());
    } else {
        Seq<CatalogTablePartition> catalogTablePartitionSeq = spark.sessionState().catalog().listPartitions(sourceTable.identifier(), Option.apply(null));
        uris = JavaConverters.seqAsJavaList(catalogTablePartitionSeq).stream().map(CatalogTablePartition::location).collect(Collectors.toList());
    }
    return uris.stream().flatMap(uri -> FileUtils.listFiles(Paths.get(uri).toFile(), TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).stream()).filter(file -> !file.toString().endsWith("crc") && !file.toString().contains("_SUCCESS")).count();
}
Also used : Arrays(java.util.Arrays) SparkCatalogTestBase(org.apache.iceberg.spark.SparkCatalogTestBase) Types(org.apache.iceberg.types.Types) SparkCatalog(org.apache.iceberg.spark.SparkCatalog) Spark3Util(org.apache.iceberg.spark.Spark3Util) CatalogTable(org.apache.spark.sql.catalyst.catalog.CatalogTable) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) TableCatalog(org.apache.spark.sql.connector.catalog.TableCatalog) Map(java.util.Map) MessageTypeParser(org.apache.parquet.schema.MessageTypeParser) After(org.junit.After) Path(org.apache.hadoop.fs.Path) SnapshotTable(org.apache.iceberg.actions.SnapshotTable) URI(java.net.URI) SimpleRecord(org.apache.iceberg.spark.source.SimpleRecord) TrueFileFilter(org.apache.commons.io.filefilter.TrueFileFilter) Parameterized(org.junit.runners.Parameterized) DataTypes(org.apache.spark.sql.types.DataTypes) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) Some(scala.Some) Seq(scala.collection.Seq) MigrateTable(org.apache.iceberg.actions.MigrateTable) Schema(org.apache.iceberg.Schema) TableIdentifier(org.apache.spark.sql.catalyst.TableIdentifier) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) JavaConverters(scala.collection.JavaConverters) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) FilenameFilter(java.io.FilenameFilter) Dataset(org.apache.spark.sql.Dataset) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) ParseException(org.apache.spark.sql.catalyst.parser.ParseException) ArrayType(org.apache.spark.sql.types.ArrayType) SparkSessionCatalog(org.apache.iceberg.spark.SparkSessionCatalog) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException) Assume(org.junit.Assume) CatalogTablePartition(org.apache.spark.sql.catalyst.catalog.CatalogTablePartition) Before(org.junit.Before) Metadata(org.apache.spark.sql.types.Metadata) SaveMode(org.apache.spark.sql.SaveMode) Table(org.apache.iceberg.Table) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) IOException(java.io.IOException) FileUtils(org.apache.commons.io.FileUtils) Test(org.junit.Test) Row(org.apache.spark.sql.Row) Option(scala.Option) NoSuchDatabaseException(org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException) File(java.io.File) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) Rule(org.junit.Rule) Paths(java.nio.file.Paths) Identifier(org.apache.spark.sql.connector.catalog.Identifier) Assert(org.junit.Assert) Collections(java.util.Collections) SparkTable(org.apache.iceberg.spark.source.SparkTable) TemporaryFolder(org.junit.rules.TemporaryFolder) CatalogTablePartition(org.apache.spark.sql.catalyst.catalog.CatalogTablePartition) CatalogTable(org.apache.spark.sql.catalyst.catalog.CatalogTable) URI(java.net.URI)

Example 4 with CatalogTable

use of org.apache.spark.sql.catalyst.catalog.CatalogTable in project iceberg by apache.

the class TestCreateActions method createSourceTable.

private void createSourceTable(String createStatement, String tableName) throws IOException, NoSuchTableException, NoSuchDatabaseException, ParseException {
    File location = temp.newFolder();
    spark.sql(String.format(createStatement, tableName, location));
    CatalogTable table = loadSessionTable(tableName);
    Seq<String> partitionColumns = table.partitionColumnNames();
    String format = table.provider().get();
    spark.table(baseTableName).write().mode(SaveMode.Append).format(format).partitionBy(partitionColumns.toSeq()).saveAsTable(tableName);
}
Also used : CatalogTable(org.apache.spark.sql.catalyst.catalog.CatalogTable) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) File(java.io.File)

Example 5 with CatalogTable

use of org.apache.spark.sql.catalyst.catalog.CatalogTable in project OpenLineage by OpenLineage.

the class AlterTableAddColumnsCommandVisitor method apply.

@Override
public List<OpenLineage.OutputDataset> apply(LogicalPlan x) {
    Optional<CatalogTable> tableOption = catalogTableFor(((AlterTableAddColumnsCommand) x).table());
    if (!tableOption.isPresent()) {
        return Collections.emptyList();
    }
    CatalogTable catalogTable = tableOption.get();
    List<StructField> tableColumns = Arrays.asList(catalogTable.schema().fields());
    List<StructField> addedColumns = JavaConversions.seqAsJavaList(((AlterTableAddColumnsCommand) x).colsToAdd());
    if (tableColumns.containsAll(addedColumns)) {
        return Collections.singletonList(outputDataset().getDataset(PathUtils.fromCatalogTable(catalogTable), catalogTable.schema()));
    } else {
        // apply triggered before applying the change - do not send an event
        return Collections.emptyList();
    }
}
Also used : StructField(org.apache.spark.sql.types.StructField) CatalogTable(org.apache.spark.sql.catalyst.catalog.CatalogTable)

Aggregations

CatalogTable (org.apache.spark.sql.catalyst.catalog.CatalogTable)20 URI (java.net.URI)6 OpenLineage (io.openlineage.client.OpenLineage)4 DatasetIdentifier (io.openlineage.spark.agent.util.DatasetIdentifier)4 Row (org.apache.spark.sql.Row)4 NoSuchDatabaseException (org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException)4 NoSuchTableException (org.apache.spark.sql.catalyst.analysis.NoSuchTableException)4 IOException (java.io.IOException)3 Collections (java.util.Collections)3 List (java.util.List)3 Map (java.util.Map)3 Collectors (java.util.stream.Collectors)3 Configuration (org.apache.hadoop.conf.Configuration)3 Path (org.apache.hadoop.fs.Path)3 AppendFiles (org.apache.iceberg.AppendFiles)3 DataFile (org.apache.iceberg.DataFile)3 MetricsConfig (org.apache.iceberg.MetricsConfig)3 PartitionSpec (org.apache.iceberg.PartitionSpec)3 Table (org.apache.iceberg.Table)3 Lists (org.apache.iceberg.relocated.com.google.common.collect.Lists)3