Search in sources :

Example 1 with CaseInsensitiveStringMap

use of org.apache.spark.sql.util.CaseInsensitiveStringMap in project iceberg by apache.

the class IcebergSource method getTable.

@Override
public Table getTable(StructType schema, Transform[] partitioning, Map<String, String> options) {
    Spark3Util.CatalogAndIdentifier catalogIdentifier = catalogAndIdentifier(new CaseInsensitiveStringMap(options));
    CatalogPlugin catalog = catalogIdentifier.catalog();
    Identifier ident = catalogIdentifier.identifier();
    try {
        if (catalog instanceof TableCatalog) {
            return ((TableCatalog) catalog).loadTable(ident);
        }
    } catch (NoSuchTableException e) {
        // throwing an iceberg NoSuchTableException because the Spark one is typed and cant be thrown from this interface
        throw new org.apache.iceberg.exceptions.NoSuchTableException(e, "Cannot find table for %s.", ident);
    }
    // throwing an iceberg NoSuchTableException because the Spark one is typed and cant be thrown from this interface
    throw new org.apache.iceberg.exceptions.NoSuchTableException("Cannot find table for %s.", ident);
}
Also used : CatalogPlugin(org.apache.spark.sql.connector.catalog.CatalogPlugin) PathIdentifier(org.apache.iceberg.spark.PathIdentifier) Identifier(org.apache.spark.sql.connector.catalog.Identifier) TableCatalog(org.apache.spark.sql.connector.catalog.TableCatalog) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap) Spark3Util(org.apache.iceberg.spark.Spark3Util)

Example 2 with CaseInsensitiveStringMap

use of org.apache.spark.sql.util.CaseInsensitiveStringMap in project iceberg by apache.

the class SparkScanBuilder method buildMergeOnReadScan.

public Scan buildMergeOnReadScan() {
    Preconditions.checkArgument(readConf.snapshotId() == null && readConf.asOfTimestamp() == null, "Cannot set time travel options %s and %s for row-level command scans", SparkReadOptions.SNAPSHOT_ID, SparkReadOptions.AS_OF_TIMESTAMP);
    Preconditions.checkArgument(readConf.startSnapshotId() == null && readConf.endSnapshotId() == null, "Cannot set incremental scan options %s and %s for row-level command scans", SparkReadOptions.START_SNAPSHOT_ID, SparkReadOptions.END_SNAPSHOT_ID);
    Snapshot snapshot = table.currentSnapshot();
    if (snapshot == null) {
        return new SparkBatchQueryScan(spark, table, null, readConf, schemaWithMetadataColumns(), filterExpressions);
    }
    // remember the current snapshot ID for commit validation
    long snapshotId = snapshot.snapshotId();
    CaseInsensitiveStringMap adjustedOptions = Spark3Util.setOption(SparkReadOptions.SNAPSHOT_ID, Long.toString(snapshotId), options);
    SparkReadConf adjustedReadConf = new SparkReadConf(spark, table, adjustedOptions);
    Schema expectedSchema = schemaWithMetadataColumns();
    TableScan scan = table.newScan().useSnapshot(snapshotId).caseSensitive(caseSensitive).filter(filterExpression()).project(expectedSchema);
    scan = configureSplitPlanning(scan);
    return new SparkBatchQueryScan(spark, table, scan, adjustedReadConf, expectedSchema, filterExpressions);
}
Also used : SparkReadConf(org.apache.iceberg.spark.SparkReadConf) Snapshot(org.apache.iceberg.Snapshot) TableScan(org.apache.iceberg.TableScan) Schema(org.apache.iceberg.Schema) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap)

Example 3 with CaseInsensitiveStringMap

use of org.apache.spark.sql.util.CaseInsensitiveStringMap in project iceberg by apache.

the class TestFilteredScan method testUnpartitionedCaseInsensitiveIDFilters.

@Test
public void testUnpartitionedCaseInsensitiveIDFilters() {
    CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString()));
    // set spark.sql.caseSensitive to false
    String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive");
    TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", "false");
    try {
        for (int i = 0; i < 10; i += 1) {
            SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).caseSensitive(false);
            // note lower(ID) == lower(id), so there must be a match
            pushFilters(builder, EqualTo.apply("ID", i));
            Batch scan = builder.build().toBatch();
            InputPartition[] tasks = scan.planInputPartitions();
            Assert.assertEquals("Should only create one task for a small file", 1, tasks.length);
            // validate row filtering
            assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), vectorized, "id = " + i));
        }
    } finally {
        // return global conf to previous state
        TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", caseSensitivityBeforeTest);
    }
}
Also used : Batch(org.apache.spark.sql.connector.read.Batch) InputPartition(org.apache.spark.sql.connector.read.InputPartition) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap) Test(org.junit.Test)

Example 4 with CaseInsensitiveStringMap

use of org.apache.spark.sql.util.CaseInsensitiveStringMap in project iceberg by apache.

the class TestFilteredScan method testUnpartitionedTimestampFilter.

@Test
public void testUnpartitionedTimestampFilter() {
    CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString()));
    SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);
    pushFilters(builder, LessThan.apply("ts", "2017-12-22T00:00:00+00:00"));
    Batch scan = builder.build().toBatch();
    InputPartition[] tasks = scan.planInputPartitions();
    Assert.assertEquals("Should only create one task for a small file", 1, tasks.length);
    assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9), read(unpartitioned.toString(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
}
Also used : Batch(org.apache.spark.sql.connector.read.Batch) InputPartition(org.apache.spark.sql.connector.read.InputPartition) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap) Test(org.junit.Test)

Example 5 with CaseInsensitiveStringMap

use of org.apache.spark.sql.util.CaseInsensitiveStringMap in project iceberg by apache.

the class TestFilteredScan method testBucketPartitionedIDFilters.

@Test
public void testBucketPartitionedIDFilters() {
    Table table = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id");
    CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location()));
    Batch unfiltered = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch();
    Assert.assertEquals("Unfiltered table should created 4 read tasks", 4, unfiltered.planInputPartitions().length);
    for (int i = 0; i < 10; i += 1) {
        SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);
        pushFilters(builder, EqualTo.apply("id", i));
        Batch scan = builder.build().toBatch();
        InputPartition[] tasks = scan.planInputPartitions();
        // validate predicate push-down
        Assert.assertEquals("Should create one task for a single bucket", 1, tasks.length);
        // validate row filtering
        assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(table.location(), vectorized, "id = " + i));
    }
}
Also used : Table(org.apache.iceberg.Table) Batch(org.apache.spark.sql.connector.read.Batch) InputPartition(org.apache.spark.sql.connector.read.InputPartition) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap) Test(org.junit.Test)

Aggregations

CaseInsensitiveStringMap (org.apache.spark.sql.util.CaseInsensitiveStringMap)14 Test (org.junit.Test)11 Batch (org.apache.spark.sql.connector.read.Batch)10 Table (org.apache.iceberg.Table)8 InputPartition (org.apache.spark.sql.connector.read.InputPartition)4 StringStartsWith (org.apache.spark.sql.sources.StringStartsWith)4 PathIdentifier (org.apache.iceberg.spark.PathIdentifier)2 Not (org.apache.spark.sql.sources.Not)2 Schema (org.apache.iceberg.Schema)1 Snapshot (org.apache.iceberg.Snapshot)1 TableScan (org.apache.iceberg.TableScan)1 Spark3Util (org.apache.iceberg.spark.Spark3Util)1 SparkCatalog (org.apache.iceberg.spark.SparkCatalog)1 SparkReadConf (org.apache.iceberg.spark.SparkReadConf)1 NoSuchTableException (org.apache.spark.sql.catalyst.analysis.NoSuchTableException)1 CatalogPlugin (org.apache.spark.sql.connector.catalog.CatalogPlugin)1 Identifier (org.apache.spark.sql.connector.catalog.Identifier)1 TableCatalog (org.apache.spark.sql.connector.catalog.TableCatalog)1 Before (org.junit.Before)1