use of org.apache.spark.sql.util.CaseInsensitiveStringMap in project iceberg by apache.
the class IcebergSource method getTable.
@Override
public Table getTable(StructType schema, Transform[] partitioning, Map<String, String> options) {
Spark3Util.CatalogAndIdentifier catalogIdentifier = catalogAndIdentifier(new CaseInsensitiveStringMap(options));
CatalogPlugin catalog = catalogIdentifier.catalog();
Identifier ident = catalogIdentifier.identifier();
try {
if (catalog instanceof TableCatalog) {
return ((TableCatalog) catalog).loadTable(ident);
}
} catch (NoSuchTableException e) {
// throwing an iceberg NoSuchTableException because the Spark one is typed and cant be thrown from this interface
throw new org.apache.iceberg.exceptions.NoSuchTableException(e, "Cannot find table for %s.", ident);
}
// throwing an iceberg NoSuchTableException because the Spark one is typed and cant be thrown from this interface
throw new org.apache.iceberg.exceptions.NoSuchTableException("Cannot find table for %s.", ident);
}
use of org.apache.spark.sql.util.CaseInsensitiveStringMap in project iceberg by apache.
the class SparkScanBuilder method buildMergeOnReadScan.
public Scan buildMergeOnReadScan() {
Preconditions.checkArgument(readConf.snapshotId() == null && readConf.asOfTimestamp() == null, "Cannot set time travel options %s and %s for row-level command scans", SparkReadOptions.SNAPSHOT_ID, SparkReadOptions.AS_OF_TIMESTAMP);
Preconditions.checkArgument(readConf.startSnapshotId() == null && readConf.endSnapshotId() == null, "Cannot set incremental scan options %s and %s for row-level command scans", SparkReadOptions.START_SNAPSHOT_ID, SparkReadOptions.END_SNAPSHOT_ID);
Snapshot snapshot = table.currentSnapshot();
if (snapshot == null) {
return new SparkBatchQueryScan(spark, table, null, readConf, schemaWithMetadataColumns(), filterExpressions);
}
// remember the current snapshot ID for commit validation
long snapshotId = snapshot.snapshotId();
CaseInsensitiveStringMap adjustedOptions = Spark3Util.setOption(SparkReadOptions.SNAPSHOT_ID, Long.toString(snapshotId), options);
SparkReadConf adjustedReadConf = new SparkReadConf(spark, table, adjustedOptions);
Schema expectedSchema = schemaWithMetadataColumns();
TableScan scan = table.newScan().useSnapshot(snapshotId).caseSensitive(caseSensitive).filter(filterExpression()).project(expectedSchema);
scan = configureSplitPlanning(scan);
return new SparkBatchQueryScan(spark, table, scan, adjustedReadConf, expectedSchema, filterExpressions);
}
use of org.apache.spark.sql.util.CaseInsensitiveStringMap in project iceberg by apache.
the class TestFilteredScan method testUnpartitionedCaseInsensitiveIDFilters.
@Test
public void testUnpartitionedCaseInsensitiveIDFilters() {
CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString()));
// set spark.sql.caseSensitive to false
String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive");
TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", "false");
try {
for (int i = 0; i < 10; i += 1) {
SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).caseSensitive(false);
// note lower(ID) == lower(id), so there must be a match
pushFilters(builder, EqualTo.apply("ID", i));
Batch scan = builder.build().toBatch();
InputPartition[] tasks = scan.planInputPartitions();
Assert.assertEquals("Should only create one task for a small file", 1, tasks.length);
// validate row filtering
assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), vectorized, "id = " + i));
}
} finally {
// return global conf to previous state
TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", caseSensitivityBeforeTest);
}
}
use of org.apache.spark.sql.util.CaseInsensitiveStringMap in project iceberg by apache.
the class TestFilteredScan method testUnpartitionedTimestampFilter.
@Test
public void testUnpartitionedTimestampFilter() {
CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString()));
SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);
pushFilters(builder, LessThan.apply("ts", "2017-12-22T00:00:00+00:00"));
Batch scan = builder.build().toBatch();
InputPartition[] tasks = scan.planInputPartitions();
Assert.assertEquals("Should only create one task for a small file", 1, tasks.length);
assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9), read(unpartitioned.toString(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
}
use of org.apache.spark.sql.util.CaseInsensitiveStringMap in project iceberg by apache.
the class TestFilteredScan method testBucketPartitionedIDFilters.
@Test
public void testBucketPartitionedIDFilters() {
Table table = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id");
CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location()));
Batch unfiltered = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch();
Assert.assertEquals("Unfiltered table should created 4 read tasks", 4, unfiltered.planInputPartitions().length);
for (int i = 0; i < 10; i += 1) {
SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);
pushFilters(builder, EqualTo.apply("id", i));
Batch scan = builder.build().toBatch();
InputPartition[] tasks = scan.planInputPartitions();
// validate predicate push-down
Assert.assertEquals("Should create one task for a single bucket", 1, tasks.length);
// validate row filtering
assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(table.location(), vectorized, "id = " + i));
}
}
Aggregations