use of org.apache.spark.sql.connector.read.InputPartition in project iceberg by apache.
the class TestFilteredScan method testUnpartitionedCaseInsensitiveIDFilters.
@Test
public void testUnpartitionedCaseInsensitiveIDFilters() {
CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString()));
// set spark.sql.caseSensitive to false
String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive");
TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", "false");
try {
for (int i = 0; i < 10; i += 1) {
SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).caseSensitive(false);
// note lower(ID) == lower(id), so there must be a match
pushFilters(builder, EqualTo.apply("ID", i));
Batch scan = builder.build().toBatch();
InputPartition[] tasks = scan.planInputPartitions();
Assert.assertEquals("Should only create one task for a small file", 1, tasks.length);
// validate row filtering
assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), vectorized, "id = " + i));
}
} finally {
// return global conf to previous state
TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", caseSensitivityBeforeTest);
}
}
use of org.apache.spark.sql.connector.read.InputPartition in project iceberg by apache.
the class TestFilteredScan method testUnpartitionedTimestampFilter.
@Test
public void testUnpartitionedTimestampFilter() {
CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString()));
SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);
pushFilters(builder, LessThan.apply("ts", "2017-12-22T00:00:00+00:00"));
Batch scan = builder.build().toBatch();
InputPartition[] tasks = scan.planInputPartitions();
Assert.assertEquals("Should only create one task for a small file", 1, tasks.length);
assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9), read(unpartitioned.toString(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
}
use of org.apache.spark.sql.connector.read.InputPartition in project iceberg by apache.
the class TestFilteredScan method testBucketPartitionedIDFilters.
@Test
public void testBucketPartitionedIDFilters() {
Table table = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id");
CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location()));
Batch unfiltered = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch();
Assert.assertEquals("Unfiltered table should created 4 read tasks", 4, unfiltered.planInputPartitions().length);
for (int i = 0; i < 10; i += 1) {
SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);
pushFilters(builder, EqualTo.apply("id", i));
Batch scan = builder.build().toBatch();
InputPartition[] tasks = scan.planInputPartitions();
// validate predicate push-down
Assert.assertEquals("Should create one task for a single bucket", 1, tasks.length);
// validate row filtering
assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(table.location(), vectorized, "id = " + i));
}
}
use of org.apache.spark.sql.connector.read.InputPartition in project iceberg by apache.
the class TestFilteredScan method testUnpartitionedIDFilters.
@Test
public void testUnpartitionedIDFilters() {
CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString()));
SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);
for (int i = 0; i < 10; i += 1) {
pushFilters(builder, EqualTo.apply("id", i));
Batch scan = builder.build().toBatch();
InputPartition[] partitions = scan.planInputPartitions();
Assert.assertEquals("Should only create one task for a small file", 1, partitions.length);
// validate row filtering
assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), vectorized, "id = " + i));
}
}
use of org.apache.spark.sql.connector.read.InputPartition in project phoenix-connectors by apache.
the class PhoenixScan method planInputPartitions.
@Override
public InputPartition[] planInputPartitions() {
populateOverriddenProperties();
try (Connection conn = DriverManager.getConnection(JDBC_PROTOCOL + JDBC_PROTOCOL_SEPARATOR + zkUrl, overriddenProps)) {
List<ColumnInfo> columnInfos = PhoenixRuntime.generateColumnInfo(conn, tableName, new ArrayList<>(Arrays.asList(schema.names())));
final Statement statement = conn.createStatement();
final String selectStatement = QueryUtil.constructSelectStatement(tableName, columnInfos, whereClause);
if (selectStatement == null) {
throw new NullPointerException();
}
final PhoenixStatement pstmt = statement.unwrap(PhoenixStatement.class);
// Optimize the query plan so that we potentially use secondary indexes
final QueryPlan queryPlan = pstmt.optimizeQuery(selectStatement);
final org.apache.hadoop.hbase.client.Scan scan = queryPlan.getContext().getScan();
// Initialize the query plan so it sets up the parallel scans
queryPlan.iterator(MapReduceParallelScanGrouper.getInstance());
List<KeyRange> allSplits = queryPlan.getSplits();
// Get the RegionSizeCalculator
PhoenixConnection phxConn = conn.unwrap(PhoenixConnection.class);
org.apache.hadoop.hbase.client.Connection connection = phxConn.getQueryServices().getAdmin().getConnection();
RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf(queryPlan.getTableRef().getTable().getPhysicalName().toString()));
final InputPartition[] partitions = new PhoenixInputPartition[allSplits.size()];
int partitionCount = 0;
for (List<org.apache.hadoop.hbase.client.Scan> scans : queryPlan.getScans()) {
// Get the region location
HRegionLocation location = regionLocator.getRegionLocation(scans.get(0).getStartRow(), false);
String regionLocation = location.getHostname();
// Get the region size
long regionSize = CompatUtil.getSize(regionLocator, connection.getAdmin(), location);
phoenixDataSourceOptions = new PhoenixDataSourceReadOptions(zkUrl, currentScnValue, tenantId, selectStatement, overriddenProps);
if (splitByStats) {
for (org.apache.hadoop.hbase.client.Scan aScan : scans) {
partitions[partitionCount++] = new PhoenixInputPartition(new PhoenixInputSplit(Collections.singletonList(aScan), regionSize, regionLocation));
}
} else {
partitions[partitionCount++] = new PhoenixInputPartition(new PhoenixInputSplit(scans, regionSize, regionLocation));
}
}
return partitions;
} catch (Exception e) {
throw new RuntimeException("Unable to plan query", e);
}
}
Aggregations