use of org.locationtech.geowave.analytic.spark.spatial.SpatialJoinRunner in project geowave by locationtech.
the class SqlQueryRunner method run.
public Dataset<Row> run() throws IOException, InterruptedException, ExecutionException, ParseException {
initContext();
// Load stores and create views.
loadStoresAndViews();
// Create a version of the sql without string literals to check for
// subquery syntax in sql statement.
final Pattern stringLit = Pattern.compile("(?:\\'|\\\").*?(?:\\'|\\\")");
final Matcher m = stringLit.matcher(sql);
final String cleanedSql = m.replaceAll("");
LOGGER.debug("cleaned SQL statement: " + cleanedSql);
// injecting a optimized join into the process
if (!cleanedSql.matches("(?i)^(?=(?:.*(?:\\b(?:INSERT INTO|UPDATE|SELECT|WITH|DELETE|CREATE TABLE|ALTER TABLE|DROP TABLE)\\b)){2})")) {
// Parse sparks logical plan for query and determine if spatial join
// is present
LogicalPlan plan = null;
plan = session.sessionState().sqlParser().parsePlan(sql);
final JsonParser gsonParser = new JsonParser();
final JsonElement jElement = gsonParser.parse(plan.prettyJson());
if (jElement.isJsonArray()) {
final JsonArray jArray = jElement.getAsJsonArray();
final int size = jArray.size();
for (int iObj = 0; iObj < size; iObj++) {
final JsonElement childElement = jArray.get(iObj);
if (childElement.isJsonObject()) {
final JsonObject jObj = childElement.getAsJsonObject();
final String objClass = jObj.get("class").getAsString();
if (Objects.equals(objClass, "org.apache.spark.sql.catalyst.plans.logical.Filter")) {
// Search through filter Object to determine if
// GeomPredicate function present in condition.
final JsonElement conditionElements = jObj.get("condition");
if (conditionElements.isJsonArray()) {
final JsonArray conditionArray = conditionElements.getAsJsonArray();
final int condSize = conditionArray.size();
for (int iCond = 0; iCond < condSize; iCond++) {
final JsonElement childCond = conditionArray.get(iCond);
if (childCond.isJsonObject()) {
final JsonObject condObj = childCond.getAsJsonObject();
final String condClass = condObj.get("class").getAsString();
if (Objects.equals(condClass, "org.apache.spark.sql.catalyst.analysis.UnresolvedFunction")) {
final String udfName = condObj.get("name").getAsJsonObject().get("funcName").getAsString();
final UDFNameAndConstructor geomUDF = UDFRegistrySPI.findFunctionByName(udfName);
if (geomUDF != null) {
final ExtractedGeomPredicate relevantPredicate = new ExtractedGeomPredicate();
relevantPredicate.predicate = geomUDF.getPredicateConstructor().get();
relevantPredicate.predicateName = udfName;
extractedPredicates.add(relevantPredicate);
}
}
}
}
}
}
}
}
}
}
// would indicate a spatial join
if (extractedPredicates.size() == 1) {
// This pattern detects the word where outside of quoted areas and
// captures it in group 2
final Pattern whereDetect = Pattern.compile("(?i)(\"[^\"]*\"|'[^']*')|(\\bWHERE\\b)");
final Pattern andOrDetect = Pattern.compile("(?i)(\"[^\"]*\"|'[^']*')|(\\bAND|OR\\b)");
final Pattern orderGroupDetect = Pattern.compile("(?i)(\"[^\"]*\"|'[^']*')|(\\bORDER BY|GROUP BY\\b)");
final Matcher filterStart = getFirstPositiveMatcher(whereDetect, sql);
if (filterStart == null) {
LOGGER.error("There should be a where clause matching the pattern. Running default SQL");
return runDefaultSQL();
}
final int whereStart = filterStart.start(2);
int whereEnd = sql.length();
final Matcher filterEnd = getFirstPositiveMatcher(orderGroupDetect, sql.substring(whereStart));
if (filterEnd != null) {
whereEnd = filterEnd.start(2);
}
final String filterClause = sql.substring(whereStart, whereEnd);
LOGGER.warn("Extracted Filter Clause: " + filterClause);
final Matcher compoundFilter = getFirstPositiveMatcher(andOrDetect, filterClause);
if (compoundFilter != null) {
LOGGER.warn("Compound conditional detected can result in multiple joins. Too complex to plan in current context. Running default sql");
return runDefaultSQL();
}
final ExtractedGeomPredicate pred = extractedPredicates.get(0);
// Parse filter string for predicate location
final int functionPos = filterClause.indexOf(pred.predicateName);
final int funcArgStart = filterClause.indexOf("(", functionPos);
final int funcArgEnd = filterClause.indexOf(")", funcArgStart);
String funcArgs = filterClause.substring(funcArgStart + 1, funcArgEnd);
funcArgs = funcArgs.replaceAll("\\s", "");
LOGGER.warn("Function Args: " + funcArgs);
final String[] args = funcArgs.split(Pattern.quote(","));
if (args.length == 2) {
// Determine valid table relations that map to input stores
final String[] tableRelations = getTableRelations(args);
pred.leftTableRelation = tableRelations[0];
pred.rightTableRelation = tableRelations[1];
}
if ((pred.leftTableRelation == null) || (pred.rightTableRelation == null)) {
LOGGER.warn("Cannot translate table identifier to geowave rdd for join.");
return runDefaultSQL();
}
// Extract radius for distance join from condition
boolean negativePredicate = false;
if (Objects.equals(pred.predicateName, "GeomDistance")) {
// Look ahead two tokens for logical operand and scalar|boolean
final String afterFunc = filterClause.substring(funcArgEnd + 1);
final String[] tokens = afterFunc.split(" ");
double radius = 0.0;
if (tokens.length < 2) {
LOGGER.warn("Could not extract radius for distance join. Running default SQL");
return runDefaultSQL();
} else {
final String logicalOperand = tokens[0].trim();
if ((logicalOperand.equals(">")) || (logicalOperand.equals(">="))) {
negativePredicate = true;
}
final String radiusStr = tokens[1].trim();
if (!org.apache.commons.lang3.math.NumberUtils.isNumber(radiusStr)) {
LOGGER.warn("Could not extract radius for distance join. Running default SQL");
return runDefaultSQL();
} else {
final Double r = org.apache.commons.lang3.math.NumberUtils.createDouble(radiusStr);
if (r == null) {
LOGGER.warn("Could not extract radius for distance join. Running default SQL");
return runDefaultSQL();
}
radius = r.doubleValue();
}
}
((GeomWithinDistance) pred.predicate).setRadius(radius);
}
// At this point we are performing a join
final SpatialJoinRunner joinRunner = new SpatialJoinRunner(session);
// Collect input store info for join
final InputStoreInfo leftStore = inputStores.get(pred.leftTableRelation);
final InputStoreInfo rightStore = inputStores.get(pred.rightTableRelation);
joinRunner.setNegativeTest(negativePredicate);
// Setup store info for runner
final AdapterToIndexMapping[] leftMappings = leftStore.getOrCreateAdapterIndexMappingStore().getIndicesForAdapter(leftStore.getOrCreateInternalAdapterStore().getAdapterId(leftStore.typeName));
final AdapterToIndexMapping[] rightMappings = rightStore.getOrCreateAdapterIndexMappingStore().getIndicesForAdapter(rightStore.getOrCreateInternalAdapterStore().getAdapterId(rightStore.typeName));
NumericIndexStrategy leftStrat = null;
if (leftMappings.length > 0) {
leftStrat = leftMappings[0].getIndex(leftStore.getOrCreateIndexStore()).getIndexStrategy();
}
NumericIndexStrategy rightStrat = null;
if (rightMappings.length > 0) {
rightStrat = rightMappings[0].getIndex(rightStore.getOrCreateIndexStore()).getIndexStrategy();
}
joinRunner.setLeftRDD(GeoWaveRDDLoader.loadIndexedRDD(session.sparkContext(), leftStore.rdd, leftStrat));
joinRunner.setRightRDD(GeoWaveRDDLoader.loadIndexedRDD(session.sparkContext(), rightStore.rdd, rightStrat));
joinRunner.setPredicate(pred.predicate);
joinRunner.setLeftStore(leftStore.storeOptions);
joinRunner.setRightStore(rightStore.storeOptions);
// Execute the join
joinRunner.run();
// Load results into dataframes and replace original views with
// joined views
final SimpleFeatureDataFrame leftResultFrame = new SimpleFeatureDataFrame(session);
final SimpleFeatureDataFrame rightResultFrame = new SimpleFeatureDataFrame(session);
leftResultFrame.init(leftStore.storeOptions, leftStore.typeName);
rightResultFrame.init(rightStore.storeOptions, rightStore.typeName);
final Dataset<Row> leftFrame = leftResultFrame.getDataFrame(joinRunner.getLeftResults());
final Dataset<Row> rightFrame = rightResultFrame.getDataFrame(joinRunner.getRightResults());
leftFrame.createOrReplaceTempView(leftStore.viewName);
rightFrame.createOrReplaceTempView(rightStore.viewName);
}
// Run the remaining query through the session sql runner.
// This will likely attempt to regenerate the join, but should reuse the
// pairs generated from optimized join beforehand
final Dataset<Row> results = session.sql(sql);
return results;
}
use of org.locationtech.geowave.analytic.spark.spatial.SpatialJoinRunner in project geowave by locationtech.
the class GeoWaveSparkSpatialJoinIT method testHailTornadoDistanceJoin.
@Test
public void testHailTornadoDistanceJoin() throws Exception {
session = SparkTestEnvironment.getInstance().getDefaultSession();
context = session.sparkContext();
GeomFunctionRegistry.registerGeometryFunctions(session);
LOGGER.debug("Testing DataStore Type: " + hailStore.getType());
long mark = System.currentTimeMillis();
ingestHailandTornado();
long dur = (System.currentTimeMillis() - mark);
final String hail_adapter = "hail";
final String tornado_adapter = "tornado_tracks";
final GeomWithinDistance distancePredicate = new GeomWithinDistance(0.01);
final String sqlHail = "select hail.* from hail, tornado where GeomDistance(hail.geom,tornado.geom) <= 0.01";
final String sqlTornado = "select tornado.* from hail, tornado where GeomDistance(hail.geom,tornado.geom) <= 0.01";
final SpatialJoinRunner runner = new SpatialJoinRunner(session);
runner.setLeftStore(hailStore);
runner.setLeftAdapterTypeName(hail_adapter);
runner.setRightStore(tornadoStore);
runner.setRightAdapterTypeName(tornado_adapter);
runner.setPredicate(distancePredicate);
loadRDDs(hail_adapter, tornado_adapter);
long tornadoIndexedCount = 0;
long hailIndexedCount = 0;
LOGGER.warn("------------ Running indexed spatial join. ----------");
mark = System.currentTimeMillis();
try {
runner.run();
} catch (InterruptedException | ExecutionException e) {
LOGGER.error("Async error in join");
e.printStackTrace();
} catch (final IOException e) {
LOGGER.error("IO error in join");
e.printStackTrace();
}
hailIndexedCount = runner.getLeftResults().getRawRDD().count();
tornadoIndexedCount = runner.getRightResults().getRawRDD().count();
final long indexJoinDur = (System.currentTimeMillis() - mark);
LOGGER.warn("Indexed Result Count: " + (hailIndexedCount + tornadoIndexedCount));
final SimpleFeatureDataFrame indexHailFrame = new SimpleFeatureDataFrame(session);
final SimpleFeatureDataFrame indexTornadoFrame = new SimpleFeatureDataFrame(session);
indexTornadoFrame.init(tornadoStore, tornado_adapter);
final Dataset<Row> indexedTornado = indexTornadoFrame.getDataFrame(runner.getRightResults());
indexHailFrame.init(hailStore, hail_adapter);
final Dataset<Row> indexedHail = indexHailFrame.getDataFrame(runner.getLeftResults());
LOGGER.warn("------------ Running Brute force spatial join. ----------");
dur = runBruteForceJoin(hail_adapter, tornado_adapter, sqlHail, sqlTornado);
LOGGER.warn("Indexed join duration = " + indexJoinDur + " ms.");
LOGGER.warn("Brute join duration = " + dur + " ms.");
// Verify each row matches
Assert.assertTrue((hailIndexedCount == hailBruteCount));
Assert.assertTrue((tornadoIndexedCount == tornadoBruteCount));
Dataset<Row> subtractedFrame = indexedHail.except(hailBruteResults);
subtractedFrame = subtractedFrame.cache();
Assert.assertTrue("Subtraction between brute force join and indexed Hail should result in count of 0", (subtractedFrame.count() == 0));
subtractedFrame.unpersist();
subtractedFrame = indexedTornado.except(tornadoBruteResults);
subtractedFrame = subtractedFrame.cache();
Assert.assertTrue("Subtraction between brute force join and indexed Tornado should result in count of 0", (subtractedFrame.count() == 0));
TestUtils.deleteAll(hailStore);
TestUtils.deleteAll(tornadoStore);
}
use of org.locationtech.geowave.analytic.spark.spatial.SpatialJoinRunner in project geowave by locationtech.
the class SpatialJoinCommand method computeResults.
@Override
public Void computeResults(final OperationParams params) throws Exception {
final String leftStoreName = parameters.get(0);
final String rightStoreName = parameters.get(1);
final String outputStoreName = parameters.get(2);
// Config file
final File configFile = getGeoWaveConfigFile(params);
// Attempt to load stores.
if (leftDataStore == null) {
leftDataStore = CLIUtils.loadStore(leftStoreName, configFile, params.getConsole());
}
if (rightDataStore == null) {
rightDataStore = CLIUtils.loadStore(rightStoreName, configFile, params.getConsole());
}
if (outputDataStore == null) {
outputDataStore = CLIUtils.loadStore(outputStoreName, configFile, params.getConsole());
}
// Save a reference to the output store in the property management.
final PersistableStore persistedStore = new PersistableStore(outputDataStore);
final PropertyManagement properties = new PropertyManagement();
properties.store(StoreParameters.StoreParam.OUTPUT_STORE, persistedStore);
// Convert properties from DBScanOptions and CommonOptions
final PropertyManagementConverter converter = new PropertyManagementConverter(properties);
converter.readProperties(spatialJoinOptions);
// TODO: Create GeomPredicate function from name
final UDFNameAndConstructor udfFunc = UDFRegistrySPI.findFunctionByName(spatialJoinOptions.getPredicate());
if (udfFunc == null) {
throw new ParameterException("UDF function matching " + spatialJoinOptions.getPredicate() + " not found.");
}
final GeomFunction predicate = udfFunc.getPredicateConstructor().get();
// Special case for distance function since it takes a scalar radius.
if (predicate instanceof GeomWithinDistance) {
((GeomWithinDistance) predicate).setRadius(spatialJoinOptions.getRadius());
}
final SpatialJoinRunner runner = new SpatialJoinRunner();
runner.setAppName(spatialJoinOptions.getAppName());
runner.setMaster(spatialJoinOptions.getMaster());
runner.setHost(spatialJoinOptions.getHost());
runner.setPartCount(spatialJoinOptions.getPartCount());
runner.setPredicate(predicate);
// set DataStore options for runner
runner.setLeftStore(leftDataStore);
if (spatialJoinOptions.getLeftAdapterTypeName() != null) {
runner.setLeftAdapterTypeName(spatialJoinOptions.getLeftAdapterTypeName());
}
runner.setRightStore(rightDataStore);
if (spatialJoinOptions.getRightAdapterTypeName() != null) {
runner.setRightAdapterTypeName(spatialJoinOptions.getRightAdapterTypeName());
}
runner.setOutputStore(outputDataStore);
if (spatialJoinOptions.getOutputLeftAdapterTypeName() != null) {
runner.setOutputLeftAdapterTypeName(spatialJoinOptions.getOutputLeftAdapterTypeName());
}
if (spatialJoinOptions.getOutputRightAdapterTypeName() != null) {
runner.setOutputRightAdapterTypeName(spatialJoinOptions.getOutputRightAdapterTypeName());
}
runner.setNegativeTest(spatialJoinOptions.isNegativeTest());
// Finally call run to execute the join
runner.run();
runner.close();
return null;
}
Aggregations