Search in sources :

Example 1 with SpatialJoinRunner

use of org.locationtech.geowave.analytic.spark.spatial.SpatialJoinRunner in project geowave by locationtech.

the class SqlQueryRunner method run.

public Dataset<Row> run() throws IOException, InterruptedException, ExecutionException, ParseException {
    initContext();
    // Load stores and create views.
    loadStoresAndViews();
    // Create a version of the sql without string literals to check for
    // subquery syntax in sql statement.
    final Pattern stringLit = Pattern.compile("(?:\\'|\\\").*?(?:\\'|\\\")");
    final Matcher m = stringLit.matcher(sql);
    final String cleanedSql = m.replaceAll("");
    LOGGER.debug("cleaned SQL statement: " + cleanedSql);
    // injecting a optimized join into the process
    if (!cleanedSql.matches("(?i)^(?=(?:.*(?:\\b(?:INSERT INTO|UPDATE|SELECT|WITH|DELETE|CREATE TABLE|ALTER TABLE|DROP TABLE)\\b)){2})")) {
        // Parse sparks logical plan for query and determine if spatial join
        // is present
        LogicalPlan plan = null;
        plan = session.sessionState().sqlParser().parsePlan(sql);
        final JsonParser gsonParser = new JsonParser();
        final JsonElement jElement = gsonParser.parse(plan.prettyJson());
        if (jElement.isJsonArray()) {
            final JsonArray jArray = jElement.getAsJsonArray();
            final int size = jArray.size();
            for (int iObj = 0; iObj < size; iObj++) {
                final JsonElement childElement = jArray.get(iObj);
                if (childElement.isJsonObject()) {
                    final JsonObject jObj = childElement.getAsJsonObject();
                    final String objClass = jObj.get("class").getAsString();
                    if (Objects.equals(objClass, "org.apache.spark.sql.catalyst.plans.logical.Filter")) {
                        // Search through filter Object to determine if
                        // GeomPredicate function present in condition.
                        final JsonElement conditionElements = jObj.get("condition");
                        if (conditionElements.isJsonArray()) {
                            final JsonArray conditionArray = conditionElements.getAsJsonArray();
                            final int condSize = conditionArray.size();
                            for (int iCond = 0; iCond < condSize; iCond++) {
                                final JsonElement childCond = conditionArray.get(iCond);
                                if (childCond.isJsonObject()) {
                                    final JsonObject condObj = childCond.getAsJsonObject();
                                    final String condClass = condObj.get("class").getAsString();
                                    if (Objects.equals(condClass, "org.apache.spark.sql.catalyst.analysis.UnresolvedFunction")) {
                                        final String udfName = condObj.get("name").getAsJsonObject().get("funcName").getAsString();
                                        final UDFNameAndConstructor geomUDF = UDFRegistrySPI.findFunctionByName(udfName);
                                        if (geomUDF != null) {
                                            final ExtractedGeomPredicate relevantPredicate = new ExtractedGeomPredicate();
                                            relevantPredicate.predicate = geomUDF.getPredicateConstructor().get();
                                            relevantPredicate.predicateName = udfName;
                                            extractedPredicates.add(relevantPredicate);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
    // would indicate a spatial join
    if (extractedPredicates.size() == 1) {
        // This pattern detects the word where outside of quoted areas and
        // captures it in group 2
        final Pattern whereDetect = Pattern.compile("(?i)(\"[^\"]*\"|'[^']*')|(\\bWHERE\\b)");
        final Pattern andOrDetect = Pattern.compile("(?i)(\"[^\"]*\"|'[^']*')|(\\bAND|OR\\b)");
        final Pattern orderGroupDetect = Pattern.compile("(?i)(\"[^\"]*\"|'[^']*')|(\\bORDER BY|GROUP BY\\b)");
        final Matcher filterStart = getFirstPositiveMatcher(whereDetect, sql);
        if (filterStart == null) {
            LOGGER.error("There should be a where clause matching the pattern. Running default SQL");
            return runDefaultSQL();
        }
        final int whereStart = filterStart.start(2);
        int whereEnd = sql.length();
        final Matcher filterEnd = getFirstPositiveMatcher(orderGroupDetect, sql.substring(whereStart));
        if (filterEnd != null) {
            whereEnd = filterEnd.start(2);
        }
        final String filterClause = sql.substring(whereStart, whereEnd);
        LOGGER.warn("Extracted Filter Clause: " + filterClause);
        final Matcher compoundFilter = getFirstPositiveMatcher(andOrDetect, filterClause);
        if (compoundFilter != null) {
            LOGGER.warn("Compound conditional detected can result in multiple joins. Too complex to plan in current context. Running default sql");
            return runDefaultSQL();
        }
        final ExtractedGeomPredicate pred = extractedPredicates.get(0);
        // Parse filter string for predicate location
        final int functionPos = filterClause.indexOf(pred.predicateName);
        final int funcArgStart = filterClause.indexOf("(", functionPos);
        final int funcArgEnd = filterClause.indexOf(")", funcArgStart);
        String funcArgs = filterClause.substring(funcArgStart + 1, funcArgEnd);
        funcArgs = funcArgs.replaceAll("\\s", "");
        LOGGER.warn("Function Args: " + funcArgs);
        final String[] args = funcArgs.split(Pattern.quote(","));
        if (args.length == 2) {
            // Determine valid table relations that map to input stores
            final String[] tableRelations = getTableRelations(args);
            pred.leftTableRelation = tableRelations[0];
            pred.rightTableRelation = tableRelations[1];
        }
        if ((pred.leftTableRelation == null) || (pred.rightTableRelation == null)) {
            LOGGER.warn("Cannot translate table identifier to geowave rdd for join.");
            return runDefaultSQL();
        }
        // Extract radius for distance join from condition
        boolean negativePredicate = false;
        if (Objects.equals(pred.predicateName, "GeomDistance")) {
            // Look ahead two tokens for logical operand and scalar|boolean
            final String afterFunc = filterClause.substring(funcArgEnd + 1);
            final String[] tokens = afterFunc.split(" ");
            double radius = 0.0;
            if (tokens.length < 2) {
                LOGGER.warn("Could not extract radius for distance join. Running default SQL");
                return runDefaultSQL();
            } else {
                final String logicalOperand = tokens[0].trim();
                if ((logicalOperand.equals(">")) || (logicalOperand.equals(">="))) {
                    negativePredicate = true;
                }
                final String radiusStr = tokens[1].trim();
                if (!org.apache.commons.lang3.math.NumberUtils.isNumber(radiusStr)) {
                    LOGGER.warn("Could not extract radius for distance join. Running default SQL");
                    return runDefaultSQL();
                } else {
                    final Double r = org.apache.commons.lang3.math.NumberUtils.createDouble(radiusStr);
                    if (r == null) {
                        LOGGER.warn("Could not extract radius for distance join. Running default SQL");
                        return runDefaultSQL();
                    }
                    radius = r.doubleValue();
                }
            }
            ((GeomWithinDistance) pred.predicate).setRadius(radius);
        }
        // At this point we are performing a join
        final SpatialJoinRunner joinRunner = new SpatialJoinRunner(session);
        // Collect input store info for join
        final InputStoreInfo leftStore = inputStores.get(pred.leftTableRelation);
        final InputStoreInfo rightStore = inputStores.get(pred.rightTableRelation);
        joinRunner.setNegativeTest(negativePredicate);
        // Setup store info for runner
        final AdapterToIndexMapping[] leftMappings = leftStore.getOrCreateAdapterIndexMappingStore().getIndicesForAdapter(leftStore.getOrCreateInternalAdapterStore().getAdapterId(leftStore.typeName));
        final AdapterToIndexMapping[] rightMappings = rightStore.getOrCreateAdapterIndexMappingStore().getIndicesForAdapter(rightStore.getOrCreateInternalAdapterStore().getAdapterId(rightStore.typeName));
        NumericIndexStrategy leftStrat = null;
        if (leftMappings.length > 0) {
            leftStrat = leftMappings[0].getIndex(leftStore.getOrCreateIndexStore()).getIndexStrategy();
        }
        NumericIndexStrategy rightStrat = null;
        if (rightMappings.length > 0) {
            rightStrat = rightMappings[0].getIndex(rightStore.getOrCreateIndexStore()).getIndexStrategy();
        }
        joinRunner.setLeftRDD(GeoWaveRDDLoader.loadIndexedRDD(session.sparkContext(), leftStore.rdd, leftStrat));
        joinRunner.setRightRDD(GeoWaveRDDLoader.loadIndexedRDD(session.sparkContext(), rightStore.rdd, rightStrat));
        joinRunner.setPredicate(pred.predicate);
        joinRunner.setLeftStore(leftStore.storeOptions);
        joinRunner.setRightStore(rightStore.storeOptions);
        // Execute the join
        joinRunner.run();
        // Load results into dataframes and replace original views with
        // joined views
        final SimpleFeatureDataFrame leftResultFrame = new SimpleFeatureDataFrame(session);
        final SimpleFeatureDataFrame rightResultFrame = new SimpleFeatureDataFrame(session);
        leftResultFrame.init(leftStore.storeOptions, leftStore.typeName);
        rightResultFrame.init(rightStore.storeOptions, rightStore.typeName);
        final Dataset<Row> leftFrame = leftResultFrame.getDataFrame(joinRunner.getLeftResults());
        final Dataset<Row> rightFrame = rightResultFrame.getDataFrame(joinRunner.getRightResults());
        leftFrame.createOrReplaceTempView(leftStore.viewName);
        rightFrame.createOrReplaceTempView(rightStore.viewName);
    }
    // Run the remaining query through the session sql runner.
    // This will likely attempt to regenerate the join, but should reuse the
    // pairs generated from optimized join beforehand
    final Dataset<Row> results = session.sql(sql);
    return results;
}
Also used : Pattern(java.util.regex.Pattern) GeomWithinDistance(org.locationtech.geowave.analytic.spark.sparksql.udf.GeomWithinDistance) Matcher(java.util.regex.Matcher) JsonObject(com.google.gson.JsonObject) AdapterToIndexMapping(org.locationtech.geowave.core.store.AdapterToIndexMapping) SpatialJoinRunner(org.locationtech.geowave.analytic.spark.spatial.SpatialJoinRunner) JsonArray(com.google.gson.JsonArray) UDFNameAndConstructor(org.locationtech.geowave.analytic.spark.sparksql.udf.UDFRegistrySPI.UDFNameAndConstructor) JsonElement(com.google.gson.JsonElement) LogicalPlan(org.apache.spark.sql.catalyst.plans.logical.LogicalPlan) Row(org.apache.spark.sql.Row) JsonParser(com.google.gson.JsonParser) NumericIndexStrategy(org.locationtech.geowave.core.index.NumericIndexStrategy)

Example 2 with SpatialJoinRunner

use of org.locationtech.geowave.analytic.spark.spatial.SpatialJoinRunner in project geowave by locationtech.

the class GeoWaveSparkSpatialJoinIT method testHailTornadoDistanceJoin.

@Test
public void testHailTornadoDistanceJoin() throws Exception {
    session = SparkTestEnvironment.getInstance().getDefaultSession();
    context = session.sparkContext();
    GeomFunctionRegistry.registerGeometryFunctions(session);
    LOGGER.debug("Testing DataStore Type: " + hailStore.getType());
    long mark = System.currentTimeMillis();
    ingestHailandTornado();
    long dur = (System.currentTimeMillis() - mark);
    final String hail_adapter = "hail";
    final String tornado_adapter = "tornado_tracks";
    final GeomWithinDistance distancePredicate = new GeomWithinDistance(0.01);
    final String sqlHail = "select hail.* from hail, tornado where GeomDistance(hail.geom,tornado.geom) <= 0.01";
    final String sqlTornado = "select tornado.* from hail, tornado where GeomDistance(hail.geom,tornado.geom) <= 0.01";
    final SpatialJoinRunner runner = new SpatialJoinRunner(session);
    runner.setLeftStore(hailStore);
    runner.setLeftAdapterTypeName(hail_adapter);
    runner.setRightStore(tornadoStore);
    runner.setRightAdapterTypeName(tornado_adapter);
    runner.setPredicate(distancePredicate);
    loadRDDs(hail_adapter, tornado_adapter);
    long tornadoIndexedCount = 0;
    long hailIndexedCount = 0;
    LOGGER.warn("------------ Running indexed spatial join. ----------");
    mark = System.currentTimeMillis();
    try {
        runner.run();
    } catch (InterruptedException | ExecutionException e) {
        LOGGER.error("Async error in join");
        e.printStackTrace();
    } catch (final IOException e) {
        LOGGER.error("IO error in join");
        e.printStackTrace();
    }
    hailIndexedCount = runner.getLeftResults().getRawRDD().count();
    tornadoIndexedCount = runner.getRightResults().getRawRDD().count();
    final long indexJoinDur = (System.currentTimeMillis() - mark);
    LOGGER.warn("Indexed Result Count: " + (hailIndexedCount + tornadoIndexedCount));
    final SimpleFeatureDataFrame indexHailFrame = new SimpleFeatureDataFrame(session);
    final SimpleFeatureDataFrame indexTornadoFrame = new SimpleFeatureDataFrame(session);
    indexTornadoFrame.init(tornadoStore, tornado_adapter);
    final Dataset<Row> indexedTornado = indexTornadoFrame.getDataFrame(runner.getRightResults());
    indexHailFrame.init(hailStore, hail_adapter);
    final Dataset<Row> indexedHail = indexHailFrame.getDataFrame(runner.getLeftResults());
    LOGGER.warn("------------ Running Brute force spatial join. ----------");
    dur = runBruteForceJoin(hail_adapter, tornado_adapter, sqlHail, sqlTornado);
    LOGGER.warn("Indexed join duration = " + indexJoinDur + " ms.");
    LOGGER.warn("Brute join duration = " + dur + " ms.");
    // Verify each row matches
    Assert.assertTrue((hailIndexedCount == hailBruteCount));
    Assert.assertTrue((tornadoIndexedCount == tornadoBruteCount));
    Dataset<Row> subtractedFrame = indexedHail.except(hailBruteResults);
    subtractedFrame = subtractedFrame.cache();
    Assert.assertTrue("Subtraction between brute force join and indexed Hail should result in count of 0", (subtractedFrame.count() == 0));
    subtractedFrame.unpersist();
    subtractedFrame = indexedTornado.except(tornadoBruteResults);
    subtractedFrame = subtractedFrame.cache();
    Assert.assertTrue("Subtraction between brute force join and indexed Tornado should result in count of 0", (subtractedFrame.count() == 0));
    TestUtils.deleteAll(hailStore);
    TestUtils.deleteAll(tornadoStore);
}
Also used : GeomWithinDistance(org.locationtech.geowave.analytic.spark.sparksql.udf.GeomWithinDistance) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) ExecutionException(java.util.concurrent.ExecutionException) SpatialJoinRunner(org.locationtech.geowave.analytic.spark.spatial.SpatialJoinRunner) SimpleFeatureDataFrame(org.locationtech.geowave.analytic.spark.sparksql.SimpleFeatureDataFrame) Test(org.junit.Test)

Example 3 with SpatialJoinRunner

use of org.locationtech.geowave.analytic.spark.spatial.SpatialJoinRunner in project geowave by locationtech.

the class SpatialJoinCommand method computeResults.

@Override
public Void computeResults(final OperationParams params) throws Exception {
    final String leftStoreName = parameters.get(0);
    final String rightStoreName = parameters.get(1);
    final String outputStoreName = parameters.get(2);
    // Config file
    final File configFile = getGeoWaveConfigFile(params);
    // Attempt to load stores.
    if (leftDataStore == null) {
        leftDataStore = CLIUtils.loadStore(leftStoreName, configFile, params.getConsole());
    }
    if (rightDataStore == null) {
        rightDataStore = CLIUtils.loadStore(rightStoreName, configFile, params.getConsole());
    }
    if (outputDataStore == null) {
        outputDataStore = CLIUtils.loadStore(outputStoreName, configFile, params.getConsole());
    }
    // Save a reference to the output store in the property management.
    final PersistableStore persistedStore = new PersistableStore(outputDataStore);
    final PropertyManagement properties = new PropertyManagement();
    properties.store(StoreParameters.StoreParam.OUTPUT_STORE, persistedStore);
    // Convert properties from DBScanOptions and CommonOptions
    final PropertyManagementConverter converter = new PropertyManagementConverter(properties);
    converter.readProperties(spatialJoinOptions);
    // TODO: Create GeomPredicate function from name
    final UDFNameAndConstructor udfFunc = UDFRegistrySPI.findFunctionByName(spatialJoinOptions.getPredicate());
    if (udfFunc == null) {
        throw new ParameterException("UDF function matching " + spatialJoinOptions.getPredicate() + " not found.");
    }
    final GeomFunction predicate = udfFunc.getPredicateConstructor().get();
    // Special case for distance function since it takes a scalar radius.
    if (predicate instanceof GeomWithinDistance) {
        ((GeomWithinDistance) predicate).setRadius(spatialJoinOptions.getRadius());
    }
    final SpatialJoinRunner runner = new SpatialJoinRunner();
    runner.setAppName(spatialJoinOptions.getAppName());
    runner.setMaster(spatialJoinOptions.getMaster());
    runner.setHost(spatialJoinOptions.getHost());
    runner.setPartCount(spatialJoinOptions.getPartCount());
    runner.setPredicate(predicate);
    // set DataStore options for runner
    runner.setLeftStore(leftDataStore);
    if (spatialJoinOptions.getLeftAdapterTypeName() != null) {
        runner.setLeftAdapterTypeName(spatialJoinOptions.getLeftAdapterTypeName());
    }
    runner.setRightStore(rightDataStore);
    if (spatialJoinOptions.getRightAdapterTypeName() != null) {
        runner.setRightAdapterTypeName(spatialJoinOptions.getRightAdapterTypeName());
    }
    runner.setOutputStore(outputDataStore);
    if (spatialJoinOptions.getOutputLeftAdapterTypeName() != null) {
        runner.setOutputLeftAdapterTypeName(spatialJoinOptions.getOutputLeftAdapterTypeName());
    }
    if (spatialJoinOptions.getOutputRightAdapterTypeName() != null) {
        runner.setOutputRightAdapterTypeName(spatialJoinOptions.getOutputRightAdapterTypeName());
    }
    runner.setNegativeTest(spatialJoinOptions.isNegativeTest());
    // Finally call run to execute the join
    runner.run();
    runner.close();
    return null;
}
Also used : GeomFunction(org.locationtech.geowave.analytic.spark.sparksql.udf.GeomFunction) GeomWithinDistance(org.locationtech.geowave.analytic.spark.sparksql.udf.GeomWithinDistance) UDFNameAndConstructor(org.locationtech.geowave.analytic.spark.sparksql.udf.UDFRegistrySPI.UDFNameAndConstructor) PropertyManagementConverter(org.locationtech.geowave.analytic.mapreduce.operations.options.PropertyManagementConverter) PersistableStore(org.locationtech.geowave.analytic.store.PersistableStore) PropertyManagement(org.locationtech.geowave.analytic.PropertyManagement) ParameterException(com.beust.jcommander.ParameterException) File(java.io.File) SpatialJoinRunner(org.locationtech.geowave.analytic.spark.spatial.SpatialJoinRunner)

Aggregations

GeomWithinDistance (org.locationtech.geowave.analytic.spark.sparksql.udf.GeomWithinDistance)3 SpatialJoinRunner (org.locationtech.geowave.analytic.spark.spatial.SpatialJoinRunner)3 Row (org.apache.spark.sql.Row)2 UDFNameAndConstructor (org.locationtech.geowave.analytic.spark.sparksql.udf.UDFRegistrySPI.UDFNameAndConstructor)2 ParameterException (com.beust.jcommander.ParameterException)1 JsonArray (com.google.gson.JsonArray)1 JsonElement (com.google.gson.JsonElement)1 JsonObject (com.google.gson.JsonObject)1 JsonParser (com.google.gson.JsonParser)1 File (java.io.File)1 IOException (java.io.IOException)1 ExecutionException (java.util.concurrent.ExecutionException)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1 LogicalPlan (org.apache.spark.sql.catalyst.plans.logical.LogicalPlan)1 Test (org.junit.Test)1 PropertyManagement (org.locationtech.geowave.analytic.PropertyManagement)1 PropertyManagementConverter (org.locationtech.geowave.analytic.mapreduce.operations.options.PropertyManagementConverter)1 SimpleFeatureDataFrame (org.locationtech.geowave.analytic.spark.sparksql.SimpleFeatureDataFrame)1 GeomFunction (org.locationtech.geowave.analytic.spark.sparksql.udf.GeomFunction)1