use of io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.SqlCondition in project hopsworks by logicalclocks.
the class PitJoinController method generateSQL.
public SqlNode generateSQL(Query query, boolean isTrainingDataset) {
// make a copy of base query to replace joins
Query baseQuery = new Query(query.getFeatureStore(), query.getProject(), query.getFeaturegroup(), query.getAs(), new ArrayList<>(query.getFeatures()), query.getAvailableFeatures(), query.getHiveEngine(), query.getFilter());
// collect left outer most features
List<Feature> finalSelectList = constructorController.collectFeatures(baseQuery);
// generate subqueries for WITH
List<SqlSelect> withSelects = wrapSubQueries(generateSubQueries(baseQuery, query, isTrainingDataset));
finalSelectList.forEach(f -> f.setPitFgAlias(FG_SUBQUERY + "0"));
// list for "x0 as ..."
SqlNodeList selectAsses = new SqlNodeList(SqlParserPos.ZERO);
// joins for the body of the WITH statement, bringing together the final result
List<Join> newJoins = new ArrayList<>();
// each sqlSelect represents one subquery corresponding to one join in the final WITH body
for (int i = 0; i < withSelects.size(); i++) {
selectAsses.add(SqlStdOperatorTable.AS.createCall(// mandatory when using "WITH xyz AS ()" therefore we need to add it manually as string here
SqlNodeList.of(new SqlIdentifier(FG_SUBQUERY + i + HIVE_AS, SqlParserPos.ZERO), withSelects.get(i))));
// each select corresponds to one join, collect features and update alias, drop event time features from "right"
// feature groups
String pitAlias = FG_SUBQUERY + i;
if (isTrainingDataset) {
// for training datasets all features are contained in final select list from beginning, set the correct
// alias only only for the features corresponding to the feature group in the current join
int finalI = i;
finalSelectList.stream().filter(f -> f.getFeatureGroup() == query.getJoins().get(finalI).getRightQuery().getFeaturegroup()).forEach(f -> f.setPitFgAlias(pitAlias));
} else {
List<Feature> features = constructorController.collectFeatures(query.getJoins().get(i).getRightQuery());
features.forEach(f -> f.setPitFgAlias(pitAlias));
finalSelectList.addAll(features);
}
// add event time inequality join condition
List<Feature> primaryKey = baseQuery.getAvailableFeatures().stream().filter(Feature::isPrimary).collect(Collectors.toList());
List<Feature> newLeftOn = addEventTimeOn(primaryKey, baseQuery.getFeaturegroup(), baseQuery.getAs());
renameJoinFeatures(newLeftOn);
// equivalent copy, but needed to be able to set different alias
List<Feature> newRightOn = addEventTimeOn(primaryKey, baseQuery.getFeaturegroup(), baseQuery.getAs());
renameJoinFeatures(newRightOn);
List<SqlCondition> newJoinOperator = newLeftOn.stream().map(f -> SqlCondition.EQUALS).collect(Collectors.toList());
newLeftOn.forEach(f -> f.setPitFgAlias(FG_SUBQUERY + "0"));
newRightOn.forEach(f -> f.setPitFgAlias(pitAlias));
newJoins.add(new Join(null, null, newLeftOn, newRightOn, JoinType.INNER, null, newJoinOperator));
}
// sort features in last select
if (isTrainingDataset) {
finalSelectList = finalSelectList.stream().sorted(Comparator.comparing(Feature::getIdx)).collect(Collectors.toList());
}
SqlNodeList selectList = new SqlNodeList(SqlParserPos.ZERO);
for (Feature f : finalSelectList) {
String featurePrefixed;
if (!Strings.isNullOrEmpty(f.getPrefix())) {
featurePrefixed = f.getPrefix() + f.getName();
} else {
featurePrefixed = f.getName();
}
selectList.add(new SqlIdentifier(Arrays.asList("`" + f.getFgAlias(true) + "`", "`" + featurePrefixed + "`"), SqlParserPos.ZERO));
}
SqlSelect body = new SqlSelect(SqlParserPos.ZERO, null, selectList, buildWithJoin(newJoins, newJoins.size() - 1), null, null, null, null, null, null, null, null);
return new SqlWith(SqlParserPos.ZERO, selectAsses, body);
}
use of io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.SqlCondition in project hopsworks by logicalclocks.
the class TestPitJoinController method testWrapSubQueries.
@Test
public void testWrapSubQueries() {
List<Feature> leftFeatures = new ArrayList<>();
leftFeatures.add(new Feature("pk1", "fg0", fgLeft, true));
leftFeatures.add(new Feature("pk2", "fg0", fgLeft));
leftFeatures.add(new Feature("ts", "fg0", fgLeft));
leftFeatures.add(new Feature("label", "fg0", fgLeft));
List<Feature> rightFeatures = new ArrayList<>();
rightFeatures.add(new Feature("pk1", "fg1", fgRight));
rightFeatures.add(new Feature("pk2", "fg1", fgRight));
rightFeatures.add(new Feature("ts", "fg1", fgRight));
rightFeatures.add(new Feature("ft1", "fg1", fgRight));
List<Feature> leftOn = Arrays.asList(new Feature("pk1", "fg0", fgLeft), new Feature("pk2", "fg0", fgLeft));
List<Feature> rightOn = Arrays.asList(new Feature("pk1", "fg1", fgRight), new Feature("pk2", "fg1", fgRight));
List<SqlCondition> joinOperator = Arrays.asList(SqlCondition.EQUALS, SqlCondition.EQUALS);
Query query = new Query("fs", "project", fgLeft, "fg0", leftFeatures, leftFeatures, false, null);
Query right = new Query("fs", "project", fgRight, "fg1", rightFeatures, rightFeatures, false, null);
Join join = new Join(query, right, leftOn, rightOn, JoinType.INNER, null, joinOperator);
query.setJoins(Collections.singletonList(join));
Query baseQuery = new Query(query.getFeatureStore(), query.getProject(), query.getFeaturegroup(), query.getAs(), query.getFeatures(), query.getAvailableFeatures(), query.getHiveEngine(), query.getFilter());
List<SqlSelect> result = pitJoinController.wrapSubQueries(pitJoinController.generateSubQueries(baseQuery, query, false));
String expected = "SELECT *\n" + "FROM (SELECT `fg0`.`pk1`, `fg0`.`pk2`, `fg0`.`ts`, `fg0`.`label`, `fg0`.`pk1` `join_pk_pk1`, `fg0`.`ts` `join_evt_ts`, `fg1`.`pk1`, `fg1`.`pk2`, `fg1`.`ts`, `fg1`.`ft1`, " + "RANK() OVER (PARTITION BY `fg0`.`pk1`, `fg0`.`pk2`, `fg0`.`ts` ORDER BY `fg1`.`ts` DESC) pit_rank_hopsworks\n" + "FROM `fs`.`fg0_1` `fg0`\n" + "INNER JOIN `fs`.`fg1_1` `fg1` ON `fg0`.`pk1` = `fg1`.`pk1` AND `fg0`.`pk2` = `fg1`.`pk2` AND `fg0`.`ts` >= `fg1`.`ts`) NA\n" + "WHERE `pit_rank_hopsworks` = 1";
Assert.assertEquals(1, result.size());
Assert.assertEquals(expected, result.get(0).toSqlString(new SparkSqlDialect(SqlDialect.EMPTY_CONTEXT)).getSql());
}
use of io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.SqlCondition in project hopsworks by logicalclocks.
the class TestPitJoinController method testGenerateSql.
@Test
public void testGenerateSql() {
List<Feature> leftFeatures = new ArrayList<>();
leftFeatures.add(new Feature("pk1", "fg0", fgLeft, true));
leftFeatures.add(new Feature("pk2", "fg0", fgLeft));
leftFeatures.add(new Feature("ts", "fg0", fgLeft));
leftFeatures.add(new Feature("label", "fg0", fgLeft));
List<Feature> rightFeatures = new ArrayList<>();
rightFeatures.add(new Feature("pk1", "fg1", fgRight));
rightFeatures.add(new Feature("pk2", "fg1", fgRight));
rightFeatures.add(new Feature("ts", "fg1", fgRight));
rightFeatures.add(new Feature("ft1", "fg1", fgRight));
List<Feature> rightFeatures1 = new ArrayList<>();
rightFeatures1.add(new Feature("pk1", "fg2", fgRight1));
rightFeatures1.add(new Feature("ts", "fg2", fgRight1));
rightFeatures1.add(new Feature("ft1", "fg2", fgRight1));
List<Feature> leftOn = Arrays.asList(new Feature("pk1", "fg0", fgLeft), new Feature("pk2", "fg0", fgLeft));
List<Feature> rightOn = Arrays.asList(new Feature("pk1", "fg1", fgRight), new Feature("pk2", "fg1", fgRight));
// join on different pks
List<Feature> leftOn1 = Collections.singletonList(new Feature("pk1", "fg0", fgLeft));
List<Feature> rightOn1 = Collections.singletonList(new Feature("pk1", "fg2", fgRight1));
List<SqlCondition> joinOperator = Arrays.asList(SqlCondition.EQUALS, SqlCondition.EQUALS);
List<SqlCondition> joinOperator1 = Collections.singletonList(SqlCondition.EQUALS);
Query query = new Query("fs", "project", fgLeft, "fg0", leftFeatures, leftFeatures, false, null);
Query right = new Query("fs", "project", fgRight, "fg1", rightFeatures, rightFeatures, false, null);
Query right1 = new Query("fs", "project", fgRight1, "fg2", rightFeatures1, rightFeatures1, false, null);
Join join = new Join(query, right, leftOn, rightOn, JoinType.INNER, null, joinOperator);
Join join1 = new Join(query, right1, leftOn1, rightOn1, JoinType.INNER, null, joinOperator1);
query.setJoins(Arrays.asList(join, join1));
String result = pitJoinController.generateSQL(query, false).toSqlString(new SparkSqlDialect(SqlDialect.EMPTY_CONTEXT)).getSql();
String expected = "WITH right_fg0 AS (SELECT *\n" + "FROM (SELECT `fg0`.`pk1`, `fg0`.`pk2`, `fg0`.`ts`, `fg0`.`label`, `fg0`.`pk1` `join_pk_pk1`, `fg0`.`ts` `join_evt_ts`, `fg1`.`pk1`, `fg1`.`pk2`, `fg1`.`ts`, `fg1`.`ft1`, " + "RANK() OVER (PARTITION BY `fg0`.`pk1`, `fg0`.`pk2`, `fg0`.`ts` ORDER BY `fg1`.`ts` DESC) pit_rank_hopsworks\n" + "FROM `fs`.`fg0_1` `fg0`\n" + "INNER JOIN `fs`.`fg1_1` `fg1` ON `fg0`.`pk1` = `fg1`.`pk1` AND `fg0`.`pk2` = `fg1`.`pk2` AND `fg0`.`ts` >= `fg1`.`ts`) NA\n" + "WHERE `pit_rank_hopsworks` = 1), " + "right_fg1 AS (SELECT *\n" + "FROM (SELECT `fg0`.`pk1`, `fg0`.`pk2`, `fg0`.`ts`, `fg0`.`label`, `fg0`.`pk1` `join_pk_pk1`, `fg0`.`ts` `join_evt_ts`, `fg2`.`pk1`, `fg2`.`ts`, `fg2`.`ft1`, " + "RANK() OVER (PARTITION BY `fg0`.`pk1`, `fg0`.`ts` ORDER BY `fg2`.`ts` DESC) pit_rank_hopsworks\n" + "FROM `fs`.`fg0_1` `fg0`\n" + "INNER JOIN `fs`.`fg2_1` `fg2` ON `fg0`.`pk1` = `fg2`.`pk1` AND `fg0`.`ts` >= `fg2`.`ts`) NA\n" + "WHERE `pit_rank_hopsworks` = 1) (" + "SELECT `right_fg0`.`pk1`, `right_fg0`.`pk2`, `right_fg0`.`ts`, `right_fg0`.`label`, `right_fg0`.`pk1`, `right_fg0`.`pk2`, `right_fg0`.`ts`, `right_fg0`.`ft1`, `right_fg1`.`pk1`, `right_fg1`.`ts`, `right_fg1`.`ft1`\n" + "FROM right_fg0\n" + "INNER JOIN right_fg1 ON `right_fg0`.`join_pk_pk1` = `right_fg1`.`join_pk_pk1` AND `right_fg0`.`join_evt_ts` = `right_fg1`.`join_evt_ts`)";
Assert.assertEquals(expected, result);
}
use of io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.SqlCondition in project hopsworks by logicalclocks.
the class TestPitJoinController method testGenerateSqlTrainingDatasetWrongFeatureOrder.
@Test
public void testGenerateSqlTrainingDatasetWrongFeatureOrder() {
List<Feature> leftFeatures = new ArrayList<>();
leftFeatures.add(new Feature("pk1", "fg0", fgLeft, true, 1));
leftFeatures.add(new Feature("pk2", "fg0", fgLeft, false, 2));
leftFeatures.add(new Feature("ts", "fg0", fgLeft, false, 3));
leftFeatures.add(new Feature("label", "fg0", fgLeft, false, 4));
// note wrong order
leftFeatures.add(new Feature("ft1", "fg1", fgRight, false, 6));
leftFeatures.add(new Feature("ft2", "fg2", fgRight1, false, 5));
List<Feature> leftOn = Arrays.asList(new Feature("pk1", "fg0", fgLeft), new Feature("pk2", "fg0", fgLeft));
List<Feature> rightOn = Arrays.asList(new Feature("pk1", "fg1", fgRight), new Feature("pk2", "fg1", fgRight));
// join on different pks
List<Feature> leftOn1 = Collections.singletonList(new Feature("pk1", "fg0", fgLeft));
List<Feature> rightOn1 = Collections.singletonList(new Feature("pk1", "fg2", fgRight1));
List<SqlCondition> joinOperator = Arrays.asList(SqlCondition.EQUALS, SqlCondition.EQUALS);
List<SqlCondition> joinOperator1 = Collections.singletonList(SqlCondition.EQUALS);
Query query = new Query("fs", "project", fgLeft, "fg0", leftFeatures, leftFeatures, false, null);
Query right = new Query("fs", "project", fgRight, "fg1", new ArrayList<>(), new ArrayList<>(), false, null);
Query right1 = new Query("fs", "project", fgRight1, "fg2", new ArrayList<>(), new ArrayList<>(), false, null);
Join join = new Join(query, right, leftOn, rightOn, JoinType.INNER, null, joinOperator);
Join join1 = new Join(query, right1, leftOn1, rightOn1, JoinType.INNER, null, joinOperator1);
query.setJoins(Arrays.asList(join, join1));
String result = pitJoinController.generateSQL(query, true).toSqlString(new SparkSqlDialect(SqlDialect.EMPTY_CONTEXT)).getSql();
String expected = "WITH right_fg0 AS (SELECT *\n" + "FROM (SELECT `fg0`.`pk1`, `fg0`.`pk2`, `fg0`.`ts`, `fg0`.`label`, `fg1`.`ft1`, `fg0`.`pk1` `join_pk_pk1`, `fg0`.`ts` `join_evt_ts`, " + "RANK() OVER (PARTITION BY `fg0`.`pk1`, `fg0`.`pk2`, `fg0`.`ts` ORDER BY `fg1`.`ts` DESC) pit_rank_hopsworks\n" + "FROM `fs`.`fg0_1` `fg0`\n" + "INNER JOIN `fs`.`fg1_1` `fg1` ON `fg0`.`pk1` = `fg1`.`pk1` AND `fg0`.`pk2` = `fg1`.`pk2` AND `fg0`.`ts` >= `fg1`.`ts`) NA\n" + "WHERE `pit_rank_hopsworks` = 1), " + "right_fg1 AS (SELECT *\n" + "FROM (SELECT `fg0`.`pk1`, `fg0`.`pk2`, `fg0`.`ts`, `fg0`.`label`, `fg2`.`ft2`, `fg0`.`pk1` `join_pk_pk1`, `fg0`.`ts` `join_evt_ts`, " + "RANK() OVER (PARTITION BY `fg0`.`pk1`, `fg0`.`ts` ORDER BY `fg2`.`ts` DESC) pit_rank_hopsworks\n" + "FROM `fs`.`fg0_1` `fg0`\n" + "INNER JOIN `fs`.`fg2_1` `fg2` ON `fg0`.`pk1` = `fg2`.`pk1` AND `fg0`.`ts` >= `fg2`.`ts`) NA\n" + "WHERE `pit_rank_hopsworks` = 1) (" + "SELECT `right_fg0`.`pk1`, `right_fg0`.`pk2`, `right_fg0`.`ts`, `right_fg0`.`label`, `right_fg1`.`ft2`, `right_fg0`.`ft1`\n" + "FROM right_fg0\n" + "INNER JOIN right_fg1 ON `right_fg0`.`join_pk_pk1` = `right_fg1`.`join_pk_pk1` AND `right_fg0`.`join_evt_ts` = `right_fg1`.`join_evt_ts`)";
Assert.assertEquals(expected, result);
}
use of io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.SqlCondition in project hopsworks by logicalclocks.
the class TestPitJoinController method testGenerateSubQueriesSingle.
@Test
public void testGenerateSubQueriesSingle() {
List<Feature> leftFeatures = new ArrayList<>();
leftFeatures.add(new Feature("pk1", "fg0", fgLeft, true));
leftFeatures.add(new Feature("pk2", "fg0", fgLeft));
leftFeatures.add(new Feature("ts", "fg0", fgLeft));
leftFeatures.add(new Feature("label", "fg0", fgLeft));
List<Feature> rightFeatures = new ArrayList<>();
rightFeatures.add(new Feature("pk1", "fg1", fgRight));
rightFeatures.add(new Feature("pk2", "fg1", fgRight));
rightFeatures.add(new Feature("ts", "fg1", fgRight));
rightFeatures.add(new Feature("ft1", "fg1", fgRight));
List<Feature> leftOn = Arrays.asList(new Feature("pk1", "fg0", fgLeft), new Feature("pk2", "fg0", fgLeft));
List<Feature> rightOn = Arrays.asList(new Feature("pk1", "fg1", fgRight), new Feature("pk2", "fg1", fgRight));
List<SqlCondition> joinOperator = Arrays.asList(SqlCondition.EQUALS, SqlCondition.EQUALS);
Query query = new Query("fs", "project", fgLeft, "fg0", leftFeatures, leftFeatures, false, null);
Query right = new Query("fs", "project", fgRight, "fg1", rightFeatures, rightFeatures, false, null);
Join join = new Join(query, right, leftOn, rightOn, JoinType.INNER, null, joinOperator);
query.setJoins(Collections.singletonList(join));
Query baseQuery = new Query(query.getFeatureStore(), query.getProject(), query.getFeaturegroup(), query.getAs(), query.getFeatures(), query.getAvailableFeatures(), query.getHiveEngine(), query.getFilter());
List<SqlCall> result = pitJoinController.generateSubQueries(baseQuery, query, false);
String expected = "(SELECT `fg0`.`pk1`, `fg0`.`pk2`, `fg0`.`ts`, `fg0`.`label`, `fg0`.`pk1` `join_pk_pk1`, `fg0`.`ts` `join_evt_ts`, `fg1`.`pk1`, `fg1`.`pk2`, `fg1`.`ts`, `fg1`.`ft1`, " + "RANK() OVER (PARTITION BY `fg0`.`pk1`, `fg0`.`pk2`, `fg0`.`ts` ORDER BY `fg1`.`ts` DESC) pit_rank_hopsworks\n" + "FROM `fs`.`fg0_1` `fg0`\n" + "INNER JOIN `fs`.`fg1_1` `fg1` ON `fg0`.`pk1` = `fg1`.`pk1` AND `fg0`.`pk2` = `fg1`.`pk2` AND `fg0`.`ts` >= `fg1`.`ts`) NA";
Assert.assertEquals(1, result.size());
Assert.assertEquals(expected, result.get(0).toSqlString(new SparkSqlDialect(SqlDialect.EMPTY_CONTEXT)).getSql());
}
Aggregations