use of io.hops.hopsworks.common.featurestore.query.join.Join in project hopsworks by logicalclocks.
the class TrainingDatasetController method collectFeatures.
// Here we need to pass the list of training dataset joins so that we can rebuild the aliases.
// and handle correctly the case in which a feature group is joined with itself.
public List<TrainingDatasetFeature> collectFeatures(Query query, List<TrainingDatasetFeatureDTO> featureDTOs, TrainingDataset trainingDataset, FeatureView featureView, int featureIndex, List<TrainingDatasetJoin> tdJoins, int joinIndex) throws FeaturestoreException {
List<TrainingDatasetFeature> features = new ArrayList<>();
boolean isLabel = false;
TransformationFunction transformationFunction = null;
for (Feature f : query.getFeatures()) {
if (featureDTOs != null && !featureDTOs.isEmpty()) {
// identify if feature is label
isLabel = featureDTOs.stream().anyMatch(dto -> f.getName().equals(dto.getName()) && dto.getLabel());
// get transformation function for this feature
transformationFunction = getTransformationFunction(f, featureDTOs);
}
features.add(trainingDataset != null ? new TrainingDatasetFeature(trainingDataset, tdJoins.get(joinIndex), query.getFeaturegroup(), f.getName(), f.getType(), featureIndex++, isLabel, transformationFunction) : new TrainingDatasetFeature(featureView, tdJoins.get(joinIndex), query.getFeaturegroup(), f.getName(), f.getType(), featureIndex++, isLabel, transformationFunction));
}
if (query.getJoins() != null) {
for (Join join : query.getJoins()) {
joinIndex++;
List<TrainingDatasetFeature> joinFeatures = collectFeatures(join.getRightQuery(), featureDTOs, trainingDataset, featureView, featureIndex, tdJoins, joinIndex);
features.addAll(joinFeatures);
featureIndex += joinFeatures.size();
}
}
return features;
}
use of io.hops.hopsworks.common.featurestore.query.join.Join in project hopsworks by logicalclocks.
the class TrainingDatasetController method getQueryJoin.
// Rebuild query object so that the query constructor can be build the string
public Join getQueryJoin(Query leftQuery, TrainingDatasetJoin rightTdJoin, Map<Integer, String> fgAliasLookup, Map<Integer, String> fsLookup, Map<Integer, List<Feature>> availableFeaturesLookup, Boolean isHiveEngine) throws FeaturestoreException {
String rightAs = fgAliasLookup.get(rightTdJoin.getId());
Query rightQuery = new Query(fsLookup.get(rightTdJoin.getFeatureGroup().getFeaturestore().getId()), onlineFeaturestoreController.getOnlineFeaturestoreDbName(rightTdJoin.getFeatureGroup().getFeaturestore().getProject()), rightTdJoin.getFeatureGroup(), rightAs, // no requested features as they are all in the left base query
new ArrayList<>(), availableFeaturesLookup.get(rightTdJoin.getFeatureGroup().getId()), isHiveEngine);
List<Feature> leftOn = rightTdJoin.getConditions().stream().map(c -> new Feature(c.getLeftFeature())).collect(Collectors.toList());
List<Feature> rightOn = rightTdJoin.getConditions().stream().map(c -> new Feature(c.getRightFeature())).collect(Collectors.toList());
JoinType joinType = JoinType.values()[rightTdJoin.getType()];
return queryController.extractLeftRightOn(leftQuery, rightQuery, leftOn, rightOn, joinType, rightTdJoin.getPrefix());
}
use of io.hops.hopsworks.common.featurestore.query.join.Join in project hopsworks by logicalclocks.
the class PitJoinController method generateSQL.
public SqlNode generateSQL(Query query, boolean isTrainingDataset) {
// make a copy of base query to replace joins
Query baseQuery = new Query(query.getFeatureStore(), query.getProject(), query.getFeaturegroup(), query.getAs(), new ArrayList<>(query.getFeatures()), query.getAvailableFeatures(), query.getHiveEngine(), query.getFilter());
// collect left outer most features
List<Feature> finalSelectList = constructorController.collectFeatures(baseQuery);
// generate subqueries for WITH
List<SqlSelect> withSelects = wrapSubQueries(generateSubQueries(baseQuery, query, isTrainingDataset));
finalSelectList.forEach(f -> f.setPitFgAlias(FG_SUBQUERY + "0"));
// list for "x0 as ..."
SqlNodeList selectAsses = new SqlNodeList(SqlParserPos.ZERO);
// joins for the body of the WITH statement, bringing together the final result
List<Join> newJoins = new ArrayList<>();
// each sqlSelect represents one subquery corresponding to one join in the final WITH body
for (int i = 0; i < withSelects.size(); i++) {
selectAsses.add(SqlStdOperatorTable.AS.createCall(// mandatory when using "WITH xyz AS ()" therefore we need to add it manually as string here
SqlNodeList.of(new SqlIdentifier(FG_SUBQUERY + i + HIVE_AS, SqlParserPos.ZERO), withSelects.get(i))));
// each select corresponds to one join, collect features and update alias, drop event time features from "right"
// feature groups
String pitAlias = FG_SUBQUERY + i;
if (isTrainingDataset) {
// for training datasets all features are contained in final select list from beginning, set the correct
// alias only only for the features corresponding to the feature group in the current join
int finalI = i;
finalSelectList.stream().filter(f -> f.getFeatureGroup() == query.getJoins().get(finalI).getRightQuery().getFeaturegroup()).forEach(f -> f.setPitFgAlias(pitAlias));
} else {
List<Feature> features = constructorController.collectFeatures(query.getJoins().get(i).getRightQuery());
features.forEach(f -> f.setPitFgAlias(pitAlias));
finalSelectList.addAll(features);
}
// add event time inequality join condition
List<Feature> primaryKey = baseQuery.getAvailableFeatures().stream().filter(Feature::isPrimary).collect(Collectors.toList());
List<Feature> newLeftOn = addEventTimeOn(primaryKey, baseQuery.getFeaturegroup(), baseQuery.getAs());
renameJoinFeatures(newLeftOn);
// equivalent copy, but needed to be able to set different alias
List<Feature> newRightOn = addEventTimeOn(primaryKey, baseQuery.getFeaturegroup(), baseQuery.getAs());
renameJoinFeatures(newRightOn);
List<SqlCondition> newJoinOperator = newLeftOn.stream().map(f -> SqlCondition.EQUALS).collect(Collectors.toList());
newLeftOn.forEach(f -> f.setPitFgAlias(FG_SUBQUERY + "0"));
newRightOn.forEach(f -> f.setPitFgAlias(pitAlias));
newJoins.add(new Join(null, null, newLeftOn, newRightOn, JoinType.INNER, null, newJoinOperator));
}
// sort features in last select
if (isTrainingDataset) {
finalSelectList = finalSelectList.stream().sorted(Comparator.comparing(Feature::getIdx)).collect(Collectors.toList());
}
SqlNodeList selectList = new SqlNodeList(SqlParserPos.ZERO);
for (Feature f : finalSelectList) {
String featurePrefixed;
if (!Strings.isNullOrEmpty(f.getPrefix())) {
featurePrefixed = f.getPrefix() + f.getName();
} else {
featurePrefixed = f.getName();
}
selectList.add(new SqlIdentifier(Arrays.asList("`" + f.getFgAlias(true) + "`", "`" + featurePrefixed + "`"), SqlParserPos.ZERO));
}
SqlSelect body = new SqlSelect(SqlParserPos.ZERO, null, selectList, buildWithJoin(newJoins, newJoins.size() - 1), null, null, null, null, null, null, null, null);
return new SqlWith(SqlParserPos.ZERO, selectAsses, body);
}
use of io.hops.hopsworks.common.featurestore.query.join.Join in project hopsworks by logicalclocks.
the class TrainingDatasetInputValidation method validateFeatures.
public void validateFeatures(Query query, List<TrainingDatasetFeatureDTO> featuresDTOs) throws FeaturestoreException {
if (query == null || featuresDTOs == null) {
// needed.
return;
}
List<TrainingDatasetFeatureDTO> labels = featuresDTOs.stream().filter(TrainingDatasetFeatureDTO::getLabel).collect(Collectors.toList());
List<TrainingDatasetFeatureDTO> featuresWithTransformation = featuresDTOs.stream().filter(f -> f.getTransformationFunction() != null).collect(Collectors.toList());
List<Feature> features = collectFeatures(query);
for (TrainingDatasetFeatureDTO label : labels) {
if (features.stream().noneMatch(f -> f.getName().equals(label.getName()))) {
throw new FeaturestoreException(RESTCodes.FeaturestoreErrorCode.LABEL_NOT_FOUND, Level.FINE, "Label: " + label.getName() + " is missing");
}
}
for (TrainingDatasetFeatureDTO featureWithTransformation : featuresWithTransformation) {
if (features.stream().noneMatch(f -> f.getName().equals(featureWithTransformation.getFeatureGroupFeatureName()))) {
throw new FeaturestoreException(RESTCodes.FeaturestoreErrorCode.FEATURE_WITH_TRANSFORMATION_NOT_FOUND, Level.FINE, "feature: " + featureWithTransformation.getName() + " is missing and transformation function can't be attached");
}
}
// verify join prefix if any
if (query != null && query.getJoins() != null) {
for (Join join : query.getJoins()) {
if (join.getPrefix() != null) {
Pattern namePattern = FeaturestoreConstants.FEATURESTORE_REGEX;
if (!namePattern.matcher(join.getPrefix()).matches()) {
throw new FeaturestoreException(RESTCodes.FeaturestoreErrorCode.ILLEGAL_PREFIX_NAME, Level.FINE, ", the provided prefix name " + join.getPrefix() + " is invalid. Prefix names can only contain lower" + " case characters, numbers and underscores and cannot be longer than " + FeaturestoreConstants.FEATURESTORE_ENTITY_NAME_MAX_LENGTH + " characters or empty.");
}
}
}
}
}
use of io.hops.hopsworks.common.featurestore.query.join.Join in project hopsworks by logicalclocks.
the class QueryController method removeDuplicateColumns.
/**
* For Join on primary keys or On condition we should remove duplicated (same name) columns.
* Spark refuses to write dataframes with duplicated column names.
* @param query
*/
void removeDuplicateColumns(Query query, boolean pitEnabled) {
for (Join join : query.getJoins()) {
// Extract left join feature names and drop all features on right side with same name
List<String> leftJoinFeatureNames = join.getLeftOn().stream().map(Feature::getName).collect(Collectors.toList());
// Remove all features which are on the join condition and are not already present in the left side of the join
List<Feature> filteredRightFeatures = new ArrayList<>();
for (Feature rightFeature : join.getRightQuery().getFeatures()) {
if (leftJoinFeatureNames.contains(rightFeature.getName()) && join.getLeftQuery().getFeatures().stream().anyMatch(lf -> lf.getName().equals(Strings.isNullOrEmpty(rightFeature.getPrefix()) ? rightFeature.getName() : rightFeature.getPrefix() + rightFeature.getName()))) {
// no need to pass it here.
continue;
}
filteredRightFeatures.add(rightFeature);
}
// drop event time from right side if PIT join
if (pitEnabled) {
filteredRightFeatures = filteredRightFeatures.stream().filter(f -> !f.getName().equals(join.getRightQuery().getFeaturegroup().getEventTime())).collect(Collectors.toList());
}
// replace the features for the right query
join.getRightQuery().setFeatures(filteredRightFeatures);
}
}
Aggregations