Search in sources :

Example 1 with Join

use of io.hops.hopsworks.common.featurestore.query.join.Join in project hopsworks by logicalclocks.

the class TrainingDatasetController method collectFeatures.

// Here we need to pass the list of training dataset joins so that we can rebuild the aliases.
// and handle correctly the case in which a feature group is joined with itself.
public List<TrainingDatasetFeature> collectFeatures(Query query, List<TrainingDatasetFeatureDTO> featureDTOs, TrainingDataset trainingDataset, FeatureView featureView, int featureIndex, List<TrainingDatasetJoin> tdJoins, int joinIndex) throws FeaturestoreException {
    List<TrainingDatasetFeature> features = new ArrayList<>();
    boolean isLabel = false;
    TransformationFunction transformationFunction = null;
    for (Feature f : query.getFeatures()) {
        if (featureDTOs != null && !featureDTOs.isEmpty()) {
            // identify if feature is label
            isLabel = featureDTOs.stream().anyMatch(dto -> f.getName().equals(dto.getName()) && dto.getLabel());
            // get transformation function for this feature
            transformationFunction = getTransformationFunction(f, featureDTOs);
        }
        features.add(trainingDataset != null ? new TrainingDatasetFeature(trainingDataset, tdJoins.get(joinIndex), query.getFeaturegroup(), f.getName(), f.getType(), featureIndex++, isLabel, transformationFunction) : new TrainingDatasetFeature(featureView, tdJoins.get(joinIndex), query.getFeaturegroup(), f.getName(), f.getType(), featureIndex++, isLabel, transformationFunction));
    }
    if (query.getJoins() != null) {
        for (Join join : query.getJoins()) {
            joinIndex++;
            List<TrainingDatasetFeature> joinFeatures = collectFeatures(join.getRightQuery(), featureDTOs, trainingDataset, featureView, featureIndex, tdJoins, joinIndex);
            features.addAll(joinFeatures);
            featureIndex += joinFeatures.size();
        }
    }
    return features;
}
Also used : TrainingDatasetFilter(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetFilter) FeaturegroupType(io.hops.hopsworks.persistence.entity.featurestore.featuregroup.FeaturegroupType) Date(java.util.Date) Feature(io.hops.hopsworks.common.featurestore.query.Feature) HopsfsTrainingDatasetController(io.hops.hopsworks.common.featurestore.trainingdatasets.hopsfs.HopsfsTrainingDatasetController) HopsFSProvenanceController(io.hops.hopsworks.common.provenance.core.HopsFSProvenanceController) Settings(io.hops.hopsworks.common.util.Settings) TransactionAttributeType(javax.ejb.TransactionAttributeType) Map(java.util.Map) FilterValue(io.hops.hopsworks.common.featurestore.query.filter.FilterValue) FeatureView(io.hops.hopsworks.persistence.entity.featurestore.featureview.FeatureView) TrainingDataset(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDataset) TrainingDatasetSplit(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.split.TrainingDatasetSplit) Utils(io.hops.hopsworks.common.hdfs.Utils) StatisticsConfig(io.hops.hopsworks.persistence.entity.featurestore.statistics.StatisticsConfig) JoinType(org.apache.calcite.sql.JoinType) Stateless(javax.ejb.Stateless) TransformationFunctionFacade(io.hops.hopsworks.common.featurestore.transformationFunction.TransformationFunctionFacade) HopsfsTrainingDatasetFacade(io.hops.hopsworks.common.featurestore.trainingdatasets.hopsfs.HopsfsTrainingDatasetFacade) Collection(java.util.Collection) TrainingDatasetFeatureDTO(io.hops.hopsworks.common.featurestore.feature.TrainingDatasetFeatureDTO) Featuregroup(io.hops.hopsworks.persistence.entity.featurestore.featuregroup.Featuregroup) RESTCodes(io.hops.hopsworks.restutils.RESTCodes) Join(io.hops.hopsworks.common.featurestore.query.join.Join) StatisticColumnController(io.hops.hopsworks.common.featurestore.statistics.columns.StatisticColumnController) FeaturestoreConnectorFacade(io.hops.hopsworks.common.featurestore.storageconnectors.FeaturestoreConnectorFacade) Collectors(java.util.stream.Collectors) QueryController(io.hops.hopsworks.common.featurestore.query.QueryController) SqlFilterLogic(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.SqlFilterLogic) TransformationFunction(io.hops.hopsworks.persistence.entity.featurestore.transformationFunction.TransformationFunction) List(java.util.List) FeaturestoreFacade(io.hops.hopsworks.common.featurestore.FeaturestoreFacade) ExternalTrainingDatasetController(io.hops.hopsworks.common.featurestore.trainingdatasets.external.ExternalTrainingDatasetController) FeaturestoreUtils(io.hops.hopsworks.common.featurestore.utils.FeaturestoreUtils) Optional(java.util.Optional) FeaturestoreConnector(io.hops.hopsworks.persistence.entity.featurestore.storageconnector.FeaturestoreConnector) DistributedFsService(io.hops.hopsworks.common.hdfs.DistributedFsService) InodeController(io.hops.hopsworks.common.hdfs.inode.InodeController) DistributedFileSystemOps(io.hops.hopsworks.common.hdfs.DistributedFileSystemOps) TrainingDatasetJoinCondition(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetJoinCondition) FeaturestoreConnectorType(io.hops.hopsworks.persistence.entity.featurestore.storageconnector.FeaturestoreConnectorType) HashMap(java.util.HashMap) FeaturestoreActivityMeta(io.hops.hopsworks.persistence.entity.featurestore.activity.FeaturestoreActivityMeta) Streams(com.logicalclocks.shaded.com.google.common.collect.Streams) OnlineFeaturestoreController(io.hops.hopsworks.common.featurestore.online.OnlineFeaturestoreController) Project(io.hops.hopsworks.persistence.entity.project.Project) TrainingDatasetFilterCondition(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetFilterCondition) ArrayList(java.util.ArrayList) Level(java.util.logging.Level) HopsfsTrainingDataset(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.hopsfs.HopsfsTrainingDataset) FeaturestoreException(io.hops.hopsworks.exceptions.FeaturestoreException) FeaturestoreActivityFacade(io.hops.hopsworks.common.featurestore.activity.FeaturestoreActivityFacade) PitJoinController(io.hops.hopsworks.common.featurestore.query.pit.PitJoinController) TransactionAttribute(javax.ejb.TransactionAttribute) HdfsUsersController(io.hops.hopsworks.common.hdfs.HdfsUsersController) OnlineFeaturegroupController(io.hops.hopsworks.common.featurestore.featuregroup.online.OnlineFeaturegroupController) Query(io.hops.hopsworks.common.featurestore.query.Query) Filter(io.hops.hopsworks.common.featurestore.query.filter.Filter) TrainingDatasetType(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetType) Inode(io.hops.hopsworks.persistence.entity.hdfs.inode.Inode) ProvenanceException(io.hops.hopsworks.exceptions.ProvenanceException) EJB(javax.ejb.EJB) TrainingDatasetFeature(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetFeature) StatisticsController(io.hops.hopsworks.common.featurestore.statistics.StatisticsController) ExternalTrainingDataset(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.external.ExternalTrainingDataset) IOException(java.io.IOException) Featurestore(io.hops.hopsworks.persistence.entity.featurestore.Featurestore) FeaturegroupController(io.hops.hopsworks.common.featurestore.featuregroup.FeaturegroupController) ServiceException(io.hops.hopsworks.exceptions.ServiceException) TimeTravelFormat(io.hops.hopsworks.persistence.entity.featurestore.featuregroup.cached.TimeTravelFormat) TrainingDatasetJoin(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetJoin) QueryDTO(io.hops.hopsworks.common.featurestore.query.QueryDTO) StatisticColumn(io.hops.hopsworks.persistence.entity.featurestore.statistics.StatisticColumn) Dataset(io.hops.hopsworks.persistence.entity.dataset.Dataset) FilterLogic(io.hops.hopsworks.common.featurestore.query.filter.FilterLogic) Users(io.hops.hopsworks.persistence.entity.user.Users) Comparator(java.util.Comparator) FeaturegroupDTO(io.hops.hopsworks.common.featurestore.featuregroup.FeaturegroupDTO) TrainingDatasetFeature(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetFeature) ArrayList(java.util.ArrayList) Join(io.hops.hopsworks.common.featurestore.query.join.Join) TrainingDatasetJoin(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetJoin) TransformationFunction(io.hops.hopsworks.persistence.entity.featurestore.transformationFunction.TransformationFunction) Feature(io.hops.hopsworks.common.featurestore.query.Feature) TrainingDatasetFeature(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetFeature)

Example 2 with Join

use of io.hops.hopsworks.common.featurestore.query.join.Join in project hopsworks by logicalclocks.

the class TrainingDatasetController method getQueryJoin.

// Rebuild query object so that the query constructor can be build the string
public Join getQueryJoin(Query leftQuery, TrainingDatasetJoin rightTdJoin, Map<Integer, String> fgAliasLookup, Map<Integer, String> fsLookup, Map<Integer, List<Feature>> availableFeaturesLookup, Boolean isHiveEngine) throws FeaturestoreException {
    String rightAs = fgAliasLookup.get(rightTdJoin.getId());
    Query rightQuery = new Query(fsLookup.get(rightTdJoin.getFeatureGroup().getFeaturestore().getId()), onlineFeaturestoreController.getOnlineFeaturestoreDbName(rightTdJoin.getFeatureGroup().getFeaturestore().getProject()), rightTdJoin.getFeatureGroup(), rightAs, // no requested features as they are all in the left base query
    new ArrayList<>(), availableFeaturesLookup.get(rightTdJoin.getFeatureGroup().getId()), isHiveEngine);
    List<Feature> leftOn = rightTdJoin.getConditions().stream().map(c -> new Feature(c.getLeftFeature())).collect(Collectors.toList());
    List<Feature> rightOn = rightTdJoin.getConditions().stream().map(c -> new Feature(c.getRightFeature())).collect(Collectors.toList());
    JoinType joinType = JoinType.values()[rightTdJoin.getType()];
    return queryController.extractLeftRightOn(leftQuery, rightQuery, leftOn, rightOn, joinType, rightTdJoin.getPrefix());
}
Also used : TrainingDatasetFilter(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetFilter) FeaturegroupType(io.hops.hopsworks.persistence.entity.featurestore.featuregroup.FeaturegroupType) Date(java.util.Date) Feature(io.hops.hopsworks.common.featurestore.query.Feature) HopsfsTrainingDatasetController(io.hops.hopsworks.common.featurestore.trainingdatasets.hopsfs.HopsfsTrainingDatasetController) HopsFSProvenanceController(io.hops.hopsworks.common.provenance.core.HopsFSProvenanceController) Settings(io.hops.hopsworks.common.util.Settings) TransactionAttributeType(javax.ejb.TransactionAttributeType) Map(java.util.Map) FilterValue(io.hops.hopsworks.common.featurestore.query.filter.FilterValue) FeatureView(io.hops.hopsworks.persistence.entity.featurestore.featureview.FeatureView) TrainingDataset(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDataset) TrainingDatasetSplit(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.split.TrainingDatasetSplit) Utils(io.hops.hopsworks.common.hdfs.Utils) StatisticsConfig(io.hops.hopsworks.persistence.entity.featurestore.statistics.StatisticsConfig) JoinType(org.apache.calcite.sql.JoinType) Stateless(javax.ejb.Stateless) TransformationFunctionFacade(io.hops.hopsworks.common.featurestore.transformationFunction.TransformationFunctionFacade) HopsfsTrainingDatasetFacade(io.hops.hopsworks.common.featurestore.trainingdatasets.hopsfs.HopsfsTrainingDatasetFacade) Collection(java.util.Collection) TrainingDatasetFeatureDTO(io.hops.hopsworks.common.featurestore.feature.TrainingDatasetFeatureDTO) Featuregroup(io.hops.hopsworks.persistence.entity.featurestore.featuregroup.Featuregroup) RESTCodes(io.hops.hopsworks.restutils.RESTCodes) Join(io.hops.hopsworks.common.featurestore.query.join.Join) StatisticColumnController(io.hops.hopsworks.common.featurestore.statistics.columns.StatisticColumnController) FeaturestoreConnectorFacade(io.hops.hopsworks.common.featurestore.storageconnectors.FeaturestoreConnectorFacade) Collectors(java.util.stream.Collectors) QueryController(io.hops.hopsworks.common.featurestore.query.QueryController) SqlFilterLogic(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.SqlFilterLogic) TransformationFunction(io.hops.hopsworks.persistence.entity.featurestore.transformationFunction.TransformationFunction) List(java.util.List) FeaturestoreFacade(io.hops.hopsworks.common.featurestore.FeaturestoreFacade) ExternalTrainingDatasetController(io.hops.hopsworks.common.featurestore.trainingdatasets.external.ExternalTrainingDatasetController) FeaturestoreUtils(io.hops.hopsworks.common.featurestore.utils.FeaturestoreUtils) Optional(java.util.Optional) FeaturestoreConnector(io.hops.hopsworks.persistence.entity.featurestore.storageconnector.FeaturestoreConnector) DistributedFsService(io.hops.hopsworks.common.hdfs.DistributedFsService) InodeController(io.hops.hopsworks.common.hdfs.inode.InodeController) DistributedFileSystemOps(io.hops.hopsworks.common.hdfs.DistributedFileSystemOps) TrainingDatasetJoinCondition(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetJoinCondition) FeaturestoreConnectorType(io.hops.hopsworks.persistence.entity.featurestore.storageconnector.FeaturestoreConnectorType) HashMap(java.util.HashMap) FeaturestoreActivityMeta(io.hops.hopsworks.persistence.entity.featurestore.activity.FeaturestoreActivityMeta) Streams(com.logicalclocks.shaded.com.google.common.collect.Streams) OnlineFeaturestoreController(io.hops.hopsworks.common.featurestore.online.OnlineFeaturestoreController) Project(io.hops.hopsworks.persistence.entity.project.Project) TrainingDatasetFilterCondition(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetFilterCondition) ArrayList(java.util.ArrayList) Level(java.util.logging.Level) HopsfsTrainingDataset(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.hopsfs.HopsfsTrainingDataset) FeaturestoreException(io.hops.hopsworks.exceptions.FeaturestoreException) FeaturestoreActivityFacade(io.hops.hopsworks.common.featurestore.activity.FeaturestoreActivityFacade) PitJoinController(io.hops.hopsworks.common.featurestore.query.pit.PitJoinController) TransactionAttribute(javax.ejb.TransactionAttribute) HdfsUsersController(io.hops.hopsworks.common.hdfs.HdfsUsersController) OnlineFeaturegroupController(io.hops.hopsworks.common.featurestore.featuregroup.online.OnlineFeaturegroupController) Query(io.hops.hopsworks.common.featurestore.query.Query) Filter(io.hops.hopsworks.common.featurestore.query.filter.Filter) TrainingDatasetType(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetType) Inode(io.hops.hopsworks.persistence.entity.hdfs.inode.Inode) ProvenanceException(io.hops.hopsworks.exceptions.ProvenanceException) EJB(javax.ejb.EJB) TrainingDatasetFeature(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetFeature) StatisticsController(io.hops.hopsworks.common.featurestore.statistics.StatisticsController) ExternalTrainingDataset(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.external.ExternalTrainingDataset) IOException(java.io.IOException) Featurestore(io.hops.hopsworks.persistence.entity.featurestore.Featurestore) FeaturegroupController(io.hops.hopsworks.common.featurestore.featuregroup.FeaturegroupController) ServiceException(io.hops.hopsworks.exceptions.ServiceException) TimeTravelFormat(io.hops.hopsworks.persistence.entity.featurestore.featuregroup.cached.TimeTravelFormat) TrainingDatasetJoin(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetJoin) QueryDTO(io.hops.hopsworks.common.featurestore.query.QueryDTO) StatisticColumn(io.hops.hopsworks.persistence.entity.featurestore.statistics.StatisticColumn) Dataset(io.hops.hopsworks.persistence.entity.dataset.Dataset) FilterLogic(io.hops.hopsworks.common.featurestore.query.filter.FilterLogic) Users(io.hops.hopsworks.persistence.entity.user.Users) Comparator(java.util.Comparator) FeaturegroupDTO(io.hops.hopsworks.common.featurestore.featuregroup.FeaturegroupDTO) Query(io.hops.hopsworks.common.featurestore.query.Query) JoinType(org.apache.calcite.sql.JoinType) Feature(io.hops.hopsworks.common.featurestore.query.Feature) TrainingDatasetFeature(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetFeature)

Example 3 with Join

use of io.hops.hopsworks.common.featurestore.query.join.Join in project hopsworks by logicalclocks.

the class PitJoinController method generateSQL.

public SqlNode generateSQL(Query query, boolean isTrainingDataset) {
    // make a copy of base query to replace joins
    Query baseQuery = new Query(query.getFeatureStore(), query.getProject(), query.getFeaturegroup(), query.getAs(), new ArrayList<>(query.getFeatures()), query.getAvailableFeatures(), query.getHiveEngine(), query.getFilter());
    // collect left outer most features
    List<Feature> finalSelectList = constructorController.collectFeatures(baseQuery);
    // generate subqueries for WITH
    List<SqlSelect> withSelects = wrapSubQueries(generateSubQueries(baseQuery, query, isTrainingDataset));
    finalSelectList.forEach(f -> f.setPitFgAlias(FG_SUBQUERY + "0"));
    // list for "x0 as ..."
    SqlNodeList selectAsses = new SqlNodeList(SqlParserPos.ZERO);
    // joins for the body of the WITH statement, bringing together the final result
    List<Join> newJoins = new ArrayList<>();
    // each sqlSelect represents one subquery corresponding to one join in the final WITH body
    for (int i = 0; i < withSelects.size(); i++) {
        selectAsses.add(SqlStdOperatorTable.AS.createCall(// mandatory when using "WITH xyz AS ()" therefore we need to add it manually as string here
        SqlNodeList.of(new SqlIdentifier(FG_SUBQUERY + i + HIVE_AS, SqlParserPos.ZERO), withSelects.get(i))));
        // each select corresponds to one join, collect features and update alias, drop event time features from "right"
        // feature groups
        String pitAlias = FG_SUBQUERY + i;
        if (isTrainingDataset) {
            // for training datasets all features are contained in final select list from beginning, set the correct
            // alias only only for the features corresponding to the feature group in the current join
            int finalI = i;
            finalSelectList.stream().filter(f -> f.getFeatureGroup() == query.getJoins().get(finalI).getRightQuery().getFeaturegroup()).forEach(f -> f.setPitFgAlias(pitAlias));
        } else {
            List<Feature> features = constructorController.collectFeatures(query.getJoins().get(i).getRightQuery());
            features.forEach(f -> f.setPitFgAlias(pitAlias));
            finalSelectList.addAll(features);
        }
        // add event time inequality join condition
        List<Feature> primaryKey = baseQuery.getAvailableFeatures().stream().filter(Feature::isPrimary).collect(Collectors.toList());
        List<Feature> newLeftOn = addEventTimeOn(primaryKey, baseQuery.getFeaturegroup(), baseQuery.getAs());
        renameJoinFeatures(newLeftOn);
        // equivalent copy, but needed to be able to set different alias
        List<Feature> newRightOn = addEventTimeOn(primaryKey, baseQuery.getFeaturegroup(), baseQuery.getAs());
        renameJoinFeatures(newRightOn);
        List<SqlCondition> newJoinOperator = newLeftOn.stream().map(f -> SqlCondition.EQUALS).collect(Collectors.toList());
        newLeftOn.forEach(f -> f.setPitFgAlias(FG_SUBQUERY + "0"));
        newRightOn.forEach(f -> f.setPitFgAlias(pitAlias));
        newJoins.add(new Join(null, null, newLeftOn, newRightOn, JoinType.INNER, null, newJoinOperator));
    }
    // sort features in last select
    if (isTrainingDataset) {
        finalSelectList = finalSelectList.stream().sorted(Comparator.comparing(Feature::getIdx)).collect(Collectors.toList());
    }
    SqlNodeList selectList = new SqlNodeList(SqlParserPos.ZERO);
    for (Feature f : finalSelectList) {
        String featurePrefixed;
        if (!Strings.isNullOrEmpty(f.getPrefix())) {
            featurePrefixed = f.getPrefix() + f.getName();
        } else {
            featurePrefixed = f.getName();
        }
        selectList.add(new SqlIdentifier(Arrays.asList("`" + f.getFgAlias(true) + "`", "`" + featurePrefixed + "`"), SqlParserPos.ZERO));
    }
    SqlSelect body = new SqlSelect(SqlParserPos.ZERO, null, selectList, buildWithJoin(newJoins, newJoins.size() - 1), null, null, null, null, null, null, null, null);
    return new SqlWith(SqlParserPos.ZERO, selectAsses, body);
}
Also used : Arrays(java.util.Arrays) JoinConditionType(org.apache.calcite.sql.JoinConditionType) Feature(io.hops.hopsworks.common.featurestore.query.Feature) JoinController(io.hops.hopsworks.common.featurestore.query.join.JoinController) ArrayList(java.util.ArrayList) Strings(com.google.common.base.Strings) SqlCall(org.apache.calcite.sql.SqlCall) SqlLiteral(org.apache.calcite.sql.SqlLiteral) SqlNode(org.apache.calcite.sql.SqlNode) SqlWith(org.apache.calcite.sql.SqlWith) TransactionAttributeType(javax.ejb.TransactionAttributeType) TransactionAttribute(javax.ejb.TransactionAttribute) SqlIdentifier(org.apache.calcite.sql.SqlIdentifier) Query(io.hops.hopsworks.common.featurestore.query.Query) Filter(io.hops.hopsworks.common.featurestore.query.filter.Filter) SqlWindow(org.apache.calcite.sql.SqlWindow) SqlSelect(org.apache.calcite.sql.SqlSelect) EJB(javax.ejb.EJB) JoinType(org.apache.calcite.sql.JoinType) SqlParserPos(org.apache.calcite.sql.parser.SqlParserPos) Stateless(javax.ejb.Stateless) SqlCondition(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.SqlCondition) Featuregroup(io.hops.hopsworks.persistence.entity.featurestore.featuregroup.Featuregroup) Join(io.hops.hopsworks.common.featurestore.query.join.Join) ConstructorController(io.hops.hopsworks.common.featurestore.query.ConstructorController) Collectors(java.util.stream.Collectors) QueryDTO(io.hops.hopsworks.common.featurestore.query.QueryDTO) List(java.util.List) FilterController(io.hops.hopsworks.common.featurestore.query.filter.FilterController) SqlStdOperatorTable(org.apache.calcite.sql.fun.SqlStdOperatorTable) SqlJoin(org.apache.calcite.sql.SqlJoin) Comparator(java.util.Comparator) SqlNodeList(org.apache.calcite.sql.SqlNodeList) Collections(java.util.Collections) Query(io.hops.hopsworks.common.featurestore.query.Query) SqlWith(org.apache.calcite.sql.SqlWith) ArrayList(java.util.ArrayList) Join(io.hops.hopsworks.common.featurestore.query.join.Join) SqlJoin(org.apache.calcite.sql.SqlJoin) SqlIdentifier(org.apache.calcite.sql.SqlIdentifier) Feature(io.hops.hopsworks.common.featurestore.query.Feature) SqlCondition(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.SqlCondition) SqlSelect(org.apache.calcite.sql.SqlSelect) SqlNodeList(org.apache.calcite.sql.SqlNodeList)

Example 4 with Join

use of io.hops.hopsworks.common.featurestore.query.join.Join in project hopsworks by logicalclocks.

the class TrainingDatasetInputValidation method validateFeatures.

public void validateFeatures(Query query, List<TrainingDatasetFeatureDTO> featuresDTOs) throws FeaturestoreException {
    if (query == null || featuresDTOs == null) {
        // needed.
        return;
    }
    List<TrainingDatasetFeatureDTO> labels = featuresDTOs.stream().filter(TrainingDatasetFeatureDTO::getLabel).collect(Collectors.toList());
    List<TrainingDatasetFeatureDTO> featuresWithTransformation = featuresDTOs.stream().filter(f -> f.getTransformationFunction() != null).collect(Collectors.toList());
    List<Feature> features = collectFeatures(query);
    for (TrainingDatasetFeatureDTO label : labels) {
        if (features.stream().noneMatch(f -> f.getName().equals(label.getName()))) {
            throw new FeaturestoreException(RESTCodes.FeaturestoreErrorCode.LABEL_NOT_FOUND, Level.FINE, "Label: " + label.getName() + " is missing");
        }
    }
    for (TrainingDatasetFeatureDTO featureWithTransformation : featuresWithTransformation) {
        if (features.stream().noneMatch(f -> f.getName().equals(featureWithTransformation.getFeatureGroupFeatureName()))) {
            throw new FeaturestoreException(RESTCodes.FeaturestoreErrorCode.FEATURE_WITH_TRANSFORMATION_NOT_FOUND, Level.FINE, "feature: " + featureWithTransformation.getName() + " is missing and transformation function can't be attached");
        }
    }
    // verify join prefix if any
    if (query != null && query.getJoins() != null) {
        for (Join join : query.getJoins()) {
            if (join.getPrefix() != null) {
                Pattern namePattern = FeaturestoreConstants.FEATURESTORE_REGEX;
                if (!namePattern.matcher(join.getPrefix()).matches()) {
                    throw new FeaturestoreException(RESTCodes.FeaturestoreErrorCode.ILLEGAL_PREFIX_NAME, Level.FINE, ", the provided prefix name " + join.getPrefix() + " is invalid. Prefix names can only contain lower" + " case characters, numbers and underscores and cannot be longer than " + FeaturestoreConstants.FEATURESTORE_ENTITY_NAME_MAX_LENGTH + " characters or empty.");
                }
            }
        }
    }
}
Also used : Feature(io.hops.hopsworks.common.featurestore.query.Feature) Strings(joptsimple.internal.Strings) FeaturestoreConnectorType(io.hops.hopsworks.persistence.entity.featurestore.storageconnector.FeaturestoreConnectorType) StringUtils(org.apache.commons.lang3.StringUtils) TrainingDatasetSplitDTO(io.hops.hopsworks.common.featurestore.trainingdatasets.split.TrainingDatasetSplitDTO) ArrayList(java.util.ArrayList) Level(java.util.logging.Level) HashSet(java.util.HashSet) FeaturestoreException(io.hops.hopsworks.exceptions.FeaturestoreException) TransactionAttributeType(javax.ejb.TransactionAttributeType) FeaturestoreInputValidation(io.hops.hopsworks.common.featurestore.utils.FeaturestoreInputValidation) TransactionAttribute(javax.ejb.TransactionAttribute) Query(io.hops.hopsworks.common.featurestore.query.Query) TrainingDatasetType(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetType) EJB(javax.ejb.EJB) Stateless(javax.ejb.Stateless) FeaturestoreConstants(io.hops.hopsworks.common.featurestore.FeaturestoreConstants) TrainingDatasetFeatureDTO(io.hops.hopsworks.common.featurestore.feature.TrainingDatasetFeatureDTO) Set(java.util.Set) RESTCodes(io.hops.hopsworks.restutils.RESTCodes) Join(io.hops.hopsworks.common.featurestore.query.join.Join) StatisticColumnController(io.hops.hopsworks.common.featurestore.statistics.columns.StatisticColumnController) FeaturestoreConnectorFacade(io.hops.hopsworks.common.featurestore.storageconnectors.FeaturestoreConnectorFacade) Collectors(java.util.stream.Collectors) FeaturestoreStorageConnectorDTO(io.hops.hopsworks.common.featurestore.storageconnectors.FeaturestoreStorageConnectorDTO) List(java.util.List) Pattern(java.util.regex.Pattern) FeaturestoreConnector(io.hops.hopsworks.persistence.entity.featurestore.storageconnector.FeaturestoreConnector) Pattern(java.util.regex.Pattern) TrainingDatasetFeatureDTO(io.hops.hopsworks.common.featurestore.feature.TrainingDatasetFeatureDTO) Join(io.hops.hopsworks.common.featurestore.query.join.Join) FeaturestoreException(io.hops.hopsworks.exceptions.FeaturestoreException) Feature(io.hops.hopsworks.common.featurestore.query.Feature)

Example 5 with Join

use of io.hops.hopsworks.common.featurestore.query.join.Join in project hopsworks by logicalclocks.

the class QueryController method removeDuplicateColumns.

/**
 * For Join on primary keys or On condition we should remove duplicated (same name) columns.
 * Spark refuses to write dataframes with duplicated column names.
 * @param query
 */
void removeDuplicateColumns(Query query, boolean pitEnabled) {
    for (Join join : query.getJoins()) {
        // Extract left join feature names and drop all features on right side with same name
        List<String> leftJoinFeatureNames = join.getLeftOn().stream().map(Feature::getName).collect(Collectors.toList());
        // Remove all features which are on the join condition and are not already present in the left side of the join
        List<Feature> filteredRightFeatures = new ArrayList<>();
        for (Feature rightFeature : join.getRightQuery().getFeatures()) {
            if (leftJoinFeatureNames.contains(rightFeature.getName()) && join.getLeftQuery().getFeatures().stream().anyMatch(lf -> lf.getName().equals(Strings.isNullOrEmpty(rightFeature.getPrefix()) ? rightFeature.getName() : rightFeature.getPrefix() + rightFeature.getName()))) {
                // no need to pass it here.
                continue;
            }
            filteredRightFeatures.add(rightFeature);
        }
        // drop event time from right side if PIT join
        if (pitEnabled) {
            filteredRightFeatures = filteredRightFeatures.stream().filter(f -> !f.getName().equals(join.getRightQuery().getFeaturegroup().getEventTime())).collect(Collectors.toList());
        }
        // replace the features for the right query
        join.getRightQuery().setFeatures(filteredRightFeatures);
    }
}
Also used : FeaturegroupFacade(io.hops.hopsworks.common.featurestore.featuregroup.FeaturegroupFacade) HashMap(java.util.HashMap) OnlineFeaturestoreController(io.hops.hopsworks.common.featurestore.online.OnlineFeaturestoreController) Project(io.hops.hopsworks.persistence.entity.project.Project) ArrayList(java.util.ArrayList) Level(java.util.logging.Level) Strings(com.google.common.base.Strings) FeaturestoreException(io.hops.hopsworks.exceptions.FeaturestoreException) FeatureGroupCommitController(io.hops.hopsworks.common.featurestore.featuregroup.cached.FeatureGroupCommitController) TransactionAttributeType(javax.ejb.TransactionAttributeType) TransactionAttribute(javax.ejb.TransactionAttribute) Map(java.util.Map) FeatureGroupFeatureDTO(io.hops.hopsworks.common.featurestore.feature.FeatureGroupFeatureDTO) EJB(javax.ejb.EJB) JoinDTO(io.hops.hopsworks.common.featurestore.query.join.JoinDTO) JoinType(org.apache.calcite.sql.JoinType) Stateless(javax.ejb.Stateless) SqlCondition(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.SqlCondition) FeatureGroupCommit(io.hops.hopsworks.persistence.entity.featurestore.featuregroup.cached.FeatureGroupCommit) Featuregroup(io.hops.hopsworks.persistence.entity.featurestore.featuregroup.Featuregroup) RESTCodes(io.hops.hopsworks.restutils.RESTCodes) Join(io.hops.hopsworks.common.featurestore.query.join.Join) Collectors(java.util.stream.Collectors) FeaturegroupController(io.hops.hopsworks.common.featurestore.featuregroup.FeaturegroupController) TimeTravelFormat(io.hops.hopsworks.persistence.entity.featurestore.featuregroup.cached.TimeTravelFormat) List(java.util.List) FeaturestoreFacade(io.hops.hopsworks.common.featurestore.FeaturestoreFacade) FilterController(io.hops.hopsworks.common.featurestore.query.filter.FilterController) Optional(java.util.Optional) Users(io.hops.hopsworks.persistence.entity.user.Users) FeaturegroupDTO(io.hops.hopsworks.common.featurestore.featuregroup.FeaturegroupDTO) ArrayList(java.util.ArrayList) Join(io.hops.hopsworks.common.featurestore.query.join.Join)

Aggregations

Join (io.hops.hopsworks.common.featurestore.query.join.Join)45 ArrayList (java.util.ArrayList)39 Test (org.junit.Test)31 Query (io.hops.hopsworks.common.featurestore.query.Query)21 SparkSqlDialect (org.apache.calcite.sql.dialect.SparkSqlDialect)21 SqlCondition (io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.SqlCondition)19 Feature (io.hops.hopsworks.common.featurestore.query.Feature)17 Featuregroup (io.hops.hopsworks.persistence.entity.featurestore.featuregroup.Featuregroup)12 List (java.util.List)11 Collectors (java.util.stream.Collectors)11 EJB (javax.ejb.EJB)11 Stateless (javax.ejb.Stateless)11 TransactionAttribute (javax.ejb.TransactionAttribute)11 TransactionAttributeType (javax.ejb.TransactionAttributeType)11 FeaturestoreException (io.hops.hopsworks.exceptions.FeaturestoreException)9 RESTCodes (io.hops.hopsworks.restutils.RESTCodes)9 Level (java.util.logging.Level)9 JoinType (org.apache.calcite.sql.JoinType)9 FeaturegroupController (io.hops.hopsworks.common.featurestore.featuregroup.FeaturegroupController)8 OnlineFeaturestoreController (io.hops.hopsworks.common.featurestore.online.OnlineFeaturestoreController)8