Search in sources :

Example 6 with TrainingDatasetJoin

use of io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetJoin in project hopsworks by logicalclocks.

the class TrainingDatasetController method getQuery.

// TODO feature view: remove
/**
 * Reconstruct the query used to generate the training datset, fetching the features and the joins
 * in the proper order from the database.
 * @param trainingDataset
 * @return
 * @throws FeaturestoreException
 */
public Query getQuery(TrainingDataset trainingDataset, boolean withLabel, Project project, Users user, Boolean isHiveEngine) throws FeaturestoreException {
    if (!trainingDataset.isQuery()) {
        throw new FeaturestoreException(RESTCodes.FeaturestoreErrorCode.TRAINING_DATASET_NO_QUERY, Level.FINE, "Inference vector is only available for datasets generated by queries");
    }
    List<TrainingDatasetJoin> joins = getJoinsSorted(trainingDataset);
    // Convert all the TrainingDatasetFeatures to QueryFeatures
    Map<Integer, String> fgAliasLookup = getAliasLookupTable(joins);
    // These features are for the select part and are from different feature groups
    // to respect the ordering, all selected features are added to the left most Query instead of splitting them
    // over the querys for their respective origin feature group
    List<TrainingDatasetFeature> tdFeatures = getFeaturesSorted(trainingDataset, withLabel);
    // Check that all the feature groups still exists, if not throw a reasonable error
    if (tdFeatures.stream().anyMatch(j -> j.getFeatureGroup() == null)) {
        throw new FeaturestoreException(RESTCodes.FeaturestoreErrorCode.TRAINING_DATASET_QUERY_FG_DELETED, Level.FINE);
    }
    // Get available features for all involved feature groups once, and save in map fgId -> availableFeatures
    Map<Integer, List<Feature>> availableFeaturesLookup = new HashMap<>();
    for (TrainingDatasetJoin join : joins) {
        if (!availableFeaturesLookup.containsKey(join.getFeatureGroup().getId())) {
            List<Feature> availableFeatures = featuregroupController.getFeatures(join.getFeatureGroup(), project, user).stream().map(f -> new Feature(f.getName(), fgAliasLookup.get(join.getId()), f.getType(), f.getDefaultValue(), f.getPrimary(), join.getFeatureGroup(), join.getPrefix())).collect(Collectors.toList());
            availableFeaturesLookup.put(join.getFeatureGroup().getId(), availableFeatures);
        }
    }
    Map<String, Feature> featureLookup = availableFeaturesLookup.values().stream().flatMap(List::stream).collect(Collectors.toMap(f -> makeFeatureLookupKey(f.getFeatureGroup().getId(), f.getName()), f -> f, (f1, f2) -> f1));
    List<Feature> features = new ArrayList<>();
    for (TrainingDatasetFeature requestedFeature : tdFeatures) {
        Feature tdFeature = featureLookup.get(makeFeatureLookupKey(requestedFeature.getFeatureGroup().getId(), requestedFeature.getName()));
        if (tdFeature == null) {
            throw new FeaturestoreException(RESTCodes.FeaturestoreErrorCode.FEATURE_DOES_NOT_EXIST, Level.FINE, "Feature: " + requestedFeature.getName() + " not found in feature group: " + requestedFeature.getFeatureGroup().getName());
        }
        // instantiate new feature since alias in available feature is not correct if fg is joined with itself
        Feature featureWithCorrectAlias = new Feature(tdFeature.getName(), fgAliasLookup.get(requestedFeature.getTrainingDatasetJoin().getId()), tdFeature.getType(), tdFeature.getDefaultValue(), tdFeature.getPrefix(), requestedFeature.getFeatureGroup(), requestedFeature.getIndex());
        features.add(featureWithCorrectAlias);
    }
    // Keep a map feature store id -> feature store name
    Map<Integer, String> fsLookup = getFsLookupTableJoins(joins);
    Query query = new Query(fsLookup.get(joins.get(0).getFeatureGroup().getFeaturestore().getId()), onlineFeaturestoreController.getOnlineFeaturestoreDbName(joins.get(0).getFeatureGroup().getFeaturestore().getProject()), joins.get(0).getFeatureGroup(), fgAliasLookup.get(joins.get(0).getId()), features, availableFeaturesLookup.get(joins.get(0).getFeatureGroup().getId()), isHiveEngine);
    // Set the remaining feature groups as join
    List<Join> queryJoins = new ArrayList<>();
    for (int i = 1; i < joins.size(); i++) {
        // left side of the join stays fixed, the counter starts at 1
        queryJoins.add(getQueryJoin(query, joins.get(i), fgAliasLookup, fsLookup, availableFeaturesLookup, isHiveEngine));
    }
    query.setJoins(queryJoins);
    FilterLogic filterLogic = convertToFilterLogic(trainingDataset.getFilters(), featureLookup, "L");
    query.setFilter(filterLogic);
    return query;
}
Also used : TrainingDatasetFilter(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetFilter) FeaturegroupType(io.hops.hopsworks.persistence.entity.featurestore.featuregroup.FeaturegroupType) Date(java.util.Date) Feature(io.hops.hopsworks.common.featurestore.query.Feature) HopsfsTrainingDatasetController(io.hops.hopsworks.common.featurestore.trainingdatasets.hopsfs.HopsfsTrainingDatasetController) HopsFSProvenanceController(io.hops.hopsworks.common.provenance.core.HopsFSProvenanceController) Settings(io.hops.hopsworks.common.util.Settings) TransactionAttributeType(javax.ejb.TransactionAttributeType) Map(java.util.Map) FilterValue(io.hops.hopsworks.common.featurestore.query.filter.FilterValue) FeatureView(io.hops.hopsworks.persistence.entity.featurestore.featureview.FeatureView) TrainingDataset(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDataset) TrainingDatasetSplit(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.split.TrainingDatasetSplit) Utils(io.hops.hopsworks.common.hdfs.Utils) StatisticsConfig(io.hops.hopsworks.persistence.entity.featurestore.statistics.StatisticsConfig) JoinType(org.apache.calcite.sql.JoinType) Stateless(javax.ejb.Stateless) TransformationFunctionFacade(io.hops.hopsworks.common.featurestore.transformationFunction.TransformationFunctionFacade) HopsfsTrainingDatasetFacade(io.hops.hopsworks.common.featurestore.trainingdatasets.hopsfs.HopsfsTrainingDatasetFacade) Collection(java.util.Collection) TrainingDatasetFeatureDTO(io.hops.hopsworks.common.featurestore.feature.TrainingDatasetFeatureDTO) Featuregroup(io.hops.hopsworks.persistence.entity.featurestore.featuregroup.Featuregroup) RESTCodes(io.hops.hopsworks.restutils.RESTCodes) Join(io.hops.hopsworks.common.featurestore.query.join.Join) StatisticColumnController(io.hops.hopsworks.common.featurestore.statistics.columns.StatisticColumnController) FeaturestoreConnectorFacade(io.hops.hopsworks.common.featurestore.storageconnectors.FeaturestoreConnectorFacade) Collectors(java.util.stream.Collectors) QueryController(io.hops.hopsworks.common.featurestore.query.QueryController) SqlFilterLogic(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.SqlFilterLogic) TransformationFunction(io.hops.hopsworks.persistence.entity.featurestore.transformationFunction.TransformationFunction) List(java.util.List) FeaturestoreFacade(io.hops.hopsworks.common.featurestore.FeaturestoreFacade) ExternalTrainingDatasetController(io.hops.hopsworks.common.featurestore.trainingdatasets.external.ExternalTrainingDatasetController) FeaturestoreUtils(io.hops.hopsworks.common.featurestore.utils.FeaturestoreUtils) Optional(java.util.Optional) FeaturestoreConnector(io.hops.hopsworks.persistence.entity.featurestore.storageconnector.FeaturestoreConnector) DistributedFsService(io.hops.hopsworks.common.hdfs.DistributedFsService) InodeController(io.hops.hopsworks.common.hdfs.inode.InodeController) DistributedFileSystemOps(io.hops.hopsworks.common.hdfs.DistributedFileSystemOps) TrainingDatasetJoinCondition(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetJoinCondition) FeaturestoreConnectorType(io.hops.hopsworks.persistence.entity.featurestore.storageconnector.FeaturestoreConnectorType) HashMap(java.util.HashMap) FeaturestoreActivityMeta(io.hops.hopsworks.persistence.entity.featurestore.activity.FeaturestoreActivityMeta) Streams(com.logicalclocks.shaded.com.google.common.collect.Streams) OnlineFeaturestoreController(io.hops.hopsworks.common.featurestore.online.OnlineFeaturestoreController) Project(io.hops.hopsworks.persistence.entity.project.Project) TrainingDatasetFilterCondition(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetFilterCondition) ArrayList(java.util.ArrayList) Level(java.util.logging.Level) HopsfsTrainingDataset(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.hopsfs.HopsfsTrainingDataset) FeaturestoreException(io.hops.hopsworks.exceptions.FeaturestoreException) FeaturestoreActivityFacade(io.hops.hopsworks.common.featurestore.activity.FeaturestoreActivityFacade) PitJoinController(io.hops.hopsworks.common.featurestore.query.pit.PitJoinController) TransactionAttribute(javax.ejb.TransactionAttribute) HdfsUsersController(io.hops.hopsworks.common.hdfs.HdfsUsersController) OnlineFeaturegroupController(io.hops.hopsworks.common.featurestore.featuregroup.online.OnlineFeaturegroupController) Query(io.hops.hopsworks.common.featurestore.query.Query) Filter(io.hops.hopsworks.common.featurestore.query.filter.Filter) TrainingDatasetType(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetType) Inode(io.hops.hopsworks.persistence.entity.hdfs.inode.Inode) ProvenanceException(io.hops.hopsworks.exceptions.ProvenanceException) EJB(javax.ejb.EJB) TrainingDatasetFeature(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetFeature) StatisticsController(io.hops.hopsworks.common.featurestore.statistics.StatisticsController) ExternalTrainingDataset(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.external.ExternalTrainingDataset) IOException(java.io.IOException) Featurestore(io.hops.hopsworks.persistence.entity.featurestore.Featurestore) FeaturegroupController(io.hops.hopsworks.common.featurestore.featuregroup.FeaturegroupController) ServiceException(io.hops.hopsworks.exceptions.ServiceException) TimeTravelFormat(io.hops.hopsworks.persistence.entity.featurestore.featuregroup.cached.TimeTravelFormat) TrainingDatasetJoin(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetJoin) QueryDTO(io.hops.hopsworks.common.featurestore.query.QueryDTO) StatisticColumn(io.hops.hopsworks.persistence.entity.featurestore.statistics.StatisticColumn) Dataset(io.hops.hopsworks.persistence.entity.dataset.Dataset) FilterLogic(io.hops.hopsworks.common.featurestore.query.filter.FilterLogic) Users(io.hops.hopsworks.persistence.entity.user.Users) Comparator(java.util.Comparator) FeaturegroupDTO(io.hops.hopsworks.common.featurestore.featuregroup.FeaturegroupDTO) TrainingDatasetJoin(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetJoin) TrainingDatasetFeature(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetFeature) Query(io.hops.hopsworks.common.featurestore.query.Query) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Join(io.hops.hopsworks.common.featurestore.query.join.Join) TrainingDatasetJoin(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetJoin) Feature(io.hops.hopsworks.common.featurestore.query.Feature) TrainingDatasetFeature(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetFeature) SqlFilterLogic(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.SqlFilterLogic) FilterLogic(io.hops.hopsworks.common.featurestore.query.filter.FilterLogic) List(java.util.List) ArrayList(java.util.ArrayList) FeaturestoreException(io.hops.hopsworks.exceptions.FeaturestoreException)

Example 7 with TrainingDatasetJoin

use of io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetJoin in project hopsworks by logicalclocks.

the class TrainingDatasetController method collectJoins.

public List<TrainingDatasetJoin> collectJoins(Query query, TrainingDataset trainingDataset, FeatureView featureView) {
    List<TrainingDatasetJoin> joins = new ArrayList<>();
    // add the first feature group
    int index = 0;
    if (query.getFeaturegroup().getFeaturegroupType() == FeaturegroupType.CACHED_FEATURE_GROUP && query.getFeaturegroup().getCachedFeaturegroup().getTimeTravelFormat() == TimeTravelFormat.HUDI) {
        joins.add(makeTrainingDatasetJoin(trainingDataset, featureView, query.getFeaturegroup(), query.getLeftFeatureGroupEndCommitId(), (short) 0, index++, null));
    } else {
        joins.add(makeTrainingDatasetJoin(trainingDataset, featureView, query.getFeaturegroup(), null, (short) 0, index++, null));
    }
    if (query.getJoins() != null && !query.getJoins().isEmpty()) {
        for (Join join : query.getJoins()) {
            TrainingDatasetJoin tdJoin;
            if (query.getFeaturegroup().getFeaturegroupType() == FeaturegroupType.CACHED_FEATURE_GROUP && query.getFeaturegroup().getCachedFeaturegroup().getTimeTravelFormat() == TimeTravelFormat.HUDI) {
                tdJoin = makeTrainingDatasetJoin(trainingDataset, featureView, join.getRightQuery().getFeaturegroup(), join.getRightQuery().getLeftFeatureGroupEndCommitId(), (short) join.getJoinType().ordinal(), index++, join.getPrefix());
            } else {
                tdJoin = makeTrainingDatasetJoin(trainingDataset, featureView, join.getRightQuery().getFeaturegroup(), null, (short) join.getJoinType().ordinal(), index++, join.getPrefix());
            }
            tdJoin.setConditions(collectJoinConditions(join, tdJoin));
            joins.add(tdJoin);
        }
    }
    return joins;
}
Also used : TrainingDatasetJoin(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetJoin) ArrayList(java.util.ArrayList) Join(io.hops.hopsworks.common.featurestore.query.join.Join) TrainingDatasetJoin(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetJoin)

Example 8 with TrainingDatasetJoin

use of io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetJoin in project hopsworks by logicalclocks.

the class FeatureViewController method makeQuery.

public Query makeQuery(FeatureView featureView, Project project, Users user) throws FeaturestoreException {
    List<TrainingDatasetJoin> joins = featureView.getJoins().stream().sorted(Comparator.comparing(TrainingDatasetJoin::getIndex)).collect(Collectors.toList());
    Map<Integer, String> fgAliasLookup = trainingDatasetController.getAliasLookupTable(joins);
    List<TrainingDatasetFeature> tdFeatures = featureView.getFeatures().stream().sorted((t1, t2) -> {
        if (t1.getIndex() != null) {
            // compare based on index
            return t1.getIndex().compareTo(t2.getIndex());
        } else {
            // Old training dataset with no index. compare based on name
            return t1.getName().compareTo(t2.getName());
        }
    }).collect(Collectors.toList());
    // Check that all the feature groups still exists, if not throw a reasonable error
    if (tdFeatures.stream().anyMatch(j -> j.getFeatureGroup() == null)) {
        throw new FeaturestoreException(RESTCodes.FeaturestoreErrorCode.TRAINING_DATASET_QUERY_FG_DELETED, Level.FINE);
    }
    // Get available features for all involved feature groups once, and save in map fgId -> availableFeatures
    Map<Integer, List<Feature>> availableFeaturesLookup = new HashMap<>();
    for (TrainingDatasetJoin join : joins) {
        if (!availableFeaturesLookup.containsKey(join.getFeatureGroup().getId())) {
            List<Feature> availableFeatures = featuregroupController.getFeatures(join.getFeatureGroup(), project, user).stream().map(f -> new Feature(f.getName(), fgAliasLookup.get(join.getId()), f.getType(), f.getPrimary(), f.getDefaultValue(), join.getPrefix())).collect(Collectors.toList());
            availableFeaturesLookup.put(join.getFeatureGroup().getId(), availableFeatures);
        }
    }
    List<Feature> features = new ArrayList<>();
    for (TrainingDatasetFeature requestedFeature : tdFeatures) {
        features.add(availableFeaturesLookup.get(requestedFeature.getFeatureGroup().getId()).stream().filter(af -> af.getName().equals(requestedFeature.getName())).map(af -> new Feature(af.getName(), fgAliasLookup.get(requestedFeature.getTrainingDatasetJoin().getId()), af.getType(), af.getDefaultValue(), af.getPrefix(), requestedFeature.getFeatureGroup(), requestedFeature.getIndex())).findFirst().orElseThrow(() -> new FeaturestoreException(RESTCodes.FeaturestoreErrorCode.FEATURE_DOES_NOT_EXIST, Level.FINE, "Feature: " + requestedFeature.getName() + " not found in feature group: " + requestedFeature.getFeatureGroup().getName())));
    }
    // Keep a map feature store id -> feature store name
    Map<Integer, String> fsLookup = trainingDatasetController.getFsLookupTableJoins(joins);
    Query query = new Query(fsLookup.get(joins.get(0).getFeatureGroup().getFeaturestore().getId()), onlineFeaturestoreController.getOnlineFeaturestoreDbName(joins.get(0).getFeatureGroup().getFeaturestore().getProject()), joins.get(0).getFeatureGroup(), fgAliasLookup.get(joins.get(0).getId()), features, availableFeaturesLookup.get(joins.get(0).getFeatureGroup().getId()), false);
    // Set the remaining feature groups as join
    List<Join> queryJoins = new ArrayList<>();
    for (int i = 1; i < joins.size(); i++) {
        // left side of the join stays fixed, the counter starts at 1
        queryJoins.add(trainingDatasetController.getQueryJoin(query, joins.get(i), fgAliasLookup, fsLookup, availableFeaturesLookup, false));
    }
    query.setJoins(queryJoins);
    return query;
}
Also used : AbstractFacade(io.hops.hopsworks.common.dao.AbstractFacade) Date(java.util.Date) Feature(io.hops.hopsworks.common.featurestore.query.Feature) TrainingDatasetController(io.hops.hopsworks.common.featurestore.trainingdatasets.TrainingDatasetController) HashMap(java.util.HashMap) OnlineFeaturestoreController(io.hops.hopsworks.common.featurestore.online.OnlineFeaturestoreController) Project(io.hops.hopsworks.persistence.entity.project.Project) ArrayList(java.util.ArrayList) Level(java.util.logging.Level) Inject(javax.inject.Inject) FeatureViewFacade(io.hops.hopsworks.common.featurestore.featureview.FeatureViewFacade) FeaturestoreException(io.hops.hopsworks.exceptions.FeaturestoreException) TransactionAttributeType(javax.ejb.TransactionAttributeType) PitJoinController(io.hops.hopsworks.common.featurestore.query.pit.PitJoinController) TransactionAttribute(javax.ejb.TransactionAttribute) Map(java.util.Map) ResourceRequest(io.hops.hopsworks.common.api.ResourceRequest) FeatureView(io.hops.hopsworks.persistence.entity.featurestore.featureview.FeatureView) Query(io.hops.hopsworks.common.featurestore.query.Query) EJB(javax.ejb.EJB) FeatureViewDTO(io.hops.hopsworks.common.featurestore.featureview.FeatureViewDTO) Stateless(javax.ejb.Stateless) TransformationFunctionFacade(io.hops.hopsworks.common.featurestore.transformationFunction.TransformationFunctionFacade) TrainingDatasetFeature(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetFeature) Collection(java.util.Collection) TrainingDatasetFeatureDTO(io.hops.hopsworks.common.featurestore.feature.TrainingDatasetFeatureDTO) Set(java.util.Set) RESTCodes(io.hops.hopsworks.restutils.RESTCodes) Join(io.hops.hopsworks.common.featurestore.query.join.Join) Featurestore(io.hops.hopsworks.persistence.entity.featurestore.Featurestore) Collectors(java.util.stream.Collectors) FeaturegroupController(io.hops.hopsworks.common.featurestore.featuregroup.FeaturegroupController) TrainingDatasetJoin(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetJoin) QueryController(io.hops.hopsworks.common.featurestore.query.QueryController) QueryDTO(io.hops.hopsworks.common.featurestore.query.QueryDTO) List(java.util.List) FEATURE_VIEW_NOT_FOUND(io.hops.hopsworks.restutils.RESTCodes.FeaturestoreErrorCode.FEATURE_VIEW_NOT_FOUND) QueryParam(io.hops.hopsworks.common.dao.QueryParam) FeaturestoreUtils(io.hops.hopsworks.common.featurestore.utils.FeaturestoreUtils) Users(io.hops.hopsworks.persistence.entity.user.Users) Comparator(java.util.Comparator) TrainingDatasetJoin(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetJoin) TrainingDatasetFeature(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetFeature) Query(io.hops.hopsworks.common.featurestore.query.Query) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Join(io.hops.hopsworks.common.featurestore.query.join.Join) TrainingDatasetJoin(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetJoin) Feature(io.hops.hopsworks.common.featurestore.query.Feature) TrainingDatasetFeature(io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetFeature) ArrayList(java.util.ArrayList) List(java.util.List) FeaturestoreException(io.hops.hopsworks.exceptions.FeaturestoreException)

Aggregations

TrainingDatasetJoin (io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetJoin)8 Query (io.hops.hopsworks.common.featurestore.query.Query)6 TrainingDatasetFeature (io.hops.hopsworks.persistence.entity.featurestore.trainingdataset.TrainingDatasetFeature)6 ArrayList (java.util.ArrayList)6 FeaturegroupController (io.hops.hopsworks.common.featurestore.featuregroup.FeaturegroupController)5 OnlineFeaturestoreController (io.hops.hopsworks.common.featurestore.online.OnlineFeaturestoreController)5 Feature (io.hops.hopsworks.common.featurestore.query.Feature)5 Join (io.hops.hopsworks.common.featurestore.query.join.Join)5 FeaturestoreException (io.hops.hopsworks.exceptions.FeaturestoreException)5 Featurestore (io.hops.hopsworks.persistence.entity.featurestore.Featurestore)5 Project (io.hops.hopsworks.persistence.entity.project.Project)5 Users (io.hops.hopsworks.persistence.entity.user.Users)5 RESTCodes (io.hops.hopsworks.restutils.RESTCodes)5 Comparator (java.util.Comparator)5 List (java.util.List)5 Map (java.util.Map)5 Level (java.util.logging.Level)5 Collectors (java.util.stream.Collectors)5 EJB (javax.ejb.EJB)5 Stateless (javax.ejb.Stateless)5