Search in sources :

Example 1 with HighScoreNumericFeatureZones

use of org.kie.kogito.explainability.local.lime.HighScoreNumericFeatureZones in project kogito-apps by kiegroup.

the class DataUtils method boostrapFeatureDistributions.

/**
 * Generate feature distributions from an existing (evantually small) {@link DataDistribution} for each {@link Feature}.
 * Each feature intervals (min, max) and density information (mean, stdDev) are generated using bootstrap, then
 * data points are sampled from a normal distribution (see {@link #generateData(double, double, int, Random)}).
 *
 * @param dataDistribution data distribution to take feature values from
 * @param perturbationContext perturbation context
 * @param featureDistributionSize desired size of generated feature distributions
 * @param draws number of times sampling from feature values is performed
 * @param sampleSize size of each sample draw
 * @param numericFeatureZonesMap high feature score zones
 * @return a map feature name -> generated feature distribution
 */
public static Map<String, FeatureDistribution> boostrapFeatureDistributions(DataDistribution dataDistribution, PerturbationContext perturbationContext, int featureDistributionSize, int draws, int sampleSize, Map<String, HighScoreNumericFeatureZones> numericFeatureZonesMap) {
    Map<String, FeatureDistribution> featureDistributions = new HashMap<>();
    for (FeatureDistribution featureDistribution : dataDistribution.asFeatureDistributions()) {
        Feature feature = featureDistribution.getFeature();
        if (Type.NUMBER.equals(feature.getType())) {
            List<Value> values = featureDistribution.getAllSamples();
            double[] means = new double[draws];
            double[] stdDevs = new double[draws];
            double[] mins = new double[draws];
            double[] maxs = new double[draws];
            for (int i = 0; i < draws; i++) {
                List<Value> sampledValues = DataUtils.sampleWithReplacement(values, sampleSize, perturbationContext.getRandom());
                double[] data = sampledValues.stream().mapToDouble(Value::asNumber).toArray();
                double mean = DataUtils.getMean(data);
                double stdDev = Math.pow(DataUtils.getStdDev(data, mean), 2);
                double min = Arrays.stream(data).min().orElse(Double.MIN_VALUE);
                double max = Arrays.stream(data).max().orElse(Double.MAX_VALUE);
                means[i] = mean;
                stdDevs[i] = stdDev;
                mins[i] = min;
                maxs[i] = max;
            }
            double finalMean = DataUtils.getMean(means);
            double finalStdDev = Math.sqrt(DataUtils.getMean(stdDevs));
            double finalMin = DataUtils.getMean(mins);
            double finalMax = DataUtils.getMean(maxs);
            double[] doubles = DataUtils.generateData(finalMean, finalStdDev, featureDistributionSize, perturbationContext.getRandom());
            double[] boundedData = Arrays.stream(doubles).map(d -> Math.min(Math.max(d, finalMin), finalMax)).toArray();
            HighScoreNumericFeatureZones highScoreNumericFeatureZones = numericFeatureZonesMap.get(feature.getName());
            double[] finaldata;
            if (highScoreNumericFeatureZones != null) {
                double[] filteredData = DoubleStream.of(boundedData).filter(highScoreNumericFeatureZones::test).toArray();
                // only use the filtered data if it's not discarding more than 50% of the points
                if (filteredData.length > featureDistributionSize / 2) {
                    finaldata = filteredData;
                } else {
                    finaldata = boundedData;
                }
            } else {
                finaldata = boundedData;
            }
            NumericFeatureDistribution numericFeatureDistribution = new NumericFeatureDistribution(feature, finaldata);
            featureDistributions.put(feature.getName(), numericFeatureDistribution);
        }
    }
    return featureDistributions;
}
Also used : IntStream(java.util.stream.IntStream) FeatureFactory(org.kie.kogito.explainability.model.FeatureFactory) Arrays(java.util.Arrays) MalformedInputException(java.nio.charset.MalformedInputException) PredictionInputsDataDistribution(org.kie.kogito.explainability.model.PredictionInputsDataDistribution) PerturbationContext(org.kie.kogito.explainability.model.PerturbationContext) Feature(org.kie.kogito.explainability.model.Feature) Prediction(org.kie.kogito.explainability.model.Prediction) CSVRecord(org.apache.commons.csv.CSVRecord) TimeoutException(java.util.concurrent.TimeoutException) HashMap(java.util.HashMap) Random(java.util.Random) Value(org.kie.kogito.explainability.model.Value) DataDistribution(org.kie.kogito.explainability.model.DataDistribution) ArrayList(java.util.ArrayList) CSVFormat(org.apache.commons.csv.CSVFormat) NumericFeatureDistribution(org.kie.kogito.explainability.model.NumericFeatureDistribution) PartialDependenceGraph(org.kie.kogito.explainability.model.PartialDependenceGraph) Map(java.util.Map) FeatureDistribution(org.kie.kogito.explainability.model.FeatureDistribution) LinkedList(java.util.LinkedList) Path(java.nio.file.Path) PredictionOutput(org.kie.kogito.explainability.model.PredictionOutput) IndependentFeaturesDataDistribution(org.kie.kogito.explainability.model.IndependentFeaturesDataDistribution) SimplePrediction(org.kie.kogito.explainability.model.SimplePrediction) Files(java.nio.file.Files) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) Type(org.kie.kogito.explainability.model.Type) PredictionProvider(org.kie.kogito.explainability.model.PredictionProvider) DoubleStream(java.util.stream.DoubleStream) ExecutionException(java.util.concurrent.ExecutionException) PredictionInput(org.kie.kogito.explainability.model.PredictionInput) List(java.util.List) Output(org.kie.kogito.explainability.model.Output) Writer(java.io.Writer) Optional(java.util.Optional) HighScoreNumericFeatureZones(org.kie.kogito.explainability.local.lime.HighScoreNumericFeatureZones) BufferedReader(java.io.BufferedReader) Config(org.kie.kogito.explainability.Config) Collections(java.util.Collections) CSVPrinter(org.apache.commons.csv.CSVPrinter) HashMap(java.util.HashMap) Feature(org.kie.kogito.explainability.model.Feature) HighScoreNumericFeatureZones(org.kie.kogito.explainability.local.lime.HighScoreNumericFeatureZones) NumericFeatureDistribution(org.kie.kogito.explainability.model.NumericFeatureDistribution) FeatureDistribution(org.kie.kogito.explainability.model.FeatureDistribution) Value(org.kie.kogito.explainability.model.Value) NumericFeatureDistribution(org.kie.kogito.explainability.model.NumericFeatureDistribution)

Aggregations

BufferedReader (java.io.BufferedReader)1 IOException (java.io.IOException)1 Writer (java.io.Writer)1 MalformedInputException (java.nio.charset.MalformedInputException)1 Files (java.nio.file.Files)1 Path (java.nio.file.Path)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 Collections (java.util.Collections)1 HashMap (java.util.HashMap)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 Map (java.util.Map)1 Optional (java.util.Optional)1 Random (java.util.Random)1 ExecutionException (java.util.concurrent.ExecutionException)1 TimeoutException (java.util.concurrent.TimeoutException)1 Collectors (java.util.stream.Collectors)1 DoubleStream (java.util.stream.DoubleStream)1 IntStream (java.util.stream.IntStream)1