use of com.amazon.randomcutforest.returntypes.DiVector in project random-cut-forest-by-aws by aws.
the class RandomCutForest method getAnomalyAttribution.
public DiVector getAnomalyAttribution(float[] point) {
// getAnomalyScore
if (!isOutputReady()) {
return new DiVector(dimensions);
}
IVisitorFactory<DiVector> visitorFactory = new VisitorFactory<>((tree, y) -> new AnomalyAttributionVisitor(tree.projectToTree(y), tree.getMass()), (tree, x) -> x.lift(tree::liftFromTree));
BinaryOperator<DiVector> accumulator = DiVector::addToLeft;
Function<DiVector, DiVector> finisher = x -> x.scale(1.0 / numberOfTrees);
return traverseForest(transformToShingledPoint(point), visitorFactory, accumulator, finisher);
}
use of com.amazon.randomcutforest.returntypes.DiVector in project random-cut-forest-by-aws by aws.
the class RandomCutForest method getApproximateDynamicAttribution.
/**
* Atrribution for dynamic sequential scoring; getL1Norm() should agree with
* getDynamicScoringSequential
*
* @param point input
* @param precision parameter to stop early stopping
* @param highIsCritical are high values anomalous (otherwise low
* values are anomalous)
* @param ignoreLeafMassThreshold we ignore leaves with mass equal/below *
* threshold
* @param seen function for scoring points that have been
* seen before
* @param unseen function for scoring points not seen in tree
* @param newDamp dampening function based on duplicates
* @return attribution DiVector of the score
*/
public DiVector getApproximateDynamicAttribution(float[] point, double precision, boolean highIsCritical, int ignoreLeafMassThreshold, BiFunction<Double, Double, Double> seen, BiFunction<Double, Double, Double> unseen, BiFunction<Double, Double, Double> newDamp) {
if (!isOutputReady()) {
return new DiVector(dimensions);
}
VisitorFactory<DiVector> visitorFactory = new VisitorFactory<>((tree, y) -> new DynamicAttributionVisitor(y, tree.getMass(), ignoreLeafMassThreshold, seen, unseen, newDamp), (tree, x) -> x.lift(tree::liftFromTree));
ConvergingAccumulator<DiVector> accumulator = new OneSidedConvergingDiVectorAccumulator(dimensions, highIsCritical, precision, DEFAULT_APPROXIMATE_DYNAMIC_SCORE_MIN_VALUES_ACCEPTED, numberOfTrees);
Function<DiVector, DiVector> finisher = vector -> vector.scale(1.0 / accumulator.getValuesAccepted());
return traverseForest(transformToShingledPoint(point), visitorFactory, accumulator, finisher);
}
use of com.amazon.randomcutforest.returntypes.DiVector in project random-cut-forest-by-aws by aws.
the class RandomCutForest method getDynamicAttribution.
/**
* Same as above, but for dynamic scoring. See the params of
* getDynamicScoreParallel
*
* @param point point to be scored
* @param ignoreLeafMassThreshold said threshold
* @param seen score function for seen points
* @param unseen score function for unseen points
* @param newDamp dampening function for duplicates in the seen
* function
* @return dynamic scoring attribution DiVector
*/
public DiVector getDynamicAttribution(float[] point, int ignoreLeafMassThreshold, BiFunction<Double, Double, Double> seen, BiFunction<Double, Double, Double> unseen, BiFunction<Double, Double, Double> newDamp) {
if (!isOutputReady()) {
return new DiVector(dimensions);
}
VisitorFactory<DiVector> visitorFactory = new VisitorFactory<>((tree, y) -> new DynamicAttributionVisitor(tree.projectToTree(y), tree.getMass(), ignoreLeafMassThreshold, seen, unseen, newDamp), (tree, x) -> x.lift(tree::liftFromTree));
BinaryOperator<DiVector> accumulator = DiVector::addToLeft;
Function<DiVector, DiVector> finisher = x -> x.scale(1.0 / numberOfTrees);
return traverseForest(transformToShingledPoint(point), visitorFactory, accumulator, finisher);
}
use of com.amazon.randomcutforest.returntypes.DiVector in project random-cut-forest-by-aws by aws.
the class RandomCutForestBenchmark method attributionAndUpdate.
@Benchmark
@OperationsPerInvocation(DATA_SIZE)
public RandomCutForest attributionAndUpdate(BenchmarkState state, Blackhole blackhole) {
double[][] data = state.data;
forest = state.forest;
DiVector vector = new DiVector(forest.getDimensions());
for (int i = INITIAL_DATA_SIZE; i < data.length; i++) {
vector = forest.getAnomalyAttribution(data[i]);
forest.update(data[i]);
}
blackhole.consume(vector);
return forest;
}
use of com.amazon.randomcutforest.returntypes.DiVector in project random-cut-forest-by-aws by aws.
the class PredictorCorrector method detect.
/**
* the core of the predictor-corrector thresholding for shingled data points. It
* uses a simple threshold provided by the basic thresholder. It first checks if
* obvious effects of the present; and absent such, for repeated breaches, how
* critical is the new current information
*
* @param result returns the augmented description
* @param lastAnomalyDescriptor state of the computation for the last anomaly
* @return the anomaly descriptor result (which has plausibly mutated)
*/
protected AnomalyDescriptor detect(AnomalyDescriptor result, IRCFComputeDescriptor lastAnomalyDescriptor, RandomCutForest forest) {
double[] point = result.getRCFPoint();
if (point == null) {
return result;
}
double score = forest.getAnomalyScore(point);
result.setRCFScore(score);
result.setRCFPoint(point);
long internalTimeStamp = result.getInternalTimeStamp();
if (score == 0) {
return result;
}
int shingleSize = result.getShingleSize();
int baseDimensions = result.getDimension() / shingleSize;
int startPosition = (shingleSize - 1) * baseDimensions;
result.setThreshold(thresholder.threshold());
boolean previousIsPotentialAnomaly = thresholder.isInPotentialAnomaly();
/*
* We first check if the score is high enough to be considered as a candidate
* anomaly. If not, which is hopefully 99% of the data, the computation is short
*/
if (thresholder.getAnomalyGrade(score, previousIsPotentialAnomaly) == 0) {
result.setAnomalyGrade(0);
// inHighScoreRegion = false;
result.setInHighScoreRegion(false);
thresholder.update(score, score, 0, false);
return result;
}
// the score is now high enough to be considered an anomaly
// inHighScoreRegion = true;
result.setInHighScoreRegion(true);
/*
* We now check if (1) we have another anomaly in the current shingle (2) have
* predictions about what the values should have been and (3) replacing by those
* "should have been" makes the anomaly score of the new shingled point low
* enough to not be an anomaly. In this case we can "explain" the high score is
* due to the past and do not need to vend anomaly -- because the most recent
* point, on their own would not produce an anomalous shingle.
*
* However, the strategy is only executable if there are (A) sufficiently many
* observations and (B) enough data in each time point such that the forecast is
* reasonable. While forecasts can be corrected for very low shingle sizes and
* say 1d input, the allure of RCF is in the multivariate case. Even for 1d, a
* shingleSize of 4 or larger would produce reasonable forecast for the purposes
* of anomaly detection.
*/
int gap = (int) (internalTimeStamp - lastAnomalyDescriptor.getInternalTimeStamp());
// the forecast may not be reasonable with less data
boolean reasonableForecast = result.isReasonableForecast();
if (reasonableForecast && lastAnomalyDescriptor.getRCFPoint() != null && lastAnomalyDescriptor.getExpectedRCFPoint() != null && gap > 0 && gap <= shingleSize) {
double[] correctedPoint = applyBasicCorrector(point, gap, shingleSize, baseDimensions, lastAnomalyDescriptor);
double correctedScore = forest.getAnomalyScore(correctedPoint);
// we know we are looking previous anomalies
if (thresholder.getAnomalyGrade(correctedScore, true) == 0) {
// fixing the past makes this anomaly go away; nothing to do but process the
// score
// we will not change inHighScoreRegion however, because the score has been
// larger
thresholder.update(score, correctedScore, 0, false);
result.setExpectedRCFPoint(correctedPoint);
result.setAnomalyGrade(0);
return result;
}
}
/*
* We now check the most egregious values seen in the current timestamp, as
* determined by attribution. Those locations provide information about (a)
* which attributes and (b) what the values should have been. However, those
* calculations of imputation only make sense when sufficient observations are
* available.
*/
DiVector attribution = forest.getAnomalyAttribution(point);
double[] newPoint = null;
double newScore = score;
DiVector newAttribution = null;
/*
* we now find the time slice, relative to the current time, which is indicative
* of the high score. relativeIndex = 0 is current time. It is negative if the
* most egregious attribution was due to the past values in the shingle
*/
int index = maxContribution(attribution, baseDimensions, -shingleSize) + 1;
if (!previousIsPotentialAnomaly && trigger(attribution, gap, baseDimensions, null, false, lastAnomalyDescriptor)) {
result.setAnomalyGrade(thresholder.getAnomalyGrade(score, false));
result.setStartOfAnomaly(true);
thresholder.update(score, score, 0, true);
} else {
/*
* we again check if the new input produces an anomaly/not on its own
*/
if (reasonableForecast) {
newPoint = getExpectedPoint(attribution, startPosition, baseDimensions, point, forest);
if (newPoint != null) {
newAttribution = forest.getAnomalyAttribution(newPoint);
newScore = forest.getAnomalyScore(newPoint);
result.setExpectedRCFPoint(newPoint);
}
}
if (trigger(attribution, gap, baseDimensions, newAttribution, previousIsPotentialAnomaly, lastAnomalyDescriptor) && score > newScore) {
result.setAnomalyGrade(thresholder.getAnomalyGrade(score, previousIsPotentialAnomaly));
// current point
index = 0;
thresholder.update(score, newScore, 0, true);
} else {
// previousIsPotentialAnomaly is true now, but not calling it anomaly either
thresholder.update(score, newScore, 0, true);
result.setAnomalyGrade(0);
return result;
}
}
result.setAttribution(attribution);
result.setRelativeIndex(index);
if (reasonableForecast) {
// anomaly in the past and detected late; repositioning the computation
// index 0 is current time
startPosition = shingleSize * baseDimensions + (result.getRelativeIndex() - 1) * baseDimensions;
newPoint = getExpectedPoint(result.getAttribution(), startPosition, baseDimensions, point, forest);
}
result.setExpectedRCFPoint(newPoint);
return result;
}
Aggregations