use of com.amazon.randomcutforest.returntypes.DiVector in project random-cut-forest-by-aws by aws.
the class RandomCutForestShingledBenchmark method attributionAndUpdate.
@Benchmark
@OperationsPerInvocation(DATA_SIZE)
public RandomCutForest attributionAndUpdate(BenchmarkState state, Blackhole blackhole) {
double[][] data = state.data;
forest = state.forest;
DiVector vector = new DiVector(forest.getDimensions());
for (int i = INITIAL_DATA_SIZE; i < data.length; i++) {
vector = forest.getAnomalyAttribution(data[i]);
forest.update(data[i]);
}
blackhole.consume(vector);
return forest;
}
use of com.amazon.randomcutforest.returntypes.DiVector in project random-cut-forest-by-aws by aws.
the class AbstractAttributionVisitor method getResult.
/**
* Take the normalization function applied to the corresponding scoring visitor
* and apply that to each coordinate of the DiVector to modify the data in
* place. The function has to be associative in its first parameter; that is, fn
* (x1, y) + fn (x2, y) = fn (x1 + x2, y)
*
* @return The modified data.
*/
@Override
public DiVector getResult() {
DiVector result = new DiVector(directionalAttribution);
result.componentwiseTransform(x -> CommonUtils.defaultScalarNormalizerFunction(x, treeMass));
return result;
}
use of com.amazon.randomcutforest.returntypes.DiVector in project random-cut-forest-by-aws by aws.
the class AttributionExamplesFunctionalTest method RRCFattributionTest.
@Test
public void RRCFattributionTest() {
// starts with the same setup as rrcfTest; data corresponds to two small
// clusters at x=+/-5.0
// queries q_1=(0,0,0, ..., 0)
// inserts updates (0,1,0, ..., 0) a few times
// queries q_2=(0,1,0, ..., 0)
// attribution of q_2 is now affected by q_1 (which is still an anomaly)
int newDimensions = 30;
randomSeed = 101;
sampleSize = 256;
RandomCutForest newForest = RandomCutForest.builder().numberOfTrees(100).sampleSize(sampleSize).dimensions(newDimensions).randomSeed(randomSeed).compact(true).boundingBoxCacheFraction(0.0).build();
dataSize = 2000 + 5;
baseMu = 0.0;
baseSigma = 1.0;
anomalyMu = 0.0;
anomalySigma = 1.0;
transitionToAnomalyProbability = 0.0;
// ignoring anomaly cluster for now
transitionToBaseProbability = 1.0;
Random prg = new Random(0);
NormalMixtureTestData generator = new NormalMixtureTestData(baseMu, baseSigma, anomalyMu, anomalySigma, transitionToAnomalyProbability, transitionToBaseProbability);
double[][] data = generator.generateTestData(dataSize, newDimensions, 100);
for (int i = 0; i < 2000; i++) {
// shrink, shift at random
for (int j = 0; j < newDimensions; j++) data[i][j] *= 0.01;
if (prg.nextDouble() < 0.5)
data[i][0] += 5.0;
else
data[i][0] -= 5.0;
newForest.update(data[i]);
}
double[] queryOne = new double[newDimensions];
double[] queryTwo = new double[newDimensions];
queryTwo[1] = 1;
double originalScoreTwo = newForest.getAnomalyScore(queryTwo);
DiVector originalAttrTwo = newForest.getAnomalyAttribution(queryTwo);
assertTrue(originalScoreTwo > 3.0);
assertEquals(originalScoreTwo, originalAttrTwo.getHighLowSum(), 1E-5);
// due to -5 cluster
assertTrue(originalAttrTwo.high[0] > 1.0);
// due to +5 cluster
assertTrue(originalAttrTwo.low[0] > 1.0);
// due to +1 in query
assertTrue(originalAttrTwo.high[1] > 1);
assertTrue(originalAttrTwo.getHighLowSum(0) > 1.1 * originalAttrTwo.getHighLowSum(1));
// we insert queryOne a few times to make sure it is sampled
for (int i = 2000; i < 2000 + 5; i++) {
double score = newForest.getAnomalyScore(queryOne);
double score2 = newForest.getAnomalyScore(queryTwo);
DiVector attr2 = newForest.getAnomalyAttribution(queryTwo);
// verify
assertTrue(score > 2.0);
assertTrue(score2 > 2.0);
assertEquals(attr2.getHighLowSum(), score2, 1E-5);
for (int j = 0; j < newDimensions; j++) data[i][j] *= 0.01;
newForest.update(data[i]);
// 5 different anomalous points
}
double midScoreTwo = newForest.getAnomalyScore(queryTwo);
DiVector midAttrTwo = newForest.getAnomalyAttribution(queryTwo);
assertTrue(midScoreTwo > 2.4);
assertEquals(midScoreTwo, midAttrTwo.getHighLowSum(), 1E-5);
// due to -5 cluster !!!
assertTrue(midAttrTwo.high[0] < 1);
// due to +5 cluster !!!
assertTrue(midAttrTwo.low[0] < 1);
// due to +1 in query
assertTrue(midAttrTwo.high[1] > 1);
assertTrue(midAttrTwo.getHighLowSum(0) < 1.1 * midAttrTwo.high[1]);
// a few more updates, which are identical
for (int i = 2005; i < 2010; i++) {
newForest.update(queryOne);
}
double finalScoreTwo = newForest.getAnomalyScore(queryTwo);
DiVector finalAttrTwo = newForest.getAnomalyAttribution(queryTwo);
assertTrue(finalScoreTwo > 2.4);
assertEquals(finalScoreTwo, finalAttrTwo.getHighLowSum(), 1E-5);
// due to -5 cluster !!!
assertTrue(finalAttrTwo.high[0] < 0.5);
// due to +5 cluster !!!
assertTrue(finalAttrTwo.low[0] < 0.5);
// due to +1 in query
assertTrue(finalAttrTwo.high[1] > 1);
assertTrue(2.5 * finalAttrTwo.getHighLowSum(0) < finalAttrTwo.high[1]);
// the drop in high[0] and low[0] is steep and the attribution has shifted
}
use of com.amazon.randomcutforest.returntypes.DiVector in project random-cut-forest-by-aws by aws.
the class RandomCutForestFunctionalTest method testGetAnomalyAttribution.
@ParameterizedTest
@ArgumentsSource(TestForestProvider.class)
public void testGetAnomalyAttribution(RandomCutForest forest) {
/* This method checks that the scores and attributions are consistent */
double[] point = { 0.0, 0.0, 0.0 };
DiVector seenResult = forest.getAnomalyAttribution(point);
double seenScore = forest.getAnomalyScore(point);
assertTrue(seenResult.getHighLowSum(0) < 0.5);
assertTrue(seenResult.getHighLowSum(1) < 0.5);
assertTrue(seenResult.getHighLowSum(2) < 0.5);
assertTrue(seenScore < 1.0);
assertEquals(seenScore, seenResult.getHighLowSum(), 1E-10);
DiVector likelyResult = forest.getApproximateAnomalyAttribution(point);
double score = forest.getApproximateAnomalyScore(point);
assertTrue(likelyResult.getHighLowSum(0) < 0.5);
assertTrue(likelyResult.getHighLowSum(1) < 0.5);
assertTrue(likelyResult.getHighLowSum(2) < 0.5);
assertEquals(score, likelyResult.getHighLowSum(), 0.1);
assertEquals(seenResult.getHighLowSum(), likelyResult.getHighLowSum(), 0.1);
}
use of com.amazon.randomcutforest.returntypes.DiVector in project random-cut-forest-by-aws by aws.
the class RandomCutForestFunctionalTest method testMultipleAttributions.
@ParameterizedTest
@ArgumentsSource(TestForestProvider.class)
public void testMultipleAttributions(RandomCutForest forest) {
/**
* We will test the attribution over random runs. Narrow tests can fail -- we
* will keep track of the aggregate number of narrow tests and test for large
* characterization that would be misleading in failure.
*/
int hardPass = 0;
int causal = 0;
double[] point = { 6.0, 0.0, 0.0 };
DiVector result = forest.getAnomalyAttribution(point);
assertTrue(result.low[0] < 0.2);
if (result.getHighLowSum(1) < 0.5)
++hardPass;
if (result.getHighLowSum(2) < 0.5)
++hardPass;
assertTrue(result.getHighLowSum(1) + result.getHighLowSum(2) < 1.0);
assertTrue(result.high[0] > forest.getAnomalyScore(point) / 3);
if (result.high[0] > 0.5 * forest.getAnomalyScore(point))
++causal;
// the last line states that first coordinate was high and was a majority
// contributor to the score
// the previous test states that the contribution is twice the average of the 12
// possible contributors.
// these tests all subparts of the score at once
point = new double[] { -6.0, 0.0, 0.0 };
result = forest.getAnomalyAttribution(point);
assertTrue(result.getHighLowSum() > 1.0);
assertTrue(result.high[0] < 0.5);
if (result.getHighLowSum(1) < 0.5)
++hardPass;
if (result.getHighLowSum(2) < 0.5)
++hardPass;
assertTrue(result.low[0] > forest.getAnomalyScore(point) / 3);
if (result.low[0] > 0.5 * forest.getAnomalyScore(point))
++causal;
point = new double[] { 0.0, 6.0, 0.0 };
assertTrue(result.getHighLowSum() > 1.0);
result = forest.getAnomalyAttribution(point);
if (result.getHighLowSum(0) < 0.5)
++hardPass;
if (result.getHighLowSum(2) < 0.5)
++hardPass;
assertTrue(result.low[1] < 0.5);
assertTrue(result.high[1] > forest.getAnomalyScore(point) / 3);
if (result.high[1] > 0.5 * forest.getAnomalyScore(point))
++causal;
point = new double[] { 0.0, -6.0, 0.0 };
assertTrue(result.getHighLowSum() > 1.0);
result = forest.getAnomalyAttribution(point);
if (result.getHighLowSum(0) < 0.5)
++hardPass;
if (result.getHighLowSum(2) < 0.5)
++hardPass;
assertTrue(result.high[1] < 0.5);
assertTrue(result.low[1] > forest.getAnomalyScore(point) / 3);
if (result.low[1] > 0.5 * forest.getAnomalyScore(point))
++causal;
point = new double[] { 0.0, 0.0, 6.0 };
assertTrue(result.getHighLowSum() > 1.0);
result = forest.getAnomalyAttribution(point);
if (result.getHighLowSum(0) < 0.5)
++hardPass;
if (result.getHighLowSum(1) < 0.5)
++hardPass;
assertTrue(result.low[2] < 0.5);
assertTrue(result.high[2] > forest.getAnomalyScore(point) / 3);
if (result.high[2] > 0.5 * forest.getAnomalyScore(point))
++causal;
point = new double[] { 0.0, 0.0, -6.0 };
assertTrue(result.getHighLowSum() > 1.0);
result = forest.getAnomalyAttribution(point);
if (result.getHighLowSum(0) < 0.5)
++hardPass;
if (result.getHighLowSum(1) < 0.5)
++hardPass;
assertTrue(result.high[2] < 0.5);
assertTrue(result.low[2] > forest.getAnomalyScore(point) / 3);
if (result.low[2] > 0.5 * forest.getAnomalyScore(point))
++causal;
// maximum is 6; there can be skew in one direction
assertTrue(causal >= 5);
point = new double[] { -3.0, 0.0, 0.0 };
result = forest.getAnomalyAttribution(point);
assertTrue(result.high[0] < 0.5);
if (result.getHighLowSum(1) < 0.5)
++hardPass;
if (result.getHighLowSum(2) < 0.5)
++hardPass;
assertTrue(result.low[0] > forest.getAnomalyScore(point) / 3);
/*
* For multiple causes, the relationship of scores only hold for larger
* distances.
*/
point = new double[] { -3.0, 6.0, 0.0 };
assertTrue(result.getHighLowSum() > 1.0);
result = forest.getAnomalyAttribution(point);
if (result.low[0] > 0.5)
++hardPass;
assertTrue(result.high[0] < 0.5);
assertTrue(result.low[1] < 0.5);
assertTrue(result.high[1] > 0.5);
if (result.high[1] > 0.9)
++hardPass;
assertTrue(result.getHighLowSum(2) < 0.5);
assertTrue(result.high[1] + result.low[0] > 0.8 * forest.getAnomalyScore(point));
point = new double[] { 6.0, -3.0, 0.0 };
assertTrue(result.getHighLowSum() > 1.0);
result = forest.getAnomalyAttribution(point);
assertTrue(result.low[0] < 0.5);
assertTrue(result.high[0] > 0.5);
if (result.high[0] > 0.9)
++hardPass;
if (result.low[1] > 0.5)
++hardPass;
assertTrue(result.high[1] < 0.5);
assertTrue(result.getHighLowSum(2) < 0.5);
assertTrue(result.high[0] + result.low[1] > 0.8 * forest.getAnomalyScore(point));
point = new double[] { 20.0, -10.0, 0.0 };
assertTrue(result.getHighLowSum() > 1.0);
result = forest.getAnomalyAttribution(point);
assertTrue(result.high[0] + result.low[1] > 0.8 * forest.getAnomalyScore(point));
if (result.high[0] > 1.8 * result.low[1])
++hardPass;
if (result.low[1] > result.high[0] / 2.2)
++hardPass;
// maximum is 20
assertTrue(hardPass >= 15);
}
Aggregations