use of com.amazon.randomcutforest.parkservices.ThresholdedRandomCutForest in project random-cut-forest-by-aws by aws.
the class ThresholdedRandomCutForestMapperTest method testRoundTripStandard.
@ParameterizedTest
@EnumSource(value = TransformMethod.class)
public void testRoundTripStandard(TransformMethod method) {
int sampleSize = 256;
int baseDimensions = 1;
int shingleSize = 8;
int dimensions = baseDimensions * shingleSize;
long seed = 0;
new Random().nextLong();
ThresholdedRandomCutForest first = new ThresholdedRandomCutForest.Builder<>().compact(true).dimensions(dimensions).precision(Precision.FLOAT_32).randomSeed(seed).internalShinglingEnabled(true).shingleSize(shingleSize).anomalyRate(0.01).transformMethod(method).adjustThreshold(true).boundingBoxCacheFraction(0).weights(new double[] { 1.0 }).build();
ThresholdedRandomCutForest second = new ThresholdedRandomCutForest.Builder<>().compact(true).dimensions(dimensions).precision(Precision.FLOAT_32).randomSeed(seed).internalShinglingEnabled(true).shingleSize(shingleSize).anomalyRate(0.01).transformMethod(method).adjustThreshold(true).weights(new double[] { 1.0 }).build();
double value = 0.75 + 0.5 * new Random().nextDouble();
first.setLowerThreshold(value);
second.setLowerThreshold(value);
Random r = new Random();
MultiDimDataWithKey dataWithKeys = ShingledMultiDimDataWithKeys.getMultiDimData(10 * sampleSize, 50, 100, 5, seed, baseDimensions);
for (double[] point : dataWithKeys.data) {
AnomalyDescriptor firstResult = first.process(point, 0L);
AnomalyDescriptor secondResult = second.process(point, 0L);
assertEquals(firstResult.getRCFScore(), secondResult.getRCFScore(), 1e-10);
if (firstResult.getAnomalyGrade() > 0) {
assertEquals(secondResult.getAnomalyGrade(), firstResult.getAnomalyGrade(), 1e-10);
assert (firstResult.getRCFScore() >= value);
}
}
// serialize + deserialize
ThresholdedRandomCutForestMapper mapper = new ThresholdedRandomCutForestMapper();
ThresholdedRandomCutForest third = mapper.toModel(mapper.toState(second));
MultiDimDataWithKey testData = ShingledMultiDimDataWithKeys.getMultiDimData(100, 50, 100, 5, seed, baseDimensions);
// update re-instantiated forest
for (double[] point : testData.data) {
AnomalyDescriptor firstResult = first.process(point, 0L);
AnomalyDescriptor secondResult = second.process(point, 0L);
AnomalyDescriptor thirdResult = third.process(point, 0L);
assertEquals(firstResult.getRCFScore(), secondResult.getRCFScore(), 1e-10);
assertEquals(firstResult.getRCFScore(), thirdResult.getRCFScore(), 1e-10);
}
}
use of com.amazon.randomcutforest.parkservices.ThresholdedRandomCutForest in project random-cut-forest-by-aws by aws.
the class ThresholdedTime method run.
@Override
public void run() throws Exception {
// Create and populate a random cut forest
int shingleSize = 4;
int numberOfTrees = 50;
int sampleSize = 256;
Precision precision = Precision.FLOAT_32;
int dataSize = 4 * sampleSize;
// change this to try different number of attributes,
// this parameter is not expected to be larger than 5 for this example
int baseDimensions = 1;
int count = 0;
int dimensions = baseDimensions * shingleSize;
ThresholdedRandomCutForest forest = new ThresholdedRandomCutForest.Builder<>().compact(true).dimensions(dimensions).randomSeed(0).numberOfTrees(numberOfTrees).shingleSize(shingleSize).sampleSize(sampleSize).internalShinglingEnabled(true).precision(precision).anomalyRate(0.01).forestMode(ForestMode.TIME_AUGMENTED).normalizeTime(true).build();
long seed = new Random().nextLong();
double[] data = new double[] { 1.0 };
System.out.println("seed = " + seed);
NormalMixtureTestData normalMixtureTestData = new NormalMixtureTestData(10, 50);
MultiDimDataWithKey dataWithKeys = normalMixtureTestData.generateTestDataWithKey(dataSize, 1, 0);
/**
* the anomalies will move from normal -> anomalous -> normal starts from normal
*/
boolean anomalyState = false;
int keyCounter = 0;
for (double[] point : dataWithKeys.data) {
long time = (long) (1000L * count + Math.floor(10 * point[0]));
AnomalyDescriptor result = forest.process(data, time);
if (keyCounter < dataWithKeys.changeIndices.length && count == dataWithKeys.changeIndices[keyCounter]) {
System.out.print("Sequence " + count + " stamp " + (result.getInternalTimeStamp()) + " CHANGE ");
if (!anomalyState) {
System.out.println(" to Distribution 1 ");
} else {
System.out.println(" to Distribution 0 ");
}
anomalyState = !anomalyState;
++keyCounter;
}
if (result.getAnomalyGrade() != 0) {
System.out.print("Sequence " + count + " stamp " + (result.getInternalTimeStamp()) + " RESULT ");
System.out.print("score " + result.getRCFScore() + ", grade " + result.getAnomalyGrade() + ", ");
if (result.isExpectedValuesPresent()) {
if (result.getRelativeIndex() != 0 && result.isStartOfAnomaly()) {
System.out.print(-result.getRelativeIndex() + " steps ago, instead of stamp " + result.getPastTimeStamp());
System.out.print(", expected timestamp " + result.getExpectedTimeStamp() + " ( " + (result.getPastTimeStamp() - result.getExpectedTimeStamp() + ")"));
} else {
System.out.print("expected " + result.getExpectedTimeStamp() + " ( " + (result.getInternalTimeStamp() - result.getExpectedTimeStamp() + ")"));
}
}
System.out.println();
}
++count;
}
}
use of com.amazon.randomcutforest.parkservices.ThresholdedRandomCutForest in project random-cut-forest-by-aws by aws.
the class ThresholdedInternalShinglingExample method run.
@Override
public void run() throws Exception {
// Create and populate a random cut forest
int shingleSize = 4;
int numberOfTrees = 50;
int sampleSize = 256;
Precision precision = Precision.FLOAT_32;
int dataSize = 4 * sampleSize;
// change this to try different number of attributes,
// this parameter is not expected to be larger than 5 for this example
int baseDimensions = 1;
long count = 0;
int dimensions = baseDimensions * shingleSize;
TransformMethod transformMethod = TransformMethod.NORMALIZE_DIFFERENCE;
ThresholdedRandomCutForest forest = ThresholdedRandomCutForest.builder().compact(true).dimensions(dimensions).randomSeed(0).numberOfTrees(numberOfTrees).shingleSize(shingleSize).sampleSize(sampleSize).internalShinglingEnabled(true).precision(precision).anomalyRate(0.01).forestMode(ForestMode.STANDARD).weightTime(0).transformMethod(transformMethod).normalizeTime(true).outputAfter(32).initialAcceptFraction(0.125).build();
ThresholdedRandomCutForest second = ThresholdedRandomCutForest.builder().compact(true).dimensions(dimensions).randomSeed(0).numberOfTrees(numberOfTrees).shingleSize(shingleSize).sampleSize(sampleSize).internalShinglingEnabled(true).precision(precision).anomalyRate(0.01).forestMode(ForestMode.TIME_AUGMENTED).weightTime(0).transformMethod(transformMethod).normalizeTime(true).outputAfter(32).initialAcceptFraction(0.125).build();
// ensuring that the parameters are the same; otherwise the grades/scores cannot
// be the same
// weighTime has to be 0
forest.setLowerThreshold(1.1);
second.setLowerThreshold(1.1);
forest.setHorizon(0.75);
second.setHorizon(0.75);
long seed = new Random().nextLong();
Random noise = new Random(0);
System.out.println("seed = " + seed);
// change the last argument seed for a different run
MultiDimDataWithKey dataWithKeys = ShingledMultiDimDataWithKeys.getMultiDimData(dataSize + shingleSize - 1, 50, 100, 5, seed, baseDimensions);
int keyCounter = 0;
for (double[] point : dataWithKeys.data) {
// idea is that we expect the arrival order to be roughly 100 apart (say
// seconds)
// then the noise corresponds to a jitter; one can try TIME_AUGMENTED and
// .normalizeTime(true)
long timestamp = 100 * count + noise.nextInt(10) - 5;
AnomalyDescriptor result = forest.process(point, timestamp);
AnomalyDescriptor test = second.process(point, timestamp);
checkArgument(Math.abs(result.getRCFScore() - test.getRCFScore()) < 1e-10, " error");
checkArgument(Math.abs(result.getAnomalyGrade() - test.getAnomalyGrade()) < 1e-10, " error");
if (keyCounter < dataWithKeys.changeIndices.length && count == dataWithKeys.changeIndices[keyCounter]) {
System.out.println("timestamp " + count + " CHANGE " + Arrays.toString(dataWithKeys.changes[keyCounter]));
++keyCounter;
}
if (result.getAnomalyGrade() != 0) {
System.out.print("timestamp " + count + " RESULT value " + result.getInternalTimeStamp() + " ");
for (int i = 0; i < baseDimensions; i++) {
System.out.print(result.getCurrentInput()[i] + ", ");
}
System.out.print("score " + result.getRCFScore() + ", grade " + result.getAnomalyGrade() + ", ");
if (result.getRelativeIndex() != 0 && result.isStartOfAnomaly()) {
System.out.print(-result.getRelativeIndex() + " steps ago, ");
}
if (result.isExpectedValuesPresent()) {
if (result.getRelativeIndex() != 0 && result.isStartOfAnomaly()) {
System.out.print("instead of ");
for (int i = 0; i < baseDimensions; i++) {
System.out.print(result.getPastValues()[i] + ", ");
}
System.out.print("expected ");
for (int i = 0; i < baseDimensions; i++) {
System.out.print(result.getExpectedValuesList()[0][i] + ", ");
if (result.getPastValues()[i] != result.getExpectedValuesList()[0][i]) {
System.out.print("( " + (result.getPastValues()[i] - result.getExpectedValuesList()[0][i]) + " ) ");
}
}
} else {
System.out.print("expected ");
for (int i = 0; i < baseDimensions; i++) {
System.out.print(result.getExpectedValuesList()[0][i] + ", ");
if (result.getCurrentInput()[i] != result.getExpectedValuesList()[0][i]) {
System.out.print("( " + (result.getCurrentInput()[i] - result.getExpectedValuesList()[0][i]) + " ) ");
}
}
}
} else {
System.out.print("insufficient data to provide expected values");
}
System.out.println();
}
++count;
}
}
use of com.amazon.randomcutforest.parkservices.ThresholdedRandomCutForest in project ml-commons by opensearch-project.
the class FixedInTimeRandomCutForest method predict.
@Override
public MLOutput predict(DataFrame dataFrame, Model model) {
if (model == null) {
throw new IllegalArgumentException("No model found for FIT RCF prediction.");
}
ThresholdedRandomCutForestState state = (ThresholdedRandomCutForestState) ModelSerDeSer.deserialize(model.getContent());
ThresholdedRandomCutForest forest = trcfMapper.toModel(state);
List<Map<String, Object>> predictResult = process(dataFrame, forest);
return MLPredictionOutput.builder().predictionResult(DataFrameBuilder.load(predictResult)).build();
}
use of com.amazon.randomcutforest.parkservices.ThresholdedRandomCutForest in project ml-commons by opensearch-project.
the class FixedInTimeRandomCutForest method process.
private List<Map<String, Object>> process(DataFrame dataFrame, ThresholdedRandomCutForest forest) {
List<Double> pointList = new ArrayList<>();
ColumnMeta[] columnMetas = dataFrame.columnMetas();
List<Map<String, Object>> predictResult = new ArrayList<>();
for (int rowNum = 0; rowNum < dataFrame.size(); rowNum++) {
Row row = dataFrame.getRow(rowNum);
long timestamp = -1;
for (int i = 0; i < columnMetas.length; i++) {
ColumnMeta columnMeta = columnMetas[i];
ColumnValue value = row.getValue(i);
// TODO: sort dataframe by time field with asc order. Currently consider the date already sorted by time.
if (timeField != null && timeField.equals(columnMeta.getName())) {
ColumnType columnType = columnMeta.getColumnType();
if (columnType == ColumnType.LONG) {
timestamp = value.longValue();
} else if (columnType == ColumnType.STRING) {
try {
timestamp = simpleDateFormat.parse(value.stringValue()).getTime();
} catch (ParseException e) {
log.error("Failed to parse timestamp " + value.stringValue(), e);
throw new MLValidationException("Failed to parse timestamp " + value.stringValue());
}
} else {
throw new MLValidationException("Wrong data type of time field. Should use LONG or STRING, but got " + columnType);
}
} else {
pointList.add(value.doubleValue());
}
}
double[] point = pointList.stream().mapToDouble(d -> d).toArray();
pointList.clear();
Map<String, Object> result = new HashMap<>();
AnomalyDescriptor process = forest.process(point, timestamp);
result.put(timeField, timestamp);
result.put("score", process.getRCFScore());
result.put("anomaly_grade", process.getAnomalyGrade());
predictResult.add(result);
}
return predictResult;
}
Aggregations