use of uk.ac.sussex.gdsc.smlm.math3.distribution.PoissonDistribution in project presto by prestodb.
the class TestTDigest method testPoissonDistribution.
@Test(enabled = false)
public void testPoissonDistribution() {
int trials = 10;
for (int k = 1; k < trials; k++) {
TDigest tDigest = createTDigest(STANDARD_COMPRESSION_FACTOR);
PoissonDistribution poisson = new PoissonDistribution(k * 0.1);
List<Integer> list = new ArrayList<>();
for (int i = 0; i < NUMBER_OF_ENTRIES; i++) {
int sample = poisson.sample();
tDigest.add(sample);
list.add(sample);
}
assertSumInts(list, tDigest);
Collections.sort(list);
for (int i = 0; i < quantile.length; i++) {
assertDiscreteWithinBound(quantile[i], STANDARD_ERROR, list, tDigest);
}
}
}
use of uk.ac.sussex.gdsc.smlm.math3.distribution.PoissonDistribution in project presto by prestodb.
the class MathFunctions method inversePoissonCdf.
@Description("Inverse of Poisson cdf given lambda (mean) parameter and probability")
@ScalarFunction
@SqlType(StandardTypes.INTEGER)
public static long inversePoissonCdf(@SqlType(StandardTypes.DOUBLE) double lambda, @SqlType(StandardTypes.DOUBLE) double p) {
checkCondition(p >= 0 && p < 1, INVALID_FUNCTION_ARGUMENT, "p must be in the interval [0, 1)");
checkCondition(lambda > 0, INVALID_FUNCTION_ARGUMENT, "lambda must be greater than 0");
PoissonDistribution distribution = new PoissonDistribution(lambda);
return distribution.inverseCumulativeProbability(p);
}
use of uk.ac.sussex.gdsc.smlm.math3.distribution.PoissonDistribution in project shifu by ShifuML.
the class DTWorker method init.
@Override
public void init(WorkerContext<DTMasterParams, DTWorkerParams> context) {
Properties props = context.getProps();
try {
SourceType sourceType = SourceType.valueOf(props.getProperty(CommonConstants.MODELSET_SOURCE_TYPE, SourceType.HDFS.toString()));
this.modelConfig = CommonUtils.loadModelConfig(props.getProperty(CommonConstants.SHIFU_MODEL_CONFIG), sourceType);
this.columnConfigList = CommonUtils.loadColumnConfigList(props.getProperty(CommonConstants.SHIFU_COLUMN_CONFIG), sourceType);
} catch (IOException e) {
throw new RuntimeException(e);
}
this.columnCategoryIndexMapping = new HashMap<Integer, Map<String, Integer>>();
for (ColumnConfig config : this.columnConfigList) {
if (config.isCategorical()) {
if (config.getBinCategory() != null) {
Map<String, Integer> tmpMap = new HashMap<String, Integer>();
for (int i = 0; i < config.getBinCategory().size(); i++) {
List<String> catVals = CommonUtils.flattenCatValGrp(config.getBinCategory().get(i));
for (String cval : catVals) {
tmpMap.put(cval, i);
}
}
this.columnCategoryIndexMapping.put(config.getColumnNum(), tmpMap);
}
}
}
this.hasCandidates = CommonUtils.hasCandidateColumns(columnConfigList);
// create Splitter
String delimiter = context.getProps().getProperty(Constants.SHIFU_OUTPUT_DATA_DELIMITER);
this.splitter = MapReduceUtils.generateShifuOutputSplitter(delimiter);
Integer kCrossValidation = this.modelConfig.getTrain().getNumKFold();
if (kCrossValidation != null && kCrossValidation > 0) {
isKFoldCV = true;
LOG.info("Cross validation is enabled by kCrossValidation: {}.", kCrossValidation);
}
Double upSampleWeight = modelConfig.getTrain().getUpSampleWeight();
if (Double.compare(upSampleWeight, 1d) != 0 && (modelConfig.isRegression() || (modelConfig.isClassification() && modelConfig.getTrain().isOneVsAll()))) {
// set mean to upSampleWeight -1 and get sample + 1 to make sure no zero sample value
LOG.info("Enable up sampling with weight {}.", upSampleWeight);
this.upSampleRng = new PoissonDistribution(upSampleWeight - 1);
}
this.isContinuousEnabled = Boolean.TRUE.toString().equalsIgnoreCase(context.getProps().getProperty(CommonConstants.CONTINUOUS_TRAINING));
this.workerThreadCount = modelConfig.getTrain().getWorkerThreadCount();
this.threadPool = Executors.newFixedThreadPool(this.workerThreadCount);
// enable shut down logic
context.addCompletionCallBack(new WorkerCompletionCallBack<DTMasterParams, DTWorkerParams>() {
@Override
public void callback(WorkerContext<DTMasterParams, DTWorkerParams> context) {
DTWorker.this.threadPool.shutdownNow();
try {
DTWorker.this.threadPool.awaitTermination(2, TimeUnit.SECONDS);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
});
this.trainerId = Integer.valueOf(context.getProps().getProperty(CommonConstants.SHIFU_TRAINER_ID, "0"));
this.isOneVsAll = modelConfig.isClassification() && modelConfig.getTrain().isOneVsAll();
GridSearch gs = new GridSearch(modelConfig.getTrain().getParams(), modelConfig.getTrain().getGridConfigFileContent());
Map<String, Object> validParams = this.modelConfig.getTrain().getParams();
if (gs.hasHyperParam()) {
validParams = gs.getParams(this.trainerId);
LOG.info("Start grid search worker with params: {}", validParams);
}
this.treeNum = Integer.valueOf(validParams.get("TreeNum").toString());
double memoryFraction = Double.valueOf(context.getProps().getProperty("guagua.data.memoryFraction", "0.6"));
LOG.info("Max heap memory: {}, fraction: {}", Runtime.getRuntime().maxMemory(), memoryFraction);
double validationRate = this.modelConfig.getValidSetRate();
if (StringUtils.isNotBlank(modelConfig.getValidationDataSetRawPath())) {
// fixed 0.6 and 0.4 of max memory for trainingData and validationData
this.trainingData = new MemoryLimitedList<Data>((long) (Runtime.getRuntime().maxMemory() * memoryFraction * 0.6), new ArrayList<Data>());
this.validationData = new MemoryLimitedList<Data>((long) (Runtime.getRuntime().maxMemory() * memoryFraction * 0.4), new ArrayList<Data>());
} else {
if (Double.compare(validationRate, 0d) != 0) {
this.trainingData = new MemoryLimitedList<Data>((long) (Runtime.getRuntime().maxMemory() * memoryFraction * (1 - validationRate)), new ArrayList<Data>());
this.validationData = new MemoryLimitedList<Data>((long) (Runtime.getRuntime().maxMemory() * memoryFraction * validationRate), new ArrayList<Data>());
} else {
this.trainingData = new MemoryLimitedList<Data>((long) (Runtime.getRuntime().maxMemory() * memoryFraction), new ArrayList<Data>());
}
}
int[] inputOutputIndex = DTrainUtils.getNumericAndCategoricalInputAndOutputCounts(this.columnConfigList);
// numerical + categorical = # of all input
this.inputCount = inputOutputIndex[0] + inputOutputIndex[1];
// regression outputNodeCount is 1, binaryClassfication, it is 1, OneVsAll it is 1, Native classification it is
// 1, with index of 0,1,2,3 denotes different classes
this.isAfterVarSelect = (inputOutputIndex[3] == 1);
this.isManualValidation = (modelConfig.getValidationDataSetRawPath() != null && !"".equals(modelConfig.getValidationDataSetRawPath()));
int numClasses = this.modelConfig.isClassification() ? this.modelConfig.getTags().size() : 2;
String imStr = validParams.get("Impurity").toString();
int minInstancesPerNode = Integer.valueOf(validParams.get("MinInstancesPerNode").toString());
double minInfoGain = Double.valueOf(validParams.get("MinInfoGain").toString());
if (imStr.equalsIgnoreCase("entropy")) {
impurity = new Entropy(numClasses, minInstancesPerNode, minInfoGain);
} else if (imStr.equalsIgnoreCase("gini")) {
impurity = new Gini(numClasses, minInstancesPerNode, minInfoGain);
} else if (imStr.equalsIgnoreCase("friedmanmse")) {
impurity = new FriedmanMSE(minInstancesPerNode, minInfoGain);
} else {
impurity = new Variance(minInstancesPerNode, minInfoGain);
}
this.isRF = ALGORITHM.RF.toString().equalsIgnoreCase(modelConfig.getAlgorithm());
this.isGBDT = ALGORITHM.GBT.toString().equalsIgnoreCase(modelConfig.getAlgorithm());
String lossStr = validParams.get("Loss").toString();
if (lossStr.equalsIgnoreCase("log")) {
this.loss = new LogLoss();
} else if (lossStr.equalsIgnoreCase("absolute")) {
this.loss = new AbsoluteLoss();
} else if (lossStr.equalsIgnoreCase("halfgradsquared")) {
this.loss = new HalfGradSquaredLoss();
} else if (lossStr.equalsIgnoreCase("squared")) {
this.loss = new SquaredLoss();
} else {
try {
this.loss = (Loss) ClassUtils.newInstance(Class.forName(lossStr));
} catch (ClassNotFoundException e) {
LOG.warn("Class not found for {}, using default SquaredLoss", lossStr);
this.loss = new SquaredLoss();
}
}
if (this.isGBDT) {
this.learningRate = Double.valueOf(validParams.get(CommonConstants.LEARNING_RATE).toString());
Object swrObj = validParams.get("GBTSampleWithReplacement");
if (swrObj != null) {
this.gbdtSampleWithReplacement = Boolean.TRUE.toString().equalsIgnoreCase(swrObj.toString());
}
Object dropoutObj = validParams.get(CommonConstants.DROPOUT_RATE);
if (dropoutObj != null) {
this.dropOutRate = Double.valueOf(dropoutObj.toString());
}
}
this.isStratifiedSampling = this.modelConfig.getTrain().getStratifiedSample();
this.checkpointOutput = new Path(context.getProps().getProperty(CommonConstants.SHIFU_DT_MASTER_CHECKPOINT_FOLDER, "tmp/cp_" + context.getAppId()));
LOG.info("Worker init params:isAfterVarSel={}, treeNum={}, impurity={}, loss={}, learningRate={}, gbdtSampleWithReplacement={}, isRF={}, isGBDT={}, isStratifiedSampling={}, isKFoldCV={}, kCrossValidation={}, dropOutRate={}", isAfterVarSelect, treeNum, impurity.getClass().getName(), loss.getClass().getName(), this.learningRate, this.gbdtSampleWithReplacement, this.isRF, this.isGBDT, this.isStratifiedSampling, this.isKFoldCV, kCrossValidation, this.dropOutRate);
// for fail over, load existing trees
if (!context.isFirstIteration()) {
if (this.isGBDT) {
// set flag here and recover later in doComputing, this is to make sure recover after load part which
// can load latest trees in #doCompute
isNeedRecoverGBDTPredict = true;
} else {
// RF , trees are recovered from last master results
recoverTrees = context.getLastMasterResult().getTrees();
}
}
if (context.isFirstIteration() && this.isContinuousEnabled && this.isGBDT) {
Path modelPath = new Path(context.getProps().getProperty(CommonConstants.GUAGUA_OUTPUT));
TreeModel existingModel = null;
try {
existingModel = (TreeModel) ModelSpecLoaderUtils.loadModel(modelConfig, modelPath, ShifuFileUtils.getFileSystemBySourceType(this.modelConfig.getDataSet().getSource()));
} catch (IOException e) {
LOG.error("Error in get existing model, will ignore and start from scratch", e);
}
if (existingModel == null) {
LOG.warn("No model is found even set to continuous model training.");
return;
} else {
recoverTrees = existingModel.getTrees();
LOG.info("Loading existing {} trees", recoverTrees.size());
}
}
}
use of uk.ac.sussex.gdsc.smlm.math3.distribution.PoissonDistribution in project shifu by ShifuML.
the class DTWorker method sampleWeights.
private float[] sampleWeights(float label) {
float[] sampleWeights = null;
// sample negative or kFoldCV, sample rate is 1d
double sampleRate = (modelConfig.getTrain().getSampleNegOnly() || this.isKFoldCV) ? 1d : modelConfig.getTrain().getBaggingSampleRate();
int classValue = (int) (label + 0.01f);
if (this.treeNum == 1 || (this.isGBDT && !this.gbdtSampleWithReplacement)) {
// if tree == 1 or GBDT, don't use with replacement sampling; for GBDT, every time is one tree
sampleWeights = new float[1];
Random random = null;
if (this.isStratifiedSampling) {
random = baggingRandomMap.get(classValue);
if (random == null) {
random = DTrainUtils.generateRandomBySampleSeed(modelConfig.getTrain().getBaggingSampleSeed(), CommonConstants.NOT_CONFIGURED_BAGGING_SEED);
baggingRandomMap.put(classValue, random);
}
} else {
random = baggingRandomMap.get(0);
if (random == null) {
random = DTrainUtils.generateRandomBySampleSeed(modelConfig.getTrain().getBaggingSampleSeed(), CommonConstants.NOT_CONFIGURED_BAGGING_SEED);
baggingRandomMap.put(0, random);
}
}
if (random.nextDouble() <= sampleRate) {
sampleWeights[0] = 1f;
} else {
sampleWeights[0] = 0f;
}
} else {
// if gbdt and gbdtSampleWithReplacement = true, still sampling with replacement
sampleWeights = new float[this.treeNum];
if (this.isStratifiedSampling) {
PoissonDistribution[] rng = this.baggingRngMap.get(classValue);
if (rng == null) {
rng = new PoissonDistribution[treeNum];
for (int i = 0; i < treeNum; i++) {
rng[i] = new PoissonDistribution(sampleRate);
}
this.baggingRngMap.put(classValue, rng);
}
for (int i = 0; i < sampleWeights.length; i++) {
sampleWeights[i] = rng[i].sample();
}
} else {
PoissonDistribution[] rng = this.baggingRngMap.get(0);
if (rng == null) {
rng = new PoissonDistribution[treeNum];
for (int i = 0; i < treeNum; i++) {
rng[i] = new PoissonDistribution(sampleRate);
}
this.baggingRngMap.put(0, rng);
}
for (int i = 0; i < sampleWeights.length; i++) {
sampleWeights[i] = rng[i].sample();
}
}
}
return sampleWeights;
}
use of uk.ac.sussex.gdsc.smlm.math3.distribution.PoissonDistribution in project shifu by ShifuML.
the class AbstractNNWorker method init.
@Override
public void init(WorkerContext<NNParams, NNParams> context) {
// load props firstly
this.props = context.getProps();
loadConfigFiles(context.getProps());
this.trainerId = Integer.valueOf(context.getProps().getProperty(CommonConstants.SHIFU_TRAINER_ID, "0"));
GridSearch gs = new GridSearch(modelConfig.getTrain().getParams(), modelConfig.getTrain().getGridConfigFileContent());
this.validParams = this.modelConfig.getTrain().getParams();
if (gs.hasHyperParam()) {
this.validParams = gs.getParams(trainerId);
LOG.info("Start grid search master with params: {}", validParams);
}
Integer kCrossValidation = this.modelConfig.getTrain().getNumKFold();
if (kCrossValidation != null && kCrossValidation > 0) {
isKFoldCV = true;
LOG.info("Cross validation is enabled by kCrossValidation: {}.", kCrossValidation);
}
this.poissonSampler = Boolean.TRUE.toString().equalsIgnoreCase(context.getProps().getProperty(NNConstants.NN_POISON_SAMPLER));
this.rng = new PoissonDistribution(1.0d);
Double upSampleWeight = modelConfig.getTrain().getUpSampleWeight();
if (Double.compare(upSampleWeight, 1d) != 0 && (modelConfig.isRegression() || (modelConfig.isClassification() && modelConfig.getTrain().isOneVsAll()))) {
// set mean to upSampleWeight -1 and get sample + 1to make sure no zero sample value
LOG.info("Enable up sampling with weight {}.", upSampleWeight);
this.upSampleRng = new PoissonDistribution(upSampleWeight - 1);
}
Integer epochsPerIterationInteger = this.modelConfig.getTrain().getEpochsPerIteration();
this.epochsPerIteration = epochsPerIterationInteger == null ? 1 : epochsPerIterationInteger.intValue();
LOG.info("epochsPerIteration in worker is :{}", epochsPerIteration);
// Object elmObject = validParams.get(DTrainUtils.IS_ELM);
// isELM = elmObject == null ? false : "true".equalsIgnoreCase(elmObject.toString());
// LOG.info("Check isELM: {}", isELM);
Object dropoutRateObj = validParams.get(CommonConstants.DROPOUT_RATE);
if (dropoutRateObj != null) {
this.dropoutRate = Double.valueOf(dropoutRateObj.toString());
}
LOG.info("'dropoutRate' in worker is :{}", this.dropoutRate);
Object miniBatchO = validParams.get(CommonConstants.MINI_BATCH);
if (miniBatchO != null) {
int miniBatchs;
try {
miniBatchs = Integer.parseInt(miniBatchO.toString());
} catch (Exception e) {
miniBatchs = 1;
}
if (miniBatchs < 0) {
this.batchs = 1;
} else if (miniBatchs > 1000) {
this.batchs = 1000;
} else {
this.batchs = miniBatchs;
}
LOG.info("'miniBatchs' in worker is : {}, batchs is {} ", miniBatchs, batchs);
}
int[] inputOutputIndex = DTrainUtils.getInputOutputCandidateCounts(modelConfig.getNormalizeType(), this.columnConfigList);
this.inputNodeCount = inputOutputIndex[0] == 0 ? inputOutputIndex[2] : inputOutputIndex[0];
// if is one vs all classification, outputNodeCount is set to 1, if classes=2, outputNodeCount is also 1
int classes = modelConfig.getTags().size();
this.outputNodeCount = (isLinearTarget || modelConfig.isRegression()) ? inputOutputIndex[1] : (modelConfig.getTrain().isOneVsAll() ? inputOutputIndex[1] : (classes == 2 ? 1 : classes));
this.candidateCount = inputOutputIndex[2];
boolean isAfterVarSelect = inputOutputIndex[0] != 0;
LOG.info("isAfterVarSelect {}: Input count {}, output count {}, candidate count {}", isAfterVarSelect, inputNodeCount, outputNodeCount, candidateCount);
// cache all feature list for sampling features
this.allFeatures = NormalUtils.getAllFeatureList(columnConfigList, isAfterVarSelect);
String subsetStr = context.getProps().getProperty(CommonConstants.SHIFU_NN_FEATURE_SUBSET);
if (StringUtils.isBlank(subsetStr)) {
this.subFeatures = this.allFeatures;
} else {
String[] splits = subsetStr.split(",");
this.subFeatures = new ArrayList<Integer>(splits.length);
for (String split : splits) {
int featureIndex = Integer.parseInt(split);
this.subFeatures.add(featureIndex);
}
}
this.subFeatureSet = new HashSet<Integer>(this.subFeatures);
LOG.info("subFeatures size is {}", subFeatures.size());
this.featureInputsCnt = DTrainUtils.getFeatureInputsCnt(this.modelConfig, this.columnConfigList, this.subFeatureSet);
this.wgtInit = "default";
Object wgtInitObj = validParams.get(CommonConstants.WEIGHT_INITIALIZER);
if (wgtInitObj != null) {
this.wgtInit = wgtInitObj.toString();
}
Object lossObj = validParams.get("Loss");
this.lossStr = lossObj != null ? lossObj.toString() : "squared";
LOG.info("Loss str is {}", this.lossStr);
this.isDry = Boolean.TRUE.toString().equalsIgnoreCase(context.getProps().getProperty(CommonConstants.SHIFU_DRY_DTRAIN));
this.isSpecificValidation = (modelConfig.getValidationDataSetRawPath() != null && !"".equals(modelConfig.getValidationDataSetRawPath()));
this.isStratifiedSampling = this.modelConfig.getTrain().getStratifiedSample();
if (isOnDisk()) {
LOG.info("NNWorker is loading data into disk.");
try {
initDiskDataSet();
} catch (IOException e) {
throw new RuntimeException(e);
}
// cannot find a good place to close these two data set, using Shutdown hook
Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {
@Override
public void run() {
((BufferedFloatMLDataSet) (AbstractNNWorker.this.trainingData)).close();
((BufferedFloatMLDataSet) (AbstractNNWorker.this.validationData)).close();
}
}));
} else {
LOG.info("NNWorker is loading data into memory.");
double memoryFraction = Double.valueOf(context.getProps().getProperty("guagua.data.memoryFraction", "0.6"));
long memoryStoreSize = (long) (Runtime.getRuntime().maxMemory() * memoryFraction);
LOG.info("Max heap memory: {}, fraction: {}", Runtime.getRuntime().maxMemory(), memoryFraction);
double crossValidationRate = this.modelConfig.getValidSetRate();
try {
if (StringUtils.isNotBlank(modelConfig.getValidationDataSetRawPath())) {
// fixed 0.6 and 0.4 of max memory for trainingData and validationData
this.trainingData = new MemoryDiskFloatMLDataSet((long) (memoryStoreSize * 0.6), DTrainUtils.getTrainingFile().toString(), this.featureInputsCnt, this.outputNodeCount);
this.validationData = new MemoryDiskFloatMLDataSet((long) (memoryStoreSize * 0.4), DTrainUtils.getTestingFile().toString(), this.featureInputsCnt, this.outputNodeCount);
} else {
this.trainingData = new MemoryDiskFloatMLDataSet((long) (memoryStoreSize * (1 - crossValidationRate)), DTrainUtils.getTrainingFile().toString(), this.featureInputsCnt, this.outputNodeCount);
this.validationData = new MemoryDiskFloatMLDataSet((long) (memoryStoreSize * crossValidationRate), DTrainUtils.getTestingFile().toString(), this.featureInputsCnt, this.outputNodeCount);
}
// cannot find a good place to close these two data set, using Shutdown hook
Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {
@Override
public void run() {
((MemoryDiskFloatMLDataSet) (AbstractNNWorker.this.trainingData)).close();
((MemoryDiskFloatMLDataSet) (AbstractNNWorker.this.validationData)).close();
}
}));
} catch (IOException e) {
throw new GuaguaRuntimeException(e);
}
}
// create Splitter
String delimiter = context.getProps().getProperty(Constants.SHIFU_OUTPUT_DATA_DELIMITER);
this.splitter = MapReduceUtils.generateShifuOutputSplitter(delimiter);
}
Aggregations