use of hex.VarImp in project h2o-2 by h2oai.
the class SharedTreeModelBuilder method doScoring.
protected TM doScoring(TM model, Frame fTrain, DTree[] ktrees, int tid, DTree.TreeModel.TreeStats tstats, boolean finalScoring, boolean oob, boolean build_tree_one_node) {
long now = System.currentTimeMillis();
if (_firstScore == 0)
_firstScore = now;
long sinceLastScore = now - _timeLastScoreStart;
Score sc = null;
// If validation is specified we use a model for scoring, so we need to update it!
// First we save model with trees (i.e., make them available for scoring)
// and then update it with resulting error
model = makeModel(model, ktrees, tstats);
model.update(self());
// Now model already contains tid-trees in serialized form
if (score_each_iteration || finalScoring || // Score every time for 4 secs
(now - _firstScore < 4000) || // Throttle scoring to keep the cost sane; limit to a 10% duty cycle & every 4 secs
(// Limit scoring updates to every 4sec
sinceLastScore > 4000 && (double) (_timeLastScoreEnd - _timeLastScoreStart) / sinceLastScore < 0.1)) {
// 10% duty cycle
_timeLastScoreStart = now;
// Perform scoring - first get adapted validation response
Response2CMAdaptor vadaptor = getValidAdaptor();
sc = new Score().doIt(model, fTrain, vadaptor, oob, build_tree_one_node).report(logTag(), tid, ktrees);
_timeLastScoreEnd = System.currentTimeMillis();
}
// Compute variable importance for this tree if necessary
VarImp varimp = null;
if (importance && ktrees != null) {
// compute this tree votes but skip the first scoring call which is done over empty forest
Timer vi_timer = new Timer();
varimp = doVarImpCalc(model, ktrees, tid - 1, fTrain, false);
Log.info(logTag(), "Computation of variable importance with " + tid + "th-tree took: " + vi_timer.toString());
}
// Double update - after scoring
model = makeModel(model, sc == null ? Double.NaN : sc.mse(), sc == null ? null : (_nclass > 1 ? new ConfusionMatrix(sc._cm) : null), varimp, sc == null ? null : (_nclass == 2 ? makeAUC(toCMArray(sc._cms), ModelUtils.DEFAULT_THRESHOLDS) : null));
model.update(self());
return model;
}
use of hex.VarImp in project h2o-2 by h2oai.
the class GLMModel method maybeComputeVariableImportances.
protected void maybeComputeVariableImportances() {
GLM2 params = get_params();
this.variable_importances = null;
final double[] b = beta();
if (params.variable_importances && null != b) {
// Warn if we may be returning results that might not include an important (base) level. . .
if (!params.use_all_factor_levels)
this.addWarning("Variable Importance may be missing important variables: because use_all_factor_levels is off the importance of base categorical levels will NOT be included.");
// Don't include the Intercept
float[] coefs_abs_value = new float[b.length - 1];
String[] names = new String[b.length - 1];
for (int i = 0; i < b.length - 1; ++i) {
coefs_abs_value[i] = (float) Math.abs(b[i]);
names[i] = coefficients_names[i];
}
this.variable_importances = new VarImp(coefs_abs_value, names);
}
}
use of hex.VarImp in project h2o-2 by h2oai.
the class DeepLearningModel method doScoring.
/**
*
* @param train training data from which the model is built (for epoch counting only)
* @param ftrain potentially downsampled training data for scoring
* @param ftest potentially downsampled validation data for scoring
* @param job_key key of the owning job
* @return true if model building is ongoing
*/
boolean doScoring(Frame train, Frame ftrain, Frame ftest, Key job_key, Job.ValidatedJob.Response2CMAdaptor vadaptor) {
try {
final long now = System.currentTimeMillis();
epoch_counter = (float) model_info().get_processed_total() / training_rows;
final double time_last_iter_millis = now - _timeLastScoreEnter;
// Note: actual communication time is estimated by the NetworkTest's collective test.
if (H2O.CLOUD.size() > 1 && get_params().train_samples_per_iteration == -2 && time_for_communication_us > 1e4) {
// Log.info("Time taken for communication: " + PrettyPrint.usecs((long)time_for_communication_us));
// Log.info("Time taken for Map/Reduce iteration: " + PrettyPrint.msecs((long)time_last_iter_millis, true));
final double comm_to_work_ratio = (time_for_communication_us * 1e-3) / time_last_iter_millis;
// Log.info("Ratio of network communication to computation: " + String.format("%.3f", comm_to_work_ratio));
// Log.info("target_comm_to_work: " + get_params().target_ratio_comm_to_comp);
final double correction = get_params().target_ratio_comm_to_comp / comm_to_work_ratio;
// Log.warn("Suggested value for train_samples_per_iteration: " + get_params().actual_train_samples_per_iteration/correction);
actual_train_samples_per_iteration /= correction;
actual_train_samples_per_iteration = Math.max(1, actual_train_samples_per_iteration);
}
run_time += time_last_iter_millis;
_timeLastScoreEnter = now;
boolean keep_running = (epoch_counter < get_params().epochs);
final long sinceLastScore = now - _timeLastScoreStart;
final long sinceLastPrint = now - _timeLastPrintStart;
final long samples = model_info().get_processed_total();
if (!keep_running || sinceLastPrint > get_params().score_interval * 1000) {
_timeLastPrintStart = now;
Log.info("Training time: " + PrettyPrint.msecs(run_time, true) + ". Processed " + String.format("%,d", samples) + " samples" + " (" + String.format("%.3f", epoch_counter) + " epochs)." + " Speed: " + String.format("%.3f", 1000. * samples / run_time) + " samples/sec.");
}
// this is potentially slow - only do every so often
if (!keep_running || (//don't score too often
sinceLastScore > get_params().score_interval * 1000 && (double) (_timeLastScoreEnd - _timeLastScoreStart) / sinceLastScore < get_params().score_duty_cycle)) {
//duty cycle
final boolean printme = !get_params().quiet_mode;
final boolean adaptCM = (isClassifier() && vadaptor.needsAdaptation2CM());
_timeLastScoreStart = now;
if (get_params().diagnostics)
model_info().computeStats();
Errors err = new Errors();
err.training_time_ms = run_time;
err.epoch_counter = epoch_counter;
err.training_samples = model_info().get_processed_total();
err.validation = ftest != null;
err.score_training_samples = ftrain.numRows();
if (get_params().autoencoder) {
if (printme)
Log.info("Scoring the auto-encoder.");
// training
{
final Frame mse_frame = scoreAutoEncoder(ftrain);
final Vec l2 = mse_frame.anyVec();
Log.info("Mean reconstruction error on training data: " + l2.mean() + "\n");
err.train_mse = l2.mean();
mse_frame.delete();
}
} else {
if (printme)
Log.info("Scoring the model.");
// compute errors
err.classification = isClassifier();
assert (err.classification == get_params().classification);
err.num_folds = get_params().n_folds;
err.train_confusion_matrix = new ConfusionMatrix();
final int hit_k = Math.min(nclasses(), get_params().max_hit_ratio_k);
if (err.classification && nclasses() > 2 && hit_k > 0) {
err.train_hitratio = new HitRatio();
err.train_hitratio.set_max_k(hit_k);
}
final String m = model_info().toString();
if (m.length() > 0)
Log.info(m);
final Frame trainPredict = score(ftrain, false);
AUC trainAUC = null;
if (err.classification && nclasses() == 2)
trainAUC = new AUC();
final double trainErr = calcError(ftrain, ftrain.lastVec(), trainPredict, trainPredict, "training", printme, get_params().max_confusion_matrix_size, err.train_confusion_matrix, trainAUC, err.train_hitratio);
if (isClassifier())
err.train_err = trainErr;
if (trainAUC != null)
err.trainAUC = trainAUC.data();
else
err.train_mse = trainErr;
trainPredict.delete();
if (err.validation) {
assert ftest != null;
err.score_validation_samples = ftest.numRows();
err.valid_confusion_matrix = new ConfusionMatrix();
if (err.classification && nclasses() > 2 && hit_k > 0) {
err.valid_hitratio = new HitRatio();
err.valid_hitratio.set_max_k(hit_k);
}
final String adaptRespName = vadaptor.adaptedValidationResponse(responseName());
Vec adaptCMresp = null;
if (adaptCM) {
Vec[] v = ftest.vecs();
//make sure to have (adapted) response in the test set
assert (ftest.find(adaptRespName) == v.length - 1);
//model would remove any extra columns anyway (need to keep it here for later)
adaptCMresp = ftest.remove(v.length - 1);
}
final Frame validPredict = score(ftest, adaptCM);
final Frame hitratio_validPredict = new Frame(validPredict);
Vec orig_label = validPredict.vecs()[0];
// Note: doesn't change predictions, just the *possible* label domain
if (adaptCM) {
assert (adaptCMresp != null);
assert (ftest.find(adaptRespName) == -1);
ftest.add(adaptRespName, adaptCMresp);
final Vec CMadapted = vadaptor.adaptModelResponse2CM(validPredict.vecs()[0]);
//replace label
validPredict.replace(0, CMadapted);
//keep the Vec around to be deleted later (no leak)
validPredict.add("to_be_deleted", CMadapted);
}
AUC validAUC = null;
if (err.classification && nclasses() == 2)
validAUC = new AUC();
final double validErr = calcError(ftest, ftest.lastVec(), validPredict, hitratio_validPredict, "validation", printme, get_params().max_confusion_matrix_size, err.valid_confusion_matrix, validAUC, err.valid_hitratio);
if (isClassifier())
err.valid_err = validErr;
if (trainAUC != null)
err.validAUC = validAUC.data();
else
err.valid_mse = validErr;
validPredict.delete();
//also delete the replaced label
if (adaptCM)
orig_label.remove(new Futures()).blockForPending();
}
// only keep confusion matrices for the last step if there are fewer than specified number of output classes
if (err.train_confusion_matrix.cm != null && err.train_confusion_matrix.cm.length - 1 >= get_params().max_confusion_matrix_size) {
err.train_confusion_matrix = null;
err.valid_confusion_matrix = null;
}
}
if (get_params().variable_importances) {
if (!get_params().quiet_mode)
Log.info("Computing variable importances.");
final float[] vi = model_info().computeVariableImportances();
err.variable_importances = new VarImp(vi, Arrays.copyOfRange(model_info().data_info().coefNames(), 0, vi.length));
}
_timeLastScoreEnd = System.currentTimeMillis();
err.scoring_time = System.currentTimeMillis() - now;
// enlarge the error array by one, push latest score back
if (errors == null) {
errors = new Errors[] { err };
} else {
Errors[] err2 = new Errors[errors.length + 1];
System.arraycopy(errors, 0, err2, 0, errors.length);
err2[err2.length - 1] = err;
errors = err2;
}
if (!get_params().autoencoder) {
// always keep a copy of the best model so far (based on the following criterion)
if (actual_best_model_key != null && (// if we have a best_model in DKV, then compare against its error() (unless it's a different model as judged by the network size)
(UKV.get(actual_best_model_key) != null && (error() < UKV.<DeepLearningModel>get(actual_best_model_key).error() || !Arrays.equals(model_info().units, UKV.<DeepLearningModel>get(actual_best_model_key).model_info().units))) || // otherwise, compare against our own _bestError
(UKV.get(actual_best_model_key) == null && error() < _bestError))) {
if (!get_params().quiet_mode)
Log.info("Error reduced from " + _bestError + " to " + error() + ". Storing best model so far under key " + actual_best_model_key.toString() + ".");
_bestError = error();
putMeAsBestModel(actual_best_model_key);
// debugging check
if (false) {
DeepLearningModel bestModel = UKV.get(actual_best_model_key);
final Frame fr = ftest != null ? ftest : ftrain;
final Frame bestPredict = bestModel.score(fr, ftest != null ? adaptCM : false);
final Frame hitRatio_bestPredict = new Frame(bestPredict);
// Note: doesn't change predictions, just the *possible* label domain
if (adaptCM) {
final Vec CMadapted = vadaptor.adaptModelResponse2CM(bestPredict.vecs()[0]);
//replace label
bestPredict.replace(0, CMadapted);
//keep the Vec around to be deleted later (no leak)
bestPredict.add("to_be_deleted", CMadapted);
}
final double err3 = calcError(fr, fr.lastVec(), bestPredict, hitRatio_bestPredict, "cross-check", printme, get_params().max_confusion_matrix_size, new water.api.ConfusionMatrix(), isClassifier() && nclasses() == 2 ? new AUC() : null, null);
if (isClassifier())
assert (ftest != null ? Math.abs(err.valid_err - err3) < 1e-5 : Math.abs(err.train_err - err3) < 1e-5);
else
assert (ftest != null ? Math.abs(err.valid_mse - err3) < 1e-5 : Math.abs(err.train_mse - err3) < 1e-5);
bestPredict.delete();
}
}
// print the freshly scored model to ASCII
for (String s : toString().split("\n")) Log.info(s);
if (printme)
Log.info("Time taken for scoring and diagnostics: " + PrettyPrint.msecs(err.scoring_time, true));
}
}
if (model_info().unstable()) {
Log.warn(unstable_msg);
keep_running = false;
} else if ((isClassifier() && last_scored().train_err <= get_params().classification_stop) || (!isClassifier() && last_scored().train_mse <= get_params().regression_stop)) {
Log.info("Achieved requested predictive accuracy on the training data. Model building completed.");
keep_running = false;
}
update(job_key);
// System.out.println(this);
return keep_running;
} catch (Exception ex) {
return false;
}
}
use of hex.VarImp in project h2o-2 by h2oai.
the class Models method summarizeModelCommonFields.
/**
* Summarize fields which are generic to water.Model.
*/
private static void summarizeModelCommonFields(ModelSummary summary, Model model) {
String[] names = model._names;
summary.warnings = model.warnings;
// fallback only
summary.model_algorithm = model.getClass().toString();
// model.job() is a local copy; on multinode clusters we need to get from the DKV
Key job_key = ((Job) model.job()).self();
// later when we deserialize models from disk we'll relax this constraint
if (null == job_key)
throw H2O.fail("Null job key for model: " + (model == null ? "null model" : model._key));
Job job = DKV.get(job_key).get();
summary.state = job.getState();
summary.model_category = model.getModelCategory();
UniqueId unique_id = model.getUniqueId();
summary.id = unique_id.getId();
summary.key = unique_id.getKey();
summary.creation_epoch_time_millis = unique_id.getCreationEpochTimeMillis();
summary.training_duration_in_ms = model.training_duration_in_ms;
summary.response_column_name = names[names.length - 1];
for (int i = 0; i < names.length - 1; i++) summary.input_column_names.add(names[i]);
// Ugh.
VarImp vi = model.varimp();
if (null != vi) {
summary.variable_importances = new LinkedHashMap();
summary.variable_importances.put("varimp", vi.varimp);
summary.variable_importances.put("variables", vi.getVariables());
summary.variable_importances.put("method", vi.method);
summary.variable_importances.put("max_var", vi.max_var);
summary.variable_importances.put("scaled", vi.scaled());
}
}
Aggregations