use of water.fvec.Vec in project h2o-2 by h2oai.
the class DeepLearning method prepareDataInfo.
/**
* Helper to create a DataInfo object from the source and response
* @return DataInfo object
*/
private DataInfo prepareDataInfo() {
final boolean del_enum_resp = classification && !response.isEnum();
final Frame train = FrameTask.DataInfo.prepareFrame(source, autoencoder ? null : response, ignored_cols, classification, ignore_const_cols, true);
final DataInfo dinfo = new //use all FactorLevels for auto-encoder
FrameTask.DataInfo(//use all FactorLevels for auto-encoder
train, //use all FactorLevels for auto-encoder
autoencoder ? 0 : 1, //use all FactorLevels for auto-encoder
true, //use all FactorLevels for auto-encoder
autoencoder || use_all_factor_levels, //transform predictors
autoencoder ? DataInfo.TransformType.NORMALIZE : DataInfo.TransformType.STANDARDIZE, //transform response
classification ? DataInfo.TransformType.NONE : DataInfo.TransformType.STANDARDIZE);
if (!autoencoder) {
//convention from DataInfo: response is the last Vec
final Vec resp = dinfo._adaptedFrame.lastVec();
//either regression or enum response
assert (!classification ^ resp.isEnum()) : "Must have enum response for classification!";
if (del_enum_resp)
ltrash(resp);
}
return dinfo;
}
use of water.fvec.Vec in project h2o-2 by h2oai.
the class KMeans2 method execImpl.
// ----------------------
@Override
public void execImpl() {
Frame fr;
KMeans2Model model = null;
try {
logStart();
source.read_lock(self());
if (source.numRows() < k)
throw new IllegalArgumentException("Cannot make " + k + " clusters out of " + source.numRows() + " rows.");
// Drop ignored cols and, if user asks for it, cols with too many NAs
fr = FrameTask.DataInfo.prepareFrame(source, ignored_cols, false, drop_na_cols);
// fr = source;
if (fr.numCols() == 0)
throw new IllegalArgumentException("No columns left to work with.");
// Sort columns, so the categoricals are all up front. They use a
// different distance metric than numeric columns.
Vec[] vecs = fr.vecs();
// Feature count
final int N = vecs.length;
int ncats = 0, len = N;
while (ncats != len) {
while (ncats < len && vecs[ncats].isEnum()) ncats++;
while (len > 0 && !vecs[len - 1].isEnum()) len--;
if (ncats < len - 1)
fr.swap(ncats, len - 1);
}
_ncats = ncats;
// The model to be built
model = new KMeans2Model(this, dest(), fr._key, fr.names(), fr.domains());
model.delete_and_lock(self());
// means are used to impute NAs
double[] means = new double[N];
for (int i = 0; i < N; i++) means[i] = vecs[i].mean();
// mults & means for normalization
double[] mults = null;
if (normalize) {
mults = new double[N];
for (int i = 0; i < N; i++) {
double sigma = vecs[i].sigma();
mults[i] = normalize(sigma) ? 1.0 / sigma : 1.0;
}
}
// Initialize clusters
Random rand = Utils.getRNG(seed - 1);
// Normalized cluster centers
double[][] clusters;
if (initialization == Initialization.None) {
// Initialize all clusters to random rows. Get 3x the number needed
clusters = model.centers = new double[k * 3][fr.numCols()];
for (double[] cluster : clusters) randomRow(vecs, rand, cluster, means, mults);
// for( int i=0; i<model.centers.length; i++ ) {
// Log.info("random model.centers["+i+"]: "+Arrays.toString(model.centers[i]));
// }
// Recluster down to K normalized clusters.
clusters = recluster(clusters, rand);
} else {
clusters = new double[1][vecs.length];
// Initialize first cluster to random row
randomRow(vecs, rand, clusters[0], means, mults);
while (model.iterations < 5) {
// Sum squares distances to clusters
SumSqr sqr = new SumSqr(clusters, means, mults, _ncats).doAll(vecs);
// Log.info("iteration: "+model.iterations+" sqr: "+sqr._sqr);
// Sample with probability inverse to square distance
long randomSeed = (long) rand.nextDouble();
Sampler sampler = new Sampler(clusters, means, mults, _ncats, sqr._sqr, k * 3, randomSeed).doAll(vecs);
clusters = Utils.append(clusters, sampler._sampled);
// Stopped/cancelled
if (!isRunning())
return;
model.centers = denormalize(clusters, ncats, means, mults);
// see below. this is sum of squared error now
model.total_within_SS = sqr._sqr;
// One iteration done
model.iterations++;
// Log.info("\nKMeans Centers during init models.iterations: "+model.iterations);
// for( int i=0; i<model.centers.length; i++ ) {
// Log.info("model.centers["+i+"]: "+Arrays.toString(model.centers[i]));
// }
// Log.info("model.total_within_SS: "+model.total_within_SS);
// Don't count these iterations as work for model building
// Early version of model is visible
model.update(self());
// Recluster down to K normalized clusters.
// makes more sense to recluster each iteration, since the weighted k*3 effect on sqr vs _sqr
// reflects the k effect on _sqr? ..if there are too many "centers" (samples) then _sqr (sum of all) is too
// big relative to sqr (possible new point, and we don't gather any more samples?
// (so the centers won't change during the init)
clusters = recluster(clusters, rand);
}
}
// Reset iteration count
model.iterations = 0;
// ---
// Run the main KMeans Clustering loop
// Stop after enough iterations
boolean done;
LOOP: for (; model.iterations < max_iter; model.iterations++) {
// Stopped/cancelled
if (!isRunning())
return;
Lloyds task = new Lloyds(clusters, means, mults, _ncats, k).doAll(vecs);
// Pick the max categorical level for clusters' center
max_cats(task._cMeans, task._cats);
// Handle the case where some clusters go dry. Rescue only 1 cluster
// per iteration ('cause we only tracked the 1 worst row)
boolean badrow = false;
for (int clu = 0; clu < k; clu++) {
if (task._rows[clu] == 0) {
// some centers *at-all*.
if (badrow) {
Log.warn("KMeans: Re-running Lloyds to re-init another cluster");
// Do not count against iterations
model.iterations--;
if (reinit_attempts++ < k) {
// Rerun Lloyds, and assign points to centroids
continue LOOP;
} else {
reinit_attempts = 0;
//give up and accept empty cluster
break;
}
}
long row = task._worst_row;
Log.warn("KMeans: Re-initializing cluster " + clu + " to row " + row);
data(clusters[clu] = task._cMeans[clu], vecs, row, means, mults);
task._rows[clu] = 1;
badrow = true;
}
}
// Fill in the model; denormalized centers
model.centers = denormalize(task._cMeans, ncats, means, mults);
model.size = task._rows;
model.within_cluster_variances = task._cSqr;
// sum squared error
double ssq = 0;
for (int i = 0; i < k; i++) {
// sum squared error all clusters
ssq += model.within_cluster_variances[i];
// model.within_cluster_variances[i] /= task._rows[i]; // mse per-cluster
}
// model.total_within_SS = ssq/fr.numRows(); // mse total
//total within sum of squares
model.total_within_SS = ssq;
// Update model in K/V store
model.update(self());
reinit_attempts = 0;
// Compute change in clusters centers
double sum = 0;
for (int clu = 0; clu < k; clu++) sum += distance(clusters[clu], task._cMeans[clu], ncats);
// Average change per feature
sum /= N;
Log.info("KMeans: Change in cluster centers=" + sum);
done = (sum < 1e-6 || model.iterations == max_iter - 1);
if (done) {
Log.info("Writing clusters to key " + model._clustersKey);
Clusters cc = new Clusters();
cc._clusters = clusters;
cc._means = means;
cc._mults = mults;
cc.doAll(1, vecs);
Frame fr2 = cc.outputFrame(model._clustersKey, new String[] { "Cluster ID" }, new String[][] { Utils.toStringMap(0, cc._clusters.length - 1) });
fr2.delete_and_lock(self()).unlock(self());
break;
}
// Update cluster centers
clusters = task._cMeans;
StringBuilder sb = new StringBuilder();
sb.append("KMeans: iter: ").append(model.iterations).append(", MSE=").append(model.total_within_SS);
for (int i = 0; i < k; i++) sb.append(", ").append(task._cSqr[i]).append("/").append(task._rows[i]);
Log.info(sb);
}
} catch (Throwable t) {
t.printStackTrace();
cancel(t);
} finally {
// Remove Job
remove();
if (model != null)
model.unlock(self());
source.unlock(self());
state = UKV.<Job>get(self()).state;
new TAtomic<KMeans2Model>() {
@Override
public KMeans2Model atomic(KMeans2Model m) {
if (m != null)
m.get_params().state = state;
return m;
}
}.invoke(dest());
}
}
use of water.fvec.Vec in project h2o-2 by h2oai.
the class VariableImportance method init.
private void init(Vec resp) {
Vec respData = _data.vecs()[_classcol];
int model_min = (int) resp.min();
int data_min = (int) respData.min();
if (resp._domain != null) {
assert respData._domain != null;
_model_classes_mapping = new int[resp._domain.length];
_data_classes_mapping = new int[respData._domain.length];
// compute mapping
alignEnumDomains(resp._domain, respData._domain, _model_classes_mapping, _data_classes_mapping);
} else {
assert respData._domain == null;
_model_classes_mapping = null;
_data_classes_mapping = null;
// compute mapping
_cmin_model_mapping = model_min - Math.min(model_min, data_min);
_cmin_data_mapping = data_min - Math.min(model_min, data_min);
}
}
use of water.fvec.Vec in project h2o-2 by h2oai.
the class DHistogram method initialHist.
// The initial histogram bins are setup from the Vec rollups.
public static DHistogram[] initialHist(Frame fr, int ncols, int nbins, DHistogram[] hs, int min_rows, boolean doGrpSplit, boolean isBinom) {
Vec[] vecs = fr.vecs();
for (int c = 0; c < ncols; c++) {
Vec v = vecs[c];
// inclusive vector min
final float minIn = (float) Math.max(v.min(), -Float.MAX_VALUE);
// inclusive vector max
final float maxIn = (float) Math.min(v.max(), Float.MAX_VALUE);
// smallest exclusive max
final float maxEx = find_maxEx(maxIn, v.isInt() ? 1 : 0);
final long vlen = v.length();
hs[c] = v.naCnt() == vlen || v.min() == v.max() ? null : make(fr._names[c], nbins, (byte) (v.isEnum() ? 2 : (v.isInt() ? 1 : 0)), minIn, maxEx, vlen, min_rows, doGrpSplit, isBinom);
}
return hs;
}
use of water.fvec.Vec in project h2o-2 by h2oai.
the class SharedTreeModelBuilder method init.
// Verify input parameters
@Override
protected void init() {
super.init();
// Sanity check
assert 0 <= ntrees && ntrees < 1000000;
//assert response.isEnum() : "Response is not enum";
assert // Classify Int or Enums
(classification && (response.isInt() || response.isEnum())) || (!classification && !response.isEnum()) : // Regress Int or Float
"Classification=" + classification + " and response=" + response.isInt();
if (source.numRows() - response.naCnt() <= 0)
throw new IllegalArgumentException("Dataset contains too many NAs!");
_ncols = _train.length;
_nrows = source.numRows() - response.naCnt();
assert (_nrows > 0) : "Dataset contains no rows - validation of input parameters is probably broken!";
// TODO: moved to shared model job
if (!response.isEnum() && classification) {
response = response.toEnum();
//_gen_enum = true;
gtrash(response);
}
_nclass = response.isEnum() ? (char) (response.domain().length) : 1;
if (classification && _nclass <= 1)
throw new IllegalArgumentException("Constant response column!");
if (_nclass > MAX_SUPPORTED_LEVELS)
throw new IllegalArgumentException("Too many levels in response column!");
int usableColumns = 0;
assert _ncols == _train.length : "Number of selected train columns does not correspond to a number of columns!";
for (int i = 0; i < _ncols; i++) {
Vec v = _train[i];
if (v.isBad() || v.isConst())
continue;
usableColumns++;
}
if (usableColumns == 0)
throw new IllegalArgumentException("There is no usable column to generate model!");
if (checkpoint != null && DKV.get(checkpoint) == null)
throw new IllegalArgumentException("Checkpoint " + checkpoint.toString() + " does not exists!");
}
Aggregations