Search in sources :

Example 81 with Vec

use of water.fvec.Vec in project h2o-2 by h2oai.

the class DeepLearning method prepareDataInfo.

/**
   * Helper to create a DataInfo object from the source and response
   * @return DataInfo object
   */
private DataInfo prepareDataInfo() {
    final boolean del_enum_resp = classification && !response.isEnum();
    final Frame train = FrameTask.DataInfo.prepareFrame(source, autoencoder ? null : response, ignored_cols, classification, ignore_const_cols, true);
    final DataInfo dinfo = new //use all FactorLevels for auto-encoder
    FrameTask.DataInfo(//use all FactorLevels for auto-encoder
    train, //use all FactorLevels for auto-encoder
    autoencoder ? 0 : 1, //use all FactorLevels for auto-encoder
    true, //use all FactorLevels for auto-encoder
    autoencoder || use_all_factor_levels, //transform predictors
    autoencoder ? DataInfo.TransformType.NORMALIZE : DataInfo.TransformType.STANDARDIZE, //transform response
    classification ? DataInfo.TransformType.NONE : DataInfo.TransformType.STANDARDIZE);
    if (!autoencoder) {
        //convention from DataInfo: response is the last Vec
        final Vec resp = dinfo._adaptedFrame.lastVec();
        //either regression or enum response
        assert (!classification ^ resp.isEnum()) : "Must have enum response for classification!";
        if (del_enum_resp)
            ltrash(resp);
    }
    return dinfo;
}
Also used : DataInfo(hex.FrameTask.DataInfo) MRUtils.sampleFrame(water.util.MRUtils.sampleFrame) Frame(water.fvec.Frame) Vec(water.fvec.Vec)

Example 82 with Vec

use of water.fvec.Vec in project h2o-2 by h2oai.

the class KMeans2 method execImpl.

// ----------------------
@Override
public void execImpl() {
    Frame fr;
    KMeans2Model model = null;
    try {
        logStart();
        source.read_lock(self());
        if (source.numRows() < k)
            throw new IllegalArgumentException("Cannot make " + k + " clusters out of " + source.numRows() + " rows.");
        // Drop ignored cols and, if user asks for it, cols with too many NAs
        fr = FrameTask.DataInfo.prepareFrame(source, ignored_cols, false, drop_na_cols);
        //      fr = source;
        if (fr.numCols() == 0)
            throw new IllegalArgumentException("No columns left to work with.");
        // Sort columns, so the categoricals are all up front.  They use a
        // different distance metric than numeric columns.
        Vec[] vecs = fr.vecs();
        // Feature count
        final int N = vecs.length;
        int ncats = 0, len = N;
        while (ncats != len) {
            while (ncats < len && vecs[ncats].isEnum()) ncats++;
            while (len > 0 && !vecs[len - 1].isEnum()) len--;
            if (ncats < len - 1)
                fr.swap(ncats, len - 1);
        }
        _ncats = ncats;
        // The model to be built
        model = new KMeans2Model(this, dest(), fr._key, fr.names(), fr.domains());
        model.delete_and_lock(self());
        // means are used to impute NAs
        double[] means = new double[N];
        for (int i = 0; i < N; i++) means[i] = vecs[i].mean();
        // mults & means for normalization
        double[] mults = null;
        if (normalize) {
            mults = new double[N];
            for (int i = 0; i < N; i++) {
                double sigma = vecs[i].sigma();
                mults[i] = normalize(sigma) ? 1.0 / sigma : 1.0;
            }
        }
        // Initialize clusters
        Random rand = Utils.getRNG(seed - 1);
        // Normalized cluster centers
        double[][] clusters;
        if (initialization == Initialization.None) {
            // Initialize all clusters to random rows. Get 3x the number needed
            clusters = model.centers = new double[k * 3][fr.numCols()];
            for (double[] cluster : clusters) randomRow(vecs, rand, cluster, means, mults);
            // for( int i=0; i<model.centers.length; i++ ) {
            //   Log.info("random model.centers["+i+"]: "+Arrays.toString(model.centers[i]));
            // }
            // Recluster down to K normalized clusters. 
            clusters = recluster(clusters, rand);
        } else {
            clusters = new double[1][vecs.length];
            // Initialize first cluster to random row
            randomRow(vecs, rand, clusters[0], means, mults);
            while (model.iterations < 5) {
                // Sum squares distances to clusters
                SumSqr sqr = new SumSqr(clusters, means, mults, _ncats).doAll(vecs);
                // Log.info("iteration: "+model.iterations+" sqr: "+sqr._sqr);
                // Sample with probability inverse to square distance
                long randomSeed = (long) rand.nextDouble();
                Sampler sampler = new Sampler(clusters, means, mults, _ncats, sqr._sqr, k * 3, randomSeed).doAll(vecs);
                clusters = Utils.append(clusters, sampler._sampled);
                // Stopped/cancelled
                if (!isRunning())
                    return;
                model.centers = denormalize(clusters, ncats, means, mults);
                // see below. this is sum of squared error now
                model.total_within_SS = sqr._sqr;
                // One iteration done
                model.iterations++;
                // Log.info("\nKMeans Centers during init models.iterations: "+model.iterations);
                // for( int i=0; i<model.centers.length; i++ ) {
                //   Log.info("model.centers["+i+"]: "+Arrays.toString(model.centers[i]));
                // }
                // Log.info("model.total_within_SS: "+model.total_within_SS);
                // Don't count these iterations as work for model building
                // Early version of model is visible
                model.update(self());
                // Recluster down to K normalized clusters. 
                // makes more sense to recluster each iteration, since the weighted k*3 effect on sqr vs _sqr
                // reflects the k effect on _sqr? ..if there are too many "centers" (samples) then _sqr (sum of all) is too 
                // big relative to sqr (possible new point, and we don't gather any more samples? 
                // (so the centers won't change during the init)
                clusters = recluster(clusters, rand);
            }
        }
        // Reset iteration count
        model.iterations = 0;
        // ---
        // Run the main KMeans Clustering loop
        // Stop after enough iterations
        boolean done;
        LOOP: for (; model.iterations < max_iter; model.iterations++) {
            // Stopped/cancelled
            if (!isRunning())
                return;
            Lloyds task = new Lloyds(clusters, means, mults, _ncats, k).doAll(vecs);
            // Pick the max categorical level for clusters' center
            max_cats(task._cMeans, task._cats);
            // Handle the case where some clusters go dry.  Rescue only 1 cluster
            // per iteration ('cause we only tracked the 1 worst row)
            boolean badrow = false;
            for (int clu = 0; clu < k; clu++) {
                if (task._rows[clu] == 0) {
                    // some centers *at-all*.
                    if (badrow) {
                        Log.warn("KMeans: Re-running Lloyds to re-init another cluster");
                        // Do not count against iterations
                        model.iterations--;
                        if (reinit_attempts++ < k) {
                            // Rerun Lloyds, and assign points to centroids
                            continue LOOP;
                        } else {
                            reinit_attempts = 0;
                            //give up and accept empty cluster
                            break;
                        }
                    }
                    long row = task._worst_row;
                    Log.warn("KMeans: Re-initializing cluster " + clu + " to row " + row);
                    data(clusters[clu] = task._cMeans[clu], vecs, row, means, mults);
                    task._rows[clu] = 1;
                    badrow = true;
                }
            }
            // Fill in the model; denormalized centers
            model.centers = denormalize(task._cMeans, ncats, means, mults);
            model.size = task._rows;
            model.within_cluster_variances = task._cSqr;
            // sum squared error
            double ssq = 0;
            for (int i = 0; i < k; i++) {
                // sum squared error all clusters
                ssq += model.within_cluster_variances[i];
            //          model.within_cluster_variances[i] /= task._rows[i]; // mse per-cluster
            }
            //        model.total_within_SS = ssq/fr.numRows(); // mse total
            //total within sum of squares
            model.total_within_SS = ssq;
            // Update model in K/V store
            model.update(self());
            reinit_attempts = 0;
            // Compute change in clusters centers
            double sum = 0;
            for (int clu = 0; clu < k; clu++) sum += distance(clusters[clu], task._cMeans[clu], ncats);
            // Average change per feature
            sum /= N;
            Log.info("KMeans: Change in cluster centers=" + sum);
            done = (sum < 1e-6 || model.iterations == max_iter - 1);
            if (done) {
                Log.info("Writing clusters to key " + model._clustersKey);
                Clusters cc = new Clusters();
                cc._clusters = clusters;
                cc._means = means;
                cc._mults = mults;
                cc.doAll(1, vecs);
                Frame fr2 = cc.outputFrame(model._clustersKey, new String[] { "Cluster ID" }, new String[][] { Utils.toStringMap(0, cc._clusters.length - 1) });
                fr2.delete_and_lock(self()).unlock(self());
                break;
            }
            // Update cluster centers
            clusters = task._cMeans;
            StringBuilder sb = new StringBuilder();
            sb.append("KMeans: iter: ").append(model.iterations).append(", MSE=").append(model.total_within_SS);
            for (int i = 0; i < k; i++) sb.append(", ").append(task._cSqr[i]).append("/").append(task._rows[i]);
            Log.info(sb);
        }
    } catch (Throwable t) {
        t.printStackTrace();
        cancel(t);
    } finally {
        // Remove Job
        remove();
        if (model != null)
            model.unlock(self());
        source.unlock(self());
        state = UKV.<Job>get(self()).state;
        new TAtomic<KMeans2Model>() {

            @Override
            public KMeans2Model atomic(KMeans2Model m) {
                if (m != null)
                    m.get_params().state = state;
                return m;
            }
        }.invoke(dest());
    }
}
Also used : Frame(water.fvec.Frame) RString(water.util.RString) Random(java.util.Random) Vec(water.fvec.Vec) ColumnsJob(water.Job.ColumnsJob)

Example 83 with Vec

use of water.fvec.Vec in project h2o-2 by h2oai.

the class VariableImportance method init.

private void init(Vec resp) {
    Vec respData = _data.vecs()[_classcol];
    int model_min = (int) resp.min();
    int data_min = (int) respData.min();
    if (resp._domain != null) {
        assert respData._domain != null;
        _model_classes_mapping = new int[resp._domain.length];
        _data_classes_mapping = new int[respData._domain.length];
        // compute mapping
        alignEnumDomains(resp._domain, respData._domain, _model_classes_mapping, _data_classes_mapping);
    } else {
        assert respData._domain == null;
        _model_classes_mapping = null;
        _data_classes_mapping = null;
        // compute mapping
        _cmin_model_mapping = model_min - Math.min(model_min, data_min);
        _cmin_data_mapping = data_min - Math.min(model_min, data_min);
    }
}
Also used : Vec(water.fvec.Vec)

Example 84 with Vec

use of water.fvec.Vec in project h2o-2 by h2oai.

the class DHistogram method initialHist.

// The initial histogram bins are setup from the Vec rollups.
public static DHistogram[] initialHist(Frame fr, int ncols, int nbins, DHistogram[] hs, int min_rows, boolean doGrpSplit, boolean isBinom) {
    Vec[] vecs = fr.vecs();
    for (int c = 0; c < ncols; c++) {
        Vec v = vecs[c];
        // inclusive vector min
        final float minIn = (float) Math.max(v.min(), -Float.MAX_VALUE);
        // inclusive vector max
        final float maxIn = (float) Math.min(v.max(), Float.MAX_VALUE);
        // smallest exclusive max
        final float maxEx = find_maxEx(maxIn, v.isInt() ? 1 : 0);
        final long vlen = v.length();
        hs[c] = v.naCnt() == vlen || v.min() == v.max() ? null : make(fr._names[c], nbins, (byte) (v.isEnum() ? 2 : (v.isInt() ? 1 : 0)), minIn, maxEx, vlen, min_rows, doGrpSplit, isBinom);
    }
    return hs;
}
Also used : Vec(water.fvec.Vec)

Example 85 with Vec

use of water.fvec.Vec in project h2o-2 by h2oai.

the class SharedTreeModelBuilder method init.

// Verify input parameters
@Override
protected void init() {
    super.init();
    // Sanity check
    assert 0 <= ntrees && ntrees < 1000000;
    //assert response.isEnum() : "Response is not enum";
    assert // Classify Int or Enums
    (classification && (response.isInt() || response.isEnum())) || (!classification && !response.isEnum()) : // Regress  Int or Float
    "Classification=" + classification + " and response=" + response.isInt();
    if (source.numRows() - response.naCnt() <= 0)
        throw new IllegalArgumentException("Dataset contains too many NAs!");
    _ncols = _train.length;
    _nrows = source.numRows() - response.naCnt();
    assert (_nrows > 0) : "Dataset contains no rows - validation of input parameters is probably broken!";
    // TODO: moved to shared model job
    if (!response.isEnum() && classification) {
        response = response.toEnum();
        //_gen_enum = true;
        gtrash(response);
    }
    _nclass = response.isEnum() ? (char) (response.domain().length) : 1;
    if (classification && _nclass <= 1)
        throw new IllegalArgumentException("Constant response column!");
    if (_nclass > MAX_SUPPORTED_LEVELS)
        throw new IllegalArgumentException("Too many levels in response column!");
    int usableColumns = 0;
    assert _ncols == _train.length : "Number of selected train columns does not correspond to a number of columns!";
    for (int i = 0; i < _ncols; i++) {
        Vec v = _train[i];
        if (v.isBad() || v.isConst())
            continue;
        usableColumns++;
    }
    if (usableColumns == 0)
        throw new IllegalArgumentException("There is no usable column to generate model!");
    if (checkpoint != null && DKV.get(checkpoint) == null)
        throw new IllegalArgumentException("Checkpoint " + checkpoint.toString() + " does not exists!");
}
Also used : Vec(water.fvec.Vec)

Aggregations

Vec (water.fvec.Vec)280 Frame (water.fvec.Frame)213 Test (org.junit.Test)82 NFSFileVec (water.fvec.NFSFileVec)48 ValFrame (water.rapids.vals.ValFrame)47 Chunk (water.fvec.Chunk)30 Random (java.util.Random)25 NewChunk (water.fvec.NewChunk)23 DeepLearningParameters (hex.deeplearning.DeepLearningModel.DeepLearningParameters)22 Key (water.Key)21 MRTask (water.MRTask)17 Val (water.rapids.Val)14 File (java.io.File)11 ArrayList (java.util.ArrayList)11 Futures (water.Futures)11 H2OIllegalArgumentException (water.exceptions.H2OIllegalArgumentException)11 ValNum (water.rapids.vals.ValNum)11 ShuffleSplitFrame (hex.splitframe.ShuffleSplitFrame)10 BufferedString (water.parser.BufferedString)10 AppendableVec (water.fvec.AppendableVec)9