Example 1 with DataInfo

use of hex.FrameTask.DataInfo in project h2o-2 by h2oai.

the class CoxPH method init.

protected void init() {
    if ((start_column != null) && !start_column.isInt())
        throw new IllegalArgumentException("start time must be null or of type integer");
    if (!stop_column.isInt())
        throw new IllegalArgumentException("stop time must be of type integer");
    if (!event_column.isInt() && !event_column.isEnum())
        throw new IllegalArgumentException("event must be of type integer or factor");
    if ((event_column.isInt() && (event_column.min() == event_column.max())) || (event_column.isEnum() && (event_column.cardinality() < 2)))
        throw new IllegalArgumentException("event column contains less than two distinct values");
    if (Double.isNaN(lre_min) || lre_min <= 0)
        throw new IllegalArgumentException("lre_min must be a positive number");
    if (iter_max < 1)
        throw new IllegalArgumentException("iter_max must be a positive integer");
    final long min_time = (start_column == null) ? (long) stop_column.min() : (long) start_column.min() + 1;
    final int n_time = (int) (stop_column.max() - min_time + 1);
    if (n_time < 1)
        throw new IllegalArgumentException("start times must be strictly less than stop times");
    if (n_time > MAX_TIME_BINS)
        throw new IllegalArgumentException("number of distinct stop times is " + n_time + "; maximum number allowed is " + MAX_TIME_BINS);
    source = getSubframe();
    int n_resp = 2;
    if (weights_column != null)
    if (start_column != null)
    final DataInfo dinfo = new DataInfo(source, n_resp, false, false, DataInfo.TransformType.DEMEAN);
    model = new CoxPHModel(this, dest(), source._key, source, null);
    model.initStats(source, dinfo);
Also used : DataInfo(hex.FrameTask.DataInfo)

Example 2 with DataInfo

use of hex.FrameTask.DataInfo in project h2o-2 by h2oai.

the class CoxPH method execImpl.

protected void execImpl() {
    final DataInfo dinfo = model.data_info;
    final int n_offsets = (model.parameters.offset_columns == null) ? 0 : model.parameters.offset_columns.length;
    final int n_coef = dinfo.fullN() - n_offsets;
    final double[] step = MemoryManager.malloc8d(n_coef);
    final double[] oldCoef = MemoryManager.malloc8d(n_coef);
    final double[] newCoef = MemoryManager.malloc8d(n_coef);
    Arrays.fill(step, Double.NaN);
    Arrays.fill(oldCoef, Double.NaN);
    for (int j = 0; j < n_coef; ++j) newCoef[j] = init;
    double oldLoglik = -Double.MAX_VALUE;
    final int n_time = (int) (model.max_time - model.min_time + 1);
    final boolean has_start_column = (model.parameters.start_column != null);
    final boolean has_weights_column = (model.parameters.weights_column != null);
    for (int i = 0; i <= iter_max; ++i) {
        model.iter = i;
        final CoxPHTask coxMR = new CoxPHTask(self(), dinfo, newCoef, model.min_time, n_time, n_offsets, has_start_column, has_weights_column).doAll(dinfo._adaptedFrame);
        final double newLoglik = model.calcLoglik(coxMR);
        if (newLoglik > oldLoglik) {
            if (i == 0)
            model.calcModelStats(newCoef, newLoglik);
            if (newLoglik == 0)
                model.lre = -Math.log10(Math.abs(oldLoglik - newLoglik));
                model.lre = -Math.log10(Math.abs((oldLoglik - newLoglik) / newLoglik));
            if (model.lre >= lre_min)
            Arrays.fill(step, 0);
            for (int j = 0; j < n_coef; ++j) for (int k = 0; k < n_coef; ++k) step[j] -= model.var_coef[j][k] * model.gradient[k];
            for (int j = 0; j < n_coef; ++j) if (Double.isNaN(step[j]) || Double.isInfinite(step[j]))
            oldLoglik = newLoglik;
            System.arraycopy(newCoef, 0, oldCoef, 0, oldCoef.length);
        } else {
            for (int j = 0; j < n_coef; ++j) step[j] /= 2;
        for (int j = 0; j < n_coef; ++j) newCoef[j] = oldCoef[j] - step[j];
    final Futures fs = new Futures();
    DKV.put(dest(), model, fs);
Also used : DataInfo(hex.FrameTask.DataInfo) Futures(water.Futures)

Example 3 with DataInfo

use of hex.FrameTask.DataInfo in project h2o-2 by h2oai.

the class DeepLearning method prepareDataInfo.

   * Helper to create a DataInfo object from the source and response
   * @return DataInfo object
private DataInfo prepareDataInfo() {
    final boolean del_enum_resp = classification && !response.isEnum();
    final Frame train = FrameTask.DataInfo.prepareFrame(source, autoencoder ? null : response, ignored_cols, classification, ignore_const_cols, true);
    final DataInfo dinfo = new //use all FactorLevels for auto-encoder
    FrameTask.DataInfo(//use all FactorLevels for auto-encoder
    train, //use all FactorLevels for auto-encoder
    autoencoder ? 0 : 1, //use all FactorLevels for auto-encoder
    true, //use all FactorLevels for auto-encoder
    autoencoder || use_all_factor_levels, //transform predictors
    autoencoder ? DataInfo.TransformType.NORMALIZE : DataInfo.TransformType.STANDARDIZE, //transform response
    classification ? DataInfo.TransformType.NONE : DataInfo.TransformType.STANDARDIZE);
    if (!autoencoder) {
        //convention from DataInfo: response is the last Vec
        final Vec resp = dinfo._adaptedFrame.lastVec();
        //either regression or enum response
        assert (!classification ^ resp.isEnum()) : "Must have enum response for classification!";
        if (del_enum_resp)
    return dinfo;
Also used : DataInfo(hex.FrameTask.DataInfo) MRUtils.sampleFrame(water.util.MRUtils.sampleFrame) Frame(water.fvec.Frame) Vec(water.fvec.Vec)

Example 4 with DataInfo

use of hex.FrameTask.DataInfo in project h2o-2 by h2oai.

the class GramMatrixTest method testProstate.

public void testProstate() {
    File f2 = find_test_file("smalldata/glm_test/prostate_cat_replaced.csv");
    Key ikey2 = NFSFileVec.make(f2);
    Key okey2 = Key.make("glm_model2");
    Frame fr2 = null;
    try {
        fr2 = ParseDataset2.parse(okey2, new Key[] { ikey2 });
        DataInfo dinfo = new DataInfo(fr2, 0, true, false, DataInfo.TransformType.NONE);
        GramTask gt = new GramTask(null, dinfo, true, false);
        double[][] res = gt._gram.getXX();
        for (int i = 0; i < exp_result.length; ++i) for (int j = 0; j < exp_result.length; ++j) assertEquals(exp_result[i][j], gt._nobs * res[i][j], 1e-5);
        gt = new GramTask(null, dinfo, false, false);
        for (int i = 0; i < exp_result.length - 1; ++i) for (int j = 0; j < exp_result.length - 1; ++j) assertEquals(exp_result[i][j], gt._nobs * res[i][j], 1e-5);
    } finally {
Also used : DataInfo(hex.FrameTask.DataInfo) GramTask(hex.gram.Gram.GramTask) File( Test(org.junit.Test)

Example 5 with DataInfo

use of hex.FrameTask.DataInfo in project h2o-2 by h2oai.

the class GLM2 method init.

public void init() {
    try {
        if (family == Family.gamma)
        if (link == Link.family_default)
            link = family.defaultLink;
        _intercept = intercept ? 1 : 0;
        // TODO
        tweedie_link_power = 1 - tweedie_variance_power;
        if (tweedie_link_power == 0)
            link = Link.log;
        _glm = new GLMParams(family, tweedie_variance_power, link, tweedie_link_power);
        source2 = new Frame(source);
        assert sorted(ignored_cols);
        if (offset != null)
            // remove offset and add it later explicitly (so that it does not interfere with DataInfo.prepareFrame)
        if (nlambdas == -1)
            nlambdas = 100;
        if (lambda_search && lambda.length > 1)
            throw new IllegalArgumentException("Can not supply both lambda_search and multiple lambdas. If lambda_search is on, GLM expects only one value of lambda_value, representing the lambda_value min (smallest lambda_value in the lambda_value search).");
        // check the response
        if (response.isEnum() && family != Family.binomial)
            throw new IllegalArgumentException("Invalid response variable, trying to run regression with categorical response!");
        switch(family) {
            case poisson:
            case tweedie:
                if (response.min() < 0)
                    throw new IllegalArgumentException("Illegal response column for family='" + family + "', response must be >= 0.");
            case gamma:
                if (response.min() <= 0)
                    throw new IllegalArgumentException("Invalid response for family='Gamma', response must be > 0!");
            case binomial:
                if (response.min() < 0 || response.max() > 1)
                    throw new IllegalArgumentException("Illegal response column for family='Binomial', response must in <0,1> range!");
        toEnum = family == Family.binomial && (!response.isEnum() && (response.min() < 0 || response.max() > 1));
        if (source2.numCols() <= 1 && !intercept)
            throw new IllegalArgumentException("There are no predictors left after ignoring constant columns in the dataset and no intercept => No parameters to estimate.");
        Frame fr = DataInfo.prepareFrame(source2, response, new int[0], toEnum, true, true);
        if (offset != null) {
            // now put the offset just in front of response
            int id = source.find(offset);
            String name = source.names()[id];
            String responseName = fr.names()[fr.numCols() - 1];
            Vec responseVec = fr.remove(fr.numCols() - 1);
            fr.add(name, offset);
            fr.add(responseName, responseVec);
            _noffsets = 1;
        TransformType dt = TransformType.NONE;
        if (standardize)
            dt = intercept ? TransformType.STANDARDIZE : TransformType.DESCALE;
        _srcDinfo = new DataInfo(fr, 1, intercept, use_all_factor_levels || lambda_search, dt, DataInfo.TransformType.NONE);
        if (offset != null && dt != TransformType.NONE) {
            // do not standardize offset
            if (_srcDinfo._normMul != null)
                _srcDinfo._normMul[_srcDinfo._normMul.length - 1] = 1;
            if (_srcDinfo._normSub != null)
                _srcDinfo._normSub[_srcDinfo._normSub.length - 1] = 0;
        if (!intercept && _srcDinfo._cats > 0)
            throw new IllegalArgumentException("Models with no intercept are only supported with all-numeric predictors.");
        _activeData = _srcDinfo;
        if (higher_accuracy)
        if (beta_constraints != null) {
            Vec v = beta_constraints.vec("names");
            if (v == null)
                throw new IllegalArgumentException("Invalid beta constraints file, missing column with predictor names");
            // for now only enums allowed here
            String[] dom = v.domain();
            String[] names = Utils.append(_srcDinfo.coefNames(), "Intercept");
            int[] map = Utils.asInts(v);
            HashSet<Integer> s = new HashSet<Integer>();
            for (int i : map) if (!s.add(i))
                throw new IllegalArgumentException("Invalid beta constraints file, got duplicate constraints for '" + dom[i] + "'");
            if (!Arrays.deepEquals(dom, names)) {
                // need mapping
                HashMap<String, Integer> m = new HashMap<String, Integer>();
                for (int i = 0; i < names.length; ++i) {
                    m.put(names[i], i);
                int[] newMap = MemoryManager.malloc4(map.length);
                for (int i = 0; i < map.length; ++i) {
                    Integer I = m.get(dom[map[i]]);
                    if (I == null)
                        throw new IllegalArgumentException("unknown predictor name '" + dom[map[i]] + "'");
                    newMap[i] = I == null ? -1 : I;
                map = newMap;
            final int numoff = _srcDinfo.numStart();
            if ((v = beta_constraints.vec("lower_bounds")) != null) {
                _lbs = map == null ? Utils.asDoubles(v) : mapVec(Utils.asDoubles(v), makeAry(names.length, Double.NEGATIVE_INFINITY), map);
                //            for(int i = 0; i < _lbs.length; ++i)
                //            if(_lbs[i] > 0) throw new IllegalArgumentException("lower bounds must be non-positive");
                System.out.println("lower bounds = " + Arrays.toString(_lbs));
                if (_srcDinfo._normMul != null) {
                    for (int i = numoff; i < _srcDinfo.fullN(); ++i) {
                        if (Double.isInfinite(_lbs[i]))
                        _lbs[i] /= _srcDinfo._normMul[i - numoff];
            System.out.println("lbs = " + Arrays.toString(_lbs));
            if ((v = beta_constraints.vec("upper_bounds")) != null) {
                _ubs = map == null ? Utils.asDoubles(v) : mapVec(Utils.asDoubles(v), makeAry(names.length, Double.POSITIVE_INFINITY), map);
                System.out.println("upper bounds = " + Arrays.toString(_ubs));
                //            if (_ubs[i] < 0) throw new IllegalArgumentException("lower bounds must be non-positive");
                if (_srcDinfo._normMul != null)
                    for (int i = numoff; i < _srcDinfo.fullN(); ++i) {
                        if (Double.isInfinite(_ubs[i]))
                        _ubs[i] /= _srcDinfo._normMul[i - numoff];
            System.out.println("ubs = " + Arrays.toString(_ubs));
            if (_lbs != null && _ubs != null) {
                for (int i = 0; i < _lbs.length; ++i) if (_lbs[i] > _ubs[i])
                    throw new IllegalArgumentException("Invalid upper/lower bounds: lower bounds must be <= upper bounds for all variables.");
            if ((v = beta_constraints.vec("beta_given")) != null) {
                _bgs = map == null ? Utils.asDoubles(v) : mapVec(Utils.asDoubles(v), makeAry(names.length, 0), map);
                if (_srcDinfo._normMul != null) {
                    double norm = 0;
                    for (int i = numoff; i < _srcDinfo.fullN(); ++i) {
                        norm += _bgs[i] * _srcDinfo._normSub[i - numoff];
                        _bgs[i] /= _srcDinfo._normMul[i - numoff];
                    if (_intercept == 1)
                        _bgs[_bgs.length - 1] -= norm;
            if ((v = beta_constraints.vec("rho")) != null)
                _rho = map == null ? Utils.asDoubles(v) : mapVec(Utils.asDoubles(v), makeAry(names.length, 0), map);
            else if (_bgs != null)
                throw new IllegalArgumentException("Missing vector of penalties (rho) in beta_constraints file.");
            String[] cols = new String[] { "names", "rho", "beta_given", "lower_bounds", "upper_bounds" };
            for (String str : beta_constraints.names()) if (Arrays.binarySearch(cols, str) < 0)
                Log.warn("unknown column in beta_constraints file: '" + str + "'");
        if (non_negative) {
            // make srue lb is >= 0
            if (_lbs == null)
                _lbs = new double[_srcDinfo.fullN() + 1];
            // no bounds for intercept
            _lbs[_srcDinfo.fullN()] = Double.NEGATIVE_INFINITY;
            for (int i = 0; i < _lbs.length; ++i) if (_lbs[i] < 0)
                _lbs[i] = 0;
    } catch (RuntimeException e) {
        throw e;
Also used : DataInfo(hex.FrameTask.DataInfo) Frame(water.fvec.Frame) HashMap(java.util.HashMap) RString(water.util.RString) TransformType(hex.FrameTask.DataInfo.TransformType) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Vec(water.fvec.Vec) HashSet(java.util.HashSet)


