Search in sources :

Example 1 with DataInfo

use of hex.FrameTask.DataInfo in project h2o-2 by h2oai.

the class CoxPH method init.

@Override
protected void init() {
    super.init();
    if ((start_column != null) && !start_column.isInt())
        throw new IllegalArgumentException("start time must be null or of type integer");
    if (!stop_column.isInt())
        throw new IllegalArgumentException("stop time must be of type integer");
    if (!event_column.isInt() && !event_column.isEnum())
        throw new IllegalArgumentException("event must be of type integer or factor");
    if ((event_column.isInt() && (event_column.min() == event_column.max())) || (event_column.isEnum() && (event_column.cardinality() < 2)))
        throw new IllegalArgumentException("event column contains less than two distinct values");
    if (Double.isNaN(lre_min) || lre_min <= 0)
        throw new IllegalArgumentException("lre_min must be a positive number");
    if (iter_max < 1)
        throw new IllegalArgumentException("iter_max must be a positive integer");
    final long min_time = (start_column == null) ? (long) stop_column.min() : (long) start_column.min() + 1;
    final int n_time = (int) (stop_column.max() - min_time + 1);
    if (n_time < 1)
        throw new IllegalArgumentException("start times must be strictly less than stop times");
    if (n_time > MAX_TIME_BINS)
        throw new IllegalArgumentException("number of distinct stop times is " + n_time + "; maximum number allowed is " + MAX_TIME_BINS);
    source = getSubframe();
    int n_resp = 2;
    if (weights_column != null)
        n_resp++;
    if (start_column != null)
        n_resp++;
    final DataInfo dinfo = new DataInfo(source, n_resp, false, false, DataInfo.TransformType.DEMEAN);
    model = new CoxPHModel(this, dest(), source._key, source, null);
    model.initStats(source, dinfo);
}
Also used : DataInfo(hex.FrameTask.DataInfo)

Example 2 with DataInfo

use of hex.FrameTask.DataInfo in project h2o-2 by h2oai.

the class CoxPH method execImpl.

@Override
protected void execImpl() {
    final DataInfo dinfo = model.data_info;
    final int n_offsets = (model.parameters.offset_columns == null) ? 0 : model.parameters.offset_columns.length;
    final int n_coef = dinfo.fullN() - n_offsets;
    final double[] step = MemoryManager.malloc8d(n_coef);
    final double[] oldCoef = MemoryManager.malloc8d(n_coef);
    final double[] newCoef = MemoryManager.malloc8d(n_coef);
    Arrays.fill(step, Double.NaN);
    Arrays.fill(oldCoef, Double.NaN);
    for (int j = 0; j < n_coef; ++j) newCoef[j] = init;
    double oldLoglik = -Double.MAX_VALUE;
    final int n_time = (int) (model.max_time - model.min_time + 1);
    final boolean has_start_column = (model.parameters.start_column != null);
    final boolean has_weights_column = (model.parameters.weights_column != null);
    for (int i = 0; i <= iter_max; ++i) {
        model.iter = i;
        final CoxPHTask coxMR = new CoxPHTask(self(), dinfo, newCoef, model.min_time, n_time, n_offsets, has_start_column, has_weights_column).doAll(dinfo._adaptedFrame);
        final double newLoglik = model.calcLoglik(coxMR);
        if (newLoglik > oldLoglik) {
            if (i == 0)
                model.calcCounts(coxMR);
            model.calcModelStats(newCoef, newLoglik);
            model.calcCumhaz_0(coxMR);
            if (newLoglik == 0)
                model.lre = -Math.log10(Math.abs(oldLoglik - newLoglik));
            else
                model.lre = -Math.log10(Math.abs((oldLoglik - newLoglik) / newLoglik));
            if (model.lre >= lre_min)
                break;
            Arrays.fill(step, 0);
            for (int j = 0; j < n_coef; ++j) for (int k = 0; k < n_coef; ++k) step[j] -= model.var_coef[j][k] * model.gradient[k];
            for (int j = 0; j < n_coef; ++j) if (Double.isNaN(step[j]) || Double.isInfinite(step[j]))
                break;
            oldLoglik = newLoglik;
            System.arraycopy(newCoef, 0, oldCoef, 0, oldCoef.length);
        } else {
            for (int j = 0; j < n_coef; ++j) step[j] /= 2;
        }
        for (int j = 0; j < n_coef; ++j) newCoef[j] = oldCoef[j] - step[j];
    }
    final Futures fs = new Futures();
    DKV.put(dest(), model, fs);
    fs.blockForPending();
}
Also used : DataInfo(hex.FrameTask.DataInfo) Futures(water.Futures)

Example 3 with DataInfo

use of hex.FrameTask.DataInfo in project h2o-2 by h2oai.

the class DeepLearning method prepareDataInfo.

/**
   * Helper to create a DataInfo object from the source and response
   * @return DataInfo object
   */
private DataInfo prepareDataInfo() {
    final boolean del_enum_resp = classification && !response.isEnum();
    final Frame train = FrameTask.DataInfo.prepareFrame(source, autoencoder ? null : response, ignored_cols, classification, ignore_const_cols, true);
    final DataInfo dinfo = new //use all FactorLevels for auto-encoder
    FrameTask.DataInfo(//use all FactorLevels for auto-encoder
    train, //use all FactorLevels for auto-encoder
    autoencoder ? 0 : 1, //use all FactorLevels for auto-encoder
    true, //use all FactorLevels for auto-encoder
    autoencoder || use_all_factor_levels, //transform predictors
    autoencoder ? DataInfo.TransformType.NORMALIZE : DataInfo.TransformType.STANDARDIZE, //transform response
    classification ? DataInfo.TransformType.NONE : DataInfo.TransformType.STANDARDIZE);
    if (!autoencoder) {
        //convention from DataInfo: response is the last Vec
        final Vec resp = dinfo._adaptedFrame.lastVec();
        //either regression or enum response
        assert (!classification ^ resp.isEnum()) : "Must have enum response for classification!";
        if (del_enum_resp)
            ltrash(resp);
    }
    return dinfo;
}
Also used : DataInfo(hex.FrameTask.DataInfo) MRUtils.sampleFrame(water.util.MRUtils.sampleFrame) Frame(water.fvec.Frame) Vec(water.fvec.Vec)

Example 4 with DataInfo

use of hex.FrameTask.DataInfo in project h2o-2 by h2oai.

the class GramMatrixTest method testProstate.

@Test
public void testProstate() {
    File f2 = find_test_file("smalldata/glm_test/prostate_cat_replaced.csv");
    Key ikey2 = NFSFileVec.make(f2);
    Key okey2 = Key.make("glm_model2");
    Frame fr2 = null;
    try {
        fr2 = ParseDataset2.parse(okey2, new Key[] { ikey2 });
        DataInfo dinfo = new DataInfo(fr2, 0, true, false, DataInfo.TransformType.NONE);
        GramTask gt = new GramTask(null, dinfo, true, false);
        gt.doAll(dinfo._adaptedFrame);
        double[][] res = gt._gram.getXX();
        System.out.println(Utils.pprint(gt._gram.getXX()));
        for (int i = 0; i < exp_result.length; ++i) for (int j = 0; j < exp_result.length; ++j) assertEquals(exp_result[i][j], gt._nobs * res[i][j], 1e-5);
        gt = new GramTask(null, dinfo, false, false);
        gt.doAll(dinfo._adaptedFrame);
        for (int i = 0; i < exp_result.length - 1; ++i) for (int j = 0; j < exp_result.length - 1; ++j) assertEquals(exp_result[i][j], gt._nobs * res[i][j], 1e-5);
    } finally {
        fr2.delete();
    }
}
Also used : DataInfo(hex.FrameTask.DataInfo) GramTask(hex.gram.Gram.GramTask) File(java.io.File) Test(org.junit.Test)

Example 5 with DataInfo

use of hex.FrameTask.DataInfo in project h2o-2 by h2oai.

the class GLM2 method init.

@Override
public void init() {
    try {
        super.init();
        if (family == Family.gamma)
            setHighAccuracy();
        if (link == Link.family_default)
            link = family.defaultLink;
        _intercept = intercept ? 1 : 0;
        // TODO
        tweedie_link_power = 1 - tweedie_variance_power;
        if (tweedie_link_power == 0)
            link = Link.log;
        _glm = new GLMParams(family, tweedie_variance_power, link, tweedie_link_power);
        source2 = new Frame(source);
        assert sorted(ignored_cols);
        source2.remove(ignored_cols);
        if (offset != null)
            // remove offset and add it later explicitly (so that it does not interfere with DataInfo.prepareFrame)
            source2.remove(source2.find(offset));
        if (nlambdas == -1)
            nlambdas = 100;
        if (lambda_search && lambda.length > 1)
            throw new IllegalArgumentException("Can not supply both lambda_search and multiple lambdas. If lambda_search is on, GLM expects only one value of lambda_value, representing the lambda_value min (smallest lambda_value in the lambda_value search).");
        // check the response
        if (response.isEnum() && family != Family.binomial)
            throw new IllegalArgumentException("Invalid response variable, trying to run regression with categorical response!");
        switch(family) {
            case poisson:
            case tweedie:
                if (response.min() < 0)
                    throw new IllegalArgumentException("Illegal response column for family='" + family + "', response must be >= 0.");
                break;
            case gamma:
                if (response.min() <= 0)
                    throw new IllegalArgumentException("Invalid response for family='Gamma', response must be > 0!");
                break;
            case binomial:
                if (response.min() < 0 || response.max() > 1)
                    throw new IllegalArgumentException("Illegal response column for family='Binomial', response must in <0,1> range!");
                break;
            default:
        }
        toEnum = family == Family.binomial && (!response.isEnum() && (response.min() < 0 || response.max() > 1));
        if (source2.numCols() <= 1 && !intercept)
            throw new IllegalArgumentException("There are no predictors left after ignoring constant columns in the dataset and no intercept => No parameters to estimate.");
        Frame fr = DataInfo.prepareFrame(source2, response, new int[0], toEnum, true, true);
        if (offset != null) {
            // now put the offset just in front of response
            int id = source.find(offset);
            String name = source.names()[id];
            String responseName = fr.names()[fr.numCols() - 1];
            Vec responseVec = fr.remove(fr.numCols() - 1);
            fr.add(name, offset);
            fr.add(responseName, responseVec);
            _noffsets = 1;
        }
        TransformType dt = TransformType.NONE;
        if (standardize)
            dt = intercept ? TransformType.STANDARDIZE : TransformType.DESCALE;
        _srcDinfo = new DataInfo(fr, 1, intercept, use_all_factor_levels || lambda_search, dt, DataInfo.TransformType.NONE);
        if (offset != null && dt != TransformType.NONE) {
            // do not standardize offset
            if (_srcDinfo._normMul != null)
                _srcDinfo._normMul[_srcDinfo._normMul.length - 1] = 1;
            if (_srcDinfo._normSub != null)
                _srcDinfo._normSub[_srcDinfo._normSub.length - 1] = 0;
        }
        if (!intercept && _srcDinfo._cats > 0)
            throw new IllegalArgumentException("Models with no intercept are only supported with all-numeric predictors.");
        _activeData = _srcDinfo;
        if (higher_accuracy)
            setHighAccuracy();
        if (beta_constraints != null) {
            Vec v = beta_constraints.vec("names");
            if (v == null)
                throw new IllegalArgumentException("Invalid beta constraints file, missing column with predictor names");
            // for now only enums allowed here
            String[] dom = v.domain();
            String[] names = Utils.append(_srcDinfo.coefNames(), "Intercept");
            int[] map = Utils.asInts(v);
            HashSet<Integer> s = new HashSet<Integer>();
            for (int i : map) if (!s.add(i))
                throw new IllegalArgumentException("Invalid beta constraints file, got duplicate constraints for '" + dom[i] + "'");
            if (!Arrays.deepEquals(dom, names)) {
                // need mapping
                HashMap<String, Integer> m = new HashMap<String, Integer>();
                for (int i = 0; i < names.length; ++i) {
                    m.put(names[i], i);
                }
                int[] newMap = MemoryManager.malloc4(map.length);
                for (int i = 0; i < map.length; ++i) {
                    Integer I = m.get(dom[map[i]]);
                    if (I == null)
                        throw new IllegalArgumentException("unknown predictor name '" + dom[map[i]] + "'");
                    newMap[i] = I == null ? -1 : I;
                }
                map = newMap;
            }
            final int numoff = _srcDinfo.numStart();
            if ((v = beta_constraints.vec("lower_bounds")) != null) {
                _lbs = map == null ? Utils.asDoubles(v) : mapVec(Utils.asDoubles(v), makeAry(names.length, Double.NEGATIVE_INFINITY), map);
                //            for(int i = 0; i < _lbs.length; ++i)
                //            if(_lbs[i] > 0) throw new IllegalArgumentException("lower bounds must be non-positive");
                System.out.println("lower bounds = " + Arrays.toString(_lbs));
                if (_srcDinfo._normMul != null) {
                    for (int i = numoff; i < _srcDinfo.fullN(); ++i) {
                        if (Double.isInfinite(_lbs[i]))
                            continue;
                        _lbs[i] /= _srcDinfo._normMul[i - numoff];
                    }
                }
            }
            System.out.println("lbs = " + Arrays.toString(_lbs));
            if ((v = beta_constraints.vec("upper_bounds")) != null) {
                _ubs = map == null ? Utils.asDoubles(v) : mapVec(Utils.asDoubles(v), makeAry(names.length, Double.POSITIVE_INFINITY), map);
                System.out.println("upper bounds = " + Arrays.toString(_ubs));
                //            if (_ubs[i] < 0) throw new IllegalArgumentException("lower bounds must be non-positive");
                if (_srcDinfo._normMul != null)
                    for (int i = numoff; i < _srcDinfo.fullN(); ++i) {
                        if (Double.isInfinite(_ubs[i]))
                            continue;
                        _ubs[i] /= _srcDinfo._normMul[i - numoff];
                    }
            }
            System.out.println("ubs = " + Arrays.toString(_ubs));
            if (_lbs != null && _ubs != null) {
                for (int i = 0; i < _lbs.length; ++i) if (_lbs[i] > _ubs[i])
                    throw new IllegalArgumentException("Invalid upper/lower bounds: lower bounds must be <= upper bounds for all variables.");
            }
            if ((v = beta_constraints.vec("beta_given")) != null) {
                _bgs = map == null ? Utils.asDoubles(v) : mapVec(Utils.asDoubles(v), makeAry(names.length, 0), map);
                if (_srcDinfo._normMul != null) {
                    double norm = 0;
                    for (int i = numoff; i < _srcDinfo.fullN(); ++i) {
                        norm += _bgs[i] * _srcDinfo._normSub[i - numoff];
                        _bgs[i] /= _srcDinfo._normMul[i - numoff];
                    }
                    if (_intercept == 1)
                        _bgs[_bgs.length - 1] -= norm;
                }
            }
            if ((v = beta_constraints.vec("rho")) != null)
                _rho = map == null ? Utils.asDoubles(v) : mapVec(Utils.asDoubles(v), makeAry(names.length, 0), map);
            else if (_bgs != null)
                throw new IllegalArgumentException("Missing vector of penalties (rho) in beta_constraints file.");
            String[] cols = new String[] { "names", "rho", "beta_given", "lower_bounds", "upper_bounds" };
            Arrays.sort(cols);
            for (String str : beta_constraints.names()) if (Arrays.binarySearch(cols, str) < 0)
                Log.warn("unknown column in beta_constraints file: '" + str + "'");
        }
        if (non_negative) {
            // make srue lb is >= 0
            if (_lbs == null)
                _lbs = new double[_srcDinfo.fullN() + 1];
            // no bounds for intercept
            _lbs[_srcDinfo.fullN()] = Double.NEGATIVE_INFINITY;
            for (int i = 0; i < _lbs.length; ++i) if (_lbs[i] < 0)
                _lbs[i] = 0;
        }
    } catch (RuntimeException e) {
        e.printStackTrace();
        cleanup();
        throw e;
    }
}
Also used : DataInfo(hex.FrameTask.DataInfo) Frame(water.fvec.Frame) HashMap(java.util.HashMap) RString(water.util.RString) TransformType(hex.FrameTask.DataInfo.TransformType) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Vec(water.fvec.Vec) HashSet(java.util.HashSet)

Aggregations

DataInfo (hex.FrameTask.DataInfo)12 Frame (water.fvec.Frame)5 Vec (water.fvec.Vec)5 GramTask (hex.gram.Gram.GramTask)3 Test (org.junit.Test)3 RString (water.util.RString)2 TransformType (hex.FrameTask.DataInfo.TransformType)1 Source (hex.glm.GLM2.Source)1 GLMIterationTask (hex.glm.GLMTask.GLMIterationTask)1 File (java.io.File)1 Field (java.lang.reflect.Field)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 Futures (water.Futures)1 NFSFileVec (water.fvec.NFSFileVec)1 RebalanceDataSet (water.fvec.RebalanceDataSet)1 MRUtils.sampleFrame (water.util.MRUtils.sampleFrame)1