Search in sources :

Example 11 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class TimeSeriesTests method testIsax.

public void testIsax() {
    Val res1 = Rapids.exec("(cumsum " + f._key + " 1)");
    fr1 = res1.getFrame();
    // 10 words 10 max cardinality 0 optimize card
    Val res2 = Rapids.exec("(isax " + fr1._key + " 10 10 0)");
    fr2 = res2.getFrame();
    String expected = "0^10_0^10_0^10_0^10_5^10_7^10_8^10_9^10_9^10_8^10";
    final String actual = fr2.vec(0).atStr(new BufferedString(), 0).toString();
    Assert.assertEquals(expected, actual);
Also used : Val(water.rapids.Val) BufferedString(water.parser.BufferedString) BufferedString(water.parser.BufferedString) Test(org.junit.Test)

Example 12 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class GLMTest method testBounds.

// Leask xval keys
//  @Test public void testXval() {
//    GLMModel model = null;
//    Frame fr = parse_test_file("smalldata/glm_test/prostate_cat_replaced.csv");
//    Frame score = null;
//    try{
//      Scope.enter();
//      // R results
////      Coefficients:
////        (Intercept)           ID          AGE       RACER2       RACER3        DPROS        DCAPS          PSA          VOL      GLEASON
////          -8.894088     0.001588    -0.009589     0.231777    -0.459937     0.556231     0.556395     0.027854    -0.011355     1.010179
//      String [] cfs1 = new String [] {"Intercept","AGE", "RACE.R2","RACE.R3", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"};
//      double [] vals = new double [] {-8.14867, -0.01368, 0.32337, -0.38028, 0.55964, 0.49548, 0.02794, -0.01104, 0.97704};
//      GLMParameters params = new GLMParameters(Family.binomial);
//      params._n_folds = 10;
//      params._response_column = "CAPSULE";
//      params._ignored_columns = new String[]{"ID"};
//      params._train = fr._key;
//      params._lambda = new double[]{0};
//      model = new GLM(params,Key.make("prostate_model")).trainModel().get();
//      HashMap<String, Double> coefs = model.coefficients();
//      for(int i = 0; i < cfs1.length; ++i)
//        assertEquals(vals[i], coefs.get(cfs1[i]),1e-4);
//      GLMValidation val = model.trainVal();
////      assertEquals(512.3, val.nullDeviance(),1e-1);
////      assertEquals(378.3, val.residualDeviance(),1e-1);
////      assertEquals(396.3, val.AIC(),1e-1);
////      score = model.score(fr);
////      hex.ModelMetrics mm = hex.ModelMetrics.getFromDKV(model,fr);
////      AUCData adata = mm._aucdata;
////      assertEquals(val.auc(),adata.AUC(),1e-2);
////      GLMValidation val2 = new GLMValidationTsk(params,model._ymu,rank(model.beta())).doAll(new Vec[]{fr.vec("CAPSULE"),score.vec("1")})._val;
////      assertEquals(val.residualDeviance(),val2.residualDeviance(),1e-6);
////      assertEquals(val.nullDeviance(),val2.nullDeviance(),1e-6);
//    } finally {
//      fr.delete();
//      if(model != null)model.delete();
//      if(score != null)score.delete();
//      Scope.exit();
//    }
//  }
   * Test bounds on prostate dataset, 2 cases :
   * 1) test against known result in glmnet (with elastic net regularization) with elastic net penalty
   * 2) test with no regularization, check the ginfo in the end.
public void testBounds() {
    //    glmnet's result:
    //    res2 <- glmnet(x=M,y=D$CAPSULE,lower.limits=-.5,upper.limits=.5,family='binomial')
    //    res2$beta[,58]
    //    AGE        RACE          DPROS       PSA         VOL         GLEASON
    //    -0.00616326 -0.50000000  0.50000000  0.03628192 -0.01249324  0.50000000 //    res2$a0[100]
    //    res2$a0[58]
    //    s57
    //    -4.155864
    //    lambda = 0.001108, null dev =  512.2888, res dev = 379.7597
    GLMModel model = null;
    Key parsed = Key.make("prostate_parsed");
    Key modelKey = Key.make("prostate_model");
    Frame fr = parse_test_file(parsed, "smalldata/logreg/prostate.csv");
    Key betaConsKey = Key.make("beta_constraints");
    String[] cfs1 = new String[] { "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "Intercept" };
    double[] vals = new double[] { -0.006502588, -0.500000000, 0.500000000, 0.400000000, 0.034826559, -0.011661747, 0.500000000, -4.564024 };
    //    [AGE, RACE, DPROS, DCAPS, PSA, VOL, GLEASON, Intercept]
    FVecTest.makeByteVec(betaConsKey, "names, lower_bounds, upper_bounds\n AGE, -.5, .5\n RACE, -.5, .5\n DCAPS, -.4, .4\n DPROS, -.5, .5 \nPSA, -.5, .5\n VOL, -.5, .5\nGLEASON, -.5, .5");
    Frame betaConstraints = ParseDataset.parse(Key.make("beta_constraints.hex"), betaConsKey);
    try {
        // H2O differs on intercept and race, same residual deviance though
        GLMParameters params = new GLMParameters();
        params._standardize = true;
        params._family = Family.binomial;
        params._beta_constraints = betaConstraints._key;
        params._response_column = "CAPSULE";
        params._ignored_columns = new String[] { "ID" };
        params._train = fr._key;
        params._objective_epsilon = 0;
        params._alpha = new double[] { 1 };
        params._lambda = new double[] { 0.001607 };
        params._obj_reg = 1.0 / 380;
        GLM glm = new GLM(params, modelKey);
        model = glm.trainModel().get();
        //      Map<String, Double> coefs =  model.coefficients();
        //      for (int i = 0; i < cfs1.length; ++i)
        //        assertEquals(vals[i], coefs.get(cfs1[i]), 1e-1);
        ModelMetricsBinomialGLM val = (ModelMetricsBinomialGLM) model._output._training_metrics;
        assertEquals(512.2888, val._nullDev, 1e-1);
        // 388.4952716196743
        assertTrue(val._resDev <= 388.5);
        params._lambda = new double[] { 0 };
        params._alpha = new double[] { 0 };
        FVecTest.makeByteVec(betaConsKey, "names, lower_bounds, upper_bounds\n RACE, -.5, .5\n DCAPS, -.4, .4\n DPROS, -.5, .5 \nPSA, -.5, .5\n VOL, -.5, .5");
        betaConstraints = ParseDataset.parse(Key.make("beta_constraints.hex"), betaConsKey);
        glm = new GLM(params, modelKey);
        model = glm.trainModel().get();
        double[] beta = model.beta();
        System.out.println("beta = " + Arrays.toString(beta));
        fr.add("CAPSULE", fr.remove("CAPSULE"));
        DKV.put(fr._key, fr);
        // now check the ginfo
        DataInfo dinfo = new DataInfo(fr, null, 1, true, TransformType.NONE, DataInfo.TransformType.NONE, true, false, false, false, false, false);
        GLMGradientTask lt = new GLMBinomialGradientTask(null, dinfo, params, 0, beta).doAll(dinfo._adaptedFrame);
        double[] grad = lt._gradient;
        String[] names = model.dinfo().coefNames();
        BufferedString tmpStr = new BufferedString();
        outer: for (int i = 0; i < names.length; ++i) {
            for (int j = 0; j < betaConstraints.numRows(); ++j) {
                if (betaConstraints.vec("names").atStr(tmpStr, j).toString().equals(names[i])) {
                    if (Math.abs(beta[i] - betaConstraints.vec("lower_bounds").at(j)) < 1e-4 || Math.abs(beta[i] - betaConstraints.vec("upper_bounds").at(j)) < 1e-4) {
                        continue outer;
            assertEquals(0, grad[i], 1e-2);
    } finally {
        if (model != null)
Also used : BufferedString(water.parser.BufferedString) GLMParameters(hex.glm.GLMModel.GLMParameters) BufferedString(water.parser.BufferedString)

Example 13 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class ShuffleSplitFrameTest method testShuffleSplitWithMultipleColumns.

public /* this test makes sure that the rows of the split frames are preserved (including UUID) */
void testShuffleSplitWithMultipleColumns() {
    long[] chunkLayout = ar(2L, 2L, 3L);
    String[][] data = ar(ar("1", "2"), ar(null, "3"), ar("4", "5", "6"));
    Frame f = null;
    Frame tmpFrm = createFrame("test1.hex", chunkLayout, data);
    try {
        f = new MRTask() {

            public void map(Chunk[] cs, NewChunk[] ncs) {
                for (int i = 0; i < cs[0]._len; i++) {
                    BufferedString bs = cs[0].atStr(new BufferedString(), i);
                    int val = bs == null ? 0 : Integer.parseInt(bs.toString());
                    ncs[3].addUUID(i, val);
        }.doAll(new byte[] { Vec.T_STR, Vec.T_NUM, Vec.T_NUM, Vec.T_UUID }, tmpFrm).outputFrame();
    } finally {
    testScenario(f, flat(data), new MRTask() {

        public void map(Chunk[] cs) {
            for (int i = 0; i < cs[0]._len; i++) {
                BufferedString bs = cs[0].atStr(new BufferedString(), i);
                int expectedVal = bs == null ? 0 : Integer.parseInt(bs.toString());
                int expectedIndex = (int) cs[2].atd(i);
                Assert.assertEquals((double) expectedVal, cs[1].atd(i), 0.00001);
                Assert.assertEquals(expectedIndex, (int) cs[3].at16l(i));
                Assert.assertEquals(expectedVal, (int) cs[3].at16h(i));
Also used : FrameTestUtil.createFrame(water.fvec.FrameTestUtil.createFrame) Frame(water.fvec.Frame) MRTask(water.MRTask) BufferedString(water.parser.BufferedString) Chunk(water.fvec.Chunk) NewChunk(water.fvec.NewChunk) NewChunk(water.fvec.NewChunk) Test(org.junit.Test)

Example 14 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class AstRectangleAssign method assign_frame_frame.

// Rectangular array copy from src into dst
private void assign_frame_frame(Frame dst, int[] cols, AstNumList rows, Frame src, Session ses) {
    // Sanity check
    if (cols.length != src.numCols())
        throw new IllegalArgumentException("Source and destination frames must have the same count of columns");
    long nrows = rows.cnt();
    if (src.numRows() != nrows)
        throw new IllegalArgumentException("Requires same count of rows in the number-list (" + nrows + ") as in the source (" + src.numRows() + ")");
    // optimization happens here on the apply() exit.
    if (dst.numRows() == nrows && rows.isDense()) {
        for (int i = 0; i < cols.length; i++) dst.replace(cols[i], src.vecs()[i]);
        if (dst._key != null)
    // Partial update; needs to preserve type, and may need to copy to support
    // copy-on-write
    Vec[] dvecs = dst.vecs();
    final Vec[] svecs = src.vecs();
    for (int col = 0; col < cols.length; col++) {
        int dtype = dvecs[cols[col]].get_type();
        if (dtype != svecs[col].get_type())
            throw new IllegalArgumentException("Columns must be the same type; " + "column " + col + ", \'" + dst._names[cols[col]] + "\', is of type " + dvecs[cols[col]].get_type_str() + " and the source is " + svecs[col].get_type_str());
        if ((dtype == Vec.T_CAT) && (!Arrays.equals(dvecs[cols[col]].domain(), svecs[col].domain())))
            throw new IllegalArgumentException("Cannot assign to a categorical column with a different domain; " + "source column " + src._names[col] + ", target column " + dst._names[cols[col]]);
    // Handle fast small case
    if (nrows <= 1 || (cols.length * nrows) <= 1000) {
        // Go parallel for more than 1000 random updates
        // Copy dst columns as-needed to allow update-in-place
        // Update dst columns
        dvecs = ses.copyOnWrite(dst, cols);
        // Just these rows
        long[] rownums = rows.expand8();
        for (int col = 0; col < svecs.length; col++) if (svecs[col].get_type() == Vec.T_STR) {
            BufferedString bStr = new BufferedString();
            for (int ridx = 0; ridx < rownums.length; ridx++) {
                BufferedString s = svecs[col].atStr(bStr, ridx);
                dvecs[cols[col]].set(rownums[ridx], s != null ? s.toString() : null);
        } else {
            for (int ridx = 0; ridx < rownums.length; ridx++) dvecs[cols[col]].set(rownums[ridx], svecs[col].at(ridx));
    // Handle large case
    Vec[] vecs = ses.copyOnWrite(dst, cols);
    // Just the selected columns get updated
    Vec[] vecs2 = new Vec[cols.length];
    for (int i = 0; i < cols.length; i++) vecs2[i] = vecs[cols[i]];
    // Side-effect internal sort; needed for fast row lookup
    new AssignFrameFrameTask(rows, svecs).doAll(vecs2);
Also used : Vec(water.fvec.Vec) BufferedString(water.parser.BufferedString)

Example 15 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class CStrChunkTest method test_sparse.

public void test_sparse() {
    NewChunk nc = new NewChunk(null, 0);
    for (int i = 0; i < 100; i++) nc.addNA();
    nc.addStr(new BufferedString("foo"));
    nc.addStr(new BufferedString("bar"));
    Chunk c = nc.compress();
    Assert.assertTrue("first 100 entries are NA", c.isNA(0) && c.isNA(99));
    Assert.assertTrue("Sparse string has values", c.atStr(new BufferedString(), 100).sameString("foo"));
    Assert.assertTrue("NA", c.isNA(101));
    final BufferedString bufferedString = c.atStr(new BufferedString(), 102);
    Assert.assertTrue("Sparse string has values: expected `bar`, got " + bufferedString, bufferedString.sameString("bar"));
Also used : BufferedString(water.parser.BufferedString)


BufferedString (water.parser.BufferedString)43 Frame (water.fvec.Frame)12 Test (org.junit.Test)9 MRTask (water.MRTask)8 Vec (water.fvec.Vec)8 Chunk (water.fvec.Chunk)7 NewChunk (water.fvec.NewChunk)6 ValFrame (water.rapids.vals.ValFrame)5 IcedLong (water.util.IcedLong)5 IOException ( ByteBuffer (java.nio.ByteBuffer)2 Random (java.util.Random)2 DateTimeFormatter (org.joda.time.format.DateTimeFormatter)2 TestFrameBuilder (water.fvec.TestFrameBuilder)2 BackendModel (deepwater.backends.BackendModel)1 BackendParams (deepwater.backends.BackendParams)1 RuntimeOptions (deepwater.backends.RuntimeOptions)1 ImageDataSet (deepwater.datasets.ImageDataSet)1 GenModel (hex.genmodel.GenModel)1 EasyPredictModelWrapper (hex.genmodel.easy.EasyPredictModelWrapper)1