use of water.parser.BufferedString in project h2o-3 by h2oai.
the class TimeSeriesTests method testIsax.
@Test
public void testIsax() {
//
Val res1 = Rapids.exec("(cumsum " + f._key + " 1)");
fr1 = res1.getFrame();
DKV.put(fr1);
// 10 words 10 max cardinality 0 optimize card
Val res2 = Rapids.exec("(isax " + fr1._key + " 10 10 0)");
fr2 = res2.getFrame();
String expected = "0^10_0^10_0^10_0^10_5^10_7^10_8^10_9^10_9^10_8^10";
final String actual = fr2.vec(0).atStr(new BufferedString(), 0).toString();
Assert.assertEquals(expected, actual);
}
use of water.parser.BufferedString in project h2o-3 by h2oai.
the class GLMTest method testBounds.
// Leask xval keys
// @Test public void testXval() {
// GLMModel model = null;
// Frame fr = parse_test_file("smalldata/glm_test/prostate_cat_replaced.csv");
// Frame score = null;
// try{
// Scope.enter();
// // R results
//// Coefficients:
//// (Intercept) ID AGE RACER2 RACER3 DPROS DCAPS PSA VOL GLEASON
//// -8.894088 0.001588 -0.009589 0.231777 -0.459937 0.556231 0.556395 0.027854 -0.011355 1.010179
// String [] cfs1 = new String [] {"Intercept","AGE", "RACE.R2","RACE.R3", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"};
// double [] vals = new double [] {-8.14867, -0.01368, 0.32337, -0.38028, 0.55964, 0.49548, 0.02794, -0.01104, 0.97704};
// GLMParameters params = new GLMParameters(Family.binomial);
// params._n_folds = 10;
// params._response_column = "CAPSULE";
// params._ignored_columns = new String[]{"ID"};
// params._train = fr._key;
// params._lambda = new double[]{0};
// model = new GLM(params,Key.make("prostate_model")).trainModel().get();
// HashMap<String, Double> coefs = model.coefficients();
// for(int i = 0; i < cfs1.length; ++i)
// assertEquals(vals[i], coefs.get(cfs1[i]),1e-4);
// GLMValidation val = model.trainVal();
//// assertEquals(512.3, val.nullDeviance(),1e-1);
//// assertEquals(378.3, val.residualDeviance(),1e-1);
//// assertEquals(396.3, val.AIC(),1e-1);
//// score = model.score(fr);
////
//// hex.ModelMetrics mm = hex.ModelMetrics.getFromDKV(model,fr);
////
//// AUCData adata = mm._aucdata;
//// assertEquals(val.auc(),adata.AUC(),1e-2);
//// GLMValidation val2 = new GLMValidationTsk(params,model._ymu,rank(model.beta())).doAll(new Vec[]{fr.vec("CAPSULE"),score.vec("1")})._val;
//// assertEquals(val.residualDeviance(),val2.residualDeviance(),1e-6);
//// assertEquals(val.nullDeviance(),val2.nullDeviance(),1e-6);
// } finally {
// fr.delete();
// if(model != null)model.delete();
// if(score != null)score.delete();
// Scope.exit();
// }
// }
/**
* Test bounds on prostate dataset, 2 cases :
* 1) test against known result in glmnet (with elastic net regularization) with elastic net penalty
* 2) test with no regularization, check the ginfo in the end.
*/
@Test
public void testBounds() {
// glmnet's result:
// res2 <- glmnet(x=M,y=D$CAPSULE,lower.limits=-.5,upper.limits=.5,family='binomial')
// res2$beta[,58]
// AGE RACE DPROS PSA VOL GLEASON
// -0.00616326 -0.50000000 0.50000000 0.03628192 -0.01249324 0.50000000 // res2$a0[100]
// res2$a0[58]
// s57
// -4.155864
// lambda = 0.001108, null dev = 512.2888, res dev = 379.7597
GLMModel model = null;
Key parsed = Key.make("prostate_parsed");
Key modelKey = Key.make("prostate_model");
Frame fr = parse_test_file(parsed, "smalldata/logreg/prostate.csv");
Key betaConsKey = Key.make("beta_constraints");
String[] cfs1 = new String[] { "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "Intercept" };
double[] vals = new double[] { -0.006502588, -0.500000000, 0.500000000, 0.400000000, 0.034826559, -0.011661747, 0.500000000, -4.564024 };
// [AGE, RACE, DPROS, DCAPS, PSA, VOL, GLEASON, Intercept]
FVecTest.makeByteVec(betaConsKey, "names, lower_bounds, upper_bounds\n AGE, -.5, .5\n RACE, -.5, .5\n DCAPS, -.4, .4\n DPROS, -.5, .5 \nPSA, -.5, .5\n VOL, -.5, .5\nGLEASON, -.5, .5");
Frame betaConstraints = ParseDataset.parse(Key.make("beta_constraints.hex"), betaConsKey);
try {
// H2O differs on intercept and race, same residual deviance though
GLMParameters params = new GLMParameters();
params._standardize = true;
params._family = Family.binomial;
params._beta_constraints = betaConstraints._key;
params._response_column = "CAPSULE";
params._ignored_columns = new String[] { "ID" };
params._train = fr._key;
params._objective_epsilon = 0;
params._alpha = new double[] { 1 };
params._lambda = new double[] { 0.001607 };
params._obj_reg = 1.0 / 380;
GLM glm = new GLM(params, modelKey);
model = glm.trainModel().get();
assertTrue(glm.isStopped());
// Map<String, Double> coefs = model.coefficients();
// for (int i = 0; i < cfs1.length; ++i)
// assertEquals(vals[i], coefs.get(cfs1[i]), 1e-1);
ModelMetricsBinomialGLM val = (ModelMetricsBinomialGLM) model._output._training_metrics;
assertEquals(512.2888, val._nullDev, 1e-1);
// 388.4952716196743
assertTrue(val._resDev <= 388.5);
model.delete();
params._lambda = new double[] { 0 };
params._alpha = new double[] { 0 };
FVecTest.makeByteVec(betaConsKey, "names, lower_bounds, upper_bounds\n RACE, -.5, .5\n DCAPS, -.4, .4\n DPROS, -.5, .5 \nPSA, -.5, .5\n VOL, -.5, .5");
betaConstraints = ParseDataset.parse(Key.make("beta_constraints.hex"), betaConsKey);
glm = new GLM(params, modelKey);
model = glm.trainModel().get();
assertTrue(glm.isStopped());
double[] beta = model.beta();
System.out.println("beta = " + Arrays.toString(beta));
fr.add("CAPSULE", fr.remove("CAPSULE"));
fr.remove("ID").remove();
DKV.put(fr._key, fr);
// now check the ginfo
DataInfo dinfo = new DataInfo(fr, null, 1, true, TransformType.NONE, DataInfo.TransformType.NONE, true, false, false, false, false, false);
GLMGradientTask lt = new GLMBinomialGradientTask(null, dinfo, params, 0, beta).doAll(dinfo._adaptedFrame);
double[] grad = lt._gradient;
String[] names = model.dinfo().coefNames();
BufferedString tmpStr = new BufferedString();
outer: for (int i = 0; i < names.length; ++i) {
for (int j = 0; j < betaConstraints.numRows(); ++j) {
if (betaConstraints.vec("names").atStr(tmpStr, j).toString().equals(names[i])) {
if (Math.abs(beta[i] - betaConstraints.vec("lower_bounds").at(j)) < 1e-4 || Math.abs(beta[i] - betaConstraints.vec("upper_bounds").at(j)) < 1e-4) {
continue outer;
}
}
}
assertEquals(0, grad[i], 1e-2);
}
} finally {
fr.delete();
betaConstraints.delete();
if (model != null)
model.delete();
}
}
use of water.parser.BufferedString in project h2o-3 by h2oai.
the class ShuffleSplitFrameTest method testShuffleSplitWithMultipleColumns.
@Test
public /* this test makes sure that the rows of the split frames are preserved (including UUID) */
void testShuffleSplitWithMultipleColumns() {
long[] chunkLayout = ar(2L, 2L, 3L);
String[][] data = ar(ar("1", "2"), ar(null, "3"), ar("4", "5", "6"));
Frame f = null;
Frame tmpFrm = createFrame("test1.hex", chunkLayout, data);
try {
f = new MRTask() {
@Override
public void map(Chunk[] cs, NewChunk[] ncs) {
for (int i = 0; i < cs[0]._len; i++) {
BufferedString bs = cs[0].atStr(new BufferedString(), i);
int val = bs == null ? 0 : Integer.parseInt(bs.toString());
ncs[0].addStr(bs);
ncs[1].addNum(val);
ncs[2].addNum(i);
ncs[3].addUUID(i, val);
}
}
}.doAll(new byte[] { Vec.T_STR, Vec.T_NUM, Vec.T_NUM, Vec.T_UUID }, tmpFrm).outputFrame();
} finally {
tmpFrm.delete();
}
testScenario(f, flat(data), new MRTask() {
@Override
public void map(Chunk[] cs) {
for (int i = 0; i < cs[0]._len; i++) {
BufferedString bs = cs[0].atStr(new BufferedString(), i);
int expectedVal = bs == null ? 0 : Integer.parseInt(bs.toString());
int expectedIndex = (int) cs[2].atd(i);
Assert.assertEquals((double) expectedVal, cs[1].atd(i), 0.00001);
Assert.assertEquals(expectedIndex, (int) cs[3].at16l(i));
Assert.assertEquals(expectedVal, (int) cs[3].at16h(i));
}
}
});
}
use of water.parser.BufferedString in project h2o-3 by h2oai.
the class AstRectangleAssign method assign_frame_frame.
// Rectangular array copy from src into dst
private void assign_frame_frame(Frame dst, int[] cols, AstNumList rows, Frame src, Session ses) {
// Sanity check
if (cols.length != src.numCols())
throw new IllegalArgumentException("Source and destination frames must have the same count of columns");
long nrows = rows.cnt();
if (src.numRows() != nrows)
throw new IllegalArgumentException("Requires same count of rows in the number-list (" + nrows + ") as in the source (" + src.numRows() + ")");
// optimization happens here on the apply() exit.
if (dst.numRows() == nrows && rows.isDense()) {
for (int i = 0; i < cols.length; i++) dst.replace(cols[i], src.vecs()[i]);
if (dst._key != null)
DKV.put(dst);
return;
}
// Partial update; needs to preserve type, and may need to copy to support
// copy-on-write
Vec[] dvecs = dst.vecs();
final Vec[] svecs = src.vecs();
for (int col = 0; col < cols.length; col++) {
int dtype = dvecs[cols[col]].get_type();
if (dtype != svecs[col].get_type())
throw new IllegalArgumentException("Columns must be the same type; " + "column " + col + ", \'" + dst._names[cols[col]] + "\', is of type " + dvecs[cols[col]].get_type_str() + " and the source is " + svecs[col].get_type_str());
if ((dtype == Vec.T_CAT) && (!Arrays.equals(dvecs[cols[col]].domain(), svecs[col].domain())))
throw new IllegalArgumentException("Cannot assign to a categorical column with a different domain; " + "source column " + src._names[col] + ", target column " + dst._names[cols[col]]);
}
// Handle fast small case
if (nrows <= 1 || (cols.length * nrows) <= 1000) {
// Go parallel for more than 1000 random updates
// Copy dst columns as-needed to allow update-in-place
// Update dst columns
dvecs = ses.copyOnWrite(dst, cols);
// Just these rows
long[] rownums = rows.expand8();
for (int col = 0; col < svecs.length; col++) if (svecs[col].get_type() == Vec.T_STR) {
BufferedString bStr = new BufferedString();
for (int ridx = 0; ridx < rownums.length; ridx++) {
BufferedString s = svecs[col].atStr(bStr, ridx);
dvecs[cols[col]].set(rownums[ridx], s != null ? s.toString() : null);
}
} else {
for (int ridx = 0; ridx < rownums.length; ridx++) dvecs[cols[col]].set(rownums[ridx], svecs[col].at(ridx));
}
return;
}
// Handle large case
Vec[] vecs = ses.copyOnWrite(dst, cols);
// Just the selected columns get updated
Vec[] vecs2 = new Vec[cols.length];
for (int i = 0; i < cols.length; i++) vecs2[i] = vecs[cols[i]];
// Side-effect internal sort; needed for fast row lookup
rows.sort();
new AssignFrameFrameTask(rows, svecs).doAll(vecs2);
}
use of water.parser.BufferedString in project h2o-3 by h2oai.
the class CStrChunkTest method test_sparse.
@Test
public void test_sparse() {
NewChunk nc = new NewChunk(null, 0);
for (int i = 0; i < 100; i++) nc.addNA();
nc.addStr(new BufferedString("foo"));
nc.addNA();
nc.addStr(new BufferedString("bar"));
Chunk c = nc.compress();
Assert.assertTrue("first 100 entries are NA", c.isNA(0) && c.isNA(99));
Assert.assertTrue("Sparse string has values", c.atStr(new BufferedString(), 100).sameString("foo"));
Assert.assertTrue("NA", c.isNA(101));
final BufferedString bufferedString = c.atStr(new BufferedString(), 102);
Assert.assertTrue("Sparse string has values: expected `bar`, got " + bufferedString, bufferedString.sameString("bar"));
}
Aggregations