use of hex.glm.GLMModel.GLMWeightsFun in project h2o-3 by h2oai.
the class GLMTest method testSparseGramComputation.
// // test categorical autoexpansions, run on airlines which has several categorical columns,
// // once on explicitly expanded data, once on h2o autoexpanded and compare the results
// @Test public void testSparseCategoricals() {
// GLMModel model1 = null, model2 = null, model3 = null, model4 = null;
//
// Frame frMM = parse_test_file("smalldata/glm_tets/train-2.csv");
//
//// Vec xy = frG.remove("xy");
// frMM.remove("").remove();
// frMM.add("IsDepDelayed", frMM.remove("IsDepDelayed"));
// DKV.put(frMM._key,frMM);
// Frame fr = parse_test_file("smalldata/airlines/AirlinesTrain.csv.zip"), res = null;
// // Distance + Origin + Dest + UniqueCarrier
// String [] ignoredCols = new String[]{"fYear", "fMonth", "fDayofMonth", "fDayOfWeek", "DepTime","ArrTime","IsDepDelayed_REC"};
// try{
// Scope.enter();
// GLMParameters params = new GLMParameters(Family.gaussian);
// params._response_column = "IsDepDelayed";
// params._ignored_columns = ignoredCols;
// params._train = fr._key;
// params._l2pen = new double[]{1e-5};
// params._standardize = false;
// model1 = new GLM(params,glmkey("airlines_cat_nostd")).trainModel().get();
// Frame score1 = model1.score(fr);
// ModelMetricsRegressionGLM mm = (ModelMetricsRegressionGLM) ModelMetrics.getFromDKV(model1, fr);
// Assert.assertEquals(model1.validation().residual_deviance, mm._resDev, 1e-4);
// System.out.println("NDOF = " + model1.validation().nullDOF() + ", numRows = " + score1.numRows());
// Assert.assertEquals(model1.validation().residual_deviance, mm._MSE * score1.numRows(), 1e-4);
// mm.remove();
// res = model1.score(fr);
// // Build a POJO, validate same results
// Assert.assertTrue(model1.testJavaScoring(fr, res, 1e-15));
//
// params._train = frMM._key;
// params._ignored_columns = new String[]{"X"};
// model2 = new GLM(params,glmkey("airlines_mm")).trainModel().get();
// params._standardize = true;
// params._train = frMM._key;
// params._use_all_factor_levels = true;
// // test the gram
// DataInfo dinfo = new DataInfo(Key.make(),frMM, null, 1, true, DataInfo.TransformType.STANDARDIZE, DataInfo.TransformType.NONE, true);
// GLMIterationTask glmt = new GLMIterationTask(null,dinfo,1e-5,params,false,null,0,null, null).doAll(dinfo._adaptedFrame);
// for(int i = 0; i < glmt._xy.length; ++i) {
// for(int j = 0; j <= i; ++j ) {
// assertEquals(frG.vec(j).at(i), glmt._gram.get(i, j), 1e-5);
// }
// assertEquals(xy.at(i), glmt._xy[i], 1e-5);
// }
// frG.delete();
// xy.remove();
// params._standardize = true;
// params._family = Family.binomial;
// params._link = Link.logit;
// model3 = new GLM(params,glmkey("airlines_mm")).trainModel().get();
// params._train = fr._key;
// params._ignored_columns = ignoredCols;
// model4 = new GLM(params,glmkey("airlines_mm")).trainModel().get();
// assertEquals(model3.validation().null_deviance,model4.validation().nullDeviance(),1e-4);
// assertEquals(model4.validation().residual_deviance, model3.validation().residualDeviance(), model3.validation().null_deviance * 1e-3);
// HashMap<String, Double> coefs1 = model1.coefficients();
// HashMap<String, Double> coefs2 = model2.coefficients();
// GLMValidation val1 = model1.validation();
// GLMValidation val2 = model2.validation();
// // compare against each other
// for(String s:coefs2.keySet()) {
// String s1 = s;
// if(s.startsWith("Origin"))
// s1 = "Origin." + s.substring(6);
// if(s.startsWith("Dest"))
// s1 = "Dest." + s.substring(4);
// if(s.startsWith("UniqueCarrier"))
// s1 = "UniqueCarrier." + s.substring(13);
// assertEquals("coeff " + s1 + " differs, " + coefs1.get(s1) + " != " + coefs2.get(s), coefs1.get(s1), coefs2.get(s),1e-4);
// DKV.put(frMM._key,frMM); // update the frame in the KV after removing the vec!
// }
// assertEquals(val1.nullDeviance(), val2.nullDeviance(),1e-4);
// assertEquals(val1.residualDeviance(), val2.residualDeviance(),1e-4);
// assertEquals(val1._aic, val2._aic,1e-2);
// // compare result against glmnet
// assertEquals(5336.918,val1.residualDeviance(),1);
// assertEquals(6051.613,val1.nullDeviance(),1);
//
//
// // lbfgs
//// params._solver = Solver.L_BFGS;
//// params._train = fr._key;
//// params._lambda = new double[]{.3};
//// model3 = new GLM(params,glmkey("lbfgs_cat")).trainModel().get();
//// params._train = frMM._key;
//// model4 = new GLM(params,glmkey("lbfgs_mm")).trainModel().get();
//// HashMap<String, Double> coefs3 = model3.coefficients();
//// HashMap<String, Double> coefs4 = model4.coefficients();
//// // compare against each other
//// for(String s:coefs4.keySet()) {
//// String s1 = s;
//// if(s.startsWith("Origin"))
//// s1 = "Origin." + s.substring(6);
//// if(s.startsWith("Dest"))
//// s1 = "Dest." + s.substring(4);
//// if(s.startsWith("UniqueCarrier"))
//// s1 = "UniqueCarrier." + s.substring(13);
//// assertEquals("coeff " + s1 + " differs, " + coefs3.get(s1) + " != " + coefs4.get(s), coefs3.get(s1), coefs4.get(s),1e-4);
//// }
//
// } finally {
// fr.delete();
// frMM.delete();
// if(res != null)res.delete();
// if(model1 != null)model1.delete();
// if(model2 != null)model2.delete();
// if(model3 != null)model3.delete();
// if(model4 != null)model4.delete();
//// if(score != null)score.delete();
// Scope.exit();
// }
// }
/**
* Test we get correct gram on dataset which contains categoricals and sparse and dense numbers
*/
@Test
public void testSparseGramComputation() {
Random rnd = new Random(123456789l);
double[] d0 = MemoryManager.malloc8d(1000);
double[] d1 = MemoryManager.malloc8d(1000);
double[] d2 = MemoryManager.malloc8d(1000);
double[] d3 = MemoryManager.malloc8d(1000);
double[] d4 = MemoryManager.malloc8d(1000);
double[] d5 = MemoryManager.malloc8d(1000);
double[] d6 = MemoryManager.malloc8d(1000);
double[] d7 = MemoryManager.malloc8d(1000);
double[] d8 = MemoryManager.malloc8d(1000);
double[] d9 = MemoryManager.malloc8d(1000);
long[] c1 = MemoryManager.malloc8(1000);
long[] c2 = MemoryManager.malloc8(1000);
String[] dom = new String[] { "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" };
for (int i = 0; i < d1.length; ++i) {
c1[i] = rnd.nextInt(dom.length);
c2[i] = rnd.nextInt(dom.length);
d0[i] = rnd.nextDouble();
d1[i] = rnd.nextDouble();
}
for (int i = 0; i < 30; ++i) {
d2[rnd.nextInt(d2.length)] = rnd.nextDouble();
d3[rnd.nextInt(d2.length)] = rnd.nextDouble();
d4[rnd.nextInt(d2.length)] = rnd.nextDouble();
d5[rnd.nextInt(d2.length)] = rnd.nextDouble();
d6[rnd.nextInt(d2.length)] = rnd.nextDouble();
d7[rnd.nextInt(d2.length)] = rnd.nextDouble();
d8[rnd.nextInt(d2.length)] = rnd.nextDouble();
d9[rnd.nextInt(d2.length)] = 1;
}
Vec.VectorGroup vg_1 = Vec.VectorGroup.VG_LEN1;
Vec v01 = Vec.makeVec(c1, dom, vg_1.addVec());
Vec v02 = Vec.makeVec(c2, dom, vg_1.addVec());
Vec v03 = Vec.makeVec(d0, vg_1.addVec());
Vec v04 = Vec.makeVec(d1, vg_1.addVec());
Vec v05 = Vec.makeVec(d2, vg_1.addVec());
Vec v06 = Vec.makeVec(d3, vg_1.addVec());
Vec v07 = Vec.makeVec(d4, vg_1.addVec());
Vec v08 = Vec.makeVec(d5, vg_1.addVec());
Vec v09 = Vec.makeVec(d6, vg_1.addVec());
Vec v10 = Vec.makeVec(d7, vg_1.addVec());
Vec v11 = Vec.makeVec(d8, vg_1.addVec());
Vec v12 = Vec.makeVec(d9, vg_1.addVec());
Frame f = new Frame(Key.<Frame>make("TestData"), null, new Vec[] { v01, v02, v03, v04, v05, v05, v06, v07, v08, v09, v10, v11, v12 });
DKV.put(f);
DataInfo dinfo = new DataInfo(f, null, 1, true, DataInfo.TransformType.STANDARDIZE, DataInfo.TransformType.NONE, true, false, false, false, false, false);
GLMParameters params = new GLMParameters(Family.gaussian);
// public GLMIterationTask(Key jobKey, DataInfo dinfo, GLMWeightsFun glmw,double [] beta, double lambda) {
final GLMIterationTask glmtSparse = new GLMIterationTask(null, dinfo, new GLMWeightsFun(params), null).setSparse(true).doAll(dinfo._adaptedFrame);
final GLMIterationTask glmtDense = new GLMIterationTask(null, dinfo, new GLMWeightsFun(params), null).setSparse(false).doAll(dinfo._adaptedFrame);
for (int i = 0; i < glmtDense._xy.length; ++i) {
for (int j = 0; j <= i; ++j) {
assertEquals(glmtDense._gram.get(i, j), glmtSparse._gram.get(i, j), 1e-8);
}
assertEquals(glmtDense._xy[i], glmtSparse._xy[i], 1e-8);
}
final double[] beta = MemoryManager.malloc8d(dinfo.fullN() + 1);
// now do the same but weighted, use LSM solution as beta to generate meaningfull weights
H2O.submitTask(new H2OCountedCompleter() {
@Override
public void compute2() {
new GLM.GramSolver(glmtDense._gram, glmtDense._xy, true, 1e-5, 0, null, null, null, null).solve(null, beta);
tryComplete();
}
}).join();
final GLMIterationTask glmtSparse2 = new GLMIterationTask(null, dinfo, new GLMWeightsFun(params), beta).setSparse(true).doAll(dinfo._adaptedFrame);
final GLMIterationTask glmtDense2 = new GLMIterationTask(null, dinfo, new GLMWeightsFun(params), beta).setSparse(false).doAll(dinfo._adaptedFrame);
for (int i = 0; i < glmtDense2._xy.length; ++i) {
for (int j = 0; j <= i; ++j) {
assertEquals(glmtDense2._gram.get(i, j), glmtSparse2._gram.get(i, j), 1e-8);
}
assertEquals(glmtDense2._xy[i], glmtSparse2._xy[i], 1e-8);
}
dinfo.remove();
f.delete();
}
use of hex.glm.GLMModel.GLMWeightsFun in project h2o-3 by h2oai.
the class GLMTest method testAirlines.
// test categorical autoexpansions, run on airlines which has several categorical columns,
// once on explicitly expanded data, once on h2o autoexpanded and compare the results
@Test
public void testAirlines() {
GLMModel model1 = null, model2 = null, model3 = null, model4 = null;
Frame frMM = parse_test_file(Key.make("AirlinesMM"), "smalldata/airlines/AirlinesTrainMM.csv.zip");
Frame frG = parse_test_file(Key.make("gram"), "smalldata/airlines/gram_std.csv", true);
Vec xy = frG.remove("xy");
frMM.remove("C1").remove();
Vec v;
frMM.add("IsDepDelayed", (v = frMM.remove("IsDepDelayed")).makeCopy(null));
v.remove();
DKV.put(frMM._key, frMM);
Frame fr = parse_test_file(Key.make("Airlines"), "smalldata/airlines/AirlinesTrain.csv.zip"), res = null;
fr.add("IsDepDelayed", (v = fr.remove("IsDepDelayed")).makeCopy(null));
v.remove();
DKV.put(fr._key, fr);
// Distance + Origin + Dest + UniqueCarrier
String[] ignoredCols = new String[] { "fYear", "fMonth", "fDayofMonth", "fDayOfWeek", "DepTime", "ArrTime", "IsDepDelayed_REC" };
try {
Scope.enter();
GLMParameters params = new GLMParameters(Family.gaussian);
params._response_column = "IsDepDelayed";
params._ignored_columns = ignoredCols;
params._train = fr._key;
params._lambda = new double[] { 0 };
params._alpha = new double[] { 0 };
params._standardize = false;
params._use_all_factor_levels = false;
model1 = new GLM(params).trainModel().get();
testScoring(model1, fr);
Frame score1 = model1.score(fr);
ModelMetricsRegressionGLM mm = (ModelMetricsRegressionGLM) ModelMetrics.getFromDKV(model1, fr);
Assert.assertEquals(((ModelMetricsRegressionGLM) model1._output._training_metrics)._resDev, mm._resDev, 1e-4);
Assert.assertEquals(((ModelMetricsRegressionGLM) model1._output._training_metrics)._resDev, mm._MSE * score1.numRows(), 1e-4);
score1.delete();
mm.remove();
res = model1.score(fr);
// Build a POJO, validate same results
params._train = frMM._key;
params._ignored_columns = new String[] { "X" };
model2 = new GLM(params).trainModel().get();
HashMap<String, Double> coefs1 = model1.coefficients();
testScoring(model2, frMM);
HashMap<String, Double> coefs2 = model2.coefficients();
boolean failed = false;
// compare against each other
for (String s : coefs2.keySet()) {
String s1 = s;
if (s.startsWith("Origin"))
s1 = "Origin." + s.substring(6);
if (s.startsWith("Dest"))
s1 = "Dest." + s.substring(4);
if (s.startsWith("UniqueCarrier"))
s1 = "UniqueCarrier." + s.substring(13);
if (Math.abs(coefs1.get(s1) - coefs2.get(s)) > 1e-4) {
System.out.println("coeff " + s1 + " differs, " + coefs1.get(s1) + " != " + coefs2.get(s));
failed = true;
}
// assertEquals("coeff " + s1 + " differs, " + coefs1.get(s1) + " != " + coefs2.get(s), coefs1.get(s1), coefs2.get(s), 1e-4);
}
assertFalse(failed);
params._standardize = true;
params._train = frMM._key;
params._use_all_factor_levels = true;
// test the gram
DataInfo dinfo = new DataInfo(frMM, null, 1, true, DataInfo.TransformType.STANDARDIZE, DataInfo.TransformType.NONE, true, false, false, false, false, false);
GLMIterationTask glmt = new GLMIterationTask(null, dinfo, new GLMWeightsFun(params), null).doAll(dinfo._adaptedFrame);
for (int i = 0; i < glmt._xy.length; ++i) {
for (int j = 0; j <= i; ++j) {
assertEquals(frG.vec(j).at(i), glmt._gram.get(i, j), 1e-5);
}
assertEquals(xy.at(i), glmt._xy[i], 1e-5);
}
xy.remove();
params = (GLMParameters) params.clone();
params._standardize = false;
params._family = Family.binomial;
params._link = Link.logit;
model3 = new GLM(params).trainModel().get();
testScoring(model3, frMM);
params._train = fr._key;
params._ignored_columns = ignoredCols;
model4 = new GLM(params).trainModel().get();
testScoring(model4, fr);
assertEquals(nullDeviance(model3), nullDeviance(model4), 1e-4);
assertEquals(residualDeviance(model4), residualDeviance(model3), nullDeviance(model3) * 1e-3);
assertEquals(nullDeviance(model1), nullDeviance(model2), 1e-4);
assertEquals(residualDeviance(model1), residualDeviance(model2), 1e-4);
// assertEquals(val1._aic, val2._aic,1e-2);
// compare result against glmnet
assertEquals(5336.918, residualDeviance(model1), 1);
assertEquals(6051.613, nullDeviance(model2), 1);
// lbfgs
// params._solver = Solver.L_BFGS;
// params._train = fr._key;
// params._lambda = new double[]{.3};
// model3 = new GLM(params,glmkey("lbfgs_cat")).trainModel().get();
// params._train = frMM._key;
// mdoel4 = new GLM(params,glmkey("lbfgs_mm")).trainModel().get();
// HashMap<String, Double> coefs3 = model3.coefficients();
// HashMap<String, Double> coefs4 = model4.coefficients();
// // compare against each other
// for(String s:coefs4.keySet()) {
// String s1 = s;
// if(s.startsWith("Origin"))
// s1 = "Origin." + s.substring(6);
// if(s.startsWith("Dest"))
// s1 = "Dest." + s.substring(4);
// if(s.startsWith("UniqueCarrier"))
// s1 = "UniqueCarrier." + s.substring(13);
// assertEquals("coeff " + s1 + " differs, " + coefs3.get(s1) + " != " + coefs4.get(s), coefs3.get(s1), coefs4.get(s),1e-4);
// }
} finally {
fr.delete();
frMM.delete();
frG.delete();
if (res != null)
res.delete();
if (model1 != null)
model1.delete();
if (model2 != null)
model2.delete();
if (model3 != null)
model3.delete();
if (model4 != null)
model4.delete();
// if(score != null)score.delete();
Scope.exit();
}
}
use of hex.glm.GLMModel.GLMWeightsFun in project h2o-3 by h2oai.
the class L_BFGS_Test method logistic.
@Test
public void logistic() {
Key parsedKey = Key.make("prostate");
DataInfo dinfo = null;
try {
GLMParameters glmp = new GLMParameters(Family.binomial, Family.binomial.defaultLink);
glmp._alpha = new double[] { 0 };
glmp._lambda = new double[] { 1e-5 };
Frame source = parse_test_file(parsedKey, "smalldata/glm_test/prostate_cat_replaced.csv");
source.add("CAPSULE", source.remove("CAPSULE"));
source.remove("ID").remove();
Frame valid = new Frame(source._names.clone(), source.vecs().clone());
dinfo = new DataInfo(source, valid, 1, false, DataInfo.TransformType.STANDARDIZE, DataInfo.TransformType.NONE, true, false, false, /* weights */
false, /* offset */
false, /* fold */
false);
DKV.put(dinfo._key, dinfo);
glmp._obj_reg = 1 / 380.0;
GLMGradientSolver solver = new GLMGradientSolver(null, glmp, dinfo, 1e-5, null);
L_BFGS lbfgs = new L_BFGS().setGradEps(1e-8);
double[] beta = MemoryManager.malloc8d(dinfo.fullN() + 1);
beta[beta.length - 1] = new GLMWeightsFun(glmp).link(source.vec("CAPSULE").mean());
L_BFGS.Result r = lbfgs.solve(solver, beta, solver.getGradient(beta), new L_BFGS.ProgressMonitor() {
int _i = 0;
public boolean progress(double[] beta, GradientInfo ginfo) {
System.out.println(++_i + ":" + ginfo._objVal + ", " + ArrayUtils.l2norm2(ginfo._gradient, false));
return true;
}
});
assertEquals(378.34, 2 * r.ginfo._objVal * source.numRows(), 1e-1);
} finally {
if (dinfo != null)
DKV.remove(dinfo._key);
Value v = DKV.get(parsedKey);
if (v != null) {
v.<Frame>get().delete();
}
}
}
use of hex.glm.GLMModel.GLMWeightsFun in project h2o-3 by h2oai.
the class L_BFGS_Test method testArcene.
// Test LSM on arcene - wide dataset with ~10k columns
// test warm start and max #iteratoions
@Test
public void testArcene() {
Key parsedKey = Key.make("arcene_parsed");
DataInfo dinfo = null;
try {
Frame source = parse_test_file(parsedKey, "smalldata/glm_test/arcene.csv");
Frame valid = new Frame(source._names.clone(), source.vecs().clone());
GLMParameters glmp = new GLMParameters(Family.gaussian);
glmp._lambda = new double[] { 1e-5 };
glmp._alpha = new double[] { 0 };
glmp._obj_reg = 0.01;
dinfo = new DataInfo(source, valid, 1, false, DataInfo.TransformType.STANDARDIZE, DataInfo.TransformType.NONE, true, false, false, /* weights */
false, /* offset */
false, /* fold */
false);
DKV.put(dinfo._key, dinfo);
GradientSolver solver = new GLMGradientSolver(null, glmp, dinfo, 1e-5, null);
L_BFGS lbfgs = new L_BFGS().setMaxIter(20);
double[] beta = MemoryManager.malloc8d(dinfo.fullN() + 1);
beta[beta.length - 1] = new GLMWeightsFun(glmp).link(source.lastVec().mean());
L_BFGS.Result r1 = lbfgs.solve(solver, beta.clone(), solver.getGradient(beta), new L_BFGS.ProgressMonitor() {
int _i = 0;
public boolean progress(double[] beta, GradientInfo ginfo) {
System.out.println(++_i + ":" + ginfo._objVal);
return true;
}
});
lbfgs.setMaxIter(50);
final int iter = r1.iter;
L_BFGS.Result r2 = lbfgs.solve(solver, r1.coefs, r1.ginfo, new L_BFGS.ProgressMonitor() {
int _i = 0;
public boolean progress(double[] beta, GradientInfo ginfo) {
System.out.println(iter + " + " + ++_i + ":" + ginfo._objVal);
return true;
}
});
System.out.println();
lbfgs = new L_BFGS().setMaxIter(100);
L_BFGS.Result r3 = lbfgs.solve(solver, beta.clone(), solver.getGradient(beta), new L_BFGS.ProgressMonitor() {
int _i = 0;
public boolean progress(double[] beta, GradientInfo ginfo) {
System.out.println(++_i + ":" + ginfo._objVal + ", " + ArrayUtils.l2norm2(ginfo._gradient, false));
return true;
}
});
assertEquals(r1.iter, 20);
// assertEquals (r1.iter + r2.iter,r3.iter); // should be equal? got mismatch by 2
assertEquals(r2.ginfo._objVal, r3.ginfo._objVal, 1e-8);
assertEquals(.5 * glmp._lambda[0] * ArrayUtils.l2norm(r3.coefs, true) + r3.ginfo._objVal, 1e-4, 5e-4);
assertTrue("iter# expected < 100, got " + r3.iter, r3.iter < 100);
} finally {
if (dinfo != null)
DKV.remove(dinfo._key);
Value v = DKV.get(parsedKey);
if (v != null) {
v.<Frame>get().delete();
}
}
}
Aggregations