use of water.fvec.NFSFileVec in project h2o-3 by h2oai.
the class GBMMissingTest method run.
@Test
public void run() {
long seed = 1234;
GBMModel mymodel = null;
Frame train = null;
Frame test = null;
Frame data = null;
GBMModel.GBMParameters p;
Log.info("");
Log.info("STARTING.");
Log.info("Using seed " + seed);
StringBuilder sb = new StringBuilder();
double sumerr = 0;
Map<Double, Double> map = new TreeMap<>();
for (double missing_fraction : new double[] { 0, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99 }) {
double err = 0;
try {
Scope.enter();
NFSFileVec nfs = TestUtil.makeNfsFileVec("smalldata/junit/weather.csv");
data = ParseDataset.parse(Key.make("data.hex"), nfs._key);
Log.info("FrameSplitting");
// Create holdout test data on clean data (before adding missing values)
FrameSplitter fs = new FrameSplitter(data, new double[] { 0.75f }, generateNumKeys(data._key, 2), null);
//.join();
H2O.submitTask(fs);
Frame[] train_test = fs.getResult();
train = train_test[0];
test = train_test[1];
Log.info("Done...");
// add missing values to the training data (excluding the response)
if (missing_fraction > 0) {
Frame frtmp = new Frame(Key.<Frame>make(), train.names(), train.vecs());
//exclude the response
frtmp.remove(frtmp.numCols() - 1);
//need to put the frame (to be modified) into DKV for MissingInserter to pick up
DKV.put(frtmp._key, frtmp);
FrameUtils.MissingInserter j = new FrameUtils.MissingInserter(frtmp._key, seed, missing_fraction);
//MissingInserter is non-blocking, must block here explicitly
j.execImpl().get();
//Delete the frame header (not the data)
DKV.remove(frtmp._key);
}
// Build a regularized GBM model with polluted training data, score on clean validation set
p = new GBMModel.GBMParameters();
p._train = train._key;
p._valid = test._key;
p._response_column = train._names[train.numCols() - 1];
//only for weather data
p._ignored_columns = new String[] { train._names[1], train._names[22] };
p._seed = seed;
// Convert response to categorical
int ri = train.numCols() - 1;
int ci = test.find(p._response_column);
Scope.track(train.replace(ri, train.vecs()[ri].toCategoricalVec()));
Scope.track(test.replace(ci, test.vecs()[ci].toCategoricalVec()));
DKV.put(train);
DKV.put(test);
GBM gbm = new GBM(p);
Log.info("Starting with " + missing_fraction * 100 + "% missing values added.");
mymodel = gbm.trainModel().get();
// Extract the scoring on validation set from the model
err = ((ModelMetricsBinomial) mymodel._output._validation_metrics).logloss();
Frame train_preds = mymodel.score(train);
Assert.assertTrue(mymodel.testJavaScoring(train, train_preds, 1e-15));
train_preds.remove();
Log.info("Missing " + missing_fraction * 100 + "% -> logloss: " + err);
} catch (Throwable t) {
t.printStackTrace();
err = 100;
} finally {
Scope.exit();
// cleanup
if (mymodel != null) {
mymodel.delete();
}
if (train != null)
train.delete();
if (test != null)
test.delete();
if (data != null)
data.delete();
}
map.put(missing_fraction, err);
sumerr += err;
}
sb.append("missing fraction --> Error\n");
for (String s : Arrays.toString(map.entrySet().toArray()).split(",")) sb.append(s.replace("=", " --> ")).append("\n");
sb.append('\n');
sb.append("Sum Err: ").append(sumerr).append("\n");
Log.info(sb.toString());
}
use of water.fvec.NFSFileVec in project h2o-3 by h2oai.
the class RapidsTest method testChicago.
@Test
public void testChicago() {
String oldtz = Rapids.exec("(getTimeZone)").getStr();
Session ses = new Session();
try {
parse_test_file(Key.make("weather.hex"), "smalldata/chicago/chicagoAllWeather.csv");
parse_test_file(Key.make("crimes.hex"), "smalldata/chicago/chicagoCrimes10k.csv.zip");
String fname = "smalldata/chicago/chicagoCensus.csv";
File f = FileUtils.locateFile(fname);
assert f != null && f.exists() : " file not found: " + fname;
NFSFileVec nfs = NFSFileVec.make(f);
ParseSetup ps = ParseSetup.guessSetup(new Key[] { nfs._key }, false, 1);
ps.getColumnTypes()[1] = Vec.T_CAT;
ParseDataset.parse(Key.make("census.hex"), new Key[] { nfs._key }, true, ps);
exec_str("(assign census.hex (colnames= census.hex\t[0 1 2 3 4 5 6 7 8] \n" + "['Community.Area.Number' 'COMMUNITY.AREA.NAME' \"PERCENT.OF.HOUSING.CROWDED\" \r\n" + " \"PERCENT.HOUSEHOLDS.BELOW.POVERTY\" \"PERCENT.AGED.16..UNEMPLOYED\" " + " \"PERCENT.AGED.25..WITHOUT.HIGH.SCHOOL.DIPLOMA\" \"PERCENT.AGED.UNDER.18.OR.OVER.64\" " + " \"PER.CAPITA.INCOME.\" \"HARDSHIP.INDEX\"]))", ses);
exec_str("(assign crimes.hex (colnames= crimes.hex [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21] [\"ID\" \"Case.Number\" \"Date\" \"Block\" \"IUCR\" \"Primary.Type\" \"Description\" \"Location.Description\" \"Arrest\" \"Domestic\" \"Beat\" \"District\" \"Ward\" \"Community.Area\" \"FBI.Code\" \"X.Coordinate\" \"Y.Coordinate\" \"Year\" \"Updated.On\" \"Latitude\" \"Longitude\" \"Location\"]))", ses);
exec_str("(setTimeZone \"Etc/UTC\")", ses);
exec_str("(assign crimes.hex (append crimes.hex (tmp= unary_op_6 (day (tmp= nary_op_5 (cols crimes.hex [2])))) \"Day\"))", ses);
checkSaneFrame();
exec_str("(assign crimes.hex (append crimes.hex (tmp= binary_op_31 (+ (tmp= unary_op_7 (month nary_op_5)) 1)) \"Month\"))", ses);
exec_str("(rm nary_op_30)", ses);
exec_str("(assign crimes.hex (append crimes.hex (tmp= binary_op_32 (+ (tmp= binary_op_9 (- (tmp= unary_op_8 (year nary_op_5)) 1900)) 1900)) \"Year\"))", ses);
exec_str("(assign crimes.hex (append crimes.hex (tmp= unary_op_10 (week nary_op_5)) \"WeekNum\"))", ses);
exec_str("(rm binary_op_32)", ses);
exec_str("(rm binary_op_31)", ses);
exec_str("(rm unary_op_8)", ses);
checkSaneFrame();
exec_str("(assign crimes.hex (append crimes.hex (tmp= unary_op_11 (dayOfWeek nary_op_5)) \"WeekDay\"))", ses);
exec_str("(rm 'nfs:\\\\C:\\\\Users\\\\cliffc\\\\Desktop\\\\h2o-3\\\\smalldata\\\\chicago\\\\chicagoCrimes10k.csv.zip')", ses);
exec_str("(assign crimes.hex (append crimes.hex (tmp= unary_op_12 (hour nary_op_5)) \"HourOfDay\"))", ses);
exec_str("(assign crimes.hex (append crimes.hex (tmp= nary_op_16 (ifelse (tmp= binary_op_15 (| (tmp= binary_op_13 (== unary_op_11 \"Sun\")) (tmp= binary_op_14 (== unary_op_11 \"Sat\")))) 1 0)) \"Weekend\"))", ses);
// Season is incorrectly assigned in the original chicago demo; picks up the Weekend flag
exec_str("(assign crimes.hex (append crimes.hex nary_op_16 \"Season\"))", ses);
// Standard "head of 10 rows" pattern for printing
exec_str("(tmp= subset_33 (rows crimes.hex [0:10]))", ses);
exec_str("(rm subset_33)", ses);
exec_str("(rm subset_33)", ses);
exec_str("(rm unary_op_29)", ses);
exec_str("(rm nary_op_28)", ses);
exec_str("(rm nary_op_27)", ses);
exec_str("(rm nary_op_26)", ses);
exec_str("(rm binary_op_25)", ses);
exec_str("(rm binary_op_24)", ses);
exec_str("(rm binary_op_23)", ses);
exec_str("(rm binary_op_22)", ses);
exec_str("(rm binary_op_21)", ses);
exec_str("(rm binary_op_20)", ses);
exec_str("(rm binary_op_19)", ses);
exec_str("(rm binary_op_18)", ses);
exec_str("(rm binary_op_17)", ses);
exec_str("(rm nary_op_16)", ses);
exec_str("(rm binary_op_15)", ses);
exec_str("(rm binary_op_14)", ses);
exec_str("(rm binary_op_13)", ses);
exec_str("(rm unary_op_12)", ses);
exec_str("(rm unary_op_11)", ses);
exec_str("(rm unary_op_10)", ses);
exec_str("(rm binary_op_9)", ses);
exec_str("(rm unary_op_8)", ses);
exec_str("(rm unary_op_7)", ses);
exec_str("(rm unary_op_6)", ses);
exec_str("(rm nary_op_5)", ses);
checkSaneFrame();
// Standard "head of 10 rows" pattern for printing
exec_str("(tmp= subset_34 (rows crimes.hex [0:10]))", ses);
exec_str("(rm subset_34)", ses);
exec_str("(assign census.hex (colnames= census.hex [0 1 2 3 4 5 6 7 8] [\"Community.Area\" \"COMMUNITY.AREA.NAME\" \"PERCENT.OF.HOUSING.CROWDED\" \"PERCENT.HOUSEHOLDS.BELOW.POVERTY\" \"PERCENT.AGED.16..UNEMPLOYED\" \"PERCENT.AGED.25..WITHOUT.HIGH.SCHOOL.DIPLOMA\" \"PERCENT.AGED.UNDER.18.OR.OVER.64\" \"PER.CAPITA.INCOME.\" \"HARDSHIP.INDEX\"]))", ses);
exec_str("(rm subset_34)", ses);
exec_str("(tmp= subset_35 (cols crimes.hex [-3]))", ses);
exec_str("(tmp= subset_36 (cols weather.hex [-1]))", ses);
exec_str("(tmp= subset_36_2 (colnames= subset_36 [0 1 2 3 4 5] [\"Month\" \"Day\" \"Year\" \"maxTemp\" \"meanTemp\" \"minTemp\"]))", ses);
exec_str("(rm crimes.hex)", ses);
exec_str("(rm weather.hex)", ses);
// nary_op_37 = merge( X Y ); Vecs in X & nary_op_37 shared
exec_str("(tmp= nary_op_37 (merge subset_35 census.hex TRUE FALSE [] [] \"auto\"))", ses);
// nary_op_38 = merge( nary_op_37 subset_36_2); Vecs in nary_op_38 and nary_pop_37 and X shared
exec_str("(tmp= subset_41 (rows (tmp= nary_op_38 (merge nary_op_37 subset_36_2 TRUE FALSE [] [] \"auto\")) (tmp= binary_op_40 (<= (tmp= nary_op_39 (h2o.runif nary_op_38 30792152736.5179)) 0.8))))", ses);
// Standard "head of 10 rows" pattern for printing
exec_str("(tmp= subset_44 (rows subset_41 [0:10]))", ses);
exec_str("(rm subset_44)", ses);
exec_str("(rm subset_44)", ses);
exec_str("(rm binary_op_40)", ses);
exec_str("(rm nary_op_37)", ses);
exec_str("(tmp= subset_43 (rows nary_op_38 (tmp= binary_op_42 (> nary_op_39 0.8))))", ses);
// Chicago demo continues on past, but this is all I've captured for now
checkSaneFrame();
ses.end(null);
} catch (Throwable ex) {
throw ses.endQuietly(ex);
} finally {
// Restore time zone (which is global, and will affect following tests)
Rapids.exec("(setTimeZone \"" + oldtz + "\")");
for (String s : new String[] { "weather.hex", "crimes.hex", "census.hex", "nary_op_5", "unary_op_6", "unary_op_7", "unary_op_8", "binary_op_9", "unary_op_10", "unary_op_11", "unary_op_12", "binary_op_13", "binary_op_14", "binary_op_15", "nary_op_16", "binary_op_17", "binary_op_18", "binary_op_19", "binary_op_20", "binary_op_21", "binary_op_22", "binary_op_23", "binary_op_24", "binary_op_25", "nary_op_26", "nary_op_27", "nary_op_28", "unary_op_29", "binary_op_30", "binary_op_31", "binary_op_32", "subset_33", "subset_34", "subset_35", "subset_36", "subset_36_2", "nary_op_37", "nary_op_38", "nary_op_39", "binary_op_40", "subset_41", "binary_op_42", "subset_43", "subset_44" }) Keyed.remove(Key.make(s));
}
}
use of water.fvec.NFSFileVec in project h2o-3 by h2oai.
the class GrepTest method testIris.
@Test
public void testIris() {
GrepModel kmm = null;
Frame fr = null;
try {
//TODO: fix with original regex
//String regex = "Iris-versicolor";
String regex = "ver..c\\wl[ob]r";
NFSFileVec nfs = TestUtil.makeNfsFileVec("smalldata/iris/iris_wheader.csv");
DKV.put(fr = new Frame(Key.<Frame>make(), new String[] { "text" }, new Vec[] { nfs }));
// long now = System.nanoTime();
GrepModel.GrepParameters parms = new GrepModel.GrepParameters();
parms._train = fr._key;
parms._regex = regex;
Job<GrepModel> job = new Grep(parms).trainModel();
kmm = job.get();
// final long dt = System.nanoTime() - now;
// System.out.println(dt);
String[] matches = kmm._output._matches;
assertEquals("Number of matches", 50, matches.length);
for (int i = 0; i < matches.length; i++) {
assertEquals("Wrong @" + i, "versicolor", matches[i]);
}
job.remove();
} finally {
if (fr != null)
fr.remove();
if (kmm != null)
kmm.delete();
}
}
use of water.fvec.NFSFileVec in project h2o-3 by h2oai.
the class ParseFolderTestBig method testBIGSVM.
@Test
@Ignore
public void testBIGSVM() {
String fname = "bigdata/cust_K/1m.svm";
Frame k1 = null;
try {
File f = FileUtils.getFile(fname);
NFSFileVec nfs = NFSFileVec.make(f);
Job<Frame> job = ParseDataset.parse(Key.make("BIGSVM.hex"), new Key[] { nfs._key }, true, ParseSetup.guessSetup(new Key[] { nfs._key }, false, ParseSetup.GUESS_HEADER), false)._job;
while (job.progress() < 1.0) {
System.out.print(((int) (job.progress() * 1000.0)) / 10.0 + "% ");
try {
Thread.sleep(1000);
} catch (InterruptedException ignore) {
/*comment to disable ideaJ warning*/
}
}
System.out.println();
k1 = job.get();
System.out.println(k1.toString());
} catch (IOException ioe) {
Assert.fail("File not found: " + fname + " - " + ioe.getMessage());
} finally {
if (k1 != null)
k1.delete();
}
}
use of water.fvec.NFSFileVec in project h2o-3 by h2oai.
the class KMeansDroplet method main.
public static void main(String[] args) throws Exception {
initCloud();
// Load and parse a file. Data is distributed to other nodes in a round-robin way
File f = new File("smalldata/glm_test/gaussian.csv");
NFSFileVec nfs = NFSFileVec.make(f);
Frame frame = water.parser.ParseDataset.parse(Key.make(), nfs._key);
// Optionally create a frame with fewer columns, e.g. skip first
frame.remove(0);
// Create k centers as arrays of doubles
int k = 7;
double[][] centers = new double[k][frame.vecs().length];
// Initialize first cluster center to random row
Random rand = new Random();
for (int cluster = 0; cluster < centers.length; cluster++) {
long row = Math.max(0, (long) (rand.nextDouble() * frame.vecs().length) - 1);
for (int i = 0; i < frame.vecs().length; i++) {
Vec v = frame.vecs()[i];
centers[cluster][i] = v.at(row);
}
}
// Iterate over the dataset and show error for each step
int NUM_ITERS = 10;
for (int i = 0; i < NUM_ITERS; i++) {
KMeans task = new KMeans();
task._centers = centers;
task.doAll(frame);
for (int c = 0; c < centers.length; c++) {
if (task._size[c] > 0) {
for (int v = 0; v < frame.vecs().length; v++) {
double value = task._sums[c][v] / task._size[c];
centers[c][v] = value;
}
}
}
System.out.println("Error is " + task._error);
}
System.out.println("Cluster Centers:");
DecimalFormat df = new DecimalFormat("#.00");
for (double[] center : centers) {
for (int v = 0; v < frame.vecs().length; v++) System.out.print(df.format(center[v]) + ", ");
System.out.println("");
}
System.exit(0);
}
Aggregations