Search in sources :

Example 1 with NFSFileVec

use of water.fvec.NFSFileVec in project h2o-3 by h2oai.

the class GBMMissingTest method run.

@Test
public void run() {
    long seed = 1234;
    GBMModel mymodel = null;
    Frame train = null;
    Frame test = null;
    Frame data = null;
    GBMModel.GBMParameters p;
    Log.info("");
    Log.info("STARTING.");
    Log.info("Using seed " + seed);
    StringBuilder sb = new StringBuilder();
    double sumerr = 0;
    Map<Double, Double> map = new TreeMap<>();
    for (double missing_fraction : new double[] { 0, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99 }) {
        double err = 0;
        try {
            Scope.enter();
            NFSFileVec nfs = TestUtil.makeNfsFileVec("smalldata/junit/weather.csv");
            data = ParseDataset.parse(Key.make("data.hex"), nfs._key);
            Log.info("FrameSplitting");
            // Create holdout test data on clean data (before adding missing values)
            FrameSplitter fs = new FrameSplitter(data, new double[] { 0.75f }, generateNumKeys(data._key, 2), null);
            //.join();
            H2O.submitTask(fs);
            Frame[] train_test = fs.getResult();
            train = train_test[0];
            test = train_test[1];
            Log.info("Done...");
            // add missing values to the training data (excluding the response)
            if (missing_fraction > 0) {
                Frame frtmp = new Frame(Key.<Frame>make(), train.names(), train.vecs());
                //exclude the response
                frtmp.remove(frtmp.numCols() - 1);
                //need to put the frame (to be modified) into DKV for MissingInserter to pick up
                DKV.put(frtmp._key, frtmp);
                FrameUtils.MissingInserter j = new FrameUtils.MissingInserter(frtmp._key, seed, missing_fraction);
                //MissingInserter is non-blocking, must block here explicitly
                j.execImpl().get();
                //Delete the frame header (not the data)
                DKV.remove(frtmp._key);
            }
            // Build a regularized GBM model with polluted training data, score on clean validation set
            p = new GBMModel.GBMParameters();
            p._train = train._key;
            p._valid = test._key;
            p._response_column = train._names[train.numCols() - 1];
            //only for weather data
            p._ignored_columns = new String[] { train._names[1], train._names[22] };
            p._seed = seed;
            // Convert response to categorical
            int ri = train.numCols() - 1;
            int ci = test.find(p._response_column);
            Scope.track(train.replace(ri, train.vecs()[ri].toCategoricalVec()));
            Scope.track(test.replace(ci, test.vecs()[ci].toCategoricalVec()));
            DKV.put(train);
            DKV.put(test);
            GBM gbm = new GBM(p);
            Log.info("Starting with " + missing_fraction * 100 + "% missing values added.");
            mymodel = gbm.trainModel().get();
            // Extract the scoring on validation set from the model
            err = ((ModelMetricsBinomial) mymodel._output._validation_metrics).logloss();
            Frame train_preds = mymodel.score(train);
            Assert.assertTrue(mymodel.testJavaScoring(train, train_preds, 1e-15));
            train_preds.remove();
            Log.info("Missing " + missing_fraction * 100 + "% -> logloss: " + err);
        } catch (Throwable t) {
            t.printStackTrace();
            err = 100;
        } finally {
            Scope.exit();
            // cleanup
            if (mymodel != null) {
                mymodel.delete();
            }
            if (train != null)
                train.delete();
            if (test != null)
                test.delete();
            if (data != null)
                data.delete();
        }
        map.put(missing_fraction, err);
        sumerr += err;
    }
    sb.append("missing fraction --> Error\n");
    for (String s : Arrays.toString(map.entrySet().toArray()).split(",")) sb.append(s.replace("=", " --> ")).append("\n");
    sb.append('\n');
    sb.append("Sum Err: ").append(sumerr).append("\n");
    Log.info(sb.toString());
}
Also used : FrameUtils(water.util.FrameUtils) Frame(water.fvec.Frame) NFSFileVec(water.fvec.NFSFileVec) TreeMap(java.util.TreeMap) FrameSplitter(hex.FrameSplitter) Test(org.junit.Test)

Example 2 with NFSFileVec

use of water.fvec.NFSFileVec in project h2o-3 by h2oai.

the class RapidsTest method testChicago.

@Test
public void testChicago() {
    String oldtz = Rapids.exec("(getTimeZone)").getStr();
    Session ses = new Session();
    try {
        parse_test_file(Key.make("weather.hex"), "smalldata/chicago/chicagoAllWeather.csv");
        parse_test_file(Key.make("crimes.hex"), "smalldata/chicago/chicagoCrimes10k.csv.zip");
        String fname = "smalldata/chicago/chicagoCensus.csv";
        File f = FileUtils.locateFile(fname);
        assert f != null && f.exists() : " file not found: " + fname;
        NFSFileVec nfs = NFSFileVec.make(f);
        ParseSetup ps = ParseSetup.guessSetup(new Key[] { nfs._key }, false, 1);
        ps.getColumnTypes()[1] = Vec.T_CAT;
        ParseDataset.parse(Key.make("census.hex"), new Key[] { nfs._key }, true, ps);
        exec_str("(assign census.hex (colnames= census.hex\t[0 1 2 3 4 5 6 7 8] \n" + "['Community.Area.Number' 'COMMUNITY.AREA.NAME' \"PERCENT.OF.HOUSING.CROWDED\" \r\n" + " \"PERCENT.HOUSEHOLDS.BELOW.POVERTY\" \"PERCENT.AGED.16..UNEMPLOYED\" " + " \"PERCENT.AGED.25..WITHOUT.HIGH.SCHOOL.DIPLOMA\" \"PERCENT.AGED.UNDER.18.OR.OVER.64\" " + " \"PER.CAPITA.INCOME.\" \"HARDSHIP.INDEX\"]))", ses);
        exec_str("(assign crimes.hex (colnames= crimes.hex [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21] [\"ID\" \"Case.Number\" \"Date\" \"Block\" \"IUCR\" \"Primary.Type\" \"Description\" \"Location.Description\" \"Arrest\" \"Domestic\" \"Beat\" \"District\" \"Ward\" \"Community.Area\" \"FBI.Code\" \"X.Coordinate\" \"Y.Coordinate\" \"Year\" \"Updated.On\" \"Latitude\" \"Longitude\" \"Location\"]))", ses);
        exec_str("(setTimeZone \"Etc/UTC\")", ses);
        exec_str("(assign crimes.hex (append crimes.hex (tmp= unary_op_6 (day (tmp= nary_op_5 (cols crimes.hex [2])))) \"Day\"))", ses);
        checkSaneFrame();
        exec_str("(assign crimes.hex (append crimes.hex (tmp= binary_op_31 (+ (tmp= unary_op_7 (month nary_op_5)) 1)) \"Month\"))", ses);
        exec_str("(rm nary_op_30)", ses);
        exec_str("(assign crimes.hex (append crimes.hex (tmp= binary_op_32 (+ (tmp= binary_op_9 (- (tmp= unary_op_8 (year nary_op_5)) 1900)) 1900)) \"Year\"))", ses);
        exec_str("(assign crimes.hex (append crimes.hex (tmp= unary_op_10 (week nary_op_5)) \"WeekNum\"))", ses);
        exec_str("(rm binary_op_32)", ses);
        exec_str("(rm binary_op_31)", ses);
        exec_str("(rm unary_op_8)", ses);
        checkSaneFrame();
        exec_str("(assign crimes.hex (append crimes.hex (tmp= unary_op_11 (dayOfWeek nary_op_5)) \"WeekDay\"))", ses);
        exec_str("(rm 'nfs:\\\\C:\\\\Users\\\\cliffc\\\\Desktop\\\\h2o-3\\\\smalldata\\\\chicago\\\\chicagoCrimes10k.csv.zip')", ses);
        exec_str("(assign crimes.hex (append crimes.hex (tmp= unary_op_12 (hour nary_op_5)) \"HourOfDay\"))", ses);
        exec_str("(assign crimes.hex (append crimes.hex (tmp= nary_op_16 (ifelse (tmp= binary_op_15 (| (tmp= binary_op_13 (== unary_op_11 \"Sun\")) (tmp= binary_op_14 (== unary_op_11 \"Sat\")))) 1 0)) \"Weekend\"))", ses);
        // Season is incorrectly assigned in the original chicago demo; picks up the Weekend flag
        exec_str("(assign crimes.hex (append crimes.hex nary_op_16 \"Season\"))", ses);
        // Standard "head of 10 rows" pattern for printing
        exec_str("(tmp= subset_33 (rows crimes.hex [0:10]))", ses);
        exec_str("(rm subset_33)", ses);
        exec_str("(rm subset_33)", ses);
        exec_str("(rm unary_op_29)", ses);
        exec_str("(rm nary_op_28)", ses);
        exec_str("(rm nary_op_27)", ses);
        exec_str("(rm nary_op_26)", ses);
        exec_str("(rm binary_op_25)", ses);
        exec_str("(rm binary_op_24)", ses);
        exec_str("(rm binary_op_23)", ses);
        exec_str("(rm binary_op_22)", ses);
        exec_str("(rm binary_op_21)", ses);
        exec_str("(rm binary_op_20)", ses);
        exec_str("(rm binary_op_19)", ses);
        exec_str("(rm binary_op_18)", ses);
        exec_str("(rm binary_op_17)", ses);
        exec_str("(rm nary_op_16)", ses);
        exec_str("(rm binary_op_15)", ses);
        exec_str("(rm binary_op_14)", ses);
        exec_str("(rm binary_op_13)", ses);
        exec_str("(rm unary_op_12)", ses);
        exec_str("(rm unary_op_11)", ses);
        exec_str("(rm unary_op_10)", ses);
        exec_str("(rm binary_op_9)", ses);
        exec_str("(rm unary_op_8)", ses);
        exec_str("(rm unary_op_7)", ses);
        exec_str("(rm unary_op_6)", ses);
        exec_str("(rm nary_op_5)", ses);
        checkSaneFrame();
        // Standard "head of 10 rows" pattern for printing
        exec_str("(tmp= subset_34 (rows crimes.hex [0:10]))", ses);
        exec_str("(rm subset_34)", ses);
        exec_str("(assign census.hex (colnames= census.hex [0 1 2 3 4 5 6 7 8] [\"Community.Area\" \"COMMUNITY.AREA.NAME\" \"PERCENT.OF.HOUSING.CROWDED\" \"PERCENT.HOUSEHOLDS.BELOW.POVERTY\" \"PERCENT.AGED.16..UNEMPLOYED\" \"PERCENT.AGED.25..WITHOUT.HIGH.SCHOOL.DIPLOMA\" \"PERCENT.AGED.UNDER.18.OR.OVER.64\" \"PER.CAPITA.INCOME.\" \"HARDSHIP.INDEX\"]))", ses);
        exec_str("(rm subset_34)", ses);
        exec_str("(tmp= subset_35 (cols  crimes.hex [-3]))", ses);
        exec_str("(tmp= subset_36 (cols weather.hex [-1]))", ses);
        exec_str("(tmp= subset_36_2 (colnames= subset_36 [0 1 2 3 4 5] [\"Month\" \"Day\" \"Year\" \"maxTemp\" \"meanTemp\" \"minTemp\"]))", ses);
        exec_str("(rm crimes.hex)", ses);
        exec_str("(rm weather.hex)", ses);
        // nary_op_37 = merge( X Y ); Vecs in X & nary_op_37 shared
        exec_str("(tmp= nary_op_37 (merge subset_35 census.hex TRUE FALSE [] [] \"auto\"))", ses);
        // nary_op_38 = merge( nary_op_37 subset_36_2); Vecs in nary_op_38 and nary_pop_37 and X shared
        exec_str("(tmp= subset_41 (rows (tmp= nary_op_38 (merge nary_op_37 subset_36_2 TRUE FALSE [] [] \"auto\")) (tmp= binary_op_40 (<= (tmp= nary_op_39 (h2o.runif nary_op_38 30792152736.5179)) 0.8))))", ses);
        // Standard "head of 10 rows" pattern for printing
        exec_str("(tmp= subset_44 (rows subset_41 [0:10]))", ses);
        exec_str("(rm subset_44)", ses);
        exec_str("(rm subset_44)", ses);
        exec_str("(rm binary_op_40)", ses);
        exec_str("(rm nary_op_37)", ses);
        exec_str("(tmp= subset_43 (rows nary_op_38 (tmp= binary_op_42 (> nary_op_39 0.8))))", ses);
        // Chicago demo continues on past, but this is all I've captured for now
        checkSaneFrame();
        ses.end(null);
    } catch (Throwable ex) {
        throw ses.endQuietly(ex);
    } finally {
        // Restore time zone (which is global, and will affect following tests)
        Rapids.exec("(setTimeZone \"" + oldtz + "\")");
        for (String s : new String[] { "weather.hex", "crimes.hex", "census.hex", "nary_op_5", "unary_op_6", "unary_op_7", "unary_op_8", "binary_op_9", "unary_op_10", "unary_op_11", "unary_op_12", "binary_op_13", "binary_op_14", "binary_op_15", "nary_op_16", "binary_op_17", "binary_op_18", "binary_op_19", "binary_op_20", "binary_op_21", "binary_op_22", "binary_op_23", "binary_op_24", "binary_op_25", "nary_op_26", "nary_op_27", "nary_op_28", "unary_op_29", "binary_op_30", "binary_op_31", "binary_op_32", "subset_33", "subset_34", "subset_35", "subset_36", "subset_36_2", "nary_op_37", "nary_op_38", "nary_op_39", "binary_op_40", "subset_41", "binary_op_42", "subset_43", "subset_44" }) Keyed.remove(Key.make(s));
    }
}
Also used : ParseSetup(water.parser.ParseSetup) NFSFileVec(water.fvec.NFSFileVec) File(java.io.File) Test(org.junit.Test)

Example 3 with NFSFileVec

use of water.fvec.NFSFileVec in project h2o-3 by h2oai.

the class GrepTest method testIris.

@Test
public void testIris() {
    GrepModel kmm = null;
    Frame fr = null;
    try {
        //TODO: fix with original regex
        //String regex = "Iris-versicolor";
        String regex = "ver..c\\wl[ob]r";
        NFSFileVec nfs = TestUtil.makeNfsFileVec("smalldata/iris/iris_wheader.csv");
        DKV.put(fr = new Frame(Key.<Frame>make(), new String[] { "text" }, new Vec[] { nfs }));
        //      long now = System.nanoTime();
        GrepModel.GrepParameters parms = new GrepModel.GrepParameters();
        parms._train = fr._key;
        parms._regex = regex;
        Job<GrepModel> job = new Grep(parms).trainModel();
        kmm = job.get();
        //      final long dt = System.nanoTime() - now;
        //      System.out.println(dt);
        String[] matches = kmm._output._matches;
        assertEquals("Number of matches", 50, matches.length);
        for (int i = 0; i < matches.length; i++) {
            assertEquals("Wrong @" + i, "versicolor", matches[i]);
        }
        job.remove();
    } finally {
        if (fr != null)
            fr.remove();
        if (kmm != null)
            kmm.delete();
    }
}
Also used : Frame(water.fvec.Frame) NFSFileVec(water.fvec.NFSFileVec)

Example 4 with NFSFileVec

use of water.fvec.NFSFileVec in project h2o-3 by h2oai.

the class ParseFolderTestBig method testBIGSVM.

@Test
@Ignore
public void testBIGSVM() {
    String fname = "bigdata/cust_K/1m.svm";
    Frame k1 = null;
    try {
        File f = FileUtils.getFile(fname);
        NFSFileVec nfs = NFSFileVec.make(f);
        Job<Frame> job = ParseDataset.parse(Key.make("BIGSVM.hex"), new Key[] { nfs._key }, true, ParseSetup.guessSetup(new Key[] { nfs._key }, false, ParseSetup.GUESS_HEADER), false)._job;
        while (job.progress() < 1.0) {
            System.out.print(((int) (job.progress() * 1000.0)) / 10.0 + "% ");
            try {
                Thread.sleep(1000);
            } catch (InterruptedException ignore) {
            /*comment to disable ideaJ warning*/
            }
        }
        System.out.println();
        k1 = job.get();
        System.out.println(k1.toString());
    } catch (IOException ioe) {
        Assert.fail("File not found: " + fname + " - " + ioe.getMessage());
    } finally {
        if (k1 != null)
            k1.delete();
    }
}
Also used : Frame(water.fvec.Frame) NFSFileVec(water.fvec.NFSFileVec) IOException(java.io.IOException) File(java.io.File) Key(water.Key)

Example 5 with NFSFileVec

use of water.fvec.NFSFileVec in project h2o-3 by h2oai.

the class KMeansDroplet method main.

public static void main(String[] args) throws Exception {
    initCloud();
    // Load and parse a file. Data is distributed to other nodes in a round-robin way
    File f = new File("smalldata/glm_test/gaussian.csv");
    NFSFileVec nfs = NFSFileVec.make(f);
    Frame frame = water.parser.ParseDataset.parse(Key.make(), nfs._key);
    // Optionally create a frame with fewer columns, e.g. skip first
    frame.remove(0);
    // Create k centers as arrays of doubles
    int k = 7;
    double[][] centers = new double[k][frame.vecs().length];
    // Initialize first cluster center to random row
    Random rand = new Random();
    for (int cluster = 0; cluster < centers.length; cluster++) {
        long row = Math.max(0, (long) (rand.nextDouble() * frame.vecs().length) - 1);
        for (int i = 0; i < frame.vecs().length; i++) {
            Vec v = frame.vecs()[i];
            centers[cluster][i] = v.at(row);
        }
    }
    // Iterate over the dataset and show error for each step
    int NUM_ITERS = 10;
    for (int i = 0; i < NUM_ITERS; i++) {
        KMeans task = new KMeans();
        task._centers = centers;
        task.doAll(frame);
        for (int c = 0; c < centers.length; c++) {
            if (task._size[c] > 0) {
                for (int v = 0; v < frame.vecs().length; v++) {
                    double value = task._sums[c][v] / task._size[c];
                    centers[c][v] = value;
                }
            }
        }
        System.out.println("Error is " + task._error);
    }
    System.out.println("Cluster Centers:");
    DecimalFormat df = new DecimalFormat("#.00");
    for (double[] center : centers) {
        for (int v = 0; v < frame.vecs().length; v++) System.out.print(df.format(center[v]) + ", ");
        System.out.println("");
    }
    System.exit(0);
}
Also used : Frame(water.fvec.Frame) Random(java.util.Random) Vec(water.fvec.Vec) NFSFileVec(water.fvec.NFSFileVec) NFSFileVec(water.fvec.NFSFileVec) DecimalFormat(java.text.DecimalFormat) File(java.io.File)

Aggregations

NFSFileVec (water.fvec.NFSFileVec)27 Frame (water.fvec.Frame)21 Test (org.junit.Test)13 File (java.io.File)12 DeepLearningParameters (hex.deeplearning.DeepLearningModel.DeepLearningParameters)9 Vec (water.fvec.Vec)8 ShuffleSplitFrame (hex.splitframe.ShuffleSplitFrame)5 Key (water.Key)5 ModelMetricsMultinomial (hex.ModelMetricsMultinomial)3 FrameSplitter (hex.FrameSplitter)2 ModelMetricsBinomial (hex.ModelMetricsBinomial)2 Random (java.util.Random)2 ExecutionException (java.util.concurrent.ExecutionException)2 DateTimeFormatter (org.joda.time.format.DateTimeFormatter)2 Ignore (org.junit.Ignore)2 FrameUtils (water.util.FrameUtils)2 TwoDimTable (water.util.TwoDimTable)2 ConfusionMatrix (hex.ConfusionMatrix)1 SplitFrame (hex.SplitFrame)1 ClassSamplingMethod (hex.deeplearning.DeepLearningModel.DeepLearningParameters.ClassSamplingMethod)1