Search in sources :

Example 1 with ParseSetup

use of water.parser.ParseSetup in project h2o-3 by h2oai.

the class ParseSetupV3 method fillImpl.

@Override
public ParseSetup fillImpl(ParseSetup impl) {
    ParseSetup parseSetup = fillImpl(impl, new String[] { "parse_type" });
    // Transform the field parse_type
    ParserInfo pi = GUESS_INFO;
    if (this.parse_type != null) {
        ParserProvider pp = ParserService.INSTANCE.getByName(this.parse_type);
        if (pp != null) {
            pi = pp.info();
        } else
            throw new H2OIllegalValueException("Cannot find right parser for specified parser type!", this.parse_type);
    }
    parseSetup.setParseType(pi);
    return parseSetup;
}
Also used : ParseSetup(water.parser.ParseSetup) ParserInfo(water.parser.ParserInfo) ParserProvider(water.parser.ParserProvider) H2OIllegalValueException(water.exceptions.H2OIllegalValueException)

Example 2 with ParseSetup

use of water.parser.ParseSetup in project h2o-3 by h2oai.

the class RapidsTest method testChicago.

@Test
public void testChicago() {
    String oldtz = Rapids.exec("(getTimeZone)").getStr();
    Session ses = new Session();
    try {
        parse_test_file(Key.make("weather.hex"), "smalldata/chicago/chicagoAllWeather.csv");
        parse_test_file(Key.make("crimes.hex"), "smalldata/chicago/chicagoCrimes10k.csv.zip");
        String fname = "smalldata/chicago/chicagoCensus.csv";
        File f = FileUtils.locateFile(fname);
        assert f != null && f.exists() : " file not found: " + fname;
        NFSFileVec nfs = NFSFileVec.make(f);
        ParseSetup ps = ParseSetup.guessSetup(new Key[] { nfs._key }, false, 1);
        ps.getColumnTypes()[1] = Vec.T_CAT;
        ParseDataset.parse(Key.make("census.hex"), new Key[] { nfs._key }, true, ps);
        exec_str("(assign census.hex (colnames= census.hex\t[0 1 2 3 4 5 6 7 8] \n" + "['Community.Area.Number' 'COMMUNITY.AREA.NAME' \"PERCENT.OF.HOUSING.CROWDED\" \r\n" + " \"PERCENT.HOUSEHOLDS.BELOW.POVERTY\" \"PERCENT.AGED.16..UNEMPLOYED\" " + " \"PERCENT.AGED.25..WITHOUT.HIGH.SCHOOL.DIPLOMA\" \"PERCENT.AGED.UNDER.18.OR.OVER.64\" " + " \"PER.CAPITA.INCOME.\" \"HARDSHIP.INDEX\"]))", ses);
        exec_str("(assign crimes.hex (colnames= crimes.hex [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21] [\"ID\" \"Case.Number\" \"Date\" \"Block\" \"IUCR\" \"Primary.Type\" \"Description\" \"Location.Description\" \"Arrest\" \"Domestic\" \"Beat\" \"District\" \"Ward\" \"Community.Area\" \"FBI.Code\" \"X.Coordinate\" \"Y.Coordinate\" \"Year\" \"Updated.On\" \"Latitude\" \"Longitude\" \"Location\"]))", ses);
        exec_str("(setTimeZone \"Etc/UTC\")", ses);
        exec_str("(assign crimes.hex (append crimes.hex (tmp= unary_op_6 (day (tmp= nary_op_5 (cols crimes.hex [2])))) \"Day\"))", ses);
        checkSaneFrame();
        exec_str("(assign crimes.hex (append crimes.hex (tmp= binary_op_31 (+ (tmp= unary_op_7 (month nary_op_5)) 1)) \"Month\"))", ses);
        exec_str("(rm nary_op_30)", ses);
        exec_str("(assign crimes.hex (append crimes.hex (tmp= binary_op_32 (+ (tmp= binary_op_9 (- (tmp= unary_op_8 (year nary_op_5)) 1900)) 1900)) \"Year\"))", ses);
        exec_str("(assign crimes.hex (append crimes.hex (tmp= unary_op_10 (week nary_op_5)) \"WeekNum\"))", ses);
        exec_str("(rm binary_op_32)", ses);
        exec_str("(rm binary_op_31)", ses);
        exec_str("(rm unary_op_8)", ses);
        checkSaneFrame();
        exec_str("(assign crimes.hex (append crimes.hex (tmp= unary_op_11 (dayOfWeek nary_op_5)) \"WeekDay\"))", ses);
        exec_str("(rm 'nfs:\\\\C:\\\\Users\\\\cliffc\\\\Desktop\\\\h2o-3\\\\smalldata\\\\chicago\\\\chicagoCrimes10k.csv.zip')", ses);
        exec_str("(assign crimes.hex (append crimes.hex (tmp= unary_op_12 (hour nary_op_5)) \"HourOfDay\"))", ses);
        exec_str("(assign crimes.hex (append crimes.hex (tmp= nary_op_16 (ifelse (tmp= binary_op_15 (| (tmp= binary_op_13 (== unary_op_11 \"Sun\")) (tmp= binary_op_14 (== unary_op_11 \"Sat\")))) 1 0)) \"Weekend\"))", ses);
        // Season is incorrectly assigned in the original chicago demo; picks up the Weekend flag
        exec_str("(assign crimes.hex (append crimes.hex nary_op_16 \"Season\"))", ses);
        // Standard "head of 10 rows" pattern for printing
        exec_str("(tmp= subset_33 (rows crimes.hex [0:10]))", ses);
        exec_str("(rm subset_33)", ses);
        exec_str("(rm subset_33)", ses);
        exec_str("(rm unary_op_29)", ses);
        exec_str("(rm nary_op_28)", ses);
        exec_str("(rm nary_op_27)", ses);
        exec_str("(rm nary_op_26)", ses);
        exec_str("(rm binary_op_25)", ses);
        exec_str("(rm binary_op_24)", ses);
        exec_str("(rm binary_op_23)", ses);
        exec_str("(rm binary_op_22)", ses);
        exec_str("(rm binary_op_21)", ses);
        exec_str("(rm binary_op_20)", ses);
        exec_str("(rm binary_op_19)", ses);
        exec_str("(rm binary_op_18)", ses);
        exec_str("(rm binary_op_17)", ses);
        exec_str("(rm nary_op_16)", ses);
        exec_str("(rm binary_op_15)", ses);
        exec_str("(rm binary_op_14)", ses);
        exec_str("(rm binary_op_13)", ses);
        exec_str("(rm unary_op_12)", ses);
        exec_str("(rm unary_op_11)", ses);
        exec_str("(rm unary_op_10)", ses);
        exec_str("(rm binary_op_9)", ses);
        exec_str("(rm unary_op_8)", ses);
        exec_str("(rm unary_op_7)", ses);
        exec_str("(rm unary_op_6)", ses);
        exec_str("(rm nary_op_5)", ses);
        checkSaneFrame();
        // Standard "head of 10 rows" pattern for printing
        exec_str("(tmp= subset_34 (rows crimes.hex [0:10]))", ses);
        exec_str("(rm subset_34)", ses);
        exec_str("(assign census.hex (colnames= census.hex [0 1 2 3 4 5 6 7 8] [\"Community.Area\" \"COMMUNITY.AREA.NAME\" \"PERCENT.OF.HOUSING.CROWDED\" \"PERCENT.HOUSEHOLDS.BELOW.POVERTY\" \"PERCENT.AGED.16..UNEMPLOYED\" \"PERCENT.AGED.25..WITHOUT.HIGH.SCHOOL.DIPLOMA\" \"PERCENT.AGED.UNDER.18.OR.OVER.64\" \"PER.CAPITA.INCOME.\" \"HARDSHIP.INDEX\"]))", ses);
        exec_str("(rm subset_34)", ses);
        exec_str("(tmp= subset_35 (cols  crimes.hex [-3]))", ses);
        exec_str("(tmp= subset_36 (cols weather.hex [-1]))", ses);
        exec_str("(tmp= subset_36_2 (colnames= subset_36 [0 1 2 3 4 5] [\"Month\" \"Day\" \"Year\" \"maxTemp\" \"meanTemp\" \"minTemp\"]))", ses);
        exec_str("(rm crimes.hex)", ses);
        exec_str("(rm weather.hex)", ses);
        // nary_op_37 = merge( X Y ); Vecs in X & nary_op_37 shared
        exec_str("(tmp= nary_op_37 (merge subset_35 census.hex TRUE FALSE [] [] \"auto\"))", ses);
        // nary_op_38 = merge( nary_op_37 subset_36_2); Vecs in nary_op_38 and nary_pop_37 and X shared
        exec_str("(tmp= subset_41 (rows (tmp= nary_op_38 (merge nary_op_37 subset_36_2 TRUE FALSE [] [] \"auto\")) (tmp= binary_op_40 (<= (tmp= nary_op_39 (h2o.runif nary_op_38 30792152736.5179)) 0.8))))", ses);
        // Standard "head of 10 rows" pattern for printing
        exec_str("(tmp= subset_44 (rows subset_41 [0:10]))", ses);
        exec_str("(rm subset_44)", ses);
        exec_str("(rm subset_44)", ses);
        exec_str("(rm binary_op_40)", ses);
        exec_str("(rm nary_op_37)", ses);
        exec_str("(tmp= subset_43 (rows nary_op_38 (tmp= binary_op_42 (> nary_op_39 0.8))))", ses);
        // Chicago demo continues on past, but this is all I've captured for now
        checkSaneFrame();
        ses.end(null);
    } catch (Throwable ex) {
        throw ses.endQuietly(ex);
    } finally {
        // Restore time zone (which is global, and will affect following tests)
        Rapids.exec("(setTimeZone \"" + oldtz + "\")");
        for (String s : new String[] { "weather.hex", "crimes.hex", "census.hex", "nary_op_5", "unary_op_6", "unary_op_7", "unary_op_8", "binary_op_9", "unary_op_10", "unary_op_11", "unary_op_12", "binary_op_13", "binary_op_14", "binary_op_15", "nary_op_16", "binary_op_17", "binary_op_18", "binary_op_19", "binary_op_20", "binary_op_21", "binary_op_22", "binary_op_23", "binary_op_24", "binary_op_25", "nary_op_26", "nary_op_27", "nary_op_28", "unary_op_29", "binary_op_30", "binary_op_31", "binary_op_32", "subset_33", "subset_34", "subset_35", "subset_36", "subset_36_2", "nary_op_37", "nary_op_38", "nary_op_39", "binary_op_40", "subset_41", "binary_op_42", "subset_43", "subset_44" }) Keyed.remove(Key.make(s));
    }
}
Also used : ParseSetup(water.parser.ParseSetup) NFSFileVec(water.fvec.NFSFileVec) File(java.io.File) Test(org.junit.Test)

Example 3 with ParseSetup

use of water.parser.ParseSetup in project h2o-3 by h2oai.

the class ParseHandler method parseSVMLight.

// called through reflection by RequestServer
@SuppressWarnings("unused")
public JobV3 parseSVMLight(int version, ParseSVMLightV3 parse) {
    Key[] fkeys = new Key[parse.source_frames.length];
    for (int i = 0; i < fkeys.length; ++i) fkeys[i] = parse.source_frames[i].key();
    Key<Frame> destKey = parse.destination_frame == null ? null : parse.destination_frame.key();
    if (destKey == null)
        destKey = Key.make(ParseSetup.createHexName(parse.source_frames[0].toString()));
    ParseSetup setup = ParseSetup.guessSetup(fkeys, ParseSetup.makeSVMLightSetup());
    return new JobV3().fillFromImpl(ParseDataset.forkParseSVMLight(destKey, fkeys, setup));
}
Also used : Frame(water.fvec.Frame) ParseSetup(water.parser.ParseSetup) JobV3(water.api.schemas3.JobV3) Key(water.Key)

Example 4 with ParseSetup

use of water.parser.ParseSetup in project h2o-3 by h2oai.

the class TestUtil method parse_test_file.

protected Frame parse_test_file(String fname, String na_string, int check_header, byte[] column_types) {
    NFSFileVec nfs = makeNfsFileVec(fname);
    Key[] res = { nfs._key };
    // create new parseSetup in order to store our na_string
    ParseSetup p = ParseSetup.guessSetup(res, new ParseSetup(DefaultParserProviders.GUESS_INFO, (byte) ',', true, check_header, 0, null, null, null, null, null));
    // add the na_strings into p.
    if (na_string != null) {
        int column_number = p.getColumnTypes().length;
        int na_length = na_string.length() - 1;
        String[][] na_strings = new String[column_number][na_length + 1];
        for (int index = 0; index < column_number; index++) {
            na_strings[index][na_length] = na_string;
        }
        p.setNAStrings(na_strings);
    }
    if (column_types != null)
        p.setColumnTypes(column_types);
    return ParseDataset.parse(Key.make(), res, true, p);
}
Also used : ParseSetup(water.parser.ParseSetup) BufferedString(water.parser.BufferedString)

Example 5 with ParseSetup

use of water.parser.ParseSetup in project h2o-3 by h2oai.

the class TestUtil method parse_test_folder.

/**
   * Parse a folder with csv files when a single na_string is specified.
   *
   * @param fname name of folder
   * @param na_string string for NA in a column
   * @return
   */
protected static Frame parse_test_folder(String fname, String na_string, int check_header, byte[] column_types) {
    File folder = FileUtils.locateFile(fname);
    File[] files = contentsOf(fname, folder);
    Arrays.sort(files);
    ArrayList<Key> keys = new ArrayList<>();
    for (File f : files) if (f.isFile())
        keys.add(NFSFileVec.make(f)._key);
    Key[] res = new Key[keys.size()];
    // generated the necessary key here
    keys.toArray(res);
    // create new parseSetup in order to store our na_string
    ParseSetup p = ParseSetup.guessSetup(res, new ParseSetup(DefaultParserProviders.GUESS_INFO, (byte) ',', true, check_header, 0, null, null, null, null, null));
    // add the na_strings into p.
    if (na_string != null) {
        int column_number = p.getColumnTypes().length;
        int na_length = na_string.length() - 1;
        String[][] na_strings = new String[column_number][na_length + 1];
        for (int index = 0; index < column_number; index++) {
            na_strings[index][na_length] = na_string;
        }
        p.setNAStrings(na_strings);
    }
    if (column_types != null)
        p.setColumnTypes(column_types);
    return ParseDataset.parse(Key.make(), res, true, p);
}
Also used : ParseSetup(water.parser.ParseSetup) ArrayList(java.util.ArrayList) BufferedString(water.parser.BufferedString)

Aggregations

ParseSetup (water.parser.ParseSetup)7 Key (water.Key)3 ArrayList (java.util.ArrayList)2 JobV3 (water.api.schemas3.JobV3)2 H2OIllegalArgumentException (water.exceptions.H2OIllegalArgumentException)2 Frame (water.fvec.Frame)2 BufferedString (water.parser.BufferedString)2 ParserInfo (water.parser.ParserInfo)2 File (java.io.File)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1 Test (org.junit.Test)1 H2OIllegalValueException (water.exceptions.H2OIllegalValueException)1 NFSFileVec (water.fvec.NFSFileVec)1 ParseDataset (water.parser.ParseDataset)1 ParseWriter (water.parser.ParseWriter)1 ParserProvider (water.parser.ParserProvider)1 DistributedException (water.util.DistributedException)1