use of water.parser.ParseSetup in project h2o-3 by h2oai.
the class ParseSetupV3 method fillImpl.
@Override
public ParseSetup fillImpl(ParseSetup impl) {
ParseSetup parseSetup = fillImpl(impl, new String[] { "parse_type" });
// Transform the field parse_type
ParserInfo pi = GUESS_INFO;
if (this.parse_type != null) {
ParserProvider pp = ParserService.INSTANCE.getByName(this.parse_type);
if (pp != null) {
pi = pp.info();
} else
throw new H2OIllegalValueException("Cannot find right parser for specified parser type!", this.parse_type);
}
parseSetup.setParseType(pi);
return parseSetup;
}
use of water.parser.ParseSetup in project h2o-3 by h2oai.
the class RapidsTest method testChicago.
@Test
public void testChicago() {
String oldtz = Rapids.exec("(getTimeZone)").getStr();
Session ses = new Session();
try {
parse_test_file(Key.make("weather.hex"), "smalldata/chicago/chicagoAllWeather.csv");
parse_test_file(Key.make("crimes.hex"), "smalldata/chicago/chicagoCrimes10k.csv.zip");
String fname = "smalldata/chicago/chicagoCensus.csv";
File f = FileUtils.locateFile(fname);
assert f != null && f.exists() : " file not found: " + fname;
NFSFileVec nfs = NFSFileVec.make(f);
ParseSetup ps = ParseSetup.guessSetup(new Key[] { nfs._key }, false, 1);
ps.getColumnTypes()[1] = Vec.T_CAT;
ParseDataset.parse(Key.make("census.hex"), new Key[] { nfs._key }, true, ps);
exec_str("(assign census.hex (colnames= census.hex\t[0 1 2 3 4 5 6 7 8] \n" + "['Community.Area.Number' 'COMMUNITY.AREA.NAME' \"PERCENT.OF.HOUSING.CROWDED\" \r\n" + " \"PERCENT.HOUSEHOLDS.BELOW.POVERTY\" \"PERCENT.AGED.16..UNEMPLOYED\" " + " \"PERCENT.AGED.25..WITHOUT.HIGH.SCHOOL.DIPLOMA\" \"PERCENT.AGED.UNDER.18.OR.OVER.64\" " + " \"PER.CAPITA.INCOME.\" \"HARDSHIP.INDEX\"]))", ses);
exec_str("(assign crimes.hex (colnames= crimes.hex [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21] [\"ID\" \"Case.Number\" \"Date\" \"Block\" \"IUCR\" \"Primary.Type\" \"Description\" \"Location.Description\" \"Arrest\" \"Domestic\" \"Beat\" \"District\" \"Ward\" \"Community.Area\" \"FBI.Code\" \"X.Coordinate\" \"Y.Coordinate\" \"Year\" \"Updated.On\" \"Latitude\" \"Longitude\" \"Location\"]))", ses);
exec_str("(setTimeZone \"Etc/UTC\")", ses);
exec_str("(assign crimes.hex (append crimes.hex (tmp= unary_op_6 (day (tmp= nary_op_5 (cols crimes.hex [2])))) \"Day\"))", ses);
checkSaneFrame();
exec_str("(assign crimes.hex (append crimes.hex (tmp= binary_op_31 (+ (tmp= unary_op_7 (month nary_op_5)) 1)) \"Month\"))", ses);
exec_str("(rm nary_op_30)", ses);
exec_str("(assign crimes.hex (append crimes.hex (tmp= binary_op_32 (+ (tmp= binary_op_9 (- (tmp= unary_op_8 (year nary_op_5)) 1900)) 1900)) \"Year\"))", ses);
exec_str("(assign crimes.hex (append crimes.hex (tmp= unary_op_10 (week nary_op_5)) \"WeekNum\"))", ses);
exec_str("(rm binary_op_32)", ses);
exec_str("(rm binary_op_31)", ses);
exec_str("(rm unary_op_8)", ses);
checkSaneFrame();
exec_str("(assign crimes.hex (append crimes.hex (tmp= unary_op_11 (dayOfWeek nary_op_5)) \"WeekDay\"))", ses);
exec_str("(rm 'nfs:\\\\C:\\\\Users\\\\cliffc\\\\Desktop\\\\h2o-3\\\\smalldata\\\\chicago\\\\chicagoCrimes10k.csv.zip')", ses);
exec_str("(assign crimes.hex (append crimes.hex (tmp= unary_op_12 (hour nary_op_5)) \"HourOfDay\"))", ses);
exec_str("(assign crimes.hex (append crimes.hex (tmp= nary_op_16 (ifelse (tmp= binary_op_15 (| (tmp= binary_op_13 (== unary_op_11 \"Sun\")) (tmp= binary_op_14 (== unary_op_11 \"Sat\")))) 1 0)) \"Weekend\"))", ses);
// Season is incorrectly assigned in the original chicago demo; picks up the Weekend flag
exec_str("(assign crimes.hex (append crimes.hex nary_op_16 \"Season\"))", ses);
// Standard "head of 10 rows" pattern for printing
exec_str("(tmp= subset_33 (rows crimes.hex [0:10]))", ses);
exec_str("(rm subset_33)", ses);
exec_str("(rm subset_33)", ses);
exec_str("(rm unary_op_29)", ses);
exec_str("(rm nary_op_28)", ses);
exec_str("(rm nary_op_27)", ses);
exec_str("(rm nary_op_26)", ses);
exec_str("(rm binary_op_25)", ses);
exec_str("(rm binary_op_24)", ses);
exec_str("(rm binary_op_23)", ses);
exec_str("(rm binary_op_22)", ses);
exec_str("(rm binary_op_21)", ses);
exec_str("(rm binary_op_20)", ses);
exec_str("(rm binary_op_19)", ses);
exec_str("(rm binary_op_18)", ses);
exec_str("(rm binary_op_17)", ses);
exec_str("(rm nary_op_16)", ses);
exec_str("(rm binary_op_15)", ses);
exec_str("(rm binary_op_14)", ses);
exec_str("(rm binary_op_13)", ses);
exec_str("(rm unary_op_12)", ses);
exec_str("(rm unary_op_11)", ses);
exec_str("(rm unary_op_10)", ses);
exec_str("(rm binary_op_9)", ses);
exec_str("(rm unary_op_8)", ses);
exec_str("(rm unary_op_7)", ses);
exec_str("(rm unary_op_6)", ses);
exec_str("(rm nary_op_5)", ses);
checkSaneFrame();
// Standard "head of 10 rows" pattern for printing
exec_str("(tmp= subset_34 (rows crimes.hex [0:10]))", ses);
exec_str("(rm subset_34)", ses);
exec_str("(assign census.hex (colnames= census.hex [0 1 2 3 4 5 6 7 8] [\"Community.Area\" \"COMMUNITY.AREA.NAME\" \"PERCENT.OF.HOUSING.CROWDED\" \"PERCENT.HOUSEHOLDS.BELOW.POVERTY\" \"PERCENT.AGED.16..UNEMPLOYED\" \"PERCENT.AGED.25..WITHOUT.HIGH.SCHOOL.DIPLOMA\" \"PERCENT.AGED.UNDER.18.OR.OVER.64\" \"PER.CAPITA.INCOME.\" \"HARDSHIP.INDEX\"]))", ses);
exec_str("(rm subset_34)", ses);
exec_str("(tmp= subset_35 (cols crimes.hex [-3]))", ses);
exec_str("(tmp= subset_36 (cols weather.hex [-1]))", ses);
exec_str("(tmp= subset_36_2 (colnames= subset_36 [0 1 2 3 4 5] [\"Month\" \"Day\" \"Year\" \"maxTemp\" \"meanTemp\" \"minTemp\"]))", ses);
exec_str("(rm crimes.hex)", ses);
exec_str("(rm weather.hex)", ses);
// nary_op_37 = merge( X Y ); Vecs in X & nary_op_37 shared
exec_str("(tmp= nary_op_37 (merge subset_35 census.hex TRUE FALSE [] [] \"auto\"))", ses);
// nary_op_38 = merge( nary_op_37 subset_36_2); Vecs in nary_op_38 and nary_pop_37 and X shared
exec_str("(tmp= subset_41 (rows (tmp= nary_op_38 (merge nary_op_37 subset_36_2 TRUE FALSE [] [] \"auto\")) (tmp= binary_op_40 (<= (tmp= nary_op_39 (h2o.runif nary_op_38 30792152736.5179)) 0.8))))", ses);
// Standard "head of 10 rows" pattern for printing
exec_str("(tmp= subset_44 (rows subset_41 [0:10]))", ses);
exec_str("(rm subset_44)", ses);
exec_str("(rm subset_44)", ses);
exec_str("(rm binary_op_40)", ses);
exec_str("(rm nary_op_37)", ses);
exec_str("(tmp= subset_43 (rows nary_op_38 (tmp= binary_op_42 (> nary_op_39 0.8))))", ses);
// Chicago demo continues on past, but this is all I've captured for now
checkSaneFrame();
ses.end(null);
} catch (Throwable ex) {
throw ses.endQuietly(ex);
} finally {
// Restore time zone (which is global, and will affect following tests)
Rapids.exec("(setTimeZone \"" + oldtz + "\")");
for (String s : new String[] { "weather.hex", "crimes.hex", "census.hex", "nary_op_5", "unary_op_6", "unary_op_7", "unary_op_8", "binary_op_9", "unary_op_10", "unary_op_11", "unary_op_12", "binary_op_13", "binary_op_14", "binary_op_15", "nary_op_16", "binary_op_17", "binary_op_18", "binary_op_19", "binary_op_20", "binary_op_21", "binary_op_22", "binary_op_23", "binary_op_24", "binary_op_25", "nary_op_26", "nary_op_27", "nary_op_28", "unary_op_29", "binary_op_30", "binary_op_31", "binary_op_32", "subset_33", "subset_34", "subset_35", "subset_36", "subset_36_2", "nary_op_37", "nary_op_38", "nary_op_39", "binary_op_40", "subset_41", "binary_op_42", "subset_43", "subset_44" }) Keyed.remove(Key.make(s));
}
}
use of water.parser.ParseSetup in project h2o-3 by h2oai.
the class ParseHandler method parseSVMLight.
// called through reflection by RequestServer
@SuppressWarnings("unused")
public JobV3 parseSVMLight(int version, ParseSVMLightV3 parse) {
Key[] fkeys = new Key[parse.source_frames.length];
for (int i = 0; i < fkeys.length; ++i) fkeys[i] = parse.source_frames[i].key();
Key<Frame> destKey = parse.destination_frame == null ? null : parse.destination_frame.key();
if (destKey == null)
destKey = Key.make(ParseSetup.createHexName(parse.source_frames[0].toString()));
ParseSetup setup = ParseSetup.guessSetup(fkeys, ParseSetup.makeSVMLightSetup());
return new JobV3().fillFromImpl(ParseDataset.forkParseSVMLight(destKey, fkeys, setup));
}
use of water.parser.ParseSetup in project h2o-3 by h2oai.
the class TestUtil method parse_test_file.
protected Frame parse_test_file(String fname, String na_string, int check_header, byte[] column_types) {
NFSFileVec nfs = makeNfsFileVec(fname);
Key[] res = { nfs._key };
// create new parseSetup in order to store our na_string
ParseSetup p = ParseSetup.guessSetup(res, new ParseSetup(DefaultParserProviders.GUESS_INFO, (byte) ',', true, check_header, 0, null, null, null, null, null));
// add the na_strings into p.
if (na_string != null) {
int column_number = p.getColumnTypes().length;
int na_length = na_string.length() - 1;
String[][] na_strings = new String[column_number][na_length + 1];
for (int index = 0; index < column_number; index++) {
na_strings[index][na_length] = na_string;
}
p.setNAStrings(na_strings);
}
if (column_types != null)
p.setColumnTypes(column_types);
return ParseDataset.parse(Key.make(), res, true, p);
}
use of water.parser.ParseSetup in project h2o-3 by h2oai.
the class TestUtil method parse_test_folder.
/**
* Parse a folder with csv files when a single na_string is specified.
*
* @param fname name of folder
* @param na_string string for NA in a column
* @return
*/
protected static Frame parse_test_folder(String fname, String na_string, int check_header, byte[] column_types) {
File folder = FileUtils.locateFile(fname);
File[] files = contentsOf(fname, folder);
Arrays.sort(files);
ArrayList<Key> keys = new ArrayList<>();
for (File f : files) if (f.isFile())
keys.add(NFSFileVec.make(f)._key);
Key[] res = new Key[keys.size()];
// generated the necessary key here
keys.toArray(res);
// create new parseSetup in order to store our na_string
ParseSetup p = ParseSetup.guessSetup(res, new ParseSetup(DefaultParserProviders.GUESS_INFO, (byte) ',', true, check_header, 0, null, null, null, null, null));
// add the na_strings into p.
if (na_string != null) {
int column_number = p.getColumnTypes().length;
int na_length = na_string.length() - 1;
String[][] na_strings = new String[column_number][na_length + 1];
for (int index = 0; index < column_number; index++) {
na_strings[index][na_length] = na_string;
}
p.setNAStrings(na_strings);
}
if (column_types != null)
p.setColumnTypes(column_types);
return ParseDataset.parse(Key.make(), res, true, p);
}
Aggregations