Search in sources :

Example 6 with ParseSetup

use of water.parser.ParseSetup in project h2o-3 by h2oai.

the class ParseHandler method parse.

// Entry point for parsing.
// called through reflection by RequestServer
@SuppressWarnings("unused")
public ParseV3 parse(int version, ParseV3 parse) {
    ParserInfo parserInfo = ParserService.INSTANCE.getByName(parse.parse_type).info();
    ParseSetup setup = new ParseSetup(parserInfo, parse.separator, parse.single_quotes, parse.check_header, parse.number_columns, delNulls(parse.column_names), ParseSetup.strToColumnTypes(parse.column_types), parse.domains, parse.na_strings, null, new ParseWriter.ParseErr[0], parse.chunk_size);
    if (parse.source_frames == null)
        throw new H2OIllegalArgumentException("Data for Frame '" + parse.destination_frame.name + "' is not available. Please check that the path is valid (for all H2O nodes).'");
    Key[] srcs = new Key[parse.source_frames.length];
    for (int i = 0; i < parse.source_frames.length; i++) srcs[i] = parse.source_frames[i].key();
    parse.job = new JobV3(ParseDataset.parse(parse.destination_frame.key(), srcs, parse.delete_on_done, setup, parse.blocking)._job);
    if (parse.blocking) {
        Frame fr = DKV.getGet(parse.destination_frame.key());
        parse.rows = fr.numRows();
    }
    return parse;
}
Also used : Frame(water.fvec.Frame) ParseSetup(water.parser.ParseSetup) H2OIllegalArgumentException(water.exceptions.H2OIllegalArgumentException) ParseWriter(water.parser.ParseWriter) ParserInfo(water.parser.ParserInfo) JobV3(water.api.schemas3.JobV3) Key(water.Key)

Example 7 with ParseSetup

use of water.parser.ParseSetup in project h2o-3 by h2oai.

the class ParseSetupHandler method guessSetup.

public ParseSetupV3 guessSetup(int version, ParseSetupV3 p) {
    if (p.source_frames == null)
        throw new H2OIllegalArgumentException("No file names given for parsing.");
    Key[] fkeys = new Key[p.source_frames.length];
    for (int i = 0; i < p.source_frames.length; i++) {
        fkeys[i] = p.source_frames[i].key();
        if (DKV.get(fkeys[i]) == null)
            throw new IllegalArgumentException("Key not loaded: " + p.source_frames[i]);
    }
    // corrects for json putting in empty strings in the place of empty sub-arrays
    if (p.na_strings != null)
        for (int i = 0; i < p.na_strings.length; i++) if (p.na_strings[i] != null && p.na_strings[i].length == 0)
            p.na_strings[i] = null;
    ParseSetup ps;
    try {
        ps = ParseSetup.guessSetup(fkeys, new ParseSetup(p));
    } catch (Throwable ex) {
        Throwable ex2 = ex;
        if (ex instanceof DistributedException)
            ex2 = ex.getCause();
        if (ex2 instanceof ParseDataset.H2OParseException)
            throw new H2OIllegalArgumentException(ex2.getMessage());
        throw ex;
    }
    if (ps._errs != null && ps._errs.length > 0) {
        p.warnings = new String[ps._errs.length];
        for (int i = 0; i < ps._errs.length; ++i) p.warnings[i] = ps._errs[i].toString();
    }
    // TODO: ParseSetup throws away the srcs list. . .
    if ((null == p.column_name_filter || "".equals(p.column_name_filter)) && (0 == p.column_offset) && (0 == p.column_count)) {
        // return the entire data preview
        PojoUtils.copyProperties(p, ps, PojoUtils.FieldNaming.ORIGIN_HAS_UNDERSCORES, new String[] { "destination_key", "source_keys", "column_types", "parse_type" });
        p.total_filtered_column_count = p.number_columns;
    } else {
        // have to manually copy the desired parts of p.data to apply either column_name_filter or column pagination or both
        PojoUtils.copyProperties(p, ps, PojoUtils.FieldNaming.ORIGIN_HAS_UNDERSCORES, new String[] { "destination_key", "source_keys", "column_types", "data", "parse_type" });
        String[] all_col_names = ps.getColumnNames();
        String[][] data = ps.getData();
        ArrayList<Integer> keep_indexes = new ArrayList<>();
        if (null != p.column_name_filter && !"".equals(p.column_name_filter)) {
            // filter and then paginate columns
            Pattern pattern = Pattern.compile(p.column_name_filter);
            Matcher m = pattern.matcher("dummy");
            for (int column = 0; column < all_col_names.length; column++) {
                m.reset(all_col_names[column]);
                if (m.matches())
                    keep_indexes.add(column);
            }
        } else {
            // note: we do a little extra work below by treating this like the filter case, but the code is simpler
            for (int column = 0; column < all_col_names.length; column++) {
                keep_indexes.add(column);
            }
        }
        int width_to_return = Math.max(0, keep_indexes.size() - p.column_offset);
        if (p.column_count > 0)
            width_to_return = Math.min(width_to_return, p.column_count);
        String[][] filtered_data = new String[data.length][width_to_return];
        for (int row = 0; row < data.length; row++) {
            int output_column = 0;
            for (int input_column_index = p.column_offset; input_column_index < p.column_offset + width_to_return; input_column_index++) {
                // indirect through keep_indexes
                filtered_data[row][output_column++] = data[row][keep_indexes.get(input_column_index)];
            }
        }
        p.data = filtered_data;
        p.total_filtered_column_count = keep_indexes.size();
    }
    p.destination_frame = ParseSetup.createHexName(p.source_frames[0].toString());
    if (p.check_header == ParseSetup.HAS_HEADER && p.data != null && Arrays.equals(p.column_names, p.data[0]))
        p.data = Arrays.copyOfRange(p.data, 1, p.data.length);
    // Fill in data type names for each column.
    p.column_types = ps.getColumnTypeStrings();
    p.parse_type = ps.getParseType() != null ? ps.getParseType().name() : GUESS_INFO.name();
    return p;
}
Also used : Pattern(java.util.regex.Pattern) ParseSetup(water.parser.ParseSetup) DistributedException(water.util.DistributedException) Matcher(java.util.regex.Matcher) H2OIllegalArgumentException(water.exceptions.H2OIllegalArgumentException) ArrayList(java.util.ArrayList) ParseDataset(water.parser.ParseDataset) Key(water.Key) H2OIllegalArgumentException(water.exceptions.H2OIllegalArgumentException)

Aggregations

ParseSetup (water.parser.ParseSetup)7 Key (water.Key)3 ArrayList (java.util.ArrayList)2 JobV3 (water.api.schemas3.JobV3)2 H2OIllegalArgumentException (water.exceptions.H2OIllegalArgumentException)2 Frame (water.fvec.Frame)2 BufferedString (water.parser.BufferedString)2 ParserInfo (water.parser.ParserInfo)2 File (java.io.File)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1 Test (org.junit.Test)1 H2OIllegalValueException (water.exceptions.H2OIllegalValueException)1 NFSFileVec (water.fvec.NFSFileVec)1 ParseDataset (water.parser.ParseDataset)1 ParseWriter (water.parser.ParseWriter)1 ParserProvider (water.parser.ParserProvider)1 DistributedException (water.util.DistributedException)1