Search in sources :

Example 1 with Enum

use of water.parser.Enum in project h2o-2 by h2oai.

the class ParseDataset2 method parse_impl.

// --------------------------------------------------------------------------
// Top-level parser driver
private static void parse_impl(ParseDataset2 job, Key[] fkeys, CustomParser.ParserSetup setup, boolean delete_on_done) {
    assert setup._ncols > 0;
    if (fkeys.length == 0) {
        job.cancel();
        return;
    }
    // Remove any previous instance and insert a sentinel (to ensure no one has
    // been writing to the same keys during our parse)!
    Vec v = getVec(fkeys[0]);
    int reserveKeys = setup._pType == ParserType.SVMLight ? 25000000 : setup._ncols;
    VectorGroup vg = v.group();
    int vecIdStart = vg.reserveKeys(reserveKeys);
    MultiFileParseTask mfpt = job._mfpt = new MultiFileParseTask(v.group(), vecIdStart, setup, job._progress);
    mfpt.invoke(fkeys);
    EnumUpdateTask eut = null;
    // Calculate enum domain
    int n = 0;
    AppendableVec[] avs = mfpt.vecs();
    if ((avs.length + vecIdStart) < reserveKeys) {
        Future f = vg.tryReturnKeys(vecIdStart + reserveKeys, vecIdStart + avs.length);
        if (f != null)
            try {
                f.get();
            } catch (InterruptedException e) {
            } catch (ExecutionException e) {
            }
    }
    int[] ecols = new int[avs.length];
    for (int i = 0; i < ecols.length; ++i) if (avs[i].shouldBeEnum())
        ecols[n++] = i;
    ecols = Arrays.copyOf(ecols, n);
    if (ecols.length > 0) {
        EnumFetchTask eft = new EnumFetchTask(H2O.SELF.index(), mfpt._eKey, ecols).invokeOnAllNodes();
        Enum[] enums = eft._gEnums;
        ValueString[][] ds = new ValueString[ecols.length][];
        int j = 0;
        for (int i : ecols) avs[i]._domain = ValueString.toString(ds[j++] = enums[i].computeColumnDomain());
        eut = new EnumUpdateTask(ds, eft._lEnums, mfpt._chunk2Enum, mfpt._eKey, ecols);
    }
    final Frame fr = new Frame(job.dest(), setup._columnNames != null ? setup._columnNames : genericColumnNames(avs.length), AppendableVec.closeAll(avs));
    // SVMLight is sparse format, there may be missing chunks with all 0s, fill them in
    new SVFTask(fr).invokeOnAllNodes();
    fr.checkCompatible();
    // Update enums to the globally agreed numbering
    if (eut != null) {
        Vec[] evecs = new Vec[ecols.length];
        for (int i = 0; i < evecs.length; ++i) evecs[i] = fr.vecs()[ecols[i]];
        eut.doAll(evecs);
    }
    Futures fs = new Futures();
    for (Vec v2 : fr.vecs()) v2.rollupStats(fs);
    fs.blockForPending();
    logParseResults(job, fr);
    // Release the frame for overwriting
    fr.unlock(job.self());
    // Remove CSV files from H2O memory
    if (delete_on_done)
        for (Key k : fkeys) Lockable.delete(k, job.self());
    else
        for (Key k : fkeys) {
            Lockable l = UKV.get(k);
            l.unlock(job.self());
        }
    job.remove();
}
Also used : Enum(water.parser.Enum) Future(java.util.concurrent.Future) VectorGroup(water.fvec.Vec.VectorGroup) ExecutionException(java.util.concurrent.ExecutionException)

Aggregations

ExecutionException (java.util.concurrent.ExecutionException)1 Future (java.util.concurrent.Future)1 VectorGroup (water.fvec.Vec.VectorGroup)1 Enum (water.parser.Enum)1