Search in sources :

Example 1 with VectorGroup

use of water.fvec.Vec.VectorGroup in project h2o-2 by h2oai.

the class MRTask2 method compute2.

/** Called from FJ threads to do local work.  The first called Task (which is
   *  also the last one to Complete) also reduces any global work.  Called
   *  internal by F/J.  Not expected to be user-called.  */
@Override
public final void compute2() {
    _t0 = System.nanoTime();
    assert _left == null && _rite == null && _res == null;
    _profile._mapstart = System.currentTimeMillis();
    if (_hi - _lo >= 2) {
        // Multi-chunk case: just divide-and-conquer to 1 chunk
        // Mid-point
        final int mid = (_lo + _hi) >>> 1;
        _left = clone();
        _rite = clone();
        _left._profile = new MRProfile(this);
        _rite._profile = new MRProfile(this);
        // Reset mid-point
        _left._hi = mid;
        // Also set self mid-point
        _rite._lo = mid;
        // One fork awaiting completion
        addToPendingCount(1);
        // Runs in another thread/FJ instance
        _left.fork();
        // Runs in THIS F/J thread
        _rite.compute2();
        _profile._mapdone = System.currentTimeMillis();
        // Not complete until the fork completes
        return;
    }
    // Zero or 1 chunks, and further chunk might not be homed here
    if (_hi > _lo) {
        // Single chunk?
        Vec v0 = _fr.anyVec();
        if (_run_local || v0.chunkKey(_lo).home()) {
            // And chunk is homed here?
            // Make decompression chunk headers for these chunks
            Vec[] vecs = _fr.vecs();
            Chunk[] bvs = new Chunk[vecs.length];
            NewChunk[] appendableChunks = null;
            for (int i = 0; i < vecs.length; i++) if (vecs[i] != null) {
                assert _run_local || vecs[i].chunkKey(_lo).home() : "Chunk=" + _lo + " v0=" + v0 + ", k=" + v0.chunkKey(_lo) + "   v[" + i + "]=" + vecs[i] + ", k=" + vecs[i].chunkKey(_lo);
                try {
                    bvs[i] = vecs[i].chunkForChunkIdx(_lo);
                } catch (Throwable t) {
                    System.err.println("missing chunk in MRTask " + getClass().getName());
                    t.printStackTrace();
                    throw new RuntimeException(t);
                }
            }
            if (_noutputs > 0) {
                final VectorGroup vg = vecs[0].group();
                _appendables = new AppendableVec[_noutputs];
                appendableChunks = new NewChunk[_noutputs];
                for (int i = 0; i < _appendables.length; ++i) {
                    _appendables[i] = new AppendableVec(vg.vecKey(_vid + i), new long[4], 0);
                    appendableChunks[i] = (NewChunk) _appendables[i].chunkForChunkIdx(_lo);
                }
            }
            // Call all the various map() calls that apply
            _profile._userstart = System.currentTimeMillis();
            if (_fr.vecs().length == 1)
                map(bvs[0]);
            if (_fr.vecs().length == 2)
                map(bvs[0], bvs[1]);
            if (_fr.vecs().length == 3)
                map(bvs[0], bvs[1], bvs[2]);
            if (true)
                map(bvs);
            if (_noutputs == 1) {
                // convenience versions for cases with single output.
                if (_fr.vecs().length == 1)
                    map(bvs[0], appendableChunks[0]);
                if (_fr.vecs().length == 2)
                    map(bvs[0], bvs[1], appendableChunks[0]);
                if (_fr.vecs().length == 3)
                    map(bvs[0], bvs[1], bvs[2], appendableChunks[0]);
                if (true)
                    map(bvs, appendableChunks[0]);
            }
            if (_noutputs == 2) {
                // convenience versions for cases with 2 outputs (e.g split).
                if (_fr.vecs().length == 1)
                    map(bvs[0], appendableChunks[0], appendableChunks[1]);
                if (_fr.vecs().length == 2)
                    map(bvs[0], bvs[1], appendableChunks[0], appendableChunks[1]);
                if (_fr.vecs().length == 3)
                    map(bvs[0], bvs[1], bvs[2], appendableChunks[0], appendableChunks[1]);
                if (true)
                    map(bvs, appendableChunks[0], appendableChunks[1]);
            }
            map(bvs, appendableChunks);
            // Save results since called map() at least once!
            _res = self();
            // Further D/K/V put any new vec results.
            _profile._closestart = System.currentTimeMillis();
            for (Chunk bv : bvs) bv.close(_lo, _fs);
            if (_noutputs > 0)
                for (NewChunk nch : appendableChunks) nch.close(_lo, _fs);
        }
    }
    _profile._mapdone = System.currentTimeMillis();
    // And this task is complete
    tryComplete();
}
Also used : VectorGroup(water.fvec.Vec.VectorGroup)

Example 2 with VectorGroup

use of water.fvec.Vec.VectorGroup in project h2o-3 by h2oai.

the class ParseDataset method forkParseSVMLight.

public static Job forkParseSVMLight(final Key<Frame> dest, final Key[] keys, final ParseSetup setup) {
    int nchunks = 0;
    Vec v = null;
    // set the parse chunk size for files
    for (int i = 0; i < keys.length; ++i) {
        Iced ice = DKV.getGet(keys[i]);
        if (ice instanceof FileVec) {
            if (i == 0)
                v = ((FileVec) ice);
            ((FileVec) ice).setChunkSize(setup._chunk_size);
            nchunks += ((FileVec) ice).nChunks();
            Log.info("Parse chunk size " + setup._chunk_size);
        } else if (ice instanceof Frame && ((Frame) ice).vec(0) instanceof FileVec) {
            if (i == 0)
                v = ((Frame) ice).vec(0);
            ((FileVec) ((Frame) ice).vec(0)).setChunkSize((Frame) ice, setup._chunk_size);
            nchunks += (((Frame) ice).vec(0)).nChunks();
            Log.info("Parse chunk size " + setup._chunk_size);
        }
    }
    final VectorGroup vg = v.group();
    final ParseDataset pds = new ParseDataset(dest);
    // Write-Lock BEFORE returning
    new Frame(pds._job._result, new String[0], new Vec[0]).delete_and_lock(pds._job);
    return pds._job.start(new H2OCountedCompleter() {

        @Override
        public void compute2() {
            ParseDataset.parseAllKeys(pds, keys, setup, true);
            tryComplete();
        }
    }, nchunks);
}
Also used : VectorGroup(water.fvec.Vec.VectorGroup) H2OCountedCompleter(water.H2O.H2OCountedCompleter)

Example 3 with VectorGroup

use of water.fvec.Vec.VectorGroup in project h2o-3 by h2oai.

the class ParseDataset method parseAllKeys.

// --------------------------------------------------------------------------
// Top-level parser driver
private static ParseDataset parseAllKeys(ParseDataset pds, Key[] fkeys, ParseSetup setup, boolean deleteOnDone) {
    final Job<Frame> job = pds._job;
    assert setup._number_columns > 0;
    if (setup._column_names != null && ((setup._column_names.length == 0) || (setup._column_names.length == 1 && setup._column_names[0].isEmpty())))
        // // FIXME: annoyingly front end sends column names as String[] {""} even if setup returned null
        setup._column_names = null;
    if (setup._na_strings != null && setup._na_strings.length != setup._number_columns)
        setup._na_strings = null;
    if (fkeys.length == 0) {
        job.stop();
        return pds;
    }
    job.update(0, "Ingesting files.");
    VectorGroup vg = getByteVec(fkeys[0]).group();
    MultiFileParseTask mfpt = pds._mfpt = new MultiFileParseTask(vg, setup, job._key, fkeys, deleteOnDone);
    mfpt.doAll(fkeys);
    Log.trace("Done ingesting files.");
    if (job.stop_requested())
        return pds;
    final AppendableVec[] avs = mfpt.vecs();
    setup._column_names = getColumnNames(avs.length, setup._column_names);
    Frame fr = null;
    // Calculate categorical domain
    // Filter down to columns with some categoricals
    int n = 0;
    int[] ecols2 = new int[avs.length];
    for (int i = 0; i < avs.length; ++i) if (// Intended type is categorical (even though no domain has been set)?
    avs[i].get_type() == Vec.T_CAT)
        ecols2[n++] = i;
    final int[] ecols = Arrays.copyOf(ecols2, n);
    // If we have any, go gather unified categorical domains
    if (n > 0) {
        if (!setup.getParseType().isDomainProvided) {
            // Domains are not provided via setup we need to collect them
            job.update(0, "Collecting categorical domains across nodes.");
            {
                GatherCategoricalDomainsTask gcdt = new GatherCategoricalDomainsTask(mfpt._cKey, ecols).doAllNodes();
                //Test domains for excessive length.
                List<String> offendingColNames = new ArrayList<>();
                for (int i = 0; i < ecols.length; i++) {
                    if (gcdt.getDomainLength(i) < Categorical.MAX_CATEGORICAL_COUNT) {
                        if (// The all-NA column
                        gcdt.getDomainLength(i) == 0)
                            // The all-NA column
                            avs[ecols[i]].setBad();
                        else
                            avs[ecols[i]].setDomain(gcdt.getDomain(i));
                    } else
                        offendingColNames.add(setup._column_names[ecols[i]]);
                }
                if (offendingColNames.size() > 0)
                    throw new H2OParseException("Exceeded categorical limit on columns " + offendingColNames + ".   Consider reparsing these columns as a string.");
            }
            Log.trace("Done collecting categorical domains across nodes.");
        } else {
            // Ignore offending domains
            for (int i = 0; i < ecols.length; i++) {
                avs[ecols[i]].setDomain(setup._domains[ecols[i]]);
            }
        }
        job.update(0, "Compressing data.");
        fr = new Frame(job._result, setup._column_names, AppendableVec.closeAll(avs));
        fr.update(job);
        Log.trace("Done compressing data.");
        if (!setup.getParseType().isDomainProvided) {
            // Update categoricals to the globally agreed numbering
            Vec[] evecs = new Vec[ecols.length];
            for (int i = 0; i < evecs.length; ++i) evecs[i] = fr.vecs()[ecols[i]];
            job.update(0, "Unifying categorical domains across nodes.");
            {
                // new CreateParse2GlobalCategoricalMaps(mfpt._cKey).doAll(evecs);
                // Using Dtask since it starts and returns faster than an MRTask
                CreateParse2GlobalCategoricalMaps[] fcdt = new CreateParse2GlobalCategoricalMaps[H2O.CLOUD.size()];
                RPC[] rpcs = new RPC[H2O.CLOUD.size()];
                for (int i = 0; i < fcdt.length; i++) {
                    H2ONode[] nodes = H2O.CLOUD.members();
                    fcdt[i] = new CreateParse2GlobalCategoricalMaps(mfpt._cKey, fr._key, ecols);
                    rpcs[i] = new RPC<>(nodes[i], fcdt[i]).call();
                }
                for (RPC rpc : rpcs) rpc.get();
                new UpdateCategoricalChunksTask(mfpt._cKey, mfpt._chunk2ParseNodeMap).doAll(evecs);
                MultiFileParseTask._categoricals.remove(mfpt._cKey);
            }
            Log.trace("Done unifying categoricals across nodes.");
        }
    } else {
        // No categoricals case
        job.update(0, "Compressing data.");
        fr = new Frame(job._result, setup._column_names, AppendableVec.closeAll(avs));
        Log.trace("Done closing all Vecs.");
    }
    // Check for job cancellation
    if (job.stop_requested())
        return pds;
    // SVMLight is sparse format, there may be missing chunks with all 0s, fill them in
    if (setup._parse_type.equals(SVMLight_INFO))
        new SVFTask(fr).doAllNodes();
    // Check for job cancellation
    if (job.stop_requested())
        return pds;
    ParseWriter.ParseErr[] errs = ArrayUtils.append(setup._errs, mfpt._errors);
    if (errs.length > 0) {
        // compute global line numbers for warnings/errs
        HashMap<String, Integer> fileChunkOffsets = new HashMap<>();
        for (int i = 0; i < mfpt._fileChunkOffsets.length; ++i) fileChunkOffsets.put(fkeys[i].toString(), mfpt._fileChunkOffsets[i]);
        long[] espc = fr.anyVec().espc();
        for (int i = 0; i < errs.length; ++i) {
            if (fileChunkOffsets.containsKey(errs[i]._file)) {
                int espcOff = fileChunkOffsets.get(errs[i]._file);
                errs[i]._gLineNum = espc[espcOff + errs[i]._cidx] + errs[i]._lineNum;
                errs[i]._lineNum = errs[i]._gLineNum - espc[espcOff];
            }
        }
        SortedSet<ParseWriter.ParseErr> s = new TreeSet<>(new Comparator<ParseWriter.ParseErr>() {

            @Override
            public int compare(ParseWriter.ParseErr o1, ParseWriter.ParseErr o2) {
                long res = o1._gLineNum - o2._gLineNum;
                if (res == 0)
                    res = o1._byteOffset - o2._byteOffset;
                if (res == 0)
                    return o1._err.compareTo(o2._err);
                return (int) res < 0 ? -1 : 1;
            }
        });
        Collections.addAll(s, errs);
        String[] warns = new String[s.size()];
        int i = 0;
        for (ParseWriter.ParseErr err : s) Log.warn(warns[i++] = err.toString());
        job.setWarnings(warns);
    }
    job.update(0, "Calculating data summary.");
    logParseResults(fr);
    // Release the frame for overwriting
    fr.update(job);
    Frame fr2 = DKV.getGet(fr._key);
    assert fr2._names.length == fr2.numCols();
    fr.unlock(job);
    // Remove CSV files from H2O memory
    if (deleteOnDone)
        for (Key k : fkeys) {
            DKV.remove(k);
            assert DKV.get(k) == null : "Input key " + k + " not deleted during parse";
        }
    return pds;
}
Also used : NonBlockingHashMap(water.nbhm.NonBlockingHashMap) VectorGroup(water.fvec.Vec.VectorGroup)

Example 4 with VectorGroup

use of water.fvec.Vec.VectorGroup in project h2o-3 by h2oai.

the class MakeGLMModelHandler method computeGram.

public GramV3 computeGram(int v, GramV3 input) {
    if (DKV.get(input.X.key()) == null)
        throw new IllegalArgumentException("Frame " + input.X.key() + " does not exist.");
    Frame fr = input.X.key().get();
    Frame frcpy = new Frame(fr._names.clone(), fr.vecs().clone());
    String wname = null;
    Vec weight = null;
    if (input.W != null && !input.W.column_name.isEmpty()) {
        wname = input.W.column_name;
        if (fr.find(wname) == -1)
            throw new IllegalArgumentException("Did not find weight vector " + wname);
        weight = frcpy.remove(wname);
    }
    DataInfo dinfo = new DataInfo(frcpy, null, 0, input.use_all_factor_levels, input.standardize ? TransformType.STANDARDIZE : TransformType.NONE, TransformType.NONE, input.skip_missing, false, !input.skip_missing, /* weight */
    false, /* offset */
    false, /* fold */
    false, /* intercept */
    true);
    DKV.put(dinfo);
    if (weight != null)
        dinfo.setWeights(wname, weight);
    Gram.GramTask gt = new Gram.GramTask(null, dinfo, false, true).doAll(dinfo._adaptedFrame);
    double[][] gram = gt._gram.getXX();
    dinfo.remove();
    String[] names = water.util.ArrayUtils.append(dinfo.coefNames(), "Intercept");
    Vec[] vecs = new Vec[gram.length];
    Key[] keys = new VectorGroup().addVecs(vecs.length);
    for (int i = 0; i < vecs.length; ++i) vecs[i] = Vec.makeVec(gram[i], keys[i]);
    input.destination_frame = new KeyV3.FrameKeyV3();
    String keyname = input.X.key().toString();
    if (keyname.endsWith(".hex"))
        keyname = keyname.substring(0, keyname.lastIndexOf("."));
    keyname = keyname + "_gram";
    if (weight != null)
        keyname = keyname + "_" + wname;
    Key k = Key.make(keyname);
    if (DKV.get(k) != null) {
        int cnt = 0;
        while (cnt < 1000 && DKV.get(k = Key.make(keyname + "_" + cnt)) != null) cnt++;
        if (cnt == 1000)
            throw new IllegalArgumentException("unable to make unique key");
    }
    input.destination_frame.fillFromImpl(k);
    DKV.put(new Frame(k, names, vecs));
    return input;
}
Also used : DataInfo(hex.DataInfo) ValFrame(water.rapids.vals.ValFrame) KeyV3(water.api.schemas3.KeyV3) Gram(hex.gram.Gram) VectorGroup(water.fvec.Vec.VectorGroup) Key(water.Key)

Example 5 with VectorGroup

use of water.fvec.Vec.VectorGroup in project h2o-3 by h2oai.

the class MRTask method compute2.

/** Called from FJ threads to do local work.  The first called Task (which is
   *  also the last one to Complete) also reduces any global work.  Called
   *  internal by F/J.  Not expected to be user-called.  */
@Override
public final void compute2() {
    assert _left == null && _rite == null && _res == null;
    if (_profile != null)
        _profile._mapstart = System.currentTimeMillis();
    if ((_hi - _lo) >= 2) {
        // Multi-chunk case: just divide-and-conquer to 1 chunk
        // Mid-point
        final int mid = (_lo + _hi) >>> 1;
        _left = copyAndInit();
        _rite = copyAndInit();
        // Reset mid-point
        _left._hi = mid;
        // Also set self mid-point
        _rite._lo = mid;
        // One fork awaiting completion
        addToPendingCount(1);
        // Runs in another thread/FJ instance
        if (!isCompletedAbnormally())
            _left.fork();
        // Runs in THIS F/J thread
        if (!isCompletedAbnormally())
            _rite.compute2();
        if (_profile != null)
            _profile._mapdone = System.currentTimeMillis();
        // Not complete until the fork completes
        return;
    }
    // Zero or 1 chunks, and further chunk might not be homed here
    if (_fr == null) {
        // No Frame, so doing Keys?
        if (// Once-per-node mode
        _keys == null || _hi > _lo && _keys[_lo].home()) {
            assert (_keys == null || !H2O.ARGS.client) : "Client node should not process any keys in MRTask!";
            if (_profile != null)
                _profile._userstart = System.currentTimeMillis();
            if (_keys != null)
                map(_keys[_lo]);
            // Save results since called map() at least once!
            _res = self();
            if (_profile != null)
                _profile._closestart = System.currentTimeMillis();
        }
    } else if (_hi > _lo) {
        // Frame, Single chunk?
        Vec v0 = _fr.anyVec();
        if (_run_local || v0.chunkKey(_lo).home()) {
            // And chunk is homed here?
            assert (_run_local || !H2O.ARGS.client) : "Client node should not process any keys in MRTask!";
            // Make decompression chunk headers for these chunks
            Vec[] vecs = _fr.vecs();
            Chunk[] bvs = new Chunk[vecs.length];
            NewChunk[] appendableChunks = null;
            for (int i = 0; i < vecs.length; i++) if (vecs[i] != null) {
                assert _run_local || vecs[i].chunkKey(_lo).home() : "Chunk=" + _lo + " v0=" + v0 + ", k=" + v0.chunkKey(_lo) + "   v[" + i + "]=" + vecs[i] + ", k=" + vecs[i].chunkKey(_lo);
                bvs[i] = vecs[i].chunkForChunkIdx(_lo);
            }
            if (_output_types != null) {
                final VectorGroup vg = vecs[0].group();
                _appendables = new AppendableVec[_output_types.length];
                appendableChunks = new NewChunk[_output_types.length];
                for (int i = 0; i < _appendables.length; ++i) {
                    _appendables[i] = new AppendableVec(vg.vecKey(_vid + i), _output_types[i]);
                    appendableChunks[i] = _appendables[i].chunkForChunkIdx(_lo);
                }
            }
            // Call all the various map() calls that apply
            if (_profile != null)
                _profile._userstart = System.currentTimeMillis();
            int num_fr_vecs = _fr.vecs().length;
            int num_outputs = _output_types == null ? 0 : _output_types.length;
            if (num_outputs == 0) {
                if (num_fr_vecs == 1)
                    map(bvs[0]);
                else if (num_fr_vecs == 2)
                    map(bvs[0], bvs[1]);
                else if (num_fr_vecs == 3)
                    map(bvs[0], bvs[1], bvs[2]);
                map(bvs);
            } else if (num_outputs == 1) {
                // convenience versions for cases with single output.
                assert appendableChunks != null;
                if (num_fr_vecs == 1)
                    map(bvs[0], appendableChunks[0]);
                else if (num_fr_vecs == 2)
                    map(bvs[0], bvs[1], appendableChunks[0]);
                // else if (fr_vecs_length == 3) map(bvs[0], bvs[1], bvs[2], appendableChunks[0]);
                map(bvs, appendableChunks[0]);
            } else if (num_outputs == 2) {
                // convenience versions for cases with 2 outputs (e.g split).
                assert appendableChunks != null;
                if (num_fr_vecs == 1)
                    map(bvs[0], appendableChunks[0], appendableChunks[1]);
                // else if (fr_vecs_length == 2) map(bvs[0], bvs[1], appendableChunks[0], appendableChunks[1]);
                // else if (fr_vecs_length == 3) map(bvs[0], bvs[1], bvs[2], appendableChunks[0], appendableChunks[1]);
                map(bvs, appendableChunks[0], appendableChunks[1]);
            }
            if (num_outputs >= 0)
                map(bvs, appendableChunks);
            // Save results since called map() at least once!
            _res = self();
            // Further D/K/V put any new vec results.
            if (_profile != null)
                _profile._closestart = System.currentTimeMillis();
            for (Chunk bv : bvs) bv.close(_lo, _fs);
            if (_output_types != null)
                for (NewChunk nch : appendableChunks) nch.close(_lo, _fs);
        }
    }
    if (_profile != null)
        _profile._mapdone = System.currentTimeMillis();
    tryComplete();
}
Also used : VectorGroup(water.fvec.Vec.VectorGroup) PrettyPrint(water.util.PrettyPrint)

Aggregations

VectorGroup (water.fvec.Vec.VectorGroup)6 DataInfo (hex.DataInfo)1 Gram (hex.gram.Gram)1 ExecutionException (java.util.concurrent.ExecutionException)1 Future (java.util.concurrent.Future)1 H2OCountedCompleter (water.H2O.H2OCountedCompleter)1 Key (water.Key)1 KeyV3 (water.api.schemas3.KeyV3)1 NonBlockingHashMap (water.nbhm.NonBlockingHashMap)1 Enum (water.parser.Enum)1 ValFrame (water.rapids.vals.ValFrame)1 PrettyPrint (water.util.PrettyPrint)1