Search in sources :

Example 1 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class DeepWaterParameters method guessProblemType.

   * Attempt to guess the problem type from the dataset
   * @return
ProblemType guessProblemType() {
    if (_problem_type == auto) {
        boolean image = false;
        boolean text = false;
        String first = null;
        Vec v = train().vec(0);
        if (v.isString() || v.isCategorical()) /*small data parser artefact*/
            BufferedString bs = new BufferedString();
            first = v.atStr(bs, 0).toString();
            try {
                image = true;
            } catch (Throwable t) {
            try {
                image = true;
            } catch (Throwable t) {
        if (first != null) {
            if (!image && (first.endsWith(".jpg") || first.endsWith(".png") || first.endsWith(".tif"))) {
                image = true;
                Log.warn("Cannot read first image at " + first + " - Check data.");
            } else if (v.isString() && train().numCols() <= 4) {
                //at most text, label, fold_col, weight
                text = true;
        if (image)
            return ProblemType.image;
        else if (text)
            return ProblemType.text;
            return ProblemType.dataset;
    } else {
        return _problem_type;
Also used : Vec(water.fvec.Vec) BufferedString(water.parser.BufferedString) BufferedString(water.parser.BufferedString) File( URL(

Example 2 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class DeepWaterTask method setupLocal.

   * Transfer ownership from global (shared) model to local model which will be worked on
protected void setupLocal() {
    //    long start = System.currentTimeMillis();
    assert (_localmodel == null);
    _localmodel = _sharedmodel;
    _sharedmodel = null;
    final int weightIdx = _fr.find(_localmodel.get_params()._weights_column);
    final int respIdx = _fr.find(_localmodel.get_params()._response_column);
    final int batchSize = _localmodel.get_params()._mini_batch_size;
    //    long nativetime = 0;
    DeepWaterIterator iter = null;
    long seed = 0xDECAF + 0xD00D * _localmodel.get_processed_global();
    Random rng = RandomUtils.getRNG(seed);
    if (_fr.numRows() > Integer.MAX_VALUE) {
        throw H2O.unimpl("Need to implement batching into int-sized chunks.");
    int len = (int) _fr.numRows();
    int j = 0;
    Futures fs = new Futures();
    ArrayList trainLabels = new ArrayList<>();
    ArrayList trainData = new ArrayList<>();
    try {
        // Binary data (Images/Documents/etc.)
        if (_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.image || _localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.text) {
            //must be the first column //FIXME
            int dataIdx = 0;
            Log.debug("Using column " + + " for " + ((_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.image) ? "path to image data" : ((_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.text) ? "text data" : "path to arbitrary bytes")));
            // full passes over the data
            BufferedString bs = new BufferedString();
            // Example: train_samples_per_iteration = 4700, and train.numRows()=1000 -> _useFraction = 4.7 -> fullpasses = 4
            int fullpasses = (int) _useFraction;
            while (j++ < fullpasses) {
                for (int i = 0; i < _fr.numRows(); ++i) {
                    double weight = weightIdx == -1 ? 1 : _fr.vec(weightIdx).at(i);
                    if (weight == 0)
                    BufferedString file = _fr.vec(dataIdx).atStr(bs, i);
                    if (file != null)
                    float response = (float) _fr.vec(respIdx).at(i);
            // fractional passes // 0.7
            while (trainData.size() < _useFraction * len || trainData.size() % batchSize != 0) {
                assert (_shuffle);
                int i = rng.nextInt(len);
                double weight = weightIdx == -1 ? 1 : _fr.vec(weightIdx).at(i);
                if (weight == 0)
                BufferedString file = _fr.vec(dataIdx).atStr(bs, i);
                if (file != null)
                float response = (float) _fr.vec(respIdx).at(i);
        } else // Numeric data (H2O Frame full with numeric columns)
        if (_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.dataset) {
            double mul = _localmodel._dataInfo._normRespMul != null ? _localmodel._dataInfo._normRespMul[0] : 1;
            double sub = _localmodel._dataInfo._normRespSub != null ? _localmodel._dataInfo._normRespSub[0] : 0;
            // full passes over the data
            int fullpasses = (int) _useFraction;
            while (j++ < fullpasses) {
                for (int i = 0; i < _fr.numRows(); ++i) {
                    double weight = weightIdx == -1 ? 1 : _fr.vec(weightIdx).at(i);
                    if (weight == 0)
                    float response = (float) ((_fr.vec(respIdx).at(i) - sub) / mul);
            // fractional passes
            while (trainData.size() < _useFraction * len || trainData.size() % batchSize != 0) {
                int i = rng.nextInt(len);
                double weight = weightIdx == -1 ? 1 : _fr.vec(weightIdx).at(i);
                if (weight == 0)
                float response = (float) ((_fr.vec(respIdx).at(i) - sub) / mul);
        // shuffle the (global) list
        if (_shuffle) {
            Collections.shuffle(trainLabels, rng);
            Collections.shuffle(trainData, rng);
        if (_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.image) {
            iter = new DeepWaterImageIterator(trainData, trainLabels, _localmodel._meanData, batchSize, _localmodel._width, _localmodel._height, _localmodel._channels, _localmodel.get_params()._cache_data);
        } else if (_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.dataset) {
            assert (_localmodel._dataInfo != null);
            iter = new DeepWaterDatasetIterator(trainData, trainLabels, _localmodel._dataInfo, batchSize, _localmodel.get_params()._cache_data);
        } else if (_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.text) {
            iter = new DeepWaterTextIterator(trainData, trainLabels, batchSize, 56, /*FIXME*/
        NativeTrainTask ntt;
        while (iter.Next(fs) && !_job.isStopping()) {
            //        if (ntt != null) nativetime += ntt._timeInMillis;
            long n = _localmodel.get_processed_total();
            //        if(!_localmodel.get_params()._quiet_mode)
            //"Trained " + n + " samples. Training on " + Arrays.toString(((DeepWaterImageIterator)iter).getFiles()));
            _localmodel._backend.setParameter(_localmodel._model, "learning_rate", _localmodel.get_params().learningRate((double) n));
            _localmodel._backend.setParameter(_localmodel._model, "momentum", _localmodel.get_params().momentum((double) n));
            //fork off GPU work, but let the iterator.Next() wait on completion before swapping again
            //System.err.println("data: " + Arrays.toString(iter.getData()));
            float[] preds = _localmodel._backend.predict(_localmodel._model, iter.getData());
            if (Float.isNaN(ArrayUtils.sum(preds))) {
                throw new UnsupportedOperationException(DeepWaterModel.unstable_msg);
            //        System.err.println("pred: " + Arrays.toString(preds));
            ntt = new NativeTrainTask(_localmodel._backend, _localmodel._model, iter.getData(), iter.getLabel());
    //      nativetime += ntt._timeInMillis;
    } catch (IOException e) {
        //gracefully continue if we can't find files etc.
//    long end = System.currentTimeMillis();
//    if (!_localmodel.get_params()._quiet_mode) {
//"Time for one iteration: " + PrettyPrint.msecs(end - start, true));
//"Time for native training : " + PrettyPrint.msecs(nativetime, true));
//    }
Also used : Futures(water.Futures) ArrayList(java.util.ArrayList) IOException( Random(java.util.Random) BufferedString(water.parser.BufferedString)

Example 3 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class Word2VecTest method testTransformAggregate.

public void testTransformAggregate() {
    try {
        Vec v = Scope.track(svec("a", "b"));
        Frame fr = Scope.track(new Frame(Key.<Frame>make(), new String[] { "Words" }, new Vec[] { v }));
        // build an arbitrary w2v model & overwrite the learned vector with fixed values
        Word2VecModel.Word2VecParameters p = new Word2VecModel.Word2VecParameters();
        p._train = fr._key;
        p._min_word_freq = 0;
        p._epochs = 1;
        p._vec_size = 2;
        Word2VecModel w2vm = (Word2VecModel) Scope.track_generic(new Word2Vec(p).trainModel().get());
        w2vm._output._vecs = new float[] { 1.0f, 0.0f, 0.0f, 1.0f };
        String[] sentences = { "a", "b", null, "a", "c", null, "c", null, "a", "a", /*chunk end*/
        "a", "b", null, // no terminator at the end
        "b" };
        Frame f = new TestFrameBuilder().withName("data").withColNames("Sentences").withVecTypes(Vec.T_STR).withDataForCol(0, sentences).withChunkLayout(10, 4).build();
        Frame result = Scope.track(w2vm.transform(f.vec(0), Word2VecModel.AggregateMethod.AVERAGE));
        Vec expectedAs = Scope.track(dvec(0.5, 1.0, Double.NaN, 0.75, 0.0));
        Vec expectedBs = Scope.track(dvec(0.5, 0.0, Double.NaN, 0.25, 1.0));
        assertVecEquals(expectedAs, result.vec(w2vm._output._vocab.get(new BufferedString("a"))), 0.0001);
        assertVecEquals(expectedBs, result.vec(w2vm._output._vocab.get(new BufferedString("b"))), 0.0001);
    } finally {
Also used : Frame(water.fvec.Frame) TestFrameBuilder(water.fvec.TestFrameBuilder) Vec(water.fvec.Vec) BufferedString(water.parser.BufferedString) BufferedString(water.parser.BufferedString)

Example 4 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class RollupStats method map.

private RollupStats map(Chunk c) {
    _size = c.byteSize();
    boolean isUUID = c._vec.isUUID();
    boolean isString = c._vec.isString();
    BufferedString tmpStr = new BufferedString();
    if (isString)
        _isInt = false;
    // Checksum support
    long checksum = 0;
    long start = c._start;
    long l = 81985529216486895L;
    // Check for popular easy cases: All Constant
    double min = c.min(), max = c.max();
    if (min == max) {
        // All constant or all NaN
        // It's the min, it's the max, it's the alpha and omega
        double d = min;
        _checksum = (c.hasFloat() ? Double.doubleToRawLongBits(d) : (long) d) * c._len;
        Arrays.fill(_mins, d);
        Arrays.fill(_maxs, d);
        if (d == Double.POSITIVE_INFINITY)
        else if (d == Double.NEGATIVE_INFINITY)
        else {
            if (Double.isNaN(d))
                _naCnt = c._len;
            else if (d != 0)
                _nzCnt = c._len;
            _mean = d;
            _rows = c._len;
        _isInt = ((long) d) == d;
        // No variance for constants
        _sigma = 0;
        return this;
    //all const NaNs
    if ((c instanceof C0DChunk && c.isNA_impl(0))) {
        //count of non-NAs * variance of non-NAs
        _sigma = 0;
        //sum of non-NAs (will get turned into mean)
        _mean = 0;
        _naCnt = c._len;
        _nzCnt = 0;
        return this;
    // Check for popular easy cases: Boolean, possibly sparse, possibly NaN
    if (min == 0 && max == 1) {
        // Easy zeros
        int zs = c._len - c.sparseLenZero();
        int nans = 0;
        // Hard-count sparse-but-zero (weird case of setting a zero over a non-zero)
        for (int i = c.nextNZ(-1); i < c._len; i = c.nextNZ(i)) if (c.isNA(i))
        else if (c.at8(i) == 0)
        // Ones
        int os = c._len - zs - nans;
        _nzCnt += os;
        _naCnt += nans;
        for (int i = 0; i < Math.min(_mins.length, zs); i++) {
        for (int i = 0; i < Math.min(_mins.length, os); i++) {
        _rows += zs + os;
        _mean = (double) os / _rows;
        _sigma = zs * (0.0 - _mean) * (0.0 - _mean) + os * (1.0 - _mean) * (1.0 - _mean);
        return this;
    // Walk the non-zeros
    if (isUUID) {
        // UUID columns do not compute min/max/mean/sigma
        for (int i = c.nextNZ(-1); i < c._len; i = c.nextNZ(i)) {
            if (c.isNA(i))
            else {
                long lo = c.at16l(i), hi = c.at16h(i);
                if (lo != 0 || hi != 0)
                l = lo ^ 37 * hi;
            if (// ignore 0s in checksum to be consistent with sparse chunks
            l != 0)
                checksum ^= (17 * (start + i)) ^ 23 * l;
    } else if (isString) {
        // String columns do not compute min/max/mean/sigma
        for (int i = c.nextNZ(-1); i < c._len; i = c.nextNZ(i)) {
            if (c.isNA(i))
            else {
                l = c.atStr(tmpStr, i).hashCode();
            if (// ignore 0s in checksum to be consistent with sparse chunks
            l != 0)
                checksum ^= (17 * (start + i)) ^ 23 * l;
    } else {
        // Work off all numeric rows, or only the nonzeros for sparse
        if (c instanceof C1Chunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C1Chunk) c, start, checksum);
        else if (c instanceof C1SChunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C1SChunk) c, start, checksum);
        else if (c instanceof C1NChunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C1NChunk) c, start, checksum);
        else if (c instanceof C2Chunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C2Chunk) c, start, checksum);
        else if (c instanceof C2SChunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C2SChunk) c, start, checksum);
        else if (c instanceof C4SChunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C4SChunk) c, start, checksum);
        else if (c instanceof C4FChunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C4FChunk) c, start, checksum);
        else if (c instanceof C4Chunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C4Chunk) c, start, checksum);
        else if (c instanceof C8Chunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C8Chunk) c, start, checksum);
        else if (c instanceof C8DChunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C8DChunk) c, start, checksum);
            checksum = new RollupStatsHelpers(this).numericChunkRollup(c, start, checksum);
        // handle the zeros
        if (c.isSparseZero()) {
            int zeros = c._len - c.sparseLenZero();
            if (zeros > 0) {
                for (int i = 0; i < Math.min(_mins.length, zeros); i++) {
                double zeromean = 0;
                double zeroM2 = 0;
                double delta = _mean - zeromean;
                _mean = (_mean * _rows + zeromean * zeros) / (_rows + zeros);
                //this is the variance*(N-1), will do sqrt(_sigma/(N-1)) later in postGlobal
                _sigma += zeroM2 + delta * delta * _rows * zeros / (_rows + zeros);
                _rows += zeros;
        } else if (c.isSparseNA()) {
            _naCnt = c._len - c.sparseLenNA();
    _checksum = checksum;
    // UUID and String columns do not compute min/max/mean/sigma
    if (isUUID || isString) {
        Arrays.fill(_mins, Double.NaN);
        Arrays.fill(_maxs, Double.NaN);
        _mean = _sigma = Double.NaN;
    return this;
Also used : BufferedString(water.parser.BufferedString)

Example 5 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class AstFlatten method apply.

public Val apply(Env env, Env.StackHelp stk, AstRoot[] asts) {
    Frame fr = stk.track(asts[1].exec(env)).getFrame();
    // did not flatten
    if (fr.numCols() != 1 || fr.numRows() != 1)
        return new ValFrame(fr);
    Vec vec = fr.anyVec();
    switch(vec.get_type()) {
        case Vec.T_BAD:
        case Vec.T_NUM:
            return new ValNum(;
        case Vec.T_TIME:
            // check for missing values
            return vec.isNA(0) ? new ValNum(Double.NaN) : new ValNum(vec.at8(0));
        case Vec.T_STR:
            return new ValStr(vec.atStr(new BufferedString(), 0).toString());
        case // check for missing values
            return vec.isNA(0) ? new ValStr("NA") : new ValStr(vec.factor(vec.at8(0)));
            throw H2O.unimpl("The type of vector: " + vec.get_type_str() + " is not supported by " + str());
Also used : ValFrame(water.rapids.vals.ValFrame) ValFrame(water.rapids.vals.ValFrame) Frame(water.fvec.Frame) Vec(water.fvec.Vec) ValStr(water.rapids.vals.ValStr) BufferedString(water.parser.BufferedString) ValNum(water.rapids.vals.ValNum)


BufferedString (water.parser.BufferedString)43 Frame (water.fvec.Frame)12 Test (org.junit.Test)9 MRTask (water.MRTask)8 Vec (water.fvec.Vec)8 Chunk (water.fvec.Chunk)7 NewChunk (water.fvec.NewChunk)6 ValFrame (water.rapids.vals.ValFrame)5 IcedLong (water.util.IcedLong)5 IOException ( ByteBuffer (java.nio.ByteBuffer)2 Random (java.util.Random)2 DateTimeFormatter (org.joda.time.format.DateTimeFormatter)2 TestFrameBuilder (water.fvec.TestFrameBuilder)2 BackendModel (deepwater.backends.BackendModel)1 BackendParams (deepwater.backends.BackendParams)1 RuntimeOptions (deepwater.backends.RuntimeOptions)1 ImageDataSet (deepwater.datasets.ImageDataSet)1 GenModel (hex.genmodel.GenModel)1 EasyPredictModelWrapper (hex.genmodel.easy.EasyPredictModelWrapper)1