Search in sources :

Example 1 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class DeepWaterParameters method guessProblemType.

/**
   * Attempt to guess the problem type from the dataset
   * @return
   */
ProblemType guessProblemType() {
    if (_problem_type == auto) {
        boolean image = false;
        boolean text = false;
        String first = null;
        Vec v = train().vec(0);
        if (v.isString() || v.isCategorical()) /*small data parser artefact*/
        {
            BufferedString bs = new BufferedString();
            first = v.atStr(bs, 0).toString();
            try {
                ImageIO.read(new File(first));
                image = true;
            } catch (Throwable t) {
            }
            try {
                ImageIO.read(new URL(first));
                image = true;
            } catch (Throwable t) {
            }
        }
        if (first != null) {
            if (!image && (first.endsWith(".jpg") || first.endsWith(".png") || first.endsWith(".tif"))) {
                image = true;
                Log.warn("Cannot read first image at " + first + " - Check data.");
            } else if (v.isString() && train().numCols() <= 4) {
                //at most text, label, fold_col, weight
                text = true;
            }
        }
        if (image)
            return ProblemType.image;
        else if (text)
            return ProblemType.text;
        else
            return ProblemType.dataset;
    } else {
        return _problem_type;
    }
}
Also used : Vec(water.fvec.Vec) BufferedString(water.parser.BufferedString) BufferedString(water.parser.BufferedString) File(java.io.File) URL(java.net.URL)

Example 2 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class DeepWaterTask method setupLocal.

/**
   * Transfer ownership from global (shared) model to local model which will be worked on
   */
@Override
protected void setupLocal() {
    //    long start = System.currentTimeMillis();
    assert (_localmodel == null);
    _localmodel = _sharedmodel;
    _sharedmodel = null;
    _localmodel.set_processed_local(0);
    final int weightIdx = _fr.find(_localmodel.get_params()._weights_column);
    final int respIdx = _fr.find(_localmodel.get_params()._response_column);
    final int batchSize = _localmodel.get_params()._mini_batch_size;
    //    long nativetime = 0;
    DeepWaterIterator iter = null;
    long seed = 0xDECAF + 0xD00D * _localmodel.get_processed_global();
    Random rng = RandomUtils.getRNG(seed);
    if (_fr.numRows() > Integer.MAX_VALUE) {
        throw H2O.unimpl("Need to implement batching into int-sized chunks.");
    }
    int len = (int) _fr.numRows();
    int j = 0;
    Futures fs = new Futures();
    ArrayList trainLabels = new ArrayList<>();
    ArrayList trainData = new ArrayList<>();
    try {
        // Binary data (Images/Documents/etc.)
        if (_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.image || _localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.text) {
            //must be the first column //FIXME
            int dataIdx = 0;
            Log.debug("Using column " + _fr.name(dataIdx) + " for " + ((_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.image) ? "path to image data" : ((_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.text) ? "text data" : "path to arbitrary bytes")));
            // full passes over the data
            BufferedString bs = new BufferedString();
            // Example: train_samples_per_iteration = 4700, and train.numRows()=1000 -> _useFraction = 4.7 -> fullpasses = 4
            int fullpasses = (int) _useFraction;
            while (j++ < fullpasses) {
                for (int i = 0; i < _fr.numRows(); ++i) {
                    double weight = weightIdx == -1 ? 1 : _fr.vec(weightIdx).at(i);
                    if (weight == 0)
                        continue;
                    BufferedString file = _fr.vec(dataIdx).atStr(bs, i);
                    if (file != null)
                        trainData.add(file.toString());
                    float response = (float) _fr.vec(respIdx).at(i);
                    trainLabels.add(response);
                }
            }
            // fractional passes // 0.7
            while (trainData.size() < _useFraction * len || trainData.size() % batchSize != 0) {
                assert (_shuffle);
                int i = rng.nextInt(len);
                double weight = weightIdx == -1 ? 1 : _fr.vec(weightIdx).at(i);
                if (weight == 0)
                    continue;
                BufferedString file = _fr.vec(dataIdx).atStr(bs, i);
                if (file != null)
                    trainData.add(file.toString());
                float response = (float) _fr.vec(respIdx).at(i);
                trainLabels.add(response);
            }
        } else // Numeric data (H2O Frame full with numeric columns)
        if (_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.dataset) {
            double mul = _localmodel._dataInfo._normRespMul != null ? _localmodel._dataInfo._normRespMul[0] : 1;
            double sub = _localmodel._dataInfo._normRespSub != null ? _localmodel._dataInfo._normRespSub[0] : 0;
            // full passes over the data
            int fullpasses = (int) _useFraction;
            while (j++ < fullpasses) {
                for (int i = 0; i < _fr.numRows(); ++i) {
                    double weight = weightIdx == -1 ? 1 : _fr.vec(weightIdx).at(i);
                    if (weight == 0)
                        continue;
                    float response = (float) ((_fr.vec(respIdx).at(i) - sub) / mul);
                    trainData.add(i);
                    trainLabels.add(response);
                }
            }
            // fractional passes
            while (trainData.size() < _useFraction * len || trainData.size() % batchSize != 0) {
                int i = rng.nextInt(len);
                double weight = weightIdx == -1 ? 1 : _fr.vec(weightIdx).at(i);
                if (weight == 0)
                    continue;
                float response = (float) ((_fr.vec(respIdx).at(i) - sub) / mul);
                trainData.add(i);
                trainLabels.add(response);
            }
        }
        // shuffle the (global) list
        if (_shuffle) {
            rng.setSeed(seed);
            Collections.shuffle(trainLabels, rng);
            rng.setSeed(seed);
            Collections.shuffle(trainData, rng);
        }
        if (_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.image) {
            iter = new DeepWaterImageIterator(trainData, trainLabels, _localmodel._meanData, batchSize, _localmodel._width, _localmodel._height, _localmodel._channels, _localmodel.get_params()._cache_data);
        } else if (_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.dataset) {
            assert (_localmodel._dataInfo != null);
            iter = new DeepWaterDatasetIterator(trainData, trainLabels, _localmodel._dataInfo, batchSize, _localmodel.get_params()._cache_data);
        } else if (_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.text) {
            iter = new DeepWaterTextIterator(trainData, trainLabels, batchSize, 56, /*FIXME*/
            _localmodel.get_params()._cache_data);
        }
        NativeTrainTask ntt;
        while (iter.Next(fs) && !_job.isStopping()) {
            //        if (ntt != null) nativetime += ntt._timeInMillis;
            long n = _localmodel.get_processed_total();
            //        if(!_localmodel.get_params()._quiet_mode)
            //          Log.info("Trained " + n + " samples. Training on " + Arrays.toString(((DeepWaterImageIterator)iter).getFiles()));
            _localmodel._backend.setParameter(_localmodel._model, "learning_rate", _localmodel.get_params().learningRate((double) n));
            _localmodel._backend.setParameter(_localmodel._model, "momentum", _localmodel.get_params().momentum((double) n));
            //fork off GPU work, but let the iterator.Next() wait on completion before swapping again
            //System.err.println("data: " + Arrays.toString(iter.getData()));
            float[] preds = _localmodel._backend.predict(_localmodel._model, iter.getData());
            if (Float.isNaN(ArrayUtils.sum(preds))) {
                Log.err(DeepWaterModel.unstable_msg);
                throw new UnsupportedOperationException(DeepWaterModel.unstable_msg);
            }
            //        System.err.println("pred: " + Arrays.toString(preds));
            ntt = new NativeTrainTask(_localmodel._backend, _localmodel._model, iter.getData(), iter.getLabel());
            fs.add(H2O.submitTask(ntt));
            _localmodel.add_processed_local(iter._batch_size);
        }
        fs.blockForPending();
    //      nativetime += ntt._timeInMillis;
    } catch (IOException e) {
        //gracefully continue if we can't find files etc.
        e.printStackTrace();
    }
//    long end = System.currentTimeMillis();
//    if (!_localmodel.get_params()._quiet_mode) {
//      Log.info("Time for one iteration: " + PrettyPrint.msecs(end - start, true));
//      Log.info("Time for native training : " + PrettyPrint.msecs(nativetime, true));
//    }
}
Also used : Futures(water.Futures) ArrayList(java.util.ArrayList) IOException(java.io.IOException) Random(java.util.Random) BufferedString(water.parser.BufferedString)

Example 3 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class Word2VecTest method testTransformAggregate.

@Test
public void testTransformAggregate() {
    Scope.enter();
    try {
        Vec v = Scope.track(svec("a", "b"));
        Frame fr = Scope.track(new Frame(Key.<Frame>make(), new String[] { "Words" }, new Vec[] { v }));
        DKV.put(fr);
        // build an arbitrary w2v model & overwrite the learned vector with fixed values
        Word2VecModel.Word2VecParameters p = new Word2VecModel.Word2VecParameters();
        p._train = fr._key;
        p._min_word_freq = 0;
        p._epochs = 1;
        p._vec_size = 2;
        Word2VecModel w2vm = (Word2VecModel) Scope.track_generic(new Word2Vec(p).trainModel().get());
        w2vm._output._vecs = new float[] { 1.0f, 0.0f, 0.0f, 1.0f };
        DKV.put(w2vm);
        String[] sentences = { "a", "b", null, "a", "c", null, "c", null, "a", "a", /*chunk end*/
        "a", "b", null, // no terminator at the end
        "b" };
        Frame f = new TestFrameBuilder().withName("data").withColNames("Sentences").withVecTypes(Vec.T_STR).withDataForCol(0, sentences).withChunkLayout(10, 4).build();
        Frame result = Scope.track(w2vm.transform(f.vec(0), Word2VecModel.AggregateMethod.AVERAGE));
        Vec expectedAs = Scope.track(dvec(0.5, 1.0, Double.NaN, 0.75, 0.0));
        Vec expectedBs = Scope.track(dvec(0.5, 0.0, Double.NaN, 0.25, 1.0));
        assertVecEquals(expectedAs, result.vec(w2vm._output._vocab.get(new BufferedString("a"))), 0.0001);
        assertVecEquals(expectedBs, result.vec(w2vm._output._vocab.get(new BufferedString("b"))), 0.0001);
    } finally {
        Scope.exit();
    }
}
Also used : Frame(water.fvec.Frame) TestFrameBuilder(water.fvec.TestFrameBuilder) Vec(water.fvec.Vec) BufferedString(water.parser.BufferedString) BufferedString(water.parser.BufferedString)

Example 4 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class RollupStats method map.

private RollupStats map(Chunk c) {
    _size = c.byteSize();
    boolean isUUID = c._vec.isUUID();
    boolean isString = c._vec.isString();
    BufferedString tmpStr = new BufferedString();
    if (isString)
        _isInt = false;
    // Checksum support
    long checksum = 0;
    long start = c._start;
    long l = 81985529216486895L;
    // Check for popular easy cases: All Constant
    double min = c.min(), max = c.max();
    if (min == max) {
        // All constant or all NaN
        // It's the min, it's the max, it's the alpha and omega
        double d = min;
        _checksum = (c.hasFloat() ? Double.doubleToRawLongBits(d) : (long) d) * c._len;
        Arrays.fill(_mins, d);
        Arrays.fill(_maxs, d);
        if (d == Double.POSITIVE_INFINITY)
            _pinfs++;
        else if (d == Double.NEGATIVE_INFINITY)
            _ninfs++;
        else {
            if (Double.isNaN(d))
                _naCnt = c._len;
            else if (d != 0)
                _nzCnt = c._len;
            _mean = d;
            _rows = c._len;
        }
        _isInt = ((long) d) == d;
        // No variance for constants
        _sigma = 0;
        return this;
    }
    //all const NaNs
    if ((c instanceof C0DChunk && c.isNA_impl(0))) {
        //count of non-NAs * variance of non-NAs
        _sigma = 0;
        //sum of non-NAs (will get turned into mean)
        _mean = 0;
        _naCnt = c._len;
        _nzCnt = 0;
        return this;
    }
    // Check for popular easy cases: Boolean, possibly sparse, possibly NaN
    if (min == 0 && max == 1) {
        // Easy zeros
        int zs = c._len - c.sparseLenZero();
        int nans = 0;
        // Hard-count sparse-but-zero (weird case of setting a zero over a non-zero)
        for (int i = c.nextNZ(-1); i < c._len; i = c.nextNZ(i)) if (c.isNA(i))
            nans++;
        else if (c.at8(i) == 0)
            zs++;
        // Ones
        int os = c._len - zs - nans;
        _nzCnt += os;
        _naCnt += nans;
        for (int i = 0; i < Math.min(_mins.length, zs); i++) {
            min(0);
            max(0);
        }
        for (int i = 0; i < Math.min(_mins.length, os); i++) {
            min(1);
            max(1);
        }
        _rows += zs + os;
        _mean = (double) os / _rows;
        _sigma = zs * (0.0 - _mean) * (0.0 - _mean) + os * (1.0 - _mean) * (1.0 - _mean);
        return this;
    }
    // Walk the non-zeros
    if (isUUID) {
        // UUID columns do not compute min/max/mean/sigma
        for (int i = c.nextNZ(-1); i < c._len; i = c.nextNZ(i)) {
            if (c.isNA(i))
                _naCnt++;
            else {
                long lo = c.at16l(i), hi = c.at16h(i);
                if (lo != 0 || hi != 0)
                    _nzCnt++;
                l = lo ^ 37 * hi;
            }
            if (// ignore 0s in checksum to be consistent with sparse chunks
            l != 0)
                checksum ^= (17 * (start + i)) ^ 23 * l;
        }
    } else if (isString) {
        // String columns do not compute min/max/mean/sigma
        for (int i = c.nextNZ(-1); i < c._len; i = c.nextNZ(i)) {
            if (c.isNA(i))
                _naCnt++;
            else {
                _nzCnt++;
                l = c.atStr(tmpStr, i).hashCode();
            }
            if (// ignore 0s in checksum to be consistent with sparse chunks
            l != 0)
                checksum ^= (17 * (start + i)) ^ 23 * l;
        }
    } else {
        // Work off all numeric rows, or only the nonzeros for sparse
        if (c instanceof C1Chunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C1Chunk) c, start, checksum);
        else if (c instanceof C1SChunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C1SChunk) c, start, checksum);
        else if (c instanceof C1NChunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C1NChunk) c, start, checksum);
        else if (c instanceof C2Chunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C2Chunk) c, start, checksum);
        else if (c instanceof C2SChunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C2SChunk) c, start, checksum);
        else if (c instanceof C4SChunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C4SChunk) c, start, checksum);
        else if (c instanceof C4FChunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C4FChunk) c, start, checksum);
        else if (c instanceof C4Chunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C4Chunk) c, start, checksum);
        else if (c instanceof C8Chunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C8Chunk) c, start, checksum);
        else if (c instanceof C8DChunk)
            checksum = new RollupStatsHelpers(this).numericChunkRollup((C8DChunk) c, start, checksum);
        else
            checksum = new RollupStatsHelpers(this).numericChunkRollup(c, start, checksum);
        // handle the zeros
        if (c.isSparseZero()) {
            int zeros = c._len - c.sparseLenZero();
            if (zeros > 0) {
                for (int i = 0; i < Math.min(_mins.length, zeros); i++) {
                    min(0);
                    max(0);
                }
                double zeromean = 0;
                double zeroM2 = 0;
                double delta = _mean - zeromean;
                _mean = (_mean * _rows + zeromean * zeros) / (_rows + zeros);
                //this is the variance*(N-1), will do sqrt(_sigma/(N-1)) later in postGlobal
                _sigma += zeroM2 + delta * delta * _rows * zeros / (_rows + zeros);
                _rows += zeros;
            }
        } else if (c.isSparseNA()) {
            _naCnt = c._len - c.sparseLenNA();
        }
    }
    _checksum = checksum;
    // UUID and String columns do not compute min/max/mean/sigma
    if (isUUID || isString) {
        Arrays.fill(_mins, Double.NaN);
        Arrays.fill(_maxs, Double.NaN);
        _mean = _sigma = Double.NaN;
    }
    return this;
}
Also used : BufferedString(water.parser.BufferedString)

Example 5 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class AstFlatten method apply.

@Override
public Val apply(Env env, Env.StackHelp stk, AstRoot[] asts) {
    Frame fr = stk.track(asts[1].exec(env)).getFrame();
    // did not flatten
    if (fr.numCols() != 1 || fr.numRows() != 1)
        return new ValFrame(fr);
    Vec vec = fr.anyVec();
    switch(vec.get_type()) {
        case Vec.T_BAD:
        case Vec.T_NUM:
            return new ValNum(vec.at(0));
        case Vec.T_TIME:
            // check for missing values
            return vec.isNA(0) ? new ValNum(Double.NaN) : new ValNum(vec.at8(0));
        case Vec.T_STR:
            return new ValStr(vec.atStr(new BufferedString(), 0).toString());
        case // check for missing values
        Vec.T_CAT:
            return vec.isNA(0) ? new ValStr("NA") : new ValStr(vec.factor(vec.at8(0)));
        default:
            throw H2O.unimpl("The type of vector: " + vec.get_type_str() + " is not supported by " + str());
    }
}
Also used : ValFrame(water.rapids.vals.ValFrame) ValFrame(water.rapids.vals.ValFrame) Frame(water.fvec.Frame) Vec(water.fvec.Vec) ValStr(water.rapids.vals.ValStr) BufferedString(water.parser.BufferedString) ValNum(water.rapids.vals.ValNum)

Aggregations

BufferedString (water.parser.BufferedString)43 Frame (water.fvec.Frame)12 Test (org.junit.Test)9 MRTask (water.MRTask)8 Vec (water.fvec.Vec)8 Chunk (water.fvec.Chunk)7 NewChunk (water.fvec.NewChunk)6 ValFrame (water.rapids.vals.ValFrame)5 IcedLong (water.util.IcedLong)5 IOException (java.io.IOException)2 ByteBuffer (java.nio.ByteBuffer)2 Random (java.util.Random)2 DateTimeFormatter (org.joda.time.format.DateTimeFormatter)2 TestFrameBuilder (water.fvec.TestFrameBuilder)2 BackendModel (deepwater.backends.BackendModel)1 BackendParams (deepwater.backends.BackendParams)1 RuntimeOptions (deepwater.backends.RuntimeOptions)1 ImageDataSet (deepwater.datasets.ImageDataSet)1 GenModel (hex.genmodel.GenModel)1 EasyPredictModelWrapper (hex.genmodel.easy.EasyPredictModelWrapper)1