use of water.parser.BufferedString in project h2o-3 by h2oai.
the class DeepWaterParameters method guessProblemType.
/**
* Attempt to guess the problem type from the dataset
* @return
*/
ProblemType guessProblemType() {
if (_problem_type == auto) {
boolean image = false;
boolean text = false;
String first = null;
Vec v = train().vec(0);
if (v.isString() || v.isCategorical()) /*small data parser artefact*/
{
BufferedString bs = new BufferedString();
first = v.atStr(bs, 0).toString();
try {
ImageIO.read(new File(first));
image = true;
} catch (Throwable t) {
}
try {
ImageIO.read(new URL(first));
image = true;
} catch (Throwable t) {
}
}
if (first != null) {
if (!image && (first.endsWith(".jpg") || first.endsWith(".png") || first.endsWith(".tif"))) {
image = true;
Log.warn("Cannot read first image at " + first + " - Check data.");
} else if (v.isString() && train().numCols() <= 4) {
//at most text, label, fold_col, weight
text = true;
}
}
if (image)
return ProblemType.image;
else if (text)
return ProblemType.text;
else
return ProblemType.dataset;
} else {
return _problem_type;
}
}
use of water.parser.BufferedString in project h2o-3 by h2oai.
the class DeepWaterTask method setupLocal.
/**
* Transfer ownership from global (shared) model to local model which will be worked on
*/
@Override
protected void setupLocal() {
// long start = System.currentTimeMillis();
assert (_localmodel == null);
_localmodel = _sharedmodel;
_sharedmodel = null;
_localmodel.set_processed_local(0);
final int weightIdx = _fr.find(_localmodel.get_params()._weights_column);
final int respIdx = _fr.find(_localmodel.get_params()._response_column);
final int batchSize = _localmodel.get_params()._mini_batch_size;
// long nativetime = 0;
DeepWaterIterator iter = null;
long seed = 0xDECAF + 0xD00D * _localmodel.get_processed_global();
Random rng = RandomUtils.getRNG(seed);
if (_fr.numRows() > Integer.MAX_VALUE) {
throw H2O.unimpl("Need to implement batching into int-sized chunks.");
}
int len = (int) _fr.numRows();
int j = 0;
Futures fs = new Futures();
ArrayList trainLabels = new ArrayList<>();
ArrayList trainData = new ArrayList<>();
try {
// Binary data (Images/Documents/etc.)
if (_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.image || _localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.text) {
//must be the first column //FIXME
int dataIdx = 0;
Log.debug("Using column " + _fr.name(dataIdx) + " for " + ((_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.image) ? "path to image data" : ((_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.text) ? "text data" : "path to arbitrary bytes")));
// full passes over the data
BufferedString bs = new BufferedString();
// Example: train_samples_per_iteration = 4700, and train.numRows()=1000 -> _useFraction = 4.7 -> fullpasses = 4
int fullpasses = (int) _useFraction;
while (j++ < fullpasses) {
for (int i = 0; i < _fr.numRows(); ++i) {
double weight = weightIdx == -1 ? 1 : _fr.vec(weightIdx).at(i);
if (weight == 0)
continue;
BufferedString file = _fr.vec(dataIdx).atStr(bs, i);
if (file != null)
trainData.add(file.toString());
float response = (float) _fr.vec(respIdx).at(i);
trainLabels.add(response);
}
}
// fractional passes // 0.7
while (trainData.size() < _useFraction * len || trainData.size() % batchSize != 0) {
assert (_shuffle);
int i = rng.nextInt(len);
double weight = weightIdx == -1 ? 1 : _fr.vec(weightIdx).at(i);
if (weight == 0)
continue;
BufferedString file = _fr.vec(dataIdx).atStr(bs, i);
if (file != null)
trainData.add(file.toString());
float response = (float) _fr.vec(respIdx).at(i);
trainLabels.add(response);
}
} else // Numeric data (H2O Frame full with numeric columns)
if (_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.dataset) {
double mul = _localmodel._dataInfo._normRespMul != null ? _localmodel._dataInfo._normRespMul[0] : 1;
double sub = _localmodel._dataInfo._normRespSub != null ? _localmodel._dataInfo._normRespSub[0] : 0;
// full passes over the data
int fullpasses = (int) _useFraction;
while (j++ < fullpasses) {
for (int i = 0; i < _fr.numRows(); ++i) {
double weight = weightIdx == -1 ? 1 : _fr.vec(weightIdx).at(i);
if (weight == 0)
continue;
float response = (float) ((_fr.vec(respIdx).at(i) - sub) / mul);
trainData.add(i);
trainLabels.add(response);
}
}
// fractional passes
while (trainData.size() < _useFraction * len || trainData.size() % batchSize != 0) {
int i = rng.nextInt(len);
double weight = weightIdx == -1 ? 1 : _fr.vec(weightIdx).at(i);
if (weight == 0)
continue;
float response = (float) ((_fr.vec(respIdx).at(i) - sub) / mul);
trainData.add(i);
trainLabels.add(response);
}
}
// shuffle the (global) list
if (_shuffle) {
rng.setSeed(seed);
Collections.shuffle(trainLabels, rng);
rng.setSeed(seed);
Collections.shuffle(trainData, rng);
}
if (_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.image) {
iter = new DeepWaterImageIterator(trainData, trainLabels, _localmodel._meanData, batchSize, _localmodel._width, _localmodel._height, _localmodel._channels, _localmodel.get_params()._cache_data);
} else if (_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.dataset) {
assert (_localmodel._dataInfo != null);
iter = new DeepWaterDatasetIterator(trainData, trainLabels, _localmodel._dataInfo, batchSize, _localmodel.get_params()._cache_data);
} else if (_localmodel.get_params()._problem_type == DeepWaterParameters.ProblemType.text) {
iter = new DeepWaterTextIterator(trainData, trainLabels, batchSize, 56, /*FIXME*/
_localmodel.get_params()._cache_data);
}
NativeTrainTask ntt;
while (iter.Next(fs) && !_job.isStopping()) {
// if (ntt != null) nativetime += ntt._timeInMillis;
long n = _localmodel.get_processed_total();
// if(!_localmodel.get_params()._quiet_mode)
// Log.info("Trained " + n + " samples. Training on " + Arrays.toString(((DeepWaterImageIterator)iter).getFiles()));
_localmodel._backend.setParameter(_localmodel._model, "learning_rate", _localmodel.get_params().learningRate((double) n));
_localmodel._backend.setParameter(_localmodel._model, "momentum", _localmodel.get_params().momentum((double) n));
//fork off GPU work, but let the iterator.Next() wait on completion before swapping again
//System.err.println("data: " + Arrays.toString(iter.getData()));
float[] preds = _localmodel._backend.predict(_localmodel._model, iter.getData());
if (Float.isNaN(ArrayUtils.sum(preds))) {
Log.err(DeepWaterModel.unstable_msg);
throw new UnsupportedOperationException(DeepWaterModel.unstable_msg);
}
// System.err.println("pred: " + Arrays.toString(preds));
ntt = new NativeTrainTask(_localmodel._backend, _localmodel._model, iter.getData(), iter.getLabel());
fs.add(H2O.submitTask(ntt));
_localmodel.add_processed_local(iter._batch_size);
}
fs.blockForPending();
// nativetime += ntt._timeInMillis;
} catch (IOException e) {
//gracefully continue if we can't find files etc.
e.printStackTrace();
}
// long end = System.currentTimeMillis();
// if (!_localmodel.get_params()._quiet_mode) {
// Log.info("Time for one iteration: " + PrettyPrint.msecs(end - start, true));
// Log.info("Time for native training : " + PrettyPrint.msecs(nativetime, true));
// }
}
use of water.parser.BufferedString in project h2o-3 by h2oai.
the class Word2VecTest method testTransformAggregate.
@Test
public void testTransformAggregate() {
Scope.enter();
try {
Vec v = Scope.track(svec("a", "b"));
Frame fr = Scope.track(new Frame(Key.<Frame>make(), new String[] { "Words" }, new Vec[] { v }));
DKV.put(fr);
// build an arbitrary w2v model & overwrite the learned vector with fixed values
Word2VecModel.Word2VecParameters p = new Word2VecModel.Word2VecParameters();
p._train = fr._key;
p._min_word_freq = 0;
p._epochs = 1;
p._vec_size = 2;
Word2VecModel w2vm = (Word2VecModel) Scope.track_generic(new Word2Vec(p).trainModel().get());
w2vm._output._vecs = new float[] { 1.0f, 0.0f, 0.0f, 1.0f };
DKV.put(w2vm);
String[] sentences = { "a", "b", null, "a", "c", null, "c", null, "a", "a", /*chunk end*/
"a", "b", null, // no terminator at the end
"b" };
Frame f = new TestFrameBuilder().withName("data").withColNames("Sentences").withVecTypes(Vec.T_STR).withDataForCol(0, sentences).withChunkLayout(10, 4).build();
Frame result = Scope.track(w2vm.transform(f.vec(0), Word2VecModel.AggregateMethod.AVERAGE));
Vec expectedAs = Scope.track(dvec(0.5, 1.0, Double.NaN, 0.75, 0.0));
Vec expectedBs = Scope.track(dvec(0.5, 0.0, Double.NaN, 0.25, 1.0));
assertVecEquals(expectedAs, result.vec(w2vm._output._vocab.get(new BufferedString("a"))), 0.0001);
assertVecEquals(expectedBs, result.vec(w2vm._output._vocab.get(new BufferedString("b"))), 0.0001);
} finally {
Scope.exit();
}
}
use of water.parser.BufferedString in project h2o-3 by h2oai.
the class RollupStats method map.
private RollupStats map(Chunk c) {
_size = c.byteSize();
boolean isUUID = c._vec.isUUID();
boolean isString = c._vec.isString();
BufferedString tmpStr = new BufferedString();
if (isString)
_isInt = false;
// Checksum support
long checksum = 0;
long start = c._start;
long l = 81985529216486895L;
// Check for popular easy cases: All Constant
double min = c.min(), max = c.max();
if (min == max) {
// All constant or all NaN
// It's the min, it's the max, it's the alpha and omega
double d = min;
_checksum = (c.hasFloat() ? Double.doubleToRawLongBits(d) : (long) d) * c._len;
Arrays.fill(_mins, d);
Arrays.fill(_maxs, d);
if (d == Double.POSITIVE_INFINITY)
_pinfs++;
else if (d == Double.NEGATIVE_INFINITY)
_ninfs++;
else {
if (Double.isNaN(d))
_naCnt = c._len;
else if (d != 0)
_nzCnt = c._len;
_mean = d;
_rows = c._len;
}
_isInt = ((long) d) == d;
// No variance for constants
_sigma = 0;
return this;
}
//all const NaNs
if ((c instanceof C0DChunk && c.isNA_impl(0))) {
//count of non-NAs * variance of non-NAs
_sigma = 0;
//sum of non-NAs (will get turned into mean)
_mean = 0;
_naCnt = c._len;
_nzCnt = 0;
return this;
}
// Check for popular easy cases: Boolean, possibly sparse, possibly NaN
if (min == 0 && max == 1) {
// Easy zeros
int zs = c._len - c.sparseLenZero();
int nans = 0;
// Hard-count sparse-but-zero (weird case of setting a zero over a non-zero)
for (int i = c.nextNZ(-1); i < c._len; i = c.nextNZ(i)) if (c.isNA(i))
nans++;
else if (c.at8(i) == 0)
zs++;
// Ones
int os = c._len - zs - nans;
_nzCnt += os;
_naCnt += nans;
for (int i = 0; i < Math.min(_mins.length, zs); i++) {
min(0);
max(0);
}
for (int i = 0; i < Math.min(_mins.length, os); i++) {
min(1);
max(1);
}
_rows += zs + os;
_mean = (double) os / _rows;
_sigma = zs * (0.0 - _mean) * (0.0 - _mean) + os * (1.0 - _mean) * (1.0 - _mean);
return this;
}
// Walk the non-zeros
if (isUUID) {
// UUID columns do not compute min/max/mean/sigma
for (int i = c.nextNZ(-1); i < c._len; i = c.nextNZ(i)) {
if (c.isNA(i))
_naCnt++;
else {
long lo = c.at16l(i), hi = c.at16h(i);
if (lo != 0 || hi != 0)
_nzCnt++;
l = lo ^ 37 * hi;
}
if (// ignore 0s in checksum to be consistent with sparse chunks
l != 0)
checksum ^= (17 * (start + i)) ^ 23 * l;
}
} else if (isString) {
// String columns do not compute min/max/mean/sigma
for (int i = c.nextNZ(-1); i < c._len; i = c.nextNZ(i)) {
if (c.isNA(i))
_naCnt++;
else {
_nzCnt++;
l = c.atStr(tmpStr, i).hashCode();
}
if (// ignore 0s in checksum to be consistent with sparse chunks
l != 0)
checksum ^= (17 * (start + i)) ^ 23 * l;
}
} else {
// Work off all numeric rows, or only the nonzeros for sparse
if (c instanceof C1Chunk)
checksum = new RollupStatsHelpers(this).numericChunkRollup((C1Chunk) c, start, checksum);
else if (c instanceof C1SChunk)
checksum = new RollupStatsHelpers(this).numericChunkRollup((C1SChunk) c, start, checksum);
else if (c instanceof C1NChunk)
checksum = new RollupStatsHelpers(this).numericChunkRollup((C1NChunk) c, start, checksum);
else if (c instanceof C2Chunk)
checksum = new RollupStatsHelpers(this).numericChunkRollup((C2Chunk) c, start, checksum);
else if (c instanceof C2SChunk)
checksum = new RollupStatsHelpers(this).numericChunkRollup((C2SChunk) c, start, checksum);
else if (c instanceof C4SChunk)
checksum = new RollupStatsHelpers(this).numericChunkRollup((C4SChunk) c, start, checksum);
else if (c instanceof C4FChunk)
checksum = new RollupStatsHelpers(this).numericChunkRollup((C4FChunk) c, start, checksum);
else if (c instanceof C4Chunk)
checksum = new RollupStatsHelpers(this).numericChunkRollup((C4Chunk) c, start, checksum);
else if (c instanceof C8Chunk)
checksum = new RollupStatsHelpers(this).numericChunkRollup((C8Chunk) c, start, checksum);
else if (c instanceof C8DChunk)
checksum = new RollupStatsHelpers(this).numericChunkRollup((C8DChunk) c, start, checksum);
else
checksum = new RollupStatsHelpers(this).numericChunkRollup(c, start, checksum);
// handle the zeros
if (c.isSparseZero()) {
int zeros = c._len - c.sparseLenZero();
if (zeros > 0) {
for (int i = 0; i < Math.min(_mins.length, zeros); i++) {
min(0);
max(0);
}
double zeromean = 0;
double zeroM2 = 0;
double delta = _mean - zeromean;
_mean = (_mean * _rows + zeromean * zeros) / (_rows + zeros);
//this is the variance*(N-1), will do sqrt(_sigma/(N-1)) later in postGlobal
_sigma += zeroM2 + delta * delta * _rows * zeros / (_rows + zeros);
_rows += zeros;
}
} else if (c.isSparseNA()) {
_naCnt = c._len - c.sparseLenNA();
}
}
_checksum = checksum;
// UUID and String columns do not compute min/max/mean/sigma
if (isUUID || isString) {
Arrays.fill(_mins, Double.NaN);
Arrays.fill(_maxs, Double.NaN);
_mean = _sigma = Double.NaN;
}
return this;
}
use of water.parser.BufferedString in project h2o-3 by h2oai.
the class AstFlatten method apply.
@Override
public Val apply(Env env, Env.StackHelp stk, AstRoot[] asts) {
Frame fr = stk.track(asts[1].exec(env)).getFrame();
// did not flatten
if (fr.numCols() != 1 || fr.numRows() != 1)
return new ValFrame(fr);
Vec vec = fr.anyVec();
switch(vec.get_type()) {
case Vec.T_BAD:
case Vec.T_NUM:
return new ValNum(vec.at(0));
case Vec.T_TIME:
// check for missing values
return vec.isNA(0) ? new ValNum(Double.NaN) : new ValNum(vec.at8(0));
case Vec.T_STR:
return new ValStr(vec.atStr(new BufferedString(), 0).toString());
case // check for missing values
Vec.T_CAT:
return vec.isNA(0) ? new ValStr("NA") : new ValStr(vec.factor(vec.at8(0)));
default:
throw H2O.unimpl("The type of vector: " + vec.get_type_str() + " is not supported by " + str());
}
}
Aggregations