Search in sources :

Example 36 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class Model method testJavaScoring.

public boolean testJavaScoring(Frame data, Frame model_predictions, double rel_epsilon, double abs_epsilon, double fraction) {
    ModelBuilder mb = ModelBuilder.make(_parms.algoName().toLowerCase(), null, null);
    boolean havePojo = mb.havePojo();
    boolean haveMojo = mb.haveMojo();
    Random rnd = RandomUtils.getRNG(data.byteSize());
    assert data.numRows() == model_predictions.numRows();
    Frame fr = new Frame(data);
    boolean computeMetrics = data.vec(_output.responseName()) != null && !data.vec(_output.responseName()).isBad();
    try {
        String[] warns = adaptTestForTrain(fr, true, computeMetrics);
        if (warns.length > 0)
            System.err.println(Arrays.toString(warns));
        // Output is in the model's domain, but needs to be mapped to the scored
        // dataset's domain.
        int[] omap = null;
        if (_output.isClassifier()) {
            Vec actual = fr.vec(_output.responseName());
            // Scored/test domain; can be null
            String[] sdomain = actual == null ? null : actual.domain();
            // Domain of predictions (union of test and train)
            String[] mdomain = model_predictions.vec(0).domain();
            if (sdomain != null && !Arrays.equals(mdomain, sdomain)) {
                // Map from model-domain to scoring-domain
                omap = CategoricalWrappedVec.computeMap(mdomain, sdomain);
            }
        }
        String modelName = JCodeGen.toJavaId(_key.toString());
        boolean preview = false;
        GenModel genmodel = null;
        Vec[] dvecs = fr.vecs();
        Vec[] pvecs = model_predictions.vecs();
        double[] features = null;
        int num_errors = 0;
        int num_total = 0;
        // First try internal POJO via fast double[] API
        if (havePojo) {
            try {
                String java_text = toJava(preview, true);
                Class clz = JCodeGen.compile(modelName, java_text);
                genmodel = (GenModel) clz.newInstance();
            } catch (Exception e) {
                e.printStackTrace();
                throw H2O.fail("Internal POJO compilation failed", e);
            }
            features = MemoryManager.malloc8d(genmodel._names.length);
            double[] predictions = MemoryManager.malloc8d(genmodel.nclasses() + 1);
            // Compare predictions, counting mis-predicts
            for (int row = 0; row < fr.numRows(); row++) {
                // For all rows, single-threaded
                if (rnd.nextDouble() >= fraction)
                    continue;
                num_total++;
                // Native Java API
                for (// Build feature set
                int col = 0; // Build feature set
                col < features.length; // Build feature set
                col++) features[col] = dvecs[col].at(row);
                // POJO predictions
                genmodel.score0(features, predictions);
                for (int col = _output.isClassifier() ? 1 : 0; col < pvecs.length; col++) {
                    // Compare predictions
                    // Load internal scoring predictions
                    double d = pvecs[col].at(row);
                    // map categorical response to scoring domain
                    if (col == 0 && omap != null)
                        d = omap[(int) d];
                    if (!MathUtils.compare(predictions[col], d, abs_epsilon, rel_epsilon)) {
                        if (num_errors++ < 10)
                            System.err.println("Predictions mismatch, row " + row + ", col " + model_predictions._names[col] + ", internal prediction=" + d + ", POJO prediction=" + predictions[col]);
                        break;
                    }
                }
            }
        }
        // EasyPredict API with POJO and/or MOJO
        for (int i = 0; i < 2; ++i) {
            if (i == 0 && !havePojo)
                continue;
            if (i == 1 && !haveMojo)
                continue;
            if (i == 1) {
                // MOJO
                final String filename = modelName + ".zip";
                StreamingSchema ss = new StreamingSchema(getMojo(), filename);
                try {
                    FileOutputStream os = new FileOutputStream(ss.getFilename());
                    ss.getStreamWriter().writeTo(os);
                    os.close();
                    genmodel = MojoModel.load(filename);
                    features = MemoryManager.malloc8d(genmodel._names.length);
                } catch (IOException e1) {
                    e1.printStackTrace();
                    throw H2O.fail("Internal MOJO loading failed", e1);
                } finally {
                    boolean deleted = new File(filename).delete();
                    if (!deleted)
                        Log.warn("Failed to delete the file");
                }
            }
            EasyPredictModelWrapper epmw = new EasyPredictModelWrapper(new EasyPredictModelWrapper.Config().setModel(genmodel).setConvertUnknownCategoricalLevelsToNa(true));
            RowData rowData = new RowData();
            BufferedString bStr = new BufferedString();
            for (int row = 0; row < fr.numRows(); row++) {
                // For all rows, single-threaded
                if (rnd.nextDouble() >= fraction)
                    continue;
                if (genmodel.getModelCategory() == ModelCategory.AutoEncoder)
                    continue;
                // Generate input row
                for (int col = 0; col < features.length; col++) {
                    if (dvecs[col].isString()) {
                        rowData.put(genmodel._names[col], dvecs[col].atStr(bStr, row).toString());
                    } else {
                        double val = dvecs[col].at(row);
                        rowData.put(genmodel._names[col], genmodel._domains[col] == null ? (Double) val : // missing categorical values are kept as NaN, the score0 logic passes it on to bitSetContains()
                        Double.isNaN(val) ? // missing categorical values are kept as NaN, the score0 logic passes it on to bitSetContains()
                        val : //unseen levels are treated as such
                        (int) val < genmodel._domains[col].length ? genmodel._domains[col][(int) val] : "UnknownLevel");
                    }
                }
                // Make a prediction
                AbstractPrediction p;
                try {
                    p = epmw.predict(rowData);
                } catch (PredictException e) {
                    num_errors++;
                    if (num_errors < 20) {
                        System.err.println("EasyPredict threw an exception when predicting row " + rowData);
                        e.printStackTrace();
                    }
                    continue;
                }
                // Convert model predictions and "internal" predictions into the same shape
                double[] expected_preds = new double[pvecs.length];
                double[] actual_preds = new double[pvecs.length];
                for (int col = 0; col < pvecs.length; col++) {
                    // Compare predictions
                    // Load internal scoring predictions
                    double d = pvecs[col].at(row);
                    // map categorical response to scoring domain
                    if (col == 0 && omap != null)
                        d = omap[(int) d];
                    double d2 = Double.NaN;
                    switch(genmodel.getModelCategory()) {
                        case Clustering:
                            d2 = ((ClusteringModelPrediction) p).cluster;
                            break;
                        case Regression:
                            d2 = ((RegressionModelPrediction) p).value;
                            break;
                        case Binomial:
                            BinomialModelPrediction bmp = (BinomialModelPrediction) p;
                            d2 = (col == 0) ? bmp.labelIndex : bmp.classProbabilities[col - 1];
                            break;
                        case Multinomial:
                            MultinomialModelPrediction mmp = (MultinomialModelPrediction) p;
                            d2 = (col == 0) ? mmp.labelIndex : mmp.classProbabilities[col - 1];
                            break;
                        case DimReduction:
                            d2 = ((DimReductionModelPrediction) p).dimensions[col];
                            break;
                    }
                    expected_preds[col] = d;
                    actual_preds[col] = d2;
                }
                // Verify the correctness of the prediction
                num_total++;
                for (int col = genmodel.isClassifier() ? 1 : 0; col < pvecs.length; col++) {
                    if (!MathUtils.compare(actual_preds[col], expected_preds[col], abs_epsilon, rel_epsilon)) {
                        num_errors++;
                        if (num_errors < 20) {
                            System.err.println((i == 0 ? "POJO" : "MOJO") + " EasyPredict Predictions mismatch for row " + rowData);
                            System.err.println("  Expected predictions: " + Arrays.toString(expected_preds));
                            System.err.println("  Actual predictions:   " + Arrays.toString(actual_preds));
                        }
                        break;
                    }
                }
            }
        }
        if (num_errors != 0)
            System.err.println("Number of errors: " + num_errors + (num_errors > 20 ? " (only first 20 are shown)" : "") + " out of " + num_total + " rows tested.");
        return num_errors == 0;
    } finally {
        // Remove temp keys.
        cleanup_adapt(fr, data);
    }
}
Also used : PredictException(hex.genmodel.easy.exception.PredictException) BufferedString(water.parser.BufferedString) EasyPredictModelWrapper(hex.genmodel.easy.EasyPredictModelWrapper) RowData(hex.genmodel.easy.RowData) BufferedString(water.parser.BufferedString) PredictException(hex.genmodel.easy.exception.PredictException) GenModel(hex.genmodel.GenModel) StreamingSchema(water.api.StreamingSchema)

Example 37 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class WordCountTaskTest method testWordCountText8.

@Test
public void testWordCountText8() {
    String fName = "bigdata/laptop/text8.gz";
    // only run if text8 is present
    assumeThat("text8 data available", locateFile(fName), is(notNullValue()));
    Frame fr = parse_test_file(fName, "NA", 0, new byte[] { Vec.T_STR });
    try {
        Map<BufferedString, IcedLong> counts = new WordCountTask().doAll(fr.vec(0))._counts;
        assertEquals(253854, counts.size());
        assertEquals(303L, counts.get(new BufferedString("anarchism"))._val);
        assertEquals(316376L, counts.get(new BufferedString("to"))._val);
        assertNotNull(counts);
    } finally {
        fr.remove();
    }
}
Also used : Frame(water.fvec.Frame) IcedLong(water.util.IcedLong) BufferedString(water.parser.BufferedString) BufferedString(water.parser.BufferedString) Test(org.junit.Test)

Example 38 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class WordCountTaskTest method testWordCount.

@Test
public void testWordCount() {
    String[] strData = new String[10000];
    for (int i = 0; i < strData.length; i++) {
        int b = i % 10;
        if (b < 3)
            strData[i] = "A";
        else if (b < 5)
            strData[i] = "B";
        else
            strData[i] = "C";
    }
    Frame fr = new TestFrameBuilder().withName("data").withColNames("Str").withVecTypes(Vec.T_STR).withDataForCol(0, strData).withChunkLayout(100, 900, 5000, 4000).build();
    try {
        Map<BufferedString, IcedLong> counts = new WordCountTask().doAll(fr.vec(0))._counts;
        assertEquals(3, counts.size());
        assertEquals(3000L, counts.get(new BufferedString("A"))._val);
        assertEquals(2000L, counts.get(new BufferedString("B"))._val);
        assertEquals(5000L, counts.get(new BufferedString("C"))._val);
        System.out.println(counts);
    } finally {
        fr.remove();
    }
}
Also used : Frame(water.fvec.Frame) TestFrameBuilder(water.fvec.TestFrameBuilder) IcedLong(water.util.IcedLong) BufferedString(water.parser.BufferedString) BufferedString(water.parser.BufferedString) Test(org.junit.Test)

Example 39 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class AstRecAssignTestUtils method strVec2array.

static String[] strVec2array(Vec v) {
    Vec.Reader ovr = v.new Reader();
    assert ovr.length() < Integer.MAX_VALUE;
    final int len = (int) ovr.length();
    BufferedString bs = new BufferedString();
    String[] array = new String[len];
    for (int i = 0; i < len; i++) {
        BufferedString s = ovr.atStr(bs, i);
        if (s != null)
            array[i] = s.toString();
    }
    return array;
}
Also used : Vec(water.fvec.Vec) AppendableVec(water.fvec.AppendableVec) BufferedString(water.parser.BufferedString) BufferedString(water.parser.BufferedString)

Example 40 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class Word2VecModel method buildModelOutput.

void buildModelOutput(Word2VecModelInfo modelInfo) {
    IcedHashMapGeneric<BufferedString, Integer> vocab = ((Vocabulary) DKV.getGet(modelInfo._vocabKey))._data;
    BufferedString[] words = new BufferedString[vocab.size()];
    for (BufferedString str : vocab.keySet()) words[vocab.get(str)] = str;
    _output._vecSize = _parms._vec_size;
    _output._vecs = modelInfo._syn0;
    _output._words = words;
    _output._vocab = vocab;
}
Also used : BufferedString(water.parser.BufferedString)

Aggregations

BufferedString (water.parser.BufferedString)43 Frame (water.fvec.Frame)12 Test (org.junit.Test)9 MRTask (water.MRTask)8 Vec (water.fvec.Vec)8 Chunk (water.fvec.Chunk)7 NewChunk (water.fvec.NewChunk)6 ValFrame (water.rapids.vals.ValFrame)5 IcedLong (water.util.IcedLong)5 IOException (java.io.IOException)2 ByteBuffer (java.nio.ByteBuffer)2 Random (java.util.Random)2 DateTimeFormatter (org.joda.time.format.DateTimeFormatter)2 TestFrameBuilder (water.fvec.TestFrameBuilder)2 BackendModel (deepwater.backends.BackendModel)1 BackendParams (deepwater.backends.BackendParams)1 RuntimeOptions (deepwater.backends.RuntimeOptions)1 ImageDataSet (deepwater.datasets.ImageDataSet)1 GenModel (hex.genmodel.GenModel)1 EasyPredictModelWrapper (hex.genmodel.easy.EasyPredictModelWrapper)1