Search in sources :

Example 76 with Vec

use of water.fvec.Vec in project h2o-3 by h2oai.

the class ParserTest2 method testParsed.

private static void testParsed(Frame fr, String[][] expected) {
    Assert.assertEquals(expected.length, fr.numRows());
    Assert.assertEquals(expected[0].length, fr.numCols());
    for (int j = 0; j < fr.numCols(); ++j) {
        Vec vec = fr.vecs()[j];
        for (int i = 0; i < expected.length; ++i) {
            if (expected[i][j] == null)
                Assert.assertTrue(i + " -- " + j, vec.isNA(i));
            else {
                String pval = vec.domain()[(int) vec.at8(i)];
                Assert.assertTrue(expected[i][j] + " -- " + pval, expected[i][j].equals(pval));
            }
        }
    }
    fr.delete();
}
Also used : Vec(water.fvec.Vec) PrettyPrint(water.util.PrettyPrint)

Example 77 with Vec

use of water.fvec.Vec in project h2o-3 by h2oai.

the class ParserTest2 method testNAs.

@Test
public void testNAs() {
    String[] data = new String[] { "'C1Chunk',C1SChunk, 'C2Chunk', 'C2SChunk',  'C4Chunk',  'C4FChunk',  'C8Chunk',  'C8DChunk',   'Categorical'\n" + "0,       0.0,          0,           0,           0,          0 ,          0,   8.878979,           A \n", "1,       0.1,          1,         0.1,           1,          1 ,          1,   1.985934,           B \n", "2,       0.2,          2,         0.2,           2,          2 ,          2,   3.398018,           C \n", "3,       0.3,          3,         0.3,           3,          3 ,          3,   9.329589,           D \n", "4,       0.4,          4,           4,           4,          4 , 2147483649,   0.290184,           A \n", "0,       0.5,          0,           0,     -100000,    1.234e2 ,-2147483650,   1e-30,              B \n", "254,    0.25,       2550,      6553.4,      100000,    2.345e-2,          0,    1e30,              C \n", " ,          ,           ,            ,            ,            ,           ,        ,                \n", "?,        NA,          ?,           ?,           ?,           ?,          ?,       ?,                \n" };
    Key rkey = ParserTest.makeByteVec(data);
    ParseSetup ps = new ParseSetup(CSV_INFO, (byte) ',', false, ParseSetup.HAS_HEADER, 9, new String[] { "'C1Chunk'", "C1SChunk", "'C2Chunk'", "'C2SChunk'", "'C4Chunk'", "'C4FChunk'", "'C8Chunk'", "'C8DChunk'", "'Categorical'" }, ParseSetup.strToColumnTypes(new String[] { "Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Enum" }), null, null, null);
    Frame fr = ParseDataset.parse(Key.make("na_test.hex"), new Key[] { rkey }, true, ps);
    int nlines = (int) fr.numRows();
    Assert.assertEquals(9, nlines);
    Assert.assertEquals(9, fr.numCols());
    for (int i = 0; i < nlines - 2; ++i) for (Vec v : fr.vecs()) Assert.assertTrue("error at line " + i + ", vec " + v.chunkForChunkIdx(0).getClass().getSimpleName(), !Double.isNaN(v.at(i)) && !v.isNA(i));
    for (int j = 0; j < fr.vecs().length; j++) {
        Vec v = fr.vecs()[j];
        for (int i = nlines - 2; i < nlines; ++i) Assert.assertTrue(i + ", " + j + ":" + v.at(i) + ", " + v.isNA(i), Double.isNaN(v.at(i)) && v.isNA(i));
    }
    fr.delete();
}
Also used : Frame(water.fvec.Frame) Vec(water.fvec.Vec) Key(water.Key) PrettyPrint(water.util.PrettyPrint) Test(org.junit.Test)

Example 78 with Vec

use of water.fvec.Vec in project h2o-3 by h2oai.

the class ParquetParser method parseChunk.

@Override
protected final ParseWriter parseChunk(int cidx, ParseReader din, ParseWriter dout) {
    if (!(din instanceof FVecParseReader)) {
        // TODO: Should we modify the interface to expose the underlying chunk for non-streaming parsers?
        throw new IllegalStateException("We only accept parser readers backed by a Vec (no streaming support!).");
    }
    Chunk chunk = ((FVecParseReader) din).getChunk();
    Vec vec = chunk.vec();
    // extract metadata, we want to read only the row groups that have centers in this chunk
    ParquetMetadataConverter.MetadataFilter chunkFilter = ParquetMetadataConverter.range(chunk.start(), chunk.start() + chunk.len());
    ParquetMetadata metadata = VecParquetReader.readFooter(_metadata, chunkFilter);
    if (metadata.getBlocks().isEmpty()) {
        Log.trace("Chunk #", cidx, " doesn't contain any Parquet block center.");
        return dout;
    }
    Log.info("Processing ", metadata.getBlocks().size(), " blocks of chunk #", cidx);
    VecParquetReader reader = new VecParquetReader(vec, metadata, dout, _setup.getColumnTypes());
    try {
        Integer recordNumber;
        do {
            recordNumber = reader.read();
        } while (recordNumber != null);
    } catch (IOException e) {
        throw new RuntimeException("Failed to parse records", e);
    }
    return dout;
}
Also used : ParquetMetadataConverter(org.apache.parquet.format.converter.ParquetMetadataConverter) ByteVec(water.fvec.ByteVec) Vec(water.fvec.Vec) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) VecParquetReader(org.apache.parquet.hadoop.VecParquetReader) IOException(java.io.IOException) Chunk(water.fvec.Chunk)

Example 79 with Vec

use of water.fvec.Vec in project h2o-3 by h2oai.

the class KMeansDroplet method main.

public static void main(String[] args) throws Exception {
    initCloud();
    // Load and parse a file. Data is distributed to other nodes in a round-robin way
    File f = new File("smalldata/glm_test/gaussian.csv");
    NFSFileVec nfs = NFSFileVec.make(f);
    Frame frame = water.parser.ParseDataset.parse(Key.make(), nfs._key);
    // Optionally create a frame with fewer columns, e.g. skip first
    frame.remove(0);
    // Create k centers as arrays of doubles
    int k = 7;
    double[][] centers = new double[k][frame.vecs().length];
    // Initialize first cluster center to random row
    Random rand = new Random();
    for (int cluster = 0; cluster < centers.length; cluster++) {
        long row = Math.max(0, (long) (rand.nextDouble() * frame.vecs().length) - 1);
        for (int i = 0; i < frame.vecs().length; i++) {
            Vec v = frame.vecs()[i];
            centers[cluster][i] = v.at(row);
        }
    }
    // Iterate over the dataset and show error for each step
    int NUM_ITERS = 10;
    for (int i = 0; i < NUM_ITERS; i++) {
        KMeans task = new KMeans();
        task._centers = centers;
        task.doAll(frame);
        for (int c = 0; c < centers.length; c++) {
            if (task._size[c] > 0) {
                for (int v = 0; v < frame.vecs().length; v++) {
                    double value = task._sums[c][v] / task._size[c];
                    centers[c][v] = value;
                }
            }
        }
        System.out.println("Error is " + task._error);
    }
    System.out.println("Cluster Centers:");
    DecimalFormat df = new DecimalFormat("#.00");
    for (double[] center : centers) {
        for (int v = 0; v < frame.vecs().length; v++) System.out.print(df.format(center[v]) + ", ");
        System.out.println("");
    }
    System.exit(0);
}
Also used : Frame(water.fvec.Frame) Random(java.util.Random) Vec(water.fvec.Vec) NFSFileVec(water.fvec.NFSFileVec) NFSFileVec(water.fvec.NFSFileVec) DecimalFormat(java.text.DecimalFormat) File(java.io.File)

Example 80 with Vec

use of water.fvec.Vec in project h2o-2 by h2oai.

the class Anomaly method execImpl.

@Override
protected final void execImpl() {
    if (dl_autoencoder_model == null)
        throw new IllegalArgumentException("Deep Learning Model must be specified.");
    DeepLearningModel dlm = UKV.get(dl_autoencoder_model);
    if (dlm == null)
        throw new IllegalArgumentException("Deep Learning Model not found.");
    if (!dlm.get_params().autoencoder)
        throw new IllegalArgumentException("Deep Learning Model must be build with autoencoder = true.");
    if (thresh == -1) {
        Log.info("Mean reconstruction error (MSE) of model on training data: " + dlm.mse());
        thresh = 10 * dlm.mse();
        Log.info("Setting MSE threshold for anomaly to: " + thresh + ".");
    }
    StringBuilder sb = new StringBuilder();
    sb.append("\nFinding outliers in frame " + source._key.toString() + ".\n");
    Frame mse = dlm.scoreAutoEncoder(source);
    sb.append("Storing the reconstruction error (MSE) for all rows under: " + dest() + ".\n");
    Frame output = new Frame(dest(), new String[] { "Reconstruction.MSE" }, new Vec[] { mse.vecs()[0] });
    output.delete_and_lock(null);
    output.unlock(null);
    final Vec mse_test = mse.anyVec();
    sb.append("Mean reconstruction error (MSE): " + mse_test.mean() + ".\n");
    // print stats and potential outliers
    sb.append("The following data points have a reconstruction error greater than " + thresh + ":\n");
    HashSet<Long> outliers = new HashSet<Long>();
    for (long i = 0; i < mse_test.length(); i++) {
        if (mse_test.at(i) > thresh) {
            outliers.add(i);
            sb.append(String.format("row %d : MSE = %5f\n", i, mse_test.at(i)));
        }
    }
    Log.info(sb);
}
Also used : Frame(water.fvec.Frame) Vec(water.fvec.Vec) DeepLearningModel(hex.deeplearning.DeepLearningModel) HashSet(java.util.HashSet)

Aggregations

Vec (water.fvec.Vec)280 Frame (water.fvec.Frame)213 Test (org.junit.Test)82 NFSFileVec (water.fvec.NFSFileVec)48 ValFrame (water.rapids.vals.ValFrame)47 Chunk (water.fvec.Chunk)30 Random (java.util.Random)25 NewChunk (water.fvec.NewChunk)23 DeepLearningParameters (hex.deeplearning.DeepLearningModel.DeepLearningParameters)22 Key (water.Key)21 MRTask (water.MRTask)17 Val (water.rapids.Val)14 File (java.io.File)11 ArrayList (java.util.ArrayList)11 Futures (water.Futures)11 H2OIllegalArgumentException (water.exceptions.H2OIllegalArgumentException)11 ValNum (water.rapids.vals.ValNum)11 ShuffleSplitFrame (hex.splitframe.ShuffleSplitFrame)10 BufferedString (water.parser.BufferedString)10 AppendableVec (water.fvec.AppendableVec)9