use of water.fvec.Vec in project h2o-3 by h2oai.
the class ParserTest2 method testParsed.
private static void testParsed(Frame fr, String[][] expected) {
Assert.assertEquals(expected.length, fr.numRows());
Assert.assertEquals(expected[0].length, fr.numCols());
for (int j = 0; j < fr.numCols(); ++j) {
Vec vec = fr.vecs()[j];
for (int i = 0; i < expected.length; ++i) {
if (expected[i][j] == null)
Assert.assertTrue(i + " -- " + j, vec.isNA(i));
else {
String pval = vec.domain()[(int) vec.at8(i)];
Assert.assertTrue(expected[i][j] + " -- " + pval, expected[i][j].equals(pval));
}
}
}
fr.delete();
}
use of water.fvec.Vec in project h2o-3 by h2oai.
the class ParserTest2 method testNAs.
@Test
public void testNAs() {
String[] data = new String[] { "'C1Chunk',C1SChunk, 'C2Chunk', 'C2SChunk', 'C4Chunk', 'C4FChunk', 'C8Chunk', 'C8DChunk', 'Categorical'\n" + "0, 0.0, 0, 0, 0, 0 , 0, 8.878979, A \n", "1, 0.1, 1, 0.1, 1, 1 , 1, 1.985934, B \n", "2, 0.2, 2, 0.2, 2, 2 , 2, 3.398018, C \n", "3, 0.3, 3, 0.3, 3, 3 , 3, 9.329589, D \n", "4, 0.4, 4, 4, 4, 4 , 2147483649, 0.290184, A \n", "0, 0.5, 0, 0, -100000, 1.234e2 ,-2147483650, 1e-30, B \n", "254, 0.25, 2550, 6553.4, 100000, 2.345e-2, 0, 1e30, C \n", " , , , , , , , , \n", "?, NA, ?, ?, ?, ?, ?, ?, \n" };
Key rkey = ParserTest.makeByteVec(data);
ParseSetup ps = new ParseSetup(CSV_INFO, (byte) ',', false, ParseSetup.HAS_HEADER, 9, new String[] { "'C1Chunk'", "C1SChunk", "'C2Chunk'", "'C2SChunk'", "'C4Chunk'", "'C4FChunk'", "'C8Chunk'", "'C8DChunk'", "'Categorical'" }, ParseSetup.strToColumnTypes(new String[] { "Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Enum" }), null, null, null);
Frame fr = ParseDataset.parse(Key.make("na_test.hex"), new Key[] { rkey }, true, ps);
int nlines = (int) fr.numRows();
Assert.assertEquals(9, nlines);
Assert.assertEquals(9, fr.numCols());
for (int i = 0; i < nlines - 2; ++i) for (Vec v : fr.vecs()) Assert.assertTrue("error at line " + i + ", vec " + v.chunkForChunkIdx(0).getClass().getSimpleName(), !Double.isNaN(v.at(i)) && !v.isNA(i));
for (int j = 0; j < fr.vecs().length; j++) {
Vec v = fr.vecs()[j];
for (int i = nlines - 2; i < nlines; ++i) Assert.assertTrue(i + ", " + j + ":" + v.at(i) + ", " + v.isNA(i), Double.isNaN(v.at(i)) && v.isNA(i));
}
fr.delete();
}
use of water.fvec.Vec in project h2o-3 by h2oai.
the class ParquetParser method parseChunk.
@Override
protected final ParseWriter parseChunk(int cidx, ParseReader din, ParseWriter dout) {
if (!(din instanceof FVecParseReader)) {
// TODO: Should we modify the interface to expose the underlying chunk for non-streaming parsers?
throw new IllegalStateException("We only accept parser readers backed by a Vec (no streaming support!).");
}
Chunk chunk = ((FVecParseReader) din).getChunk();
Vec vec = chunk.vec();
// extract metadata, we want to read only the row groups that have centers in this chunk
ParquetMetadataConverter.MetadataFilter chunkFilter = ParquetMetadataConverter.range(chunk.start(), chunk.start() + chunk.len());
ParquetMetadata metadata = VecParquetReader.readFooter(_metadata, chunkFilter);
if (metadata.getBlocks().isEmpty()) {
Log.trace("Chunk #", cidx, " doesn't contain any Parquet block center.");
return dout;
}
Log.info("Processing ", metadata.getBlocks().size(), " blocks of chunk #", cidx);
VecParquetReader reader = new VecParquetReader(vec, metadata, dout, _setup.getColumnTypes());
try {
Integer recordNumber;
do {
recordNumber = reader.read();
} while (recordNumber != null);
} catch (IOException e) {
throw new RuntimeException("Failed to parse records", e);
}
return dout;
}
use of water.fvec.Vec in project h2o-3 by h2oai.
the class KMeansDroplet method main.
public static void main(String[] args) throws Exception {
initCloud();
// Load and parse a file. Data is distributed to other nodes in a round-robin way
File f = new File("smalldata/glm_test/gaussian.csv");
NFSFileVec nfs = NFSFileVec.make(f);
Frame frame = water.parser.ParseDataset.parse(Key.make(), nfs._key);
// Optionally create a frame with fewer columns, e.g. skip first
frame.remove(0);
// Create k centers as arrays of doubles
int k = 7;
double[][] centers = new double[k][frame.vecs().length];
// Initialize first cluster center to random row
Random rand = new Random();
for (int cluster = 0; cluster < centers.length; cluster++) {
long row = Math.max(0, (long) (rand.nextDouble() * frame.vecs().length) - 1);
for (int i = 0; i < frame.vecs().length; i++) {
Vec v = frame.vecs()[i];
centers[cluster][i] = v.at(row);
}
}
// Iterate over the dataset and show error for each step
int NUM_ITERS = 10;
for (int i = 0; i < NUM_ITERS; i++) {
KMeans task = new KMeans();
task._centers = centers;
task.doAll(frame);
for (int c = 0; c < centers.length; c++) {
if (task._size[c] > 0) {
for (int v = 0; v < frame.vecs().length; v++) {
double value = task._sums[c][v] / task._size[c];
centers[c][v] = value;
}
}
}
System.out.println("Error is " + task._error);
}
System.out.println("Cluster Centers:");
DecimalFormat df = new DecimalFormat("#.00");
for (double[] center : centers) {
for (int v = 0; v < frame.vecs().length; v++) System.out.print(df.format(center[v]) + ", ");
System.out.println("");
}
System.exit(0);
}
use of water.fvec.Vec in project h2o-2 by h2oai.
the class Anomaly method execImpl.
@Override
protected final void execImpl() {
if (dl_autoencoder_model == null)
throw new IllegalArgumentException("Deep Learning Model must be specified.");
DeepLearningModel dlm = UKV.get(dl_autoencoder_model);
if (dlm == null)
throw new IllegalArgumentException("Deep Learning Model not found.");
if (!dlm.get_params().autoencoder)
throw new IllegalArgumentException("Deep Learning Model must be build with autoencoder = true.");
if (thresh == -1) {
Log.info("Mean reconstruction error (MSE) of model on training data: " + dlm.mse());
thresh = 10 * dlm.mse();
Log.info("Setting MSE threshold for anomaly to: " + thresh + ".");
}
StringBuilder sb = new StringBuilder();
sb.append("\nFinding outliers in frame " + source._key.toString() + ".\n");
Frame mse = dlm.scoreAutoEncoder(source);
sb.append("Storing the reconstruction error (MSE) for all rows under: " + dest() + ".\n");
Frame output = new Frame(dest(), new String[] { "Reconstruction.MSE" }, new Vec[] { mse.vecs()[0] });
output.delete_and_lock(null);
output.unlock(null);
final Vec mse_test = mse.anyVec();
sb.append("Mean reconstruction error (MSE): " + mse_test.mean() + ".\n");
// print stats and potential outliers
sb.append("The following data points have a reconstruction error greater than " + thresh + ":\n");
HashSet<Long> outliers = new HashSet<Long>();
for (long i = 0; i < mse_test.length(); i++) {
if (mse_test.at(i) > thresh) {
outliers.add(i);
sb.append(String.format("row %d : MSE = %5f\n", i, mse_test.at(i)));
}
}
Log.info(sb);
}
Aggregations