use of water.Key in project h2o-2 by h2oai.
the class FrameUtils method parseFrame.
/** Parse given file into the form of frame represented by the given key.
*
* @param okey destination key for parsed frame
* @param files files to parse
* @return a new frame
*/
public static Frame parseFrame(Key okey, File... files) {
assert files.length > 0 : "Ups. No files to parse!";
for (File f : files) if (!f.exists())
throw new RuntimeException("File not found " + f);
// Create output key if not specified
if (okey == null)
okey = Key.make(files[0].getName());
Key[] fkeys = new Key[files.length];
int cnt = 0;
for (File f : files) fkeys[cnt++] = NFSFileVec.make(f);
return parseFrame(okey, fkeys);
}
use of water.Key in project h2o-3 by h2oai.
the class AvroParserProvider method createParserSetup.
@Override
public ParseSetup createParserSetup(Key[] inputs, ParseSetup requiredSetup) {
// Also expect that files are not compressed
assert inputs != null && inputs.length > 0 : "Inputs cannot be empty!";
Key firstInput = inputs[0];
Iced ice = DKV.getGet(firstInput);
if (ice == null)
throw new H2OIllegalArgumentException("Missing data", "Did not find any data under key " + firstInput);
ByteVec bv = (ByteVec) (ice instanceof ByteVec ? ice : ((Frame) ice).vecs()[0]);
byte[] bits = bv.getFirstBytes();
try {
AvroParser.AvroInfo avroInfo = AvroParser.extractAvroInfo(bits, requiredSetup);
return new AvroParser.AvroParseSetup(requiredSetup, avroInfo.header, avroInfo.firstBlockSize, avroInfo.domains);
} catch (Throwable e) {
throw new H2OIllegalArgumentException("Wrong data", "Cannot find Avro header in input file: " + firstInput, e);
}
}
use of water.Key in project h2o-3 by h2oai.
the class PersistHdfs method load.
/** InputStream from a HDFS-based Key */
/*public static InputStream openStream(Key k, Job pmon) throws IOException {
H2OHdfsInputStream res = null;
Path p = new Path(k.toString());
try {
res = new H2OHdfsInputStream(p, 0, pmon);
} catch( IOException e ) {
try {
Thread.sleep(1000);
} catch( Exception ex ) {}
Log.warn("Error while opening HDFS key " + k.toString() + ", will wait and retry.");
res = new H2OHdfsInputStream(p, 0, pmon);
}
return res;
}*/
@Override
public byte[] load(final Value v) {
//
// !!! WARNING !!!
//
// tomk: Sun Apr 19 13:11:51 PDT 2015
//
//
// This load implementation behaved *HORRIBLY* with S3 when the libraries were updated.
// Behaves well (and is the same set of libraries as H2O-1):
// org.apache.hadoop:hadoop-client:2.0.0-cdh4.3.0
// net.java.dev.jets3t:jets3t:0.6.1
//
// Behaves abysmally:
// org.apache.hadoop:hadoop-client:2.5.0-cdh5.2.0
// net.java.dev.jets3t:jets3t:0.9.2
//
//
// I did some debugging.
//
// What happens in the new libraries is the connection type is a streaming connection, and
// the entire file gets read on close() even if you only wanted to read a chunk. The result
// is the same data gets read over and over again by the underlying transport layer even
// though H2O only thinks it's asking for (and receiving) each piece of data once.
//
// I suspect this has something to do with the 'Range' HTTP header on the GET, but I'm not
// entirely sure. Many layers of library need to be fought through to really figure it out.
//
// Anyway, this will need to be rewritten from the perspective of how to properly use the
// new library version. Might make sense to go to straight to 's3a' which is a replacement
// for 's3n'.
//
long end, start = System.currentTimeMillis();
final byte[] b = MemoryManager.malloc1(v._max);
Key k = v._key;
long skip = k.isChunkKey() ? water.fvec.NFSFileVec.chunkOffset(k) : 0;
final Path p = _iceRoot == null ? new Path(getPathForKey(k)) : new Path(_iceRoot, getIceName(v));
final long skip_ = skip;
run(new Callable() {
@Override
public Object call() throws Exception {
FileSystem fs = FileSystem.get(p.toUri(), CONF);
FSDataInputStream s = null;
try {
// fs.getDefaultBlockSize(p);
s = fs.open(p);
// System.out.println(Arrays.toString(bs));
if (p.toString().toLowerCase().startsWith("maprfs:")) {
// MapR behaves really horribly with the google ByteStreams code below.
// Instead of skipping by seeking, it skips by reading and dropping. Very bad.
// Use the HDFS API here directly instead.
s.seek(skip_);
s.readFully(b);
} else {
// NOTE:
// The following line degrades performance of HDFS load from S3 API: s.readFully(skip,b,0,b.length);
// Google API's simple seek has better performance
// Load of 300MB file via Google API ~ 14sec, via s.readFully ~ 5min (under the same condition)
// ByteStreams.skipFully(s, skip_);
// ByteStreams.readFully(s, b);
s.seek(skip_);
s.readFully(b);
}
assert v.isPersisted();
} finally {
s.getWrappedStream().close();
FileUtils.close(s);
}
return null;
}
}, true, v._max);
end = System.currentTimeMillis();
if (// Only log read that took over 1 second to complete
end - start > 1000)
Log.debug("Slow Read: " + (end - start) + " millis to get bytes " + skip_ + "-" + (skip_ + b.length) + " in HDFS read.");
return b;
}
use of water.Key in project h2o-3 by h2oai.
the class GridsHandler method list.
/**
* Return all the grids.
*/
// called through reflection by RequestServer
@SuppressWarnings("unused")
public GridsV99 list(int version, GridsV99 s) {
final Key[] gridKeys = KeySnapshot.globalSnapshot().filter(new KeySnapshot.KVFilter() {
@Override
public boolean filter(KeySnapshot.KeyInfo k) {
return Value.isSubclassOf(k._type, Grid.class);
}
}).keys();
s.grids = new GridSchemaV99[gridKeys.length];
for (int i = 0; i < gridKeys.length; i++) {
s.grids[i] = new GridSchemaV99();
s.grids[i].fillFromImpl(getFromDKV("(none)", gridKeys[i], Grid.class));
}
return s;
}
use of water.Key in project h2o-3 by h2oai.
the class ParseHandler method parse.
// Entry point for parsing.
// called through reflection by RequestServer
@SuppressWarnings("unused")
public ParseV3 parse(int version, ParseV3 parse) {
ParserInfo parserInfo = ParserService.INSTANCE.getByName(parse.parse_type).info();
ParseSetup setup = new ParseSetup(parserInfo, parse.separator, parse.single_quotes, parse.check_header, parse.number_columns, delNulls(parse.column_names), ParseSetup.strToColumnTypes(parse.column_types), parse.domains, parse.na_strings, null, new ParseWriter.ParseErr[0], parse.chunk_size);
if (parse.source_frames == null)
throw new H2OIllegalArgumentException("Data for Frame '" + parse.destination_frame.name + "' is not available. Please check that the path is valid (for all H2O nodes).'");
Key[] srcs = new Key[parse.source_frames.length];
for (int i = 0; i < parse.source_frames.length; i++) srcs[i] = parse.source_frames[i].key();
parse.job = new JobV3(ParseDataset.parse(parse.destination_frame.key(), srcs, parse.delete_on_done, setup, parse.blocking)._job);
if (parse.blocking) {
Frame fr = DKV.getGet(parse.destination_frame.key());
parse.rows = fr.numRows();
}
return parse;
}
Aggregations