use of org.bson.LazyBSONObject in project mongo-hadoop by mongodb.
the class BSONSplitter method splitFile.
/**
* Calculate the splits for a given input file according to the settings
* for split size only. This method does not respect options like
* {@link com.mongodb.hadoop.util.MongoConfigUtil#BSON_READ_SPLITS bson.split.read_splits}.
*
* @param file the FileStatus for which to calculate splits.
* @return a List of the calculated splits.
*
* @throws IOException when an error occurs reading from the FileSystem
*/
protected List<BSONFileSplit> splitFile(final FileStatus file) throws IOException {
Path path = file.getPath();
ArrayList<BSONFileSplit> splits = new ArrayList<BSONFileSplit>();
FileSystem fs = path.getFileSystem(getConf());
long length = file.getLen();
int numDocsRead = 0;
long splitSize = getSplitSize(getConf(), file);
if (LOG.isDebugEnabled()) {
LOG.debug("Generating splits for " + path + " of up to " + splitSize + " bytes.");
}
FSDataInputStream fsDataStream = fs.open(path);
long curSplitLen = 0;
long curSplitStart = 0;
try {
while (fsDataStream.getPos() + 1 < length) {
lazyCallback.reset();
lazyDec.decode(fsDataStream, lazyCallback);
LazyBSONObject bo = (LazyBSONObject) lazyCallback.get();
int bsonDocSize = bo.getBSONSize();
if (curSplitLen + bsonDocSize >= splitSize) {
BSONFileSplit split = createFileSplit(file, fs, curSplitStart, curSplitLen);
splits.add(split);
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("Creating new split (%d) %s", splits.size(), split));
}
curSplitStart = fsDataStream.getPos() - bsonDocSize;
curSplitLen = 0;
}
curSplitLen += bsonDocSize;
numDocsRead++;
if (numDocsRead % 1000 == 0) {
float splitProgress = 100f * ((float) fsDataStream.getPos() / length);
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("Read %d docs calculating splits for %s; %3.3f%% complete.", numDocsRead, file.getPath(), splitProgress));
}
}
}
if (curSplitLen > 0) {
BSONFileSplit split = createFileSplit(file, fs, curSplitStart, curSplitLen);
splits.add(split);
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("Final split (%d) %s", splits.size(), split.getPath()));
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("Completed splits calculation for " + file.getPath());
}
} catch (IOException e) {
LOG.warn("IOException: " + e);
} finally {
fsDataStream.close();
}
return splits;
}
Aggregations