use of com.mongodb.hadoop.input.BSONFileSplit in project mongo-hadoop by mongodb.
the class BSONSplitter method readSplits.
/**
* Calculate splits for each file in the input path, sensitive to options such
* as {@link com.mongodb.hadoop.util.MongoConfigUtil#BSON_READ_SPLITS bson.split.read_splits}.
* This method always re-calculates the splits and will try to write the
* splits file.
*
* @see #readSplitsForFile
*
* @throws IOException when an error occurs reading from the file
*/
public void readSplits() throws IOException {
splitsList = new ArrayList<BSONFileSplit>();
if (inputPath == null) {
throw new IllegalStateException("Input path has not been set.");
}
FileSystem fs = inputPath.getFileSystem(getConf());
FileStatus file = fs.getFileStatus(inputPath);
readSplitsForFile(file);
}
use of com.mongodb.hadoop.input.BSONFileSplit in project mongo-hadoop by mongodb.
the class BSONSplitter method splitFile.
/**
* Calculate the splits for a given input file according to the settings
* for split size only. This method does not respect options like
* {@link com.mongodb.hadoop.util.MongoConfigUtil#BSON_READ_SPLITS bson.split.read_splits}.
*
* @param file the FileStatus for which to calculate splits.
* @return a List of the calculated splits.
*
* @throws IOException when an error occurs reading from the FileSystem
*/
protected List<BSONFileSplit> splitFile(final FileStatus file) throws IOException {
Path path = file.getPath();
ArrayList<BSONFileSplit> splits = new ArrayList<BSONFileSplit>();
FileSystem fs = path.getFileSystem(getConf());
long length = file.getLen();
int numDocsRead = 0;
long splitSize = getSplitSize(getConf(), file);
if (LOG.isDebugEnabled()) {
LOG.debug("Generating splits for " + path + " of up to " + splitSize + " bytes.");
}
FSDataInputStream fsDataStream = fs.open(path);
long curSplitLen = 0;
long curSplitStart = 0;
try {
while (fsDataStream.getPos() + 1 < length) {
lazyCallback.reset();
lazyDec.decode(fsDataStream, lazyCallback);
LazyBSONObject bo = (LazyBSONObject) lazyCallback.get();
int bsonDocSize = bo.getBSONSize();
if (curSplitLen + bsonDocSize >= splitSize) {
BSONFileSplit split = createFileSplit(file, fs, curSplitStart, curSplitLen);
splits.add(split);
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("Creating new split (%d) %s", splits.size(), split));
}
curSplitStart = fsDataStream.getPos() - bsonDocSize;
curSplitLen = 0;
}
curSplitLen += bsonDocSize;
numDocsRead++;
if (numDocsRead % 1000 == 0) {
float splitProgress = 100f * ((float) fsDataStream.getPos() / length);
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("Read %d docs calculating splits for %s; %3.3f%% complete.", numDocsRead, file.getPath(), splitProgress));
}
}
}
if (curSplitLen > 0) {
BSONFileSplit split = createFileSplit(file, fs, curSplitStart, curSplitLen);
splits.add(split);
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("Final split (%d) %s", splits.size(), split.getPath()));
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("Completed splits calculation for " + file.getPath());
}
} catch (IOException e) {
LOG.warn("IOException: " + e);
} finally {
fsDataStream.close();
}
return splits;
}
use of com.mongodb.hadoop.input.BSONFileSplit in project mongo-hadoop by mongodb.
the class BSONSplitter method readSplitsForFile.
/**
* Calculate the splits for a given input file, sensitive to options such
* as {@link com.mongodb.hadoop.util.MongoConfigUtil#BSON_READ_SPLITS bson.split.read_splits}.
* This method always re-calculates the splits and will try to write the
* splits file.
*
* @param file the FileStatus for which to calculate splits.
* @throws IOException when an error occurs reading from the FileSystem
*
* @see #readSplits
*/
public void readSplitsForFile(final FileStatus file) throws IOException {
long length = file.getLen();
if (!MongoConfigUtil.getBSONReadSplits(getConf())) {
LOG.info("Reading splits is disabled - constructing single split for " + file);
FileSystem fs = file.getPath().getFileSystem(getConf());
BSONFileSplit onesplit = createFileSplit(file, fs, 0, length);
ArrayList<BSONFileSplit> splits = new ArrayList<BSONFileSplit>();
splits.add(onesplit);
splitsList = splits;
return;
}
if (length != 0) {
splitsList = (ArrayList<BSONFileSplit>) splitFile(file);
writeSplits();
} else {
LOG.warn("Zero-length file, skipping split calculation.");
}
}
use of com.mongodb.hadoop.input.BSONFileSplit in project mongo-hadoop by mongodb.
the class BSONFileRecordReaderTest method testGetCurrentKey.
@Test
public void testGetCurrentKey() throws Exception {
URI path = BSONFileRecordReaderTest.class.getResource("/bookstore-dump/inventory.bson").toURI();
File file = new File(path);
// Default case: "_id" is used as inputKey.
BSONFileRecordReader reader = new BSONFileRecordReader();
BSONFileSplit split = new BSONFileSplit(new Path(path), 0, file.length(), new String[0]);
JobConf conf = new JobConf();
reader.init(split, conf);
assertTrue(reader.nextKeyValue());
assertEquals(reader.getCurrentKey(), new ObjectId("4d2a6084c6237b412fcd5597"));
// Use a nested field as inputKey.
reader = new BSONFileRecordReader();
split = new BSONFileSplit(new Path(path), 0, file.length(), new String[0]);
split.setKeyField("price.msrp");
reader.init(split, conf);
assertTrue(reader.nextKeyValue());
assertEquals(reader.getCurrentKey(), 33);
// Use a key within an array as the inputKey.
reader = new BSONFileRecordReader();
split = new BSONFileSplit(new Path(path), 0, file.length(), new String[0]);
split.setKeyField("tags.0");
reader.init(split, conf);
assertTrue(reader.nextKeyValue());
assertEquals(reader.getCurrentKey(), "html5");
}
use of com.mongodb.hadoop.input.BSONFileSplit in project mongo-hadoop by mongodb.
the class BSONSplitterTest method testCreateFileSplitFromBSON.
@Test
public void testCreateFileSplitFromBSON() throws IOException {
BSONObject splitSpec = new BasicBSONObject();
splitSpec.put("s", 0L);
splitSpec.put("l", file.getLen());
BSONFileSplit splitResult = SPLITTER.createFileSplitFromBSON(splitSpec, fs, file);
assertOneSplit(splitResult);
}
Aggregations