use of com.mongodb.hadoop.input.BSONFileSplit in project mongo-hadoop by mongodb.
the class BSONSplitter method run.
/**
* When run as a Tool, BSONSplitter can be used to pre-split and compress
* BSON files. This can be especially useful before uploading large BSON
* files to HDFS to save time. The compressed splits are written to the
* given output path or to the directory containing the input file, if
* the output path is unspecified. A ".splits" file is not generated, since
* each output file is expected to be its own split.
*
* @param args command-line arguments. Run with zero arguments to see usage.
* @return exit status
* @throws Exception
*/
@Override
public int run(final String[] args) throws Exception {
if (args.length < 1) {
printUsage();
return 1;
}
// Parse command-line arguments.
Path filePath = new Path(args[0]);
String compressorName = null, outputDirectoryStr = null;
Path outputDirectory;
CompressionCodec codec;
Compressor compressor;
for (int i = 1; i < args.length; ++i) {
if ("-c".equals(args[i]) && args.length > i) {
compressorName = args[++i];
} else if ("-o".equals(args[i]) && args.length > i) {
outputDirectoryStr = args[++i];
} else {
// CHECKSTYLE:OFF
System.err.println("unrecognized option: " + args[i]);
// CHECKSTYLE:ON
printUsage();
return 1;
}
}
// Supply default values for unspecified arguments.
if (null == outputDirectoryStr) {
outputDirectory = filePath.getParent();
} else {
outputDirectory = new Path(outputDirectoryStr);
}
if (null == compressorName) {
codec = new DefaultCodec();
} else {
Class<?> codecClass = Class.forName(compressorName);
codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, getConf());
}
if (codec instanceof Configurable) {
((Configurable) codec).setConf(getConf());
}
// Do not write a .splits file so as not to confuse BSONSplitter.
// Each compressed file will be its own split.
MongoConfigUtil.setBSONWriteSplits(getConf(), false);
// Open the file.
FileSystem inputFS = FileSystem.get(filePath.toUri(), getConf());
FileSystem outputFS = FileSystem.get(outputDirectory.toUri(), getConf());
FSDataInputStream inputStream = inputFS.open(filePath);
// Use BSONSplitter to split the file.
Path splitFilePath = getSplitsFilePath(filePath, getConf());
try {
loadSplitsFromSplitFile(inputFS.getFileStatus(filePath), splitFilePath);
} catch (NoSplitFileException e) {
LOG.info("did not find .splits file in " + splitFilePath.toUri());
setInputPath(filePath);
readSplits();
}
List<BSONFileSplit> splits = getAllSplits();
LOG.info("compressing " + splits.size() + " splits.");
byte[] buf = new byte[1024 * 1024];
for (int i = 0; i < splits.size(); ++i) {
// e.g., hdfs:///user/hive/warehouse/mongo/OutputFile-42.bz2
Path splitOutputPath = new Path(outputDirectory, filePath.getName() + "-" + i + codec.getDefaultExtension());
// Compress the split into a new file.
compressor = CodecPool.getCompressor(codec);
CompressionOutputStream compressionOutputStream = null;
try {
compressionOutputStream = codec.createOutputStream(outputFS.create(splitOutputPath), compressor);
int totalBytes = 0, bytesRead = 0;
BSONFileSplit split = splits.get(i);
inputStream.seek(split.getStart());
LOG.info("writing " + splitOutputPath.toUri() + ".");
while (totalBytes < split.getLength() && bytesRead >= 0) {
bytesRead = inputStream.read(buf, 0, (int) Math.min(buf.length, split.getLength() - totalBytes));
if (bytesRead > 0) {
compressionOutputStream.write(buf, 0, bytesRead);
totalBytes += bytesRead;
}
}
} finally {
if (compressionOutputStream != null) {
compressionOutputStream.close();
}
CodecPool.returnCompressor(compressor);
}
}
LOG.info("done.");
return 0;
}
use of com.mongodb.hadoop.input.BSONFileSplit in project mongo-hadoop by mongodb.
the class BSONSplitter method createFileSplit.
public BSONFileSplit createFileSplit(final FileStatus inFile, final FileSystem fs, final long splitStart, final long splitLen) {
BSONFileSplit split;
try {
BlockLocation[] blkLocations;
// This code is based off of org.apache.hadoop.mapreduce.lib
// .input.FileInputFormat.getSplits()
boolean isLocatedFileStatus = CompatUtils.isInstance(inFile, "org.apache.hadoop.fs.LocatedFileStatus", getConf(), FileStatus.class);
if (isLocatedFileStatus) {
blkLocations = (BlockLocation[]) CompatUtils.invokeMethod(FileStatus.class, inFile, "getBlockLocations", new Object[] {}, new Class[] {});
} else {
blkLocations = fs.getFileBlockLocations(inFile, splitStart, splitLen);
}
int blockIndex = getBlockIndex(blkLocations, splitStart);
split = new BSONFileSplit(inFile.getPath(), splitStart, splitLen, blkLocations[blockIndex].getHosts());
} catch (IOException e) {
LOG.warn("Couldn't find block locations when constructing input split from byte offset. Using non-block-aware input split; " + e.getMessage());
split = new BSONFileSplit(inFile.getPath(), splitStart, splitLen, null);
}
split.setKeyField(MongoConfigUtil.getInputKey(getConf()));
return split;
}
use of com.mongodb.hadoop.input.BSONFileSplit in project mongo-hadoop by mongodb.
the class BSONSplitter method getStartingPositionForSplit.
/**
* Get the position at which the BSONFileRecordReader should begin
* iterating the given split. This may not be at the beginning of the split
* if the splits were not calculated by BSONSplitter.
*
* @param split the FileSplit for which to find the starting position.
* @return the position of the first complete document within the split.
* @throws IOException when an error occurs while reading a file
*/
public synchronized long getStartingPositionForSplit(final FileSplit split) throws IOException {
FileSystem fs = split.getPath().getFileSystem(getConf());
FileStatus file = fs.getFileStatus(split.getPath());
ArrayList<BSONFileSplit> splits;
BSONFileSplit[] splitsArr;
// Get splits calculated on document boundaries.
if (MongoConfigUtil.getBSONReadSplits(getConf())) {
// Use the splits file to load splits on document boundaries.
try {
// Try to use the existing splits file.
loadSplitsFromSplitFile(file, getSplitsFilePath(file.getPath(), getConf()));
} catch (NoSplitFileException e) {
// Create a splits file from scratch.
readSplitsForFile(file);
}
splits = getAllSplits();
} else {
// Can't use a splits file, so create splits from scratch.
splits = (ArrayList<BSONFileSplit>) splitFile(file);
}
splitsArr = new BSONFileSplit[splits.size()];
splits.toArray(splitsArr);
// Get the first pre-calculated split occurring before the start of
// the given split.
long previousStart = split.getStart();
long startIterating = 0;
for (BSONFileSplit bfs : splitsArr) {
if (bfs.getStart() >= split.getStart()) {
startIterating = previousStart;
break;
}
previousStart = bfs.getStart();
}
// Beginning at 'startIterating', jump to the first document that begins
// at or beyond the given split.
FSDataInputStream fsDataStream = null;
long pos = startIterating;
try {
fsDataStream = fs.open(split.getPath());
fsDataStream.seek(pos);
while (pos < split.getStart()) {
callback.reset();
bsonDec.decode(fsDataStream, callback);
pos = fsDataStream.getPos();
}
} finally {
if (null != fsDataStream) {
fsDataStream.close();
}
}
return pos;
}
use of com.mongodb.hadoop.input.BSONFileSplit in project mongo-hadoop by mongodb.
the class BSONFileInputFormat method createRecordReader.
@Override
public RecordReader createRecordReader(final InputSplit split, final TaskAttemptContext context) throws IOException, InterruptedException {
if (split instanceof BSONFileSplit) {
// Split was created by BSONSplitter and starts at a whole document.
return new BSONFileRecordReader();
}
// Split was not created by BSONSplitter, and we need to find the
// first document to begin iterating.
FileSplit fileSplit = (FileSplit) split;
BSONSplitter splitter = new BSONSplitter();
splitter.setConf(context.getConfiguration());
splitter.setInputPath(fileSplit.getPath());
return new BSONFileRecordReader(splitter.getStartingPositionForSplit(fileSplit));
}
Aggregations