use of org.apache.hadoop.mapred.FileSplit in project mongo-hadoop by mongodb.
the class BSONFileInputFormat method getRecordReader.
@Override
public RecordReader<NullWritable, BSONWritable> getRecordReader(final InputSplit split, final JobConf job, final Reporter reporter) throws IOException {
FileSplit fileSplit = (FileSplit) split;
FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
if (split instanceof BSONFileSplit || !isSplitable(fs, fileSplit.getPath())) {
BSONFileRecordReader reader = new BSONFileRecordReader();
reader.initialize(split, job);
return reader;
}
// Split was not created by BSONSplitter.
BSONSplitter splitter = new BSONSplitter();
splitter.setConf(job);
splitter.setInputPath(fileSplit.getPath());
org.apache.hadoop.mapreduce.lib.input.FileSplit newStyleFileSplit = new org.apache.hadoop.mapreduce.lib.input.FileSplit(fileSplit.getPath(), fileSplit.getStart(), fileSplit.getLength(), fileSplit.getLocations());
long start = splitter.getStartingPositionForSplit(newStyleFileSplit);
BSONFileRecordReader reader = new BSONFileRecordReader(start);
reader.initialize(fileSplit, job);
return reader;
}
use of org.apache.hadoop.mapred.FileSplit in project mongo-hadoop by mongodb.
the class BSONFileInputFormatTest method enronEmails.
@Test
public void enronEmails() throws IOException {
BSONFileInputFormat inputFormat = new BSONFileInputFormat();
JobConf job = new JobConf();
String inputDirectory = new File(EXAMPLE_DATA_HOME, "/dump/enron_mail/messages.bson").getAbsoluteFile().toURI().toString();
// Hadoop 2.X
job.set("mapreduce.input.fileinputformat.inputdir", inputDirectory);
// Hadoop 1.2.X
job.set("mapred.input.dir", inputDirectory);
FileSplit[] splits = inputFormat.getSplits(job, 5);
int count = 0;
BSONWritable writable = new BSONWritable();
for (FileSplit split : splits) {
RecordReader<NullWritable, BSONWritable> recordReader = inputFormat.getRecordReader(split, job, null);
while (recordReader.next(null, writable)) {
count++;
}
}
assertEquals("There are 501513 messages in the enron corpus", 501513, count);
}
use of org.apache.hadoop.mapred.FileSplit in project mongo-hadoop by mongodb.
the class BSONFileInputFormatTest method enronEmails.
@Test
public void enronEmails() throws IOException {
BSONFileInputFormat inputFormat = new BSONFileInputFormat();
JobConf job = new JobConf();
String inputDirectory = new File(EXAMPLE_DATA_HOME, "/dump/enron_mail/messages.bson").getAbsoluteFile().toURI().toString();
// Hadoop 2.X
job.set("mapreduce.input.fileinputformat.inputdir", inputDirectory);
// Hadoop 1.2.X
job.set("mapred.input.dir", inputDirectory);
FileSplit[] splits = inputFormat.getSplits(job, 5);
int count = 0;
BSONWritable writable = new BSONWritable();
for (FileSplit split : splits) {
RecordReader<NullWritable, BSONWritable> recordReader = inputFormat.getRecordReader(split, job, null);
while (recordReader.next(null, writable)) {
count++;
}
}
assertEquals("There are 501513 messages in the enron corpus", 501513, count);
}
use of org.apache.hadoop.mapred.FileSplit in project hadoop by apache.
the class AutoInputFormat method getRecordReader.
public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
FileSplit fileSplit = (FileSplit) split;
FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
FSDataInputStream is = fs.open(fileSplit.getPath());
byte[] header = new byte[3];
RecordReader reader = null;
try {
is.readFully(header);
} catch (EOFException eof) {
reader = textInputFormat.getRecordReader(split, job, reporter);
} finally {
is.close();
}
if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
reader = seqFileInputFormat.getRecordReader(split, job, reporter);
} else {
reader = textInputFormat.getRecordReader(split, job, reporter);
}
return reader;
}
use of org.apache.hadoop.mapred.FileSplit in project presto by prestodb.
the class AbstractTestHiveFileFormats method createTestFile.
public static FileSplit createTestFile(String filePath, HiveStorageFormat storageFormat, HiveCompressionCodec compressionCodec, List<TestColumn> testColumns, ConnectorSession session, int numRows, HiveFileWriterFactory fileWriterFactory) throws Exception {
// filter out partition keys, which are not written to the file
testColumns = ImmutableList.copyOf(filter(testColumns, not(TestColumn::isPartitionKey)));
List<Type> types = testColumns.stream().map(TestColumn::getType).map(HiveType::valueOf).map(type -> type.getType(TYPE_MANAGER)).collect(toList());
PageBuilder pageBuilder = new PageBuilder(types);
for (int rowNumber = 0; rowNumber < numRows; rowNumber++) {
pageBuilder.declarePosition();
for (int columnNumber = 0; columnNumber < testColumns.size(); columnNumber++) {
serializeObject(types.get(columnNumber), pageBuilder.getBlockBuilder(columnNumber), testColumns.get(columnNumber).getWriteValue(), testColumns.get(columnNumber).getObjectInspector(), false);
}
}
Page page = pageBuilder.build();
JobConf jobConf = new JobConf();
configureCompression(jobConf, compressionCodec);
Properties tableProperties = new Properties();
tableProperties.setProperty("columns", Joiner.on(',').join(transform(testColumns, TestColumn::getName)));
tableProperties.setProperty("columns.types", Joiner.on(',').join(transform(testColumns, TestColumn::getType)));
Optional<HiveFileWriter> fileWriter = fileWriterFactory.createFileWriter(new Path(filePath), testColumns.stream().map(TestColumn::getName).collect(toList()), StorageFormat.fromHiveStorageFormat(storageFormat), tableProperties, jobConf, session);
HiveFileWriter hiveFileWriter = fileWriter.orElseThrow(() -> new IllegalArgumentException("fileWriterFactory"));
hiveFileWriter.appendRows(page);
hiveFileWriter.commit();
return new FileSplit(new Path(filePath), 0, new File(filePath).length(), new String[0]);
}
Aggregations