use of org.apache.hadoop.mapreduce.RecordReader in project hadoop by apache.
the class TeraInputFormat method writePartitionFile.
/**
* Use the input splits to take samples of the input and generate sample
* keys. By default reads 100,000 keys from 10 locations in the input, sorts
* them and picks N-1 keys to generate N equally sized partitions.
* @param job the job to sample
* @param partFile where to write the output file to
* @throws Throwable if something goes wrong
*/
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
long t1 = System.currentTimeMillis();
Configuration conf = job.getConfiguration();
final TeraInputFormat inFormat = new TeraInputFormat();
final TextSampler sampler = new TextSampler();
int partitions = job.getNumReduceTasks();
long sampleSize = conf.getLong(TeraSortConfigKeys.SAMPLE_SIZE.key(), TeraSortConfigKeys.DEFAULT_SAMPLE_SIZE);
final List<InputSplit> splits = inFormat.getSplits(job);
long t2 = System.currentTimeMillis();
System.out.println("Computing input splits took " + (t2 - t1) + "ms");
int samples = Math.min(conf.getInt(TeraSortConfigKeys.NUM_PARTITIONS.key(), TeraSortConfigKeys.DEFAULT_NUM_PARTITIONS), splits.size());
System.out.println("Sampling " + samples + " splits of " + splits.size());
final long recordsPerSample = sampleSize / samples;
final int sampleStep = splits.size() / samples;
Thread[] samplerReader = new Thread[samples];
SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
// take N samples from different parts of the input
for (int i = 0; i < samples; ++i) {
final int idx = i;
samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
{
setDaemon(true);
}
public void run() {
long records = 0;
try {
TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context);
reader.initialize(splits.get(sampleStep * idx), context);
while (reader.nextKeyValue()) {
sampler.addKey(new Text(reader.getCurrentKey()));
records += 1;
if (recordsPerSample <= records) {
break;
}
}
} catch (IOException ie) {
System.err.println("Got an exception while reading splits " + StringUtils.stringifyException(ie));
throw new RuntimeException(ie);
} catch (InterruptedException e) {
}
}
};
samplerReader[i].start();
}
FileSystem outFs = partFile.getFileSystem(conf);
DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile));
for (int i = 0; i < samples; i++) {
try {
samplerReader[i].join();
if (threadGroup.getThrowable() != null) {
throw threadGroup.getThrowable();
}
} catch (InterruptedException e) {
}
}
for (Text split : sampler.createPartitions(partitions)) {
split.write(writer);
}
writer.close();
long t3 = System.currentTimeMillis();
System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}
use of org.apache.hadoop.mapreduce.RecordReader in project carbondata by apache.
the class CarbonReaderBuilder method build.
public <T> CarbonReader<T> build() throws IOException, InterruptedException {
CarbonTable table = CarbonTable.buildFromTablePath("_temp", tablePath);
final CarbonFileInputFormat format = new CarbonFileInputFormat();
final Job job = new Job(new Configuration());
format.setTableInfo(job.getConfiguration(), table.getTableInfo());
format.setTablePath(job.getConfiguration(), table.getTablePath());
format.setTableName(job.getConfiguration(), table.getTableName());
format.setDatabaseName(job.getConfiguration(), table.getDatabaseName());
if (filterExpression != null) {
format.setFilterPredicates(job.getConfiguration(), filterExpression);
}
if (projectionColumns != null) {
format.setColumnProjection(job.getConfiguration(), new CarbonProjection(projectionColumns));
}
final List<InputSplit> splits = format.getSplits(new JobContextImpl(job.getConfiguration(), new JobID()));
List<RecordReader<Void, T>> readers = new ArrayList<>(splits.size());
for (InputSplit split : splits) {
TaskAttemptContextImpl attempt = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
RecordReader reader = format.createRecordReader(split, attempt);
reader.initialize(split, attempt);
readers.add(reader);
}
return new CarbonReader<>(readers);
}
use of org.apache.hadoop.mapreduce.RecordReader in project carbondata by apache.
the class CarbonStreamInputFormatTest method testCreateRecordReader.
@Test
public void testCreateRecordReader() {
try {
InputSplit inputSplit = buildInputSplit();
CarbonStreamInputFormat inputFormat = new CarbonStreamInputFormat();
RecordReader recordReader = inputFormat.createRecordReader(inputSplit, taskAttemptContext);
Assert.assertNotNull("Failed to create record reader", recordReader);
} catch (Exception e) {
e.printStackTrace();
Assert.assertTrue(e.getMessage(), false);
}
}
use of org.apache.hadoop.mapreduce.RecordReader in project hbase by apache.
the class MultiTableInputFormatBase method createRecordReader.
/**
* Builds a TableRecordReader. If no TableRecordReader was provided, uses the
* default.
*
* @param split The split to work with.
* @param context The current context.
* @return The newly created record reader.
* @throws IOException When creating the reader fails.
* @throws InterruptedException when record reader initialization fails
* @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(
* org.apache.hadoop.mapreduce.InputSplit,
* org.apache.hadoop.mapreduce.TaskAttemptContext)
*/
@Override
public RecordReader<ImmutableBytesWritable, Result> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
TableSplit tSplit = (TableSplit) split;
LOG.info(MessageFormat.format("Input split length: {0} bytes.", tSplit.getLength()));
if (tSplit.getTable() == null) {
throw new IOException("Cannot create a record reader because of a" + " previous error. Please look at the previous logs lines from" + " the task's full log for more details.");
}
final Connection connection = ConnectionFactory.createConnection(context.getConfiguration());
Table table = connection.getTable(tSplit.getTable());
if (this.tableRecordReader == null) {
this.tableRecordReader = new TableRecordReader();
}
final TableRecordReader trr = this.tableRecordReader;
try {
Scan sc = tSplit.getScan();
sc.withStartRow(tSplit.getStartRow());
sc.withStopRow(tSplit.getEndRow());
trr.setScan(sc);
trr.setTable(table);
return new RecordReader<ImmutableBytesWritable, Result>() {
@Override
public void close() throws IOException {
trr.close();
connection.close();
}
@Override
public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException {
return trr.getCurrentKey();
}
@Override
public Result getCurrentValue() throws IOException, InterruptedException {
return trr.getCurrentValue();
}
@Override
public float getProgress() throws IOException, InterruptedException {
return trr.getProgress();
}
@Override
public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException, InterruptedException {
trr.initialize(inputsplit, context);
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
return trr.nextKeyValue();
}
};
} catch (IOException ioe) {
// If there is an exception make sure that all
// resources are closed and released.
trr.close();
connection.close();
throw ioe;
}
}
use of org.apache.hadoop.mapreduce.RecordReader in project druid by druid-io.
the class BaseParquetInputTest method getAllRows.
static List<InputRow> getAllRows(String parserType, HadoopDruidIndexerConfig config) throws IOException, InterruptedException {
Job job = Job.getInstance(new Configuration());
config.intoConfiguration(job);
File testFile = new File(((StaticPathSpec) config.getPathSpec()).getPaths());
Path path = new Path(testFile.getAbsoluteFile().toURI());
FileSplit split = new FileSplit(path, 0, testFile.length(), null);
InputFormat inputFormat = ReflectionUtils.newInstance(INPUT_FORMAT_CLASSES.get(parserType), job.getConfiguration());
TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
try (RecordReader reader = inputFormat.createRecordReader(split, context)) {
List<InputRow> records = new ArrayList<>();
InputRowParser parser = config.getParser();
reader.initialize(split, context);
while (reader.nextKeyValue()) {
reader.nextKeyValue();
Object data = reader.getCurrentValue();
records.add(((List<InputRow>) parser.parseBatch(data)).get(0));
}
return records;
}
}
Aggregations