use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class CompositeInputSplit method readFields.
/**
* {@inheritDoc}
* @throws IOException If the child InputSplit cannot be read, typically
* for failing access checks.
*/
// Generic array assignment
@SuppressWarnings("unchecked")
public void readFields(DataInput in) throws IOException {
int card = WritableUtils.readVInt(in);
if (splits == null || splits.length != card) {
splits = new InputSplit[card];
}
Class<? extends InputSplit>[] cls = new Class[card];
try {
for (int i = 0; i < card; ++i) {
cls[i] = Class.forName(Text.readString(in)).asSubclass(InputSplit.class);
}
for (int i = 0; i < card; ++i) {
splits[i] = ReflectionUtils.newInstance(cls[i], null);
SerializationFactory factory = new SerializationFactory(conf);
Deserializer deserializer = factory.getDeserializer(cls[i]);
deserializer.open((DataInputStream) in);
splits[i] = (InputSplit) deserializer.deserialize(splits[i]);
}
} catch (ClassNotFoundException e) {
throw new IOException("Failed split init", e);
}
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class CompositeInputSplit method write.
/**
* Write splits in the following format.
* {@code
* <count><class1><class2>...<classn><split1><split2>...<splitn>
* }
*/
@SuppressWarnings("unchecked")
public void write(DataOutput out) throws IOException {
WritableUtils.writeVInt(out, splits.length);
for (InputSplit s : splits) {
Text.writeString(out, s.getClass().getName());
}
for (InputSplit s : splits) {
SerializationFactory factory = new SerializationFactory(conf);
Serializer serializer = factory.getSerializer(s.getClass());
serializer.open((DataOutputStream) out);
serializer.serialize(s);
}
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class TeraInputFormat method writePartitionFile.
/**
* Use the input splits to take samples of the input and generate sample
* keys. By default reads 100,000 keys from 10 locations in the input, sorts
* them and picks N-1 keys to generate N equally sized partitions.
* @param job the job to sample
* @param partFile where to write the output file to
* @throws Throwable if something goes wrong
*/
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
long t1 = System.currentTimeMillis();
Configuration conf = job.getConfiguration();
final TeraInputFormat inFormat = new TeraInputFormat();
final TextSampler sampler = new TextSampler();
int partitions = job.getNumReduceTasks();
long sampleSize = conf.getLong(TeraSortConfigKeys.SAMPLE_SIZE.key(), TeraSortConfigKeys.DEFAULT_SAMPLE_SIZE);
final List<InputSplit> splits = inFormat.getSplits(job);
long t2 = System.currentTimeMillis();
System.out.println("Computing input splits took " + (t2 - t1) + "ms");
int samples = Math.min(conf.getInt(TeraSortConfigKeys.NUM_PARTITIONS.key(), TeraSortConfigKeys.DEFAULT_NUM_PARTITIONS), splits.size());
System.out.println("Sampling " + samples + " splits of " + splits.size());
final long recordsPerSample = sampleSize / samples;
final int sampleStep = splits.size() / samples;
Thread[] samplerReader = new Thread[samples];
SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
// take N samples from different parts of the input
for (int i = 0; i < samples; ++i) {
final int idx = i;
samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
{
setDaemon(true);
}
public void run() {
long records = 0;
try {
TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context);
reader.initialize(splits.get(sampleStep * idx), context);
while (reader.nextKeyValue()) {
sampler.addKey(new Text(reader.getCurrentKey()));
records += 1;
if (recordsPerSample <= records) {
break;
}
}
} catch (IOException ie) {
System.err.println("Got an exception while reading splits " + StringUtils.stringifyException(ie));
throw new RuntimeException(ie);
} catch (InterruptedException e) {
}
}
};
samplerReader[i].start();
}
FileSystem outFs = partFile.getFileSystem(conf);
DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile));
for (int i = 0; i < samples; i++) {
try {
samplerReader[i].join();
if (threadGroup.getThrowable() != null) {
throw threadGroup.getThrowable();
}
} catch (InterruptedException e) {
}
}
for (Text split : sampler.createPartitions(partitions)) {
split.write(writer);
}
writer.close();
long t3 = System.currentTimeMillis();
System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class TestDbClasses method testDataDrivenDBInputFormat.
@Test(timeout = 10000)
public void testDataDrivenDBInputFormat() throws Exception {
JobContext jobContext = mock(JobContext.class);
Configuration configuration = new Configuration();
configuration.setInt(MRJobConfig.NUM_MAPS, 1);
when(jobContext.getConfiguration()).thenReturn(configuration);
DataDrivenDBInputFormat<NullDBWritable> format = new DataDrivenDBInputFormat<NullDBWritable>();
List<InputSplit> splits = format.getSplits(jobContext);
assertEquals(1, splits.size());
DataDrivenDBInputSplit split = (DataDrivenDBInputSplit) splits.get(0);
assertEquals("1=1", split.getLowerClause());
assertEquals("1=1", split.getUpperClause());
// 2
configuration.setInt(MRJobConfig.NUM_MAPS, 2);
DataDrivenDBInputFormat.setBoundingQuery(configuration, "query");
assertEquals("query", configuration.get(DBConfiguration.INPUT_BOUNDING_QUERY));
Job job = mock(Job.class);
when(job.getConfiguration()).thenReturn(configuration);
DataDrivenDBInputFormat.setInput(job, NullDBWritable.class, "query", "Bounding Query");
assertEquals("Bounding Query", configuration.get(DBConfiguration.INPUT_BOUNDING_QUERY));
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class TestSplitters method testBooleanSplitter.
@Test(timeout = 2000)
public void testBooleanSplitter() throws Exception {
BooleanSplitter splitter = new BooleanSplitter();
ResultSet result = mock(ResultSet.class);
when(result.getString(1)).thenReturn("result1");
List<InputSplit> splits = splitter.split(configuration, result, "column");
assertSplits(new String[] { "column = FALSE column = FALSE", "column IS NULL column IS NULL" }, splits);
when(result.getString(1)).thenReturn("result1");
when(result.getString(2)).thenReturn("result2");
when(result.getBoolean(1)).thenReturn(true);
when(result.getBoolean(2)).thenReturn(false);
splits = splitter.split(configuration, result, "column");
assertEquals(0, splits.size());
when(result.getString(1)).thenReturn("result1");
when(result.getString(2)).thenReturn("result2");
when(result.getBoolean(1)).thenReturn(false);
when(result.getBoolean(2)).thenReturn(true);
splits = splitter.split(configuration, result, "column");
assertSplits(new String[] { "column = FALSE column = FALSE", ".*column = TRUE" }, splits);
}
Aggregations