use of org.apache.hadoop.mapreduce.lib.input.CombineFileSplit in project hadoop by apache.
the class InputStriper method splitFor.
/**
* @param inputDir Pool used to resolve block locations.
* @param bytes Target byte count
* @param nLocs Number of block locations per split.
* @return A set of files satisfying the byte count, with locations weighted
* to the dominating proportion of input bytes.
*/
CombineFileSplit splitFor(FilePool inputDir, long bytes, int nLocs) throws IOException {
final ArrayList<Path> paths = new ArrayList<Path>();
final ArrayList<Long> start = new ArrayList<Long>();
final ArrayList<Long> length = new ArrayList<Long>();
final HashMap<String, Double> sb = new HashMap<String, Double>();
do {
paths.add(current.getPath());
start.add(currentStart);
final long fromFile = Math.min(bytes, current.getLen() - currentStart);
length.add(fromFile);
for (BlockLocation loc : inputDir.locationsFor(current, currentStart, fromFile)) {
final double tedium = loc.getLength() / (1.0 * bytes);
for (String l : loc.getHosts()) {
Double j = sb.get(l);
if (null == j) {
sb.put(l, tedium);
} else {
sb.put(l, j.doubleValue() + tedium);
}
}
}
currentStart += fromFile;
bytes -= fromFile;
// Switch to a new file if
// - the current file is uncompressed and completely used
// - the current file is compressed
CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
CompressionCodec codec = compressionCodecs.getCodec(current.getPath());
if (current.getLen() - currentStart == 0 || codec != null) {
current = files.get(++idx % files.size());
currentStart = 0;
}
} while (bytes > 0);
final ArrayList<Entry<String, Double>> sort = new ArrayList<Entry<String, Double>>(sb.entrySet());
Collections.sort(sort, hostRank);
final String[] hosts = new String[Math.min(nLocs, sort.size())];
for (int i = 0; i < nLocs && i < sort.size(); ++i) {
hosts[i] = sort.get(i).getKey();
}
return new CombineFileSplit(paths.toArray(new Path[0]), toLongArray(start), toLongArray(length), hosts);
}
use of org.apache.hadoop.mapreduce.lib.input.CombineFileSplit in project hadoop by apache.
the class TestCompressionEmulationUtils method testFileQueueDecompression.
/**
* Test of {@link FileQueue} can identify compressed file and provide
* readers to extract uncompressed data only if input-compression is enabled.
*/
@Test
public void testFileQueueDecompression() throws IOException {
JobConf conf = new JobConf();
FileSystem lfs = FileSystem.getLocal(conf);
String inputLine = "Hi Hello!";
CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
CompressionEmulationUtil.setInputCompressionEmulationEnabled(conf, true);
org.apache.hadoop.mapred.FileOutputFormat.setCompressOutput(conf, true);
org.apache.hadoop.mapred.FileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class);
// define the test's root temp directory
Path rootTempDir = new Path(System.getProperty("test.build.data", "/tmp")).makeQualified(lfs.getUri(), lfs.getWorkingDirectory());
Path tempDir = new Path(rootTempDir, "TestFileQueueDecompression");
lfs.delete(tempDir, true);
// create a compressed file
Path compressedFile = new Path(tempDir, "test");
OutputStream out = CompressionEmulationUtil.getPossiblyCompressedOutputStream(compressedFile, conf);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out));
writer.write(inputLine);
writer.close();
compressedFile = compressedFile.suffix(".gz");
// now read back the data from the compressed stream using FileQueue
long fileSize = lfs.listStatus(compressedFile)[0].getLen();
CombineFileSplit split = new CombineFileSplit(new Path[] { compressedFile }, new long[] { fileSize });
FileQueue queue = new FileQueue(split, conf);
byte[] bytes = new byte[inputLine.getBytes().length];
queue.read(bytes);
queue.close();
String readLine = new String(bytes);
assertEquals("Compression/Decompression error", inputLine, readLine);
}
use of org.apache.hadoop.mapreduce.lib.input.CombineFileSplit in project hadoop by apache.
the class TestFileQueue method testEmpty.
@Test
public void testEmpty() throws Exception {
final Configuration conf = new Configuration();
// verify OK if unused
final FileQueue q = new FileQueue(new CombineFileSplit(new Path[0], new long[0], new long[0], new String[0]), conf);
}
use of org.apache.hadoop.mapreduce.lib.input.CombineFileSplit in project hadoop by apache.
the class TestGridMixClasses method testGridmixSplit.
/*
* simple test GridmixSplit (copy, getters, write, read..)
*/
@Test(timeout = 1000)
public void testGridmixSplit() throws Exception {
Path[] files = { new Path("one"), new Path("two") };
long[] start = { 1, 2 };
long[] lengths = { 100, 200 };
String[] locations = { "locOne", "loctwo" };
CombineFileSplit cfSplit = new CombineFileSplit(files, start, lengths, locations);
ResourceUsageMetrics metrics = new ResourceUsageMetrics();
metrics.setCumulativeCpuUsage(200);
double[] reduceBytes = { 8.1d, 8.2d };
double[] reduceRecords = { 9.1d, 9.2d };
long[] reduceOutputBytes = { 101L, 102L };
long[] reduceOutputRecords = { 111L, 112L };
GridmixSplit test = new GridmixSplit(cfSplit, 2, 3, 4L, 5L, 6L, 7L, reduceBytes, reduceRecords, reduceOutputBytes, reduceOutputRecords);
ByteArrayOutputStream data = new ByteArrayOutputStream();
DataOutputStream out = new DataOutputStream(data);
test.write(out);
GridmixSplit copy = new GridmixSplit();
copy.readFields(new DataInputStream(new ByteArrayInputStream(data.toByteArray())));
// data should be the same
assertEquals(test.getId(), copy.getId());
assertEquals(test.getMapCount(), copy.getMapCount());
assertEquals(test.getInputRecords(), copy.getInputRecords());
assertEquals(test.getOutputBytes()[0], copy.getOutputBytes()[0]);
assertEquals(test.getOutputRecords()[0], copy.getOutputRecords()[0]);
assertEquals(test.getReduceBytes(0), copy.getReduceBytes(0));
assertEquals(test.getReduceRecords(0), copy.getReduceRecords(0));
}
use of org.apache.hadoop.mapreduce.lib.input.CombineFileSplit in project hadoop by apache.
the class TestGridMixClasses method testLoadJobLoadRecordReader.
/*
* test LoadRecordReader. It class reads data from some files.
*/
@Test(timeout = 3000)
public void testLoadJobLoadRecordReader() throws Exception {
LoadJob.LoadRecordReader test = new LoadJob.LoadRecordReader();
Configuration conf = new Configuration();
FileSystem fs1 = mock(FileSystem.class);
when(fs1.open((Path) anyObject())).thenReturn(new FakeFSDataInputStream(new FakeInputStream()));
Path p1 = mock(Path.class);
when(p1.getFileSystem((JobConf) anyObject())).thenReturn(fs1);
FileSystem fs2 = mock(FileSystem.class);
when(fs2.open((Path) anyObject())).thenReturn(new FakeFSDataInputStream(new FakeInputStream()));
Path p2 = mock(Path.class);
when(p2.getFileSystem((JobConf) anyObject())).thenReturn(fs2);
Path[] paths = { p1, p2 };
long[] start = { 0, 0 };
long[] lengths = { 1000, 1000 };
String[] locations = { "temp1", "temp2" };
CombineFileSplit cfsplit = new CombineFileSplit(paths, start, lengths, locations);
double[] reduceBytes = { 100, 100 };
double[] reduceRecords = { 2, 2 };
long[] reduceOutputBytes = { 500, 500 };
long[] reduceOutputRecords = { 2, 2 };
ResourceUsageMetrics metrics = new ResourceUsageMetrics();
ResourceUsageMetrics[] rMetrics = { new ResourceUsageMetrics(), new ResourceUsageMetrics() };
LoadSplit input = new LoadSplit(cfsplit, 2, 3, 1500L, 2L, 3000L, 2L, reduceBytes, reduceRecords, reduceOutputBytes, reduceOutputRecords, metrics, rMetrics);
TaskAttemptID taskId = new TaskAttemptID();
TaskAttemptContext ctx = new TaskAttemptContextImpl(conf, taskId);
test.initialize(input, ctx);
GridmixRecord gr = test.getCurrentValue();
int counter = 0;
while (test.nextKeyValue()) {
gr = test.getCurrentValue();
if (counter == 0) {
// read first file
assertEquals(0.5, test.getProgress(), 0.001);
} else if (counter == 1) {
// read second file
assertEquals(1.0, test.getProgress(), 0.001);
}
//
assertEquals(1000, gr.getSize());
counter++;
}
assertEquals(1000, gr.getSize());
// Two files have been read
assertEquals(2, counter);
test.close();
}
Aggregations