use of org.apache.hadoop.util.LineReader in project hadoop by apache.
the class TestLineReader method testCustomDelimiter.
@Test
public void testCustomDelimiter() throws Exception {
/* TEST_1
* The test scenario is the tail of the buffer
* equals the starting character/s of delimiter
*
* The Test Data is such that,
*
* 1) we will have "</entity>" as delimiter
*
* 2) The tail of the current buffer would be "</"
* which matches with the starting character sequence of delimiter.
*
* 3) The Head of the next buffer would be "id>"
* which does NOT match with the remaining characters of delimiter.
*
* 4) Input data would be prefixed by char 'a'
* about numberOfCharToFillTheBuffer times.
* So that, one iteration to buffer the input data,
* would end at '</' ie equals starting 2 char of delimiter
*
* 5) For this we would take BufferSize as 64 * 1024;
*
* Check Condition
* In the second key value pair, the value should contain
* "</" from currentToken and
* "id>" from next token
*/
Delimiter = "</entity>";
String CurrentBufferTailToken = "</entity><entity><id>Gelesh</";
// Ending part of Input Data Buffer
// It contains '</' ie delimiter character
String NextBufferHeadToken = "id><name>Omathil</name></entity>";
// Supposing the start of next buffer is this
String Expected = (CurrentBufferTailToken + NextBufferHeadToken).replace(Delimiter, "");
// Expected ,must capture from both the buffer, excluding Delimiter
String TestPartOfInput = CurrentBufferTailToken + NextBufferHeadToken;
int BufferSize = 64 * 1024;
int numberOfCharToFillTheBuffer = BufferSize - CurrentBufferTailToken.length();
StringBuilder fillerString = new StringBuilder();
for (int i = 0; i < numberOfCharToFillTheBuffer; i++) {
// char 'a' as a filler for the test string
fillerString.append('a');
}
TestData = fillerString + TestPartOfInput;
lineReader = new LineReader(new ByteArrayInputStream(TestData.getBytes()), Delimiter.getBytes());
line = new Text();
lineReader.readLine(line);
Assert.assertEquals(fillerString.toString(), line.toString());
lineReader.readLine(line);
Assert.assertEquals(Expected, line.toString());
/*TEST_2
* The test scenario is such that,
* the character/s preceding the delimiter,
* equals the starting character/s of delimiter
*/
Delimiter = "record";
StringBuilder TestStringBuilder = new StringBuilder();
TestStringBuilder.append(Delimiter + "Kerala ");
TestStringBuilder.append(Delimiter + "Bangalore");
TestStringBuilder.append(Delimiter + " North Korea");
TestStringBuilder.append(Delimiter + Delimiter + "Guantanamo");
TestStringBuilder.append(Delimiter + "ecord" + "recor" + //~EOF with 're'
"core");
TestData = TestStringBuilder.toString();
lineReader = new LineReader(new ByteArrayInputStream(TestData.getBytes()), Delimiter.getBytes());
lineReader.readLine(line);
Assert.assertEquals("", line.toString());
lineReader.readLine(line);
Assert.assertEquals("Kerala ", line.toString());
lineReader.readLine(line);
Assert.assertEquals("Bangalore", line.toString());
lineReader.readLine(line);
Assert.assertEquals(" North Korea", line.toString());
lineReader.readLine(line);
Assert.assertEquals("", line.toString());
lineReader.readLine(line);
Assert.assertEquals("Guantanamo", line.toString());
lineReader.readLine(line);
Assert.assertEquals(("ecord" + "recor" + "core"), line.toString());
// Test 3
// The test scenario is such that,
// aaaabccc split by aaab
TestData = "aaaabccc";
Delimiter = "aaab";
lineReader = new LineReader(new ByteArrayInputStream(TestData.getBytes()), Delimiter.getBytes());
lineReader.readLine(line);
Assert.assertEquals("a", line.toString());
lineReader.readLine(line);
Assert.assertEquals("ccc", line.toString());
}
use of org.apache.hadoop.util.LineReader in project hadoop by apache.
the class TestCodec method testSplitableCodec.
private void testSplitableCodec(Class<? extends SplittableCompressionCodec> codecClass) throws IOException {
final long DEFLBYTES = 2 * 1024 * 1024;
final Configuration conf = new Configuration();
final Random rand = new Random();
final long seed = rand.nextLong();
LOG.info("seed: " + seed);
rand.setSeed(seed);
SplittableCompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);
final FileSystem fs = FileSystem.getLocal(conf);
final FileStatus infile = fs.getFileStatus(writeSplitTestFile(fs, rand, codec, DEFLBYTES));
if (infile.getLen() > Integer.MAX_VALUE) {
fail("Unexpected compression: " + DEFLBYTES + " -> " + infile.getLen());
}
final int flen = (int) infile.getLen();
final Text line = new Text();
final Decompressor dcmp = CodecPool.getDecompressor(codec);
try {
for (int pos = 0; pos < infile.getLen(); pos += rand.nextInt(flen / 8)) {
// read from random positions, verifying that there exist two sequential
// lines as written in writeSplitTestFile
final SplitCompressionInputStream in = codec.createInputStream(fs.open(infile.getPath()), dcmp, pos, flen, SplittableCompressionCodec.READ_MODE.BYBLOCK);
if (in.getAdjustedStart() >= flen) {
break;
}
LOG.info("SAMPLE " + in.getAdjustedStart() + "," + in.getAdjustedEnd());
final LineReader lreader = new LineReader(in);
// ignore; likely partial
lreader.readLine(line);
if (in.getPos() >= flen) {
break;
}
lreader.readLine(line);
final int seq1 = readLeadingInt(line);
lreader.readLine(line);
if (in.getPos() >= flen) {
break;
}
final int seq2 = readLeadingInt(line);
assertEquals("Mismatched lines", seq1 + 1, seq2);
}
} finally {
CodecPool.returnDecompressor(dcmp);
}
// remove on success
fs.delete(infile.getPath().getParent(), true);
}
use of org.apache.hadoop.util.LineReader in project hadoop by apache.
the class HadoopLogsAnalyzer method maybeUncompressedPath.
private LineReader maybeUncompressedPath(Path p) throws FileNotFoundException, IOException {
CompressionCodecFactory codecs = new CompressionCodecFactory(getConf());
inputCodec = codecs.getCodec(p);
FileSystem fs = p.getFileSystem(getConf());
FSDataInputStream fileIn = fs.open(p);
if (inputCodec == null) {
return new LineReader(fileIn, getConf());
} else {
inputDecompressor = CodecPool.getDecompressor(inputCodec);
return new LineReader(inputCodec.createInputStream(fileIn, inputDecompressor), getConf());
}
}
use of org.apache.hadoop.util.LineReader in project hadoop by apache.
the class TestTextOutputFormat method testCompress.
/**
* test compressed file
* @throws IOException
*/
@Test
public void testCompress() throws IOException {
JobConf job = new JobConf();
job.set(JobContext.TASK_ATTEMPT_ID, attempt);
job.set(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.COMPRESS, "true");
FileOutputFormat.setOutputPath(job, workDir.getParent().getParent());
FileOutputFormat.setWorkOutputPath(job, workDir);
FileSystem fs = workDir.getFileSystem(job);
if (!fs.mkdirs(workDir)) {
fail("Failed to create output directory");
}
String file = "test_compress.txt";
// A reporter that does nothing
Reporter reporter = Reporter.NULL;
TextOutputFormat<Object, Object> theOutputFormat = new TextOutputFormat<Object, Object>();
RecordWriter<Object, Object> theRecordWriter = theOutputFormat.getRecordWriter(localFs, job, file, reporter);
Text key1 = new Text("key1");
Text key2 = new Text("key2");
Text val1 = new Text("val1");
Text val2 = new Text("val2");
NullWritable nullWritable = NullWritable.get();
try {
theRecordWriter.write(key1, val1);
theRecordWriter.write(null, nullWritable);
theRecordWriter.write(null, val1);
theRecordWriter.write(nullWritable, val2);
theRecordWriter.write(key2, nullWritable);
theRecordWriter.write(key1, null);
theRecordWriter.write(null, null);
theRecordWriter.write(key2, val2);
} finally {
theRecordWriter.close(reporter);
}
StringBuffer expectedOutput = new StringBuffer();
expectedOutput.append(key1).append("\t").append(val1).append("\n");
expectedOutput.append(val1).append("\n");
expectedOutput.append(val2).append("\n");
expectedOutput.append(key2).append("\n");
expectedOutput.append(key1).append("\n");
expectedOutput.append(key2).append("\t").append(val2).append("\n");
DefaultCodec codec = new DefaultCodec();
codec.setConf(job);
Path expectedFile = new Path(workDir, file + codec.getDefaultExtension());
final FileInputStream istream = new FileInputStream(expectedFile.toString());
CompressionInputStream cistream = codec.createInputStream(istream);
LineReader reader = new LineReader(cistream);
String output = "";
Text out = new Text();
while (reader.readLine(out) > 0) {
output += out;
output += "\n";
}
reader.close();
assertEquals(expectedOutput.toString(), output);
}
use of org.apache.hadoop.util.LineReader in project hadoop by apache.
the class TestTextInputFormat method testUTF8.
@Test(timeout = 5000)
public void testUTF8() throws Exception {
LineReader in = makeStream("abcd€bdcd€");
Text line = new Text();
in.readLine(line);
assertEquals("readLine changed utf8 characters", "abcd€bdcd€", line.toString());
in = makeStream("abc xyz");
in.readLine(line);
assertEquals("split on fake newline", "abc xyz", line.toString());
}
Aggregations