Search in sources :

Example 1 with LineReader

use of org.apache.hadoop.util.LineReader in project hadoop by apache.

the class TestLineReader method testCustomDelimiter.

public void testCustomDelimiter() throws Exception {
    /* TEST_1
     * The test scenario is the tail of the buffer
     * equals the starting character/s of delimiter
     * The Test Data is such that,
     * 1) we will have "</entity>" as delimiter  
     * 2) The tail of the current buffer would be "</"
     *    which matches with the starting character sequence of delimiter.
     * 3) The Head of the next buffer would be   "id>" 
     *    which does NOT match with the remaining characters of delimiter.
     * 4) Input data would be prefixed by char 'a' 
     *    about numberOfCharToFillTheBuffer times.
     *    So that, one iteration to buffer the input data,
     *    would end at '</' ie equals starting 2 char of delimiter  
     * 5) For this we would take BufferSize as 64 * 1024;
     * Check Condition
     *  In the second key value pair, the value should contain 
     *  "</"  from currentToken and
     *  "id>" from next token
    Delimiter = "</entity>";
    String CurrentBufferTailToken = "</entity><entity><id>Gelesh</";
    // Ending part of Input Data Buffer
    // It contains '</' ie delimiter character 
    String NextBufferHeadToken = "id><name>Omathil</name></entity>";
    // Supposing the start of next buffer is this
    String Expected = (CurrentBufferTailToken + NextBufferHeadToken).replace(Delimiter, "");
    // Expected ,must capture from both the buffer, excluding Delimiter
    String TestPartOfInput = CurrentBufferTailToken + NextBufferHeadToken;
    int BufferSize = 64 * 1024;
    int numberOfCharToFillTheBuffer = BufferSize - CurrentBufferTailToken.length();
    StringBuilder fillerString = new StringBuilder();
    for (int i = 0; i < numberOfCharToFillTheBuffer; i++) {
        // char 'a' as a filler for the test string
    TestData = fillerString + TestPartOfInput;
    lineReader = new LineReader(new ByteArrayInputStream(TestData.getBytes()), Delimiter.getBytes());
    line = new Text();
    Assert.assertEquals(fillerString.toString(), line.toString());
    Assert.assertEquals(Expected, line.toString());
     * The test scenario is such that,
     * the character/s preceding the delimiter,
     * equals the starting character/s of delimiter
    Delimiter = "record";
    StringBuilder TestStringBuilder = new StringBuilder();
    TestStringBuilder.append(Delimiter + "Kerala ");
    TestStringBuilder.append(Delimiter + "Bangalore");
    TestStringBuilder.append(Delimiter + " North Korea");
    TestStringBuilder.append(Delimiter + Delimiter + "Guantanamo");
    TestStringBuilder.append(Delimiter + "ecord" + "recor" + //~EOF with 're'
    TestData = TestStringBuilder.toString();
    lineReader = new LineReader(new ByteArrayInputStream(TestData.getBytes()), Delimiter.getBytes());
    Assert.assertEquals("", line.toString());
    Assert.assertEquals("Kerala ", line.toString());
    Assert.assertEquals("Bangalore", line.toString());
    Assert.assertEquals(" North Korea", line.toString());
    Assert.assertEquals("", line.toString());
    Assert.assertEquals("Guantanamo", line.toString());
    Assert.assertEquals(("ecord" + "recor" + "core"), line.toString());
    // Test 3
    // The test scenario is such that,
    // aaaabccc split by aaab
    TestData = "aaaabccc";
    Delimiter = "aaab";
    lineReader = new LineReader(new ByteArrayInputStream(TestData.getBytes()), Delimiter.getBytes());
    Assert.assertEquals("a", line.toString());
    Assert.assertEquals("ccc", line.toString());
Also used : ByteArrayInputStream( LineReader(org.apache.hadoop.util.LineReader) Text( Test(org.junit.Test)

Example 2 with LineReader

use of org.apache.hadoop.util.LineReader in project hadoop by apache.

the class TestCodec method testSplitableCodec.

private void testSplitableCodec(Class<? extends SplittableCompressionCodec> codecClass) throws IOException {
    final long DEFLBYTES = 2 * 1024 * 1024;
    final Configuration conf = new Configuration();
    final Random rand = new Random();
    final long seed = rand.nextLong();"seed: " + seed);
    SplittableCompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);
    final FileSystem fs = FileSystem.getLocal(conf);
    final FileStatus infile = fs.getFileStatus(writeSplitTestFile(fs, rand, codec, DEFLBYTES));
    if (infile.getLen() > Integer.MAX_VALUE) {
        fail("Unexpected compression: " + DEFLBYTES + " -> " + infile.getLen());
    final int flen = (int) infile.getLen();
    final Text line = new Text();
    final Decompressor dcmp = CodecPool.getDecompressor(codec);
    try {
        for (int pos = 0; pos < infile.getLen(); pos += rand.nextInt(flen / 8)) {
            // read from random positions, verifying that there exist two sequential
            // lines as written in writeSplitTestFile
            final SplitCompressionInputStream in = codec.createInputStream(, dcmp, pos, flen, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (in.getAdjustedStart() >= flen) {
  "SAMPLE " + in.getAdjustedStart() + "," + in.getAdjustedEnd());
            final LineReader lreader = new LineReader(in);
            // ignore; likely partial
            if (in.getPos() >= flen) {
            final int seq1 = readLeadingInt(line);
            if (in.getPos() >= flen) {
            final int seq2 = readLeadingInt(line);
            assertEquals("Mismatched lines", seq1 + 1, seq2);
    } finally {
    // remove on success
    fs.delete(infile.getPath().getParent(), true);
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) BuiltInGzipDecompressor( Configuration(org.apache.hadoop.conf.Configuration) Random(java.util.Random) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) Text(

Example 3 with LineReader

use of org.apache.hadoop.util.LineReader in project hadoop by apache.

the class HadoopLogsAnalyzer method maybeUncompressedPath.

private LineReader maybeUncompressedPath(Path p) throws FileNotFoundException, IOException {
    CompressionCodecFactory codecs = new CompressionCodecFactory(getConf());
    inputCodec = codecs.getCodec(p);
    FileSystem fs = p.getFileSystem(getConf());
    FSDataInputStream fileIn =;
    if (inputCodec == null) {
        return new LineReader(fileIn, getConf());
    } else {
        inputDecompressor = CodecPool.getDecompressor(inputCodec);
        return new LineReader(inputCodec.createInputStream(fileIn, inputDecompressor), getConf());
Also used : CompressionCodecFactory( FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream)

Example 4 with LineReader

use of org.apache.hadoop.util.LineReader in project hadoop by apache.

the class TestTextOutputFormat method testCompress.

   * test compressed file
   * @throws IOException
public void testCompress() throws IOException {
    JobConf job = new JobConf();
    job.set(JobContext.TASK_ATTEMPT_ID, attempt);
    job.set(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.COMPRESS, "true");
    FileOutputFormat.setOutputPath(job, workDir.getParent().getParent());
    FileOutputFormat.setWorkOutputPath(job, workDir);
    FileSystem fs = workDir.getFileSystem(job);
    if (!fs.mkdirs(workDir)) {
        fail("Failed to create output directory");
    String file = "test_compress.txt";
    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;
    TextOutputFormat<Object, Object> theOutputFormat = new TextOutputFormat<Object, Object>();
    RecordWriter<Object, Object> theRecordWriter = theOutputFormat.getRecordWriter(localFs, job, file, reporter);
    Text key1 = new Text("key1");
    Text key2 = new Text("key2");
    Text val1 = new Text("val1");
    Text val2 = new Text("val2");
    NullWritable nullWritable = NullWritable.get();
    try {
        theRecordWriter.write(key1, val1);
        theRecordWriter.write(null, nullWritable);
        theRecordWriter.write(null, val1);
        theRecordWriter.write(nullWritable, val2);
        theRecordWriter.write(key2, nullWritable);
        theRecordWriter.write(key1, null);
        theRecordWriter.write(null, null);
        theRecordWriter.write(key2, val2);
    } finally {
    StringBuffer expectedOutput = new StringBuffer();
    DefaultCodec codec = new DefaultCodec();
    Path expectedFile = new Path(workDir, file + codec.getDefaultExtension());
    final FileInputStream istream = new FileInputStream(expectedFile.toString());
    CompressionInputStream cistream = codec.createInputStream(istream);
    LineReader reader = new LineReader(cistream);
    String output = "";
    Text out = new Text();
    while (reader.readLine(out) > 0) {
        output += out;
        output += "\n";
    assertEquals(expectedOutput.toString(), output);
Also used : Path(org.apache.hadoop.fs.Path) CompressionInputStream( DefaultCodec( Text( NullWritable( FileInputStream( FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) Test(org.junit.Test)

Example 5 with LineReader

use of org.apache.hadoop.util.LineReader in project hadoop by apache.

the class TestTextInputFormat method testUTF8.

@Test(timeout = 5000)
public void testUTF8() throws Exception {
    LineReader in = makeStream("abcd€bdcd€");
    Text line = new Text();
    assertEquals("readLine changed utf8 characters", "abcd€bdcd€", line.toString());
    in = makeStream("abc xyz");
    assertEquals("split on fake newline", "abc xyz", line.toString());
Also used : LineReader(org.apache.hadoop.util.LineReader) Text( Test(org.junit.Test)


LineReader (org.apache.hadoop.util.LineReader)36 Text ( Path (org.apache.hadoop.fs.Path)15 FileSystem (org.apache.hadoop.fs.FileSystem)14 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)11 Test (org.junit.Test)10 Configuration (org.apache.hadoop.conf.Configuration)5 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)4 Counters (org.apache.hadoop.mapred.Counters)4 JobConf (org.apache.hadoop.mapred.JobConf)4 RunningJob (org.apache.hadoop.mapred.RunningJob)4 IOException ( ArrayList (java.util.ArrayList)3 CommandLine (org.apache.commons.cli.CommandLine)3 CommandLineParser (org.apache.commons.cli.CommandLineParser)3 GnuParser (org.apache.commons.cli.GnuParser)3 HelpFormatter (org.apache.commons.cli.HelpFormatter)3 Options (org.apache.commons.cli.Options)3 ParseException (org.apache.commons.cli.ParseException)3 CompressionCodec (