Examples with DefaultCodec - org.apache.hadoop.io.compress.DefaultCodec

Example 11 with DefaultCodec

use of org.apache.hadoop.io.compress.DefaultCodec in project hive by apache.

the class RCFileGenerator method genData.

private static void genData(String format, int numRows, String output, String plainOutput) throws Exception {
    int numFields = 0;
    if (format.equals("student")) {
        rand = new Random(numRows);
        numFields = 3;
    } else if (format.equals("voter")) {
        rand = new Random(1000000000 + numRows);
        numFields = 4;
    } else if (format.equals("alltypes")) {
        rand = new Random(2000000000L + numRows);
        numFields = 10;
    }
    RCFileOutputFormat.setColumnNumber(conf, numFields);
    RCFile.Writer writer = new RCFile.Writer(fs, conf, getFile(output), null, new DefaultCodec());
    PrintWriter pw = new PrintWriter(new FileWriter(plainOutput));
    for (int j = 0; j < numRows; j++) {
        BytesRefArrayWritable row = new BytesRefArrayWritable(numFields);
        byte[][] fields = null;
        if (format.equals("student")) {
            byte[][] f = { randomName().getBytes("UTF-8"), Integer.valueOf(randomAge()).toString().getBytes("UTF-8"), Double.valueOf(randomGpa()).toString().getBytes("UTF-8") };
            fields = f;
        } else if (format.equals("voter")) {
            byte[][] f = { randomName().getBytes("UTF-8"), Integer.valueOf(randomAge()).toString().getBytes("UTF-8"), randomRegistration().getBytes("UTF-8"), Double.valueOf(randomContribution()).toString().getBytes("UTF-8") };
            fields = f;
        } else if (format.equals("alltypes")) {
            byte[][] f = { Integer.valueOf(rand.nextInt(Byte.MAX_VALUE)).toString().getBytes("UTF-8"), Integer.valueOf(rand.nextInt(Short.MAX_VALUE)).toString().getBytes("UTF-8"), Integer.valueOf(rand.nextInt()).toString().getBytes("UTF-8"), Long.valueOf(rand.nextLong()).toString().getBytes("UTF-8"), Float.valueOf(rand.nextFloat() * 1000).toString().getBytes("UTF-8"), Double.valueOf(rand.nextDouble() * 1000000).toString().getBytes("UTF-8"), randomName().getBytes("UTF-8"), randomMap(), randomArray() };
            fields = f;
        }
        for (int i = 0; i < fields.length; i++) {
            BytesRefWritable field = new BytesRefWritable(fields[i], 0, fields[i].length);
            row.set(i, field);
            pw.print(new String(fields[i]));
            if (i != fields.length - 1)
                pw.print("\t");
            else
                pw.println();
        }
        writer.append(row);
    }
    writer.close();
    pw.close();
}

Also used : RCFile(org.apache.hadoop.hive.ql.io.RCFile) Random(java.util.Random) BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) FileWriter(java.io.FileWriter) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) PrintWriter(java.io.PrintWriter) FileWriter(java.io.FileWriter) PrintWriter(java.io.PrintWriter) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)

Example 12 with DefaultCodec

use of org.apache.hadoop.io.compress.DefaultCodec in project hive by apache.

the class TestRCFileCat method testRCFileCat.

/**
   * test parse file
   */
@Test
public void testRCFileCat() throws Exception {
    File template = File.createTempFile("hive", "tmpTest");
    Configuration configuration = new Configuration();
    byte[][] record_1 = { Bytes.toBytes("123"), Bytes.toBytes("456"), Bytes.toBytes("789"), Bytes.toBytes("1000"), Bytes.toBytes("5.3"), Bytes.toBytes("hive and hadoop"), new byte[0], Bytes.toBytes("NULL") };
    byte[][] record_2 = { Bytes.toBytes("100"), Bytes.toBytes("200"), Bytes.toBytes("123"), Bytes.toBytes("1000"), Bytes.toBytes("5.3"), Bytes.toBytes("hive and hadoop"), new byte[0], Bytes.toBytes("NULL") };
    byte[][] record_3 = { Bytes.toBytes("200"), Bytes.toBytes("400"), Bytes.toBytes("678"), Bytes.toBytes("1000"), Bytes.toBytes("4.8"), Bytes.toBytes("hive and hadoop"), new byte[0], Bytes.toBytes("TEST") };
    RCFileOutputFormat.setColumnNumber(configuration, 8);
    Path file = new Path(template.getAbsolutePath());
    FileSystem fs = FileSystem.getLocal(configuration);
    RCFile.Writer writer = new RCFile.Writer(fs, configuration, file, null, RCFile.createMetadata(new Text("apple"), new Text("block"), new Text("cat"), new Text("dog")), new DefaultCodec());
    write(writer, record_1);
    write(writer, record_2);
    write(writer, record_3);
    writer.close();
    RCFileCat fileCat = new RCFileCat();
    fileCat.test = true;
    fileCat.setConf(new Configuration());
    // set fake input and output streams
    PrintStream oldOutPrintStream = System.out;
    PrintStream oldErrPrintStream = System.err;
    ByteArrayOutputStream dataOut = new ByteArrayOutputStream();
    ByteArrayOutputStream dataErr = new ByteArrayOutputStream();
    System.setOut(new PrintStream(dataOut));
    System.setErr(new PrintStream(dataErr));
    try {
        String[] params = { "--verbose", "file://" + template.toURI().getPath() };
        assertEquals(0, fileCat.run(params));
        assertTrue(dataOut.toString().contains("123\t456\t789\t1000\t5.3\thive and hadoop\t\tNULL"));
        assertTrue(dataOut.toString().contains("100\t200\t123\t1000\t5.3\thive and hadoop\t\tNULL"));
        assertTrue(dataOut.toString().contains("200\t400\t678\t1000\t4.8\thive and hadoop\t\tTEST"));
        dataOut.reset();
        params = new String[] { "--start=-10", "--file-sizes", "file://" + template.toURI().getPath() };
        assertEquals(0, fileCat.run(params));
        assertTrue(dataOut.toString().contains("File size (uncompressed): 105. File size (compressed): 134. Number of rows: 3."));
        dataOut.reset();
        params = new String[] { "--start=0", "--column-sizes", "file://" + template.toURI().getPath() };
        assertEquals(0, fileCat.run(params));
        assertTrue(dataOut.toString().contains("0\t9\t17"));
        assertTrue(dataOut.toString().contains("1\t9\t17"));
        assertTrue(dataOut.toString().contains("2\t9\t17"));
        assertTrue(dataOut.toString().contains("3\t12\t14"));
        assertTrue(dataOut.toString().contains("4\t9\t17"));
        assertTrue(dataOut.toString().contains("5\t45\t26"));
        dataOut.reset();
        params = new String[] { "--start=0", "--column-sizes-pretty", "file://" + template.toURI().getPath() };
        assertEquals(0, fileCat.run(params));
        assertTrue(dataOut.toString().contains("Column 0: Uncompressed size: 9 Compressed size: 17"));
        assertTrue(dataOut.toString().contains("Column 1: Uncompressed size: 9 Compressed size: 17"));
        assertTrue(dataOut.toString().contains("Column 2: Uncompressed size: 9 Compressed size: 17"));
        assertTrue(dataOut.toString().contains("Column 3: Uncompressed size: 12 Compressed size: 14"));
        assertTrue(dataOut.toString().contains("Column 4: Uncompressed size: 9 Compressed size: 17"));
        assertTrue(dataOut.toString().contains("Column 5: Uncompressed size: 45 Compressed size: 26"));
        params = new String[] {};
        assertEquals(-1, fileCat.run(params));
        assertTrue(dataErr.toString().contains("RCFileCat [--start=start_offet] [--length=len] [--verbose] " + "[--column-sizes | --column-sizes-pretty] [--file-sizes] fileName"));
        dataErr.reset();
        params = new String[] { "--fakeParameter", "file://" + template.toURI().getPath() };
        assertEquals(-1, fileCat.run(params));
        assertTrue(dataErr.toString().contains("RCFileCat [--start=start_offet] [--length=len] [--verbose] " + "[--column-sizes | --column-sizes-pretty] [--file-sizes] fileName"));
    } finally {
        // restore  input and output streams
        System.setOut(oldOutPrintStream);
        System.setErr(oldErrPrintStream);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) PrintStream(java.io.PrintStream) Configuration(org.apache.hadoop.conf.Configuration) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) Text(org.apache.hadoop.io.Text) ByteArrayOutputStream(java.io.ByteArrayOutputStream) RCFile(org.apache.hadoop.hive.ql.io.RCFile) FileSystem(org.apache.hadoop.fs.FileSystem) RCFile(org.apache.hadoop.hive.ql.io.RCFile) File(java.io.File) Test(org.junit.Test)

Example 13 with DefaultCodec

use of org.apache.hadoop.io.compress.DefaultCodec in project hive by apache.

the class TestRCFile method testGetColumn.

/**
   * Tests {@link RCFile.Reader#getColumn(int, BytesRefArrayWritable) } method.
   * @throws IOException
   */
@Test
public void testGetColumn() throws IOException {
    cleanup();
    RCFileOutputFormat.setColumnNumber(conf, expectedFieldsData.length);
    RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, RCFile.createMetadata(new Text("apple"), new Text("block"), new Text("cat"), new Text("dog")), new DefaultCodec());
    byte[][] record_1 = { "123".getBytes("UTF-8"), "456".getBytes("UTF-8"), "789".getBytes("UTF-8"), "1000".getBytes("UTF-8"), "5.3".getBytes("UTF-8"), "hive and hadoop".getBytes("UTF-8"), new byte[0], "NULL".getBytes("UTF-8") };
    byte[][] record_2 = { "100".getBytes("UTF-8"), "200".getBytes("UTF-8"), "123".getBytes("UTF-8"), "1000".getBytes("UTF-8"), "5.3".getBytes("UTF-8"), "hive and hadoop".getBytes("UTF-8"), new byte[0], "NULL".getBytes("UTF-8") };
    BytesRefArrayWritable bytes = new BytesRefArrayWritable(record_1.length);
    for (int i = 0; i < record_1.length; i++) {
        BytesRefWritable cu = new BytesRefWritable(record_1[i], 0, record_1[i].length);
        bytes.set(i, cu);
    }
    writer.append(bytes);
    bytes.clear();
    for (int i = 0; i < record_2.length; i++) {
        BytesRefWritable cu = new BytesRefWritable(record_2[i], 0, record_2[i].length);
        bytes.set(i, cu);
    }
    writer.append(bytes);
    writer.close();
    RCFile.Reader reader = new RCFile.Reader(fs, file, conf);
    LongWritable rowID = new LongWritable();
    assertTrue(reader.next(rowID));
    assertEquals(rowID.get(), 0L);
    assertTrue(reader.next(rowID));
    assertEquals(rowID.get(), 1L);
    BytesRefArrayWritable result = null;
    BytesRefWritable brw;
    for (int col = 0; col < 8; col++) {
        BytesRefArrayWritable result2 = reader.getColumn(col, result);
        if (result == null) {
            assertNotNull(result2);
            result = result2;
        } else {
            // #getColumn(2) should return the instance passed in:
            assertSame(result2, result);
        }
        // each column has height of 2:
        assertEquals(2, result.size());
        for (int row = 0; row < result.size(); row++) {
            brw = result.get(row);
            int start = brw.getStart();
            int len = brw.getLength();
            byte[] actualData = Arrays.copyOfRange(brw.getData(), start, start + len);
            byte[] expectedData = (row == 0) ? record_1[col] : record_2[col];
            assertArrayEquals("col=" + col + " : row=" + row, expectedData, actualData);
        }
        result.clear();
    }
    reader.close();
}

Also used : BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) RecordReader(org.apache.hadoop.mapred.RecordReader) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable) Test(org.junit.Test)

Example 14 with DefaultCodec

use of org.apache.hadoop.io.compress.DefaultCodec in project hive by apache.

the class TestRCFile method testReadCorruptFile.

@Test
public void testReadCorruptFile() throws IOException, SerDeException {
    cleanup();
    byte[][] record = { null, null, null, null, null, null, null, null };
    RCFileOutputFormat.setColumnNumber(conf, expectedFieldsData.length);
    RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, new DefaultCodec());
    BytesRefArrayWritable bytes = new BytesRefArrayWritable(record.length);
    final int recCount = 100;
    Random rand = new Random();
    for (int recIdx = 0; recIdx < recCount; recIdx++) {
        for (int i = 0; i < record.length; i++) {
            record[i] = new Integer(rand.nextInt()).toString().getBytes("UTF-8");
        }
        for (int i = 0; i < record.length; i++) {
            BytesRefWritable cu = new BytesRefWritable(record[i], 0, record[i].length);
            bytes.set(i, cu);
        }
        writer.append(bytes);
        bytes.clear();
    }
    writer.close();
    // Insert junk in middle of file. Assumes file is on local disk.
    RandomAccessFile raf = new RandomAccessFile(file.toUri().getPath(), "rw");
    long corruptOffset = raf.length() / 2;
    LOG.info("corrupting " + raf + " at offset " + corruptOffset);
    raf.seek(corruptOffset);
    raf.writeBytes("junkjunkjunkjunkjunkjunkjunkjunk");
    raf.close();
    // Set the option for tolerating corruptions. The read should succeed.
    Configuration tmpConf = new Configuration(conf);
    tmpConf.setBoolean("hive.io.rcfile.tolerate.corruptions", true);
    RCFile.Reader reader = new RCFile.Reader(fs, file, tmpConf);
    LongWritable rowID = new LongWritable();
    while (true) {
        boolean more = reader.next(rowID);
        if (!more) {
            break;
        }
        BytesRefArrayWritable cols = new BytesRefArrayWritable();
        reader.getCurrentRow(cols);
        cols.resetValid(8);
    }
    reader.close();
}

Also used : BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) Configuration(org.apache.hadoop.conf.Configuration) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) RecordReader(org.apache.hadoop.mapred.RecordReader) Random(java.util.Random) RandomAccessFile(java.io.RandomAccessFile) LongWritable(org.apache.hadoop.io.LongWritable) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable) Test(org.junit.Test)

Example 15 with DefaultCodec

use of org.apache.hadoop.io.compress.DefaultCodec in project hadoop by apache.

the class TestSequenceFileAppend method testAppend.

@Test(timeout = 30000)
public void testAppend() throws Exception {
    Path file = new Path(ROOT_PATH, "testseqappend.seq");
    fs.delete(file, true);
    Text key1 = new Text("Key1");
    Text value1 = new Text("Value1");
    Text value2 = new Text("Updated");
    SequenceFile.Metadata metadata = new SequenceFile.Metadata();
    metadata.set(key1, value1);
    Writer.Option metadataOption = Writer.metadata(metadata);
    Writer writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), metadataOption);
    writer.append(1L, "one");
    writer.append(2L, "two");
    writer.close();
    verify2Values(file);
    metadata.set(key1, value2);
    writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true), metadataOption);
    // Verify the Meta data is not changed
    assertEquals(value1, writer.metadata.get(key1));
    writer.append(3L, "three");
    writer.append(4L, "four");
    writer.close();
    verifyAll4Values(file);
    // Verify the Meta data readable after append
    Reader reader = new Reader(conf, Reader.file(file));
    assertEquals(value1, reader.getMetadata().get(key1));
    reader.close();
    // Verify failure if the compression details are different
    try {
        Option wrongCompressOption = Writer.compression(CompressionType.RECORD, new GzipCodec());
        writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true), wrongCompressOption);
        writer.close();
        fail("Expected IllegalArgumentException for compression options");
    } catch (IllegalArgumentException IAE) {
    // Expected exception. Ignore it
    }
    try {
        Option wrongCompressOption = Writer.compression(CompressionType.BLOCK, new DefaultCodec());
        writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true), wrongCompressOption);
        writer.close();
        fail("Expected IllegalArgumentException for compression options");
    } catch (IllegalArgumentException IAE) {
    // Expected exception. Ignore it
    }
    fs.deleteOnExit(file);
}

Also used : Path(org.apache.hadoop.fs.Path) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) Reader(org.apache.hadoop.io.SequenceFile.Reader) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) Option(org.apache.hadoop.io.SequenceFile.Writer.Option) Option(org.apache.hadoop.io.SequenceFile.Writer.Option) Writer(org.apache.hadoop.io.SequenceFile.Writer) Test(org.junit.Test)

Aggregations

DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)21 Test (org.junit.Test)15 Path (org.apache.hadoop.fs.Path)10 GzipCodec (org.apache.hadoop.io.compress.GzipCodec)7 Configuration (org.apache.hadoop.conf.Configuration)6 FileSystem (org.apache.hadoop.fs.FileSystem)6 Text (org.apache.hadoop.io.Text)6 BytesRefArrayWritable (org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable)5 BytesRefWritable (org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)5 Writer (org.apache.hadoop.io.SequenceFile.Writer)4 Random (java.util.Random)3 LongWritable (org.apache.hadoop.io.LongWritable)3 Option (org.apache.hadoop.io.SequenceFile.Writer.Option)3 CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)3 RecordReader (org.apache.hadoop.mapred.RecordReader)3 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)2 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)2 LocalFileSystem (org.apache.hadoop.fs.LocalFileSystem)2 KeyValueCodec (org.apache.hadoop.hbase.codec.KeyValueCodec)2 RCFile (org.apache.hadoop.hive.ql.io.RCFile)2