Search in sources :

Example 11 with DefaultCodec

use of org.apache.hadoop.io.compress.DefaultCodec in project hive by apache.

the class RCFileGenerator method genData.

private static void genData(String format, int numRows, String output, String plainOutput) throws Exception {
    int numFields = 0;
    if (format.equals("student")) {
        rand = new Random(numRows);
        numFields = 3;
    } else if (format.equals("voter")) {
        rand = new Random(1000000000 + numRows);
        numFields = 4;
    } else if (format.equals("alltypes")) {
        rand = new Random(2000000000L + numRows);
        numFields = 10;
    }
    RCFileOutputFormat.setColumnNumber(conf, numFields);
    RCFile.Writer writer = new RCFile.Writer(fs, conf, getFile(output), null, new DefaultCodec());
    PrintWriter pw = new PrintWriter(new FileWriter(plainOutput));
    for (int j = 0; j < numRows; j++) {
        BytesRefArrayWritable row = new BytesRefArrayWritable(numFields);
        byte[][] fields = null;
        if (format.equals("student")) {
            byte[][] f = { randomName().getBytes("UTF-8"), Integer.valueOf(randomAge()).toString().getBytes("UTF-8"), Double.valueOf(randomGpa()).toString().getBytes("UTF-8") };
            fields = f;
        } else if (format.equals("voter")) {
            byte[][] f = { randomName().getBytes("UTF-8"), Integer.valueOf(randomAge()).toString().getBytes("UTF-8"), randomRegistration().getBytes("UTF-8"), Double.valueOf(randomContribution()).toString().getBytes("UTF-8") };
            fields = f;
        } else if (format.equals("alltypes")) {
            byte[][] f = { Integer.valueOf(rand.nextInt(Byte.MAX_VALUE)).toString().getBytes("UTF-8"), Integer.valueOf(rand.nextInt(Short.MAX_VALUE)).toString().getBytes("UTF-8"), Integer.valueOf(rand.nextInt()).toString().getBytes("UTF-8"), Long.valueOf(rand.nextLong()).toString().getBytes("UTF-8"), Float.valueOf(rand.nextFloat() * 1000).toString().getBytes("UTF-8"), Double.valueOf(rand.nextDouble() * 1000000).toString().getBytes("UTF-8"), randomName().getBytes("UTF-8"), randomMap(), randomArray() };
            fields = f;
        }
        for (int i = 0; i < fields.length; i++) {
            BytesRefWritable field = new BytesRefWritable(fields[i], 0, fields[i].length);
            row.set(i, field);
            pw.print(new String(fields[i]));
            if (i != fields.length - 1)
                pw.print("\t");
            else
                pw.println();
        }
        writer.append(row);
    }
    writer.close();
    pw.close();
}
Also used : RCFile(org.apache.hadoop.hive.ql.io.RCFile) Random(java.util.Random) BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) FileWriter(java.io.FileWriter) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) PrintWriter(java.io.PrintWriter) FileWriter(java.io.FileWriter) PrintWriter(java.io.PrintWriter) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)

Example 12 with DefaultCodec

use of org.apache.hadoop.io.compress.DefaultCodec in project hive by apache.

the class TestRCFileCat method testRCFileCat.

/**
   * test parse file
   */
@Test
public void testRCFileCat() throws Exception {
    File template = File.createTempFile("hive", "tmpTest");
    Configuration configuration = new Configuration();
    byte[][] record_1 = { Bytes.toBytes("123"), Bytes.toBytes("456"), Bytes.toBytes("789"), Bytes.toBytes("1000"), Bytes.toBytes("5.3"), Bytes.toBytes("hive and hadoop"), new byte[0], Bytes.toBytes("NULL") };
    byte[][] record_2 = { Bytes.toBytes("100"), Bytes.toBytes("200"), Bytes.toBytes("123"), Bytes.toBytes("1000"), Bytes.toBytes("5.3"), Bytes.toBytes("hive and hadoop"), new byte[0], Bytes.toBytes("NULL") };
    byte[][] record_3 = { Bytes.toBytes("200"), Bytes.toBytes("400"), Bytes.toBytes("678"), Bytes.toBytes("1000"), Bytes.toBytes("4.8"), Bytes.toBytes("hive and hadoop"), new byte[0], Bytes.toBytes("TEST") };
    RCFileOutputFormat.setColumnNumber(configuration, 8);
    Path file = new Path(template.getAbsolutePath());
    FileSystem fs = FileSystem.getLocal(configuration);
    RCFile.Writer writer = new RCFile.Writer(fs, configuration, file, null, RCFile.createMetadata(new Text("apple"), new Text("block"), new Text("cat"), new Text("dog")), new DefaultCodec());
    write(writer, record_1);
    write(writer, record_2);
    write(writer, record_3);
    writer.close();
    RCFileCat fileCat = new RCFileCat();
    fileCat.test = true;
    fileCat.setConf(new Configuration());
    // set fake input and output streams
    PrintStream oldOutPrintStream = System.out;
    PrintStream oldErrPrintStream = System.err;
    ByteArrayOutputStream dataOut = new ByteArrayOutputStream();
    ByteArrayOutputStream dataErr = new ByteArrayOutputStream();
    System.setOut(new PrintStream(dataOut));
    System.setErr(new PrintStream(dataErr));
    try {
        String[] params = { "--verbose", "file://" + template.toURI().getPath() };
        assertEquals(0, fileCat.run(params));
        assertTrue(dataOut.toString().contains("123\t456\t789\t1000\t5.3\thive and hadoop\t\tNULL"));
        assertTrue(dataOut.toString().contains("100\t200\t123\t1000\t5.3\thive and hadoop\t\tNULL"));
        assertTrue(dataOut.toString().contains("200\t400\t678\t1000\t4.8\thive and hadoop\t\tTEST"));
        dataOut.reset();
        params = new String[] { "--start=-10", "--file-sizes", "file://" + template.toURI().getPath() };
        assertEquals(0, fileCat.run(params));
        assertTrue(dataOut.toString().contains("File size (uncompressed): 105. File size (compressed): 134. Number of rows: 3."));
        dataOut.reset();
        params = new String[] { "--start=0", "--column-sizes", "file://" + template.toURI().getPath() };
        assertEquals(0, fileCat.run(params));
        assertTrue(dataOut.toString().contains("0\t9\t17"));
        assertTrue(dataOut.toString().contains("1\t9\t17"));
        assertTrue(dataOut.toString().contains("2\t9\t17"));
        assertTrue(dataOut.toString().contains("3\t12\t14"));
        assertTrue(dataOut.toString().contains("4\t9\t17"));
        assertTrue(dataOut.toString().contains("5\t45\t26"));
        dataOut.reset();
        params = new String[] { "--start=0", "--column-sizes-pretty", "file://" + template.toURI().getPath() };
        assertEquals(0, fileCat.run(params));
        assertTrue(dataOut.toString().contains("Column 0: Uncompressed size: 9 Compressed size: 17"));
        assertTrue(dataOut.toString().contains("Column 1: Uncompressed size: 9 Compressed size: 17"));
        assertTrue(dataOut.toString().contains("Column 2: Uncompressed size: 9 Compressed size: 17"));
        assertTrue(dataOut.toString().contains("Column 3: Uncompressed size: 12 Compressed size: 14"));
        assertTrue(dataOut.toString().contains("Column 4: Uncompressed size: 9 Compressed size: 17"));
        assertTrue(dataOut.toString().contains("Column 5: Uncompressed size: 45 Compressed size: 26"));
        params = new String[] {};
        assertEquals(-1, fileCat.run(params));
        assertTrue(dataErr.toString().contains("RCFileCat [--start=start_offet] [--length=len] [--verbose] " + "[--column-sizes | --column-sizes-pretty] [--file-sizes] fileName"));
        dataErr.reset();
        params = new String[] { "--fakeParameter", "file://" + template.toURI().getPath() };
        assertEquals(-1, fileCat.run(params));
        assertTrue(dataErr.toString().contains("RCFileCat [--start=start_offet] [--length=len] [--verbose] " + "[--column-sizes | --column-sizes-pretty] [--file-sizes] fileName"));
    } finally {
        // restore  input and output streams
        System.setOut(oldOutPrintStream);
        System.setErr(oldErrPrintStream);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) PrintStream(java.io.PrintStream) Configuration(org.apache.hadoop.conf.Configuration) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) Text(org.apache.hadoop.io.Text) ByteArrayOutputStream(java.io.ByteArrayOutputStream) RCFile(org.apache.hadoop.hive.ql.io.RCFile) FileSystem(org.apache.hadoop.fs.FileSystem) RCFile(org.apache.hadoop.hive.ql.io.RCFile) File(java.io.File) Test(org.junit.Test)

Example 13 with DefaultCodec

use of org.apache.hadoop.io.compress.DefaultCodec in project hive by apache.

the class TestRCFile method testGetColumn.

/**
   * Tests {@link RCFile.Reader#getColumn(int, BytesRefArrayWritable) } method.
   * @throws IOException
   */
@Test
public void testGetColumn() throws IOException {
    cleanup();
    RCFileOutputFormat.setColumnNumber(conf, expectedFieldsData.length);
    RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, RCFile.createMetadata(new Text("apple"), new Text("block"), new Text("cat"), new Text("dog")), new DefaultCodec());
    byte[][] record_1 = { "123".getBytes("UTF-8"), "456".getBytes("UTF-8"), "789".getBytes("UTF-8"), "1000".getBytes("UTF-8"), "5.3".getBytes("UTF-8"), "hive and hadoop".getBytes("UTF-8"), new byte[0], "NULL".getBytes("UTF-8") };
    byte[][] record_2 = { "100".getBytes("UTF-8"), "200".getBytes("UTF-8"), "123".getBytes("UTF-8"), "1000".getBytes("UTF-8"), "5.3".getBytes("UTF-8"), "hive and hadoop".getBytes("UTF-8"), new byte[0], "NULL".getBytes("UTF-8") };
    BytesRefArrayWritable bytes = new BytesRefArrayWritable(record_1.length);
    for (int i = 0; i < record_1.length; i++) {
        BytesRefWritable cu = new BytesRefWritable(record_1[i], 0, record_1[i].length);
        bytes.set(i, cu);
    }
    writer.append(bytes);
    bytes.clear();
    for (int i = 0; i < record_2.length; i++) {
        BytesRefWritable cu = new BytesRefWritable(record_2[i], 0, record_2[i].length);
        bytes.set(i, cu);
    }
    writer.append(bytes);
    writer.close();
    RCFile.Reader reader = new RCFile.Reader(fs, file, conf);
    LongWritable rowID = new LongWritable();
    assertTrue(reader.next(rowID));
    assertEquals(rowID.get(), 0L);
    assertTrue(reader.next(rowID));
    assertEquals(rowID.get(), 1L);
    BytesRefArrayWritable result = null;
    BytesRefWritable brw;
    for (int col = 0; col < 8; col++) {
        BytesRefArrayWritable result2 = reader.getColumn(col, result);
        if (result == null) {
            assertNotNull(result2);
            result = result2;
        } else {
            // #getColumn(2) should return the instance passed in:
            assertSame(result2, result);
        }
        // each column has height of 2:
        assertEquals(2, result.size());
        for (int row = 0; row < result.size(); row++) {
            brw = result.get(row);
            int start = brw.getStart();
            int len = brw.getLength();
            byte[] actualData = Arrays.copyOfRange(brw.getData(), start, start + len);
            byte[] expectedData = (row == 0) ? record_1[col] : record_2[col];
            assertArrayEquals("col=" + col + " : row=" + row, expectedData, actualData);
        }
        result.clear();
    }
    reader.close();
}
Also used : BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) RecordReader(org.apache.hadoop.mapred.RecordReader) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable) Test(org.junit.Test)

Example 14 with DefaultCodec

use of org.apache.hadoop.io.compress.DefaultCodec in project hive by apache.

the class TestRCFile method testReadCorruptFile.

@Test
public void testReadCorruptFile() throws IOException, SerDeException {
    cleanup();
    byte[][] record = { null, null, null, null, null, null, null, null };
    RCFileOutputFormat.setColumnNumber(conf, expectedFieldsData.length);
    RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, new DefaultCodec());
    BytesRefArrayWritable bytes = new BytesRefArrayWritable(record.length);
    final int recCount = 100;
    Random rand = new Random();
    for (int recIdx = 0; recIdx < recCount; recIdx++) {
        for (int i = 0; i < record.length; i++) {
            record[i] = new Integer(rand.nextInt()).toString().getBytes("UTF-8");
        }
        for (int i = 0; i < record.length; i++) {
            BytesRefWritable cu = new BytesRefWritable(record[i], 0, record[i].length);
            bytes.set(i, cu);
        }
        writer.append(bytes);
        bytes.clear();
    }
    writer.close();
    // Insert junk in middle of file. Assumes file is on local disk.
    RandomAccessFile raf = new RandomAccessFile(file.toUri().getPath(), "rw");
    long corruptOffset = raf.length() / 2;
    LOG.info("corrupting " + raf + " at offset " + corruptOffset);
    raf.seek(corruptOffset);
    raf.writeBytes("junkjunkjunkjunkjunkjunkjunkjunk");
    raf.close();
    // Set the option for tolerating corruptions. The read should succeed.
    Configuration tmpConf = new Configuration(conf);
    tmpConf.setBoolean("hive.io.rcfile.tolerate.corruptions", true);
    RCFile.Reader reader = new RCFile.Reader(fs, file, tmpConf);
    LongWritable rowID = new LongWritable();
    while (true) {
        boolean more = reader.next(rowID);
        if (!more) {
            break;
        }
        BytesRefArrayWritable cols = new BytesRefArrayWritable();
        reader.getCurrentRow(cols);
        cols.resetValid(8);
    }
    reader.close();
}
Also used : BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) Configuration(org.apache.hadoop.conf.Configuration) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) RecordReader(org.apache.hadoop.mapred.RecordReader) Random(java.util.Random) RandomAccessFile(java.io.RandomAccessFile) LongWritable(org.apache.hadoop.io.LongWritable) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable) Test(org.junit.Test)

Example 15 with DefaultCodec

use of org.apache.hadoop.io.compress.DefaultCodec in project hadoop by apache.

the class TestSequenceFileAppend method testAppend.

@Test(timeout = 30000)
public void testAppend() throws Exception {
    Path file = new Path(ROOT_PATH, "testseqappend.seq");
    fs.delete(file, true);
    Text key1 = new Text("Key1");
    Text value1 = new Text("Value1");
    Text value2 = new Text("Updated");
    SequenceFile.Metadata metadata = new SequenceFile.Metadata();
    metadata.set(key1, value1);
    Writer.Option metadataOption = Writer.metadata(metadata);
    Writer writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), metadataOption);
    writer.append(1L, "one");
    writer.append(2L, "two");
    writer.close();
    verify2Values(file);
    metadata.set(key1, value2);
    writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true), metadataOption);
    // Verify the Meta data is not changed
    assertEquals(value1, writer.metadata.get(key1));
    writer.append(3L, "three");
    writer.append(4L, "four");
    writer.close();
    verifyAll4Values(file);
    // Verify the Meta data readable after append
    Reader reader = new Reader(conf, Reader.file(file));
    assertEquals(value1, reader.getMetadata().get(key1));
    reader.close();
    // Verify failure if the compression details are different
    try {
        Option wrongCompressOption = Writer.compression(CompressionType.RECORD, new GzipCodec());
        writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true), wrongCompressOption);
        writer.close();
        fail("Expected IllegalArgumentException for compression options");
    } catch (IllegalArgumentException IAE) {
    // Expected exception. Ignore it
    }
    try {
        Option wrongCompressOption = Writer.compression(CompressionType.BLOCK, new DefaultCodec());
        writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true), wrongCompressOption);
        writer.close();
        fail("Expected IllegalArgumentException for compression options");
    } catch (IllegalArgumentException IAE) {
    // Expected exception. Ignore it
    }
    fs.deleteOnExit(file);
}
Also used : Path(org.apache.hadoop.fs.Path) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) Reader(org.apache.hadoop.io.SequenceFile.Reader) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) Option(org.apache.hadoop.io.SequenceFile.Writer.Option) Option(org.apache.hadoop.io.SequenceFile.Writer.Option) Writer(org.apache.hadoop.io.SequenceFile.Writer) Test(org.junit.Test)

Aggregations

DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)21 Test (org.junit.Test)15 Path (org.apache.hadoop.fs.Path)10 GzipCodec (org.apache.hadoop.io.compress.GzipCodec)7 Configuration (org.apache.hadoop.conf.Configuration)6 FileSystem (org.apache.hadoop.fs.FileSystem)6 Text (org.apache.hadoop.io.Text)6 BytesRefArrayWritable (org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable)5 BytesRefWritable (org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)5 Writer (org.apache.hadoop.io.SequenceFile.Writer)4 Random (java.util.Random)3 LongWritable (org.apache.hadoop.io.LongWritable)3 Option (org.apache.hadoop.io.SequenceFile.Writer.Option)3 CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)3 RecordReader (org.apache.hadoop.mapred.RecordReader)3 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)2 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)2 LocalFileSystem (org.apache.hadoop.fs.LocalFileSystem)2 KeyValueCodec (org.apache.hadoop.hbase.codec.KeyValueCodec)2 RCFile (org.apache.hadoop.hive.ql.io.RCFile)2