Search in sources :

Example 21 with BytesRefWritable

use of org.apache.hadoop.hive.serde2.columnar.BytesRefWritable in project hive by apache.

the class TestRCFile method testGetColumn.

/**
   * Tests {@link RCFile.Reader#getColumn(int, BytesRefArrayWritable) } method.
   * @throws IOException
   */
@Test
public void testGetColumn() throws IOException {
    cleanup();
    RCFileOutputFormat.setColumnNumber(conf, expectedFieldsData.length);
    RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, RCFile.createMetadata(new Text("apple"), new Text("block"), new Text("cat"), new Text("dog")), new DefaultCodec());
    byte[][] record_1 = { "123".getBytes("UTF-8"), "456".getBytes("UTF-8"), "789".getBytes("UTF-8"), "1000".getBytes("UTF-8"), "5.3".getBytes("UTF-8"), "hive and hadoop".getBytes("UTF-8"), new byte[0], "NULL".getBytes("UTF-8") };
    byte[][] record_2 = { "100".getBytes("UTF-8"), "200".getBytes("UTF-8"), "123".getBytes("UTF-8"), "1000".getBytes("UTF-8"), "5.3".getBytes("UTF-8"), "hive and hadoop".getBytes("UTF-8"), new byte[0], "NULL".getBytes("UTF-8") };
    BytesRefArrayWritable bytes = new BytesRefArrayWritable(record_1.length);
    for (int i = 0; i < record_1.length; i++) {
        BytesRefWritable cu = new BytesRefWritable(record_1[i], 0, record_1[i].length);
        bytes.set(i, cu);
    }
    writer.append(bytes);
    bytes.clear();
    for (int i = 0; i < record_2.length; i++) {
        BytesRefWritable cu = new BytesRefWritable(record_2[i], 0, record_2[i].length);
        bytes.set(i, cu);
    }
    writer.append(bytes);
    writer.close();
    RCFile.Reader reader = new RCFile.Reader(fs, file, conf);
    LongWritable rowID = new LongWritable();
    assertTrue(reader.next(rowID));
    assertEquals(rowID.get(), 0L);
    assertTrue(reader.next(rowID));
    assertEquals(rowID.get(), 1L);
    BytesRefArrayWritable result = null;
    BytesRefWritable brw;
    for (int col = 0; col < 8; col++) {
        BytesRefArrayWritable result2 = reader.getColumn(col, result);
        if (result == null) {
            assertNotNull(result2);
            result = result2;
        } else {
            // #getColumn(2) should return the instance passed in:
            assertSame(result2, result);
        }
        // each column has height of 2:
        assertEquals(2, result.size());
        for (int row = 0; row < result.size(); row++) {
            brw = result.get(row);
            int start = brw.getStart();
            int len = brw.getLength();
            byte[] actualData = Arrays.copyOfRange(brw.getData(), start, start + len);
            byte[] expectedData = (row == 0) ? record_1[col] : record_2[col];
            assertArrayEquals("col=" + col + " : row=" + row, expectedData, actualData);
        }
        result.clear();
    }
    reader.close();
}
Also used : BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) RecordReader(org.apache.hadoop.mapred.RecordReader) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable) Test(org.junit.Test)

Example 22 with BytesRefWritable

use of org.apache.hadoop.hive.serde2.columnar.BytesRefWritable in project hive by apache.

the class TestRCFile method testReadCorruptFile.

@Test
public void testReadCorruptFile() throws IOException, SerDeException {
    cleanup();
    byte[][] record = { null, null, null, null, null, null, null, null };
    RCFileOutputFormat.setColumnNumber(conf, expectedFieldsData.length);
    RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, new DefaultCodec());
    BytesRefArrayWritable bytes = new BytesRefArrayWritable(record.length);
    final int recCount = 100;
    Random rand = new Random();
    for (int recIdx = 0; recIdx < recCount; recIdx++) {
        for (int i = 0; i < record.length; i++) {
            record[i] = new Integer(rand.nextInt()).toString().getBytes("UTF-8");
        }
        for (int i = 0; i < record.length; i++) {
            BytesRefWritable cu = new BytesRefWritable(record[i], 0, record[i].length);
            bytes.set(i, cu);
        }
        writer.append(bytes);
        bytes.clear();
    }
    writer.close();
    // Insert junk in middle of file. Assumes file is on local disk.
    RandomAccessFile raf = new RandomAccessFile(file.toUri().getPath(), "rw");
    long corruptOffset = raf.length() / 2;
    LOG.info("corrupting " + raf + " at offset " + corruptOffset);
    raf.seek(corruptOffset);
    raf.writeBytes("junkjunkjunkjunkjunkjunkjunkjunk");
    raf.close();
    // Set the option for tolerating corruptions. The read should succeed.
    Configuration tmpConf = new Configuration(conf);
    tmpConf.setBoolean("hive.io.rcfile.tolerate.corruptions", true);
    RCFile.Reader reader = new RCFile.Reader(fs, file, tmpConf);
    LongWritable rowID = new LongWritable();
    while (true) {
        boolean more = reader.next(rowID);
        if (!more) {
            break;
        }
        BytesRefArrayWritable cols = new BytesRefArrayWritable();
        reader.getCurrentRow(cols);
        cols.resetValid(8);
    }
    reader.close();
}
Also used : BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) Configuration(org.apache.hadoop.conf.Configuration) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) RecordReader(org.apache.hadoop.mapred.RecordReader) Random(java.util.Random) RandomAccessFile(java.io.RandomAccessFile) LongWritable(org.apache.hadoop.io.LongWritable) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable) Test(org.junit.Test)

Example 23 with BytesRefWritable

use of org.apache.hadoop.hive.serde2.columnar.BytesRefWritable in project hive by apache.

the class PerformTestRCFileAndSeqFile method writeSeqenceFileTest.

private void writeSeqenceFileTest(FileSystem fs, int rowCount, Path file, int columnNum, CompressionCodec codec) throws IOException {
    byte[][] columnRandom;
    resetRandomGenerators();
    BytesRefArrayWritable bytes = new BytesRefArrayWritable(columnNum);
    columnRandom = new byte[columnNum][];
    for (int i = 0; i < columnNum; i++) {
        BytesRefWritable cu = new BytesRefWritable();
        bytes.set(i, cu);
    }
    // zero length key is not allowed by block compress writer, so we use a byte
    // writable
    ByteWritable key = new ByteWritable();
    SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, file, ByteWritable.class, BytesRefArrayWritable.class, CompressionType.BLOCK, codec);
    for (int i = 0; i < rowCount; i++) {
        nextRandomRow(columnRandom, bytes);
        seqWriter.append(key, bytes);
    }
    seqWriter.close();
}
Also used : BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) SequenceFile(org.apache.hadoop.io.SequenceFile) ByteWritable(org.apache.hadoop.hive.serde2.io.ByteWritable) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)

Example 24 with BytesRefWritable

use of org.apache.hadoop.hive.serde2.columnar.BytesRefWritable in project hive by apache.

the class TestRCFileCat method write.

private void write(RCFile.Writer writer, byte[][] record) throws IOException {
    BytesRefArrayWritable bytes = new BytesRefArrayWritable(record.length);
    for (int i = 0; i < record.length; i++) {
        BytesRefWritable cu = new BytesRefWritable(record[i], 0, record[i].length);
        bytes.set(i, cu);
    }
    writer.append(bytes);
}
Also used : BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)

Example 25 with BytesRefWritable

use of org.apache.hadoop.hive.serde2.columnar.BytesRefWritable in project presto by prestodb.

the class ColumnarBinaryHiveRecordCursor method parseStringColumn.

private void parseStringColumn(int column) {
    loaded[column] = true;
    if (hiveColumnIndexes[column] >= value.size()) {
        // this partition may contain fewer fields than what's declared in the schema
        // this happens when additional columns are added to the hive table after a partition has been created
        nulls[column] = true;
    } else {
        BytesRefWritable fieldData = value.unCheckedGet(hiveColumnIndexes[column]);
        byte[] bytes;
        try {
            bytes = fieldData.getData();
        } catch (IOException e) {
            throw Throwables.propagate(e);
        }
        int start = fieldData.getStart();
        int length = fieldData.getLength();
        parseStringColumn(column, bytes, start, length);
    }
}
Also used : IOException(java.io.IOException) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)

Aggregations

BytesRefWritable (org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)28 IOException (java.io.IOException)14 BytesRefArrayWritable (org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable)14 Configuration (org.apache.hadoop.conf.Configuration)6 Path (org.apache.hadoop.fs.Path)5 DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)5 LongWritable (org.apache.hadoop.io.LongWritable)4 RecordReader (org.apache.hadoop.mapred.RecordReader)4 Test (org.junit.Test)4 Random (java.util.Random)2 RCFile (org.apache.hadoop.hive.ql.io.RCFile)2 ColumnarSerDe (org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe)2 ByteWritable (org.apache.hadoop.hive.serde2.io.ByteWritable)2 Text (org.apache.hadoop.io.Text)2 InputSplit (org.apache.hadoop.mapred.InputSplit)2 JobConf (org.apache.hadoop.mapred.JobConf)2 ColumnEntry (com.alibaba.datax.plugin.unstructuredstorage.reader.ColumnEntry)1 PrestoException (com.facebook.presto.spi.PrestoException)1 FileWriter (java.io.FileWriter)1 PrintWriter (java.io.PrintWriter)1