Search in sources :

Example 21 with BytesRefArrayWritable

use of org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable in project hive by apache.

the class TestRCFile method testGetColumn.

/**
 * Tests {@link RCFile.Reader#getColumn(int, BytesRefArrayWritable) } method.
 * @throws IOException
 */
@Test
public void testGetColumn() throws IOException {
    cleanup();
    RCFileOutputFormat.setColumnNumber(conf, expectedFieldsData.length);
    RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, RCFile.createMetadata(new Text("apple"), new Text("block"), new Text("cat"), new Text("dog")), new DefaultCodec());
    byte[][] record_1 = { "123".getBytes(StandardCharsets.UTF_8), "456".getBytes(StandardCharsets.UTF_8), "789".getBytes(StandardCharsets.UTF_8), "1000".getBytes(StandardCharsets.UTF_8), "5.3".getBytes(StandardCharsets.UTF_8), "hive and hadoop".getBytes(StandardCharsets.UTF_8), new byte[0], "NULL".getBytes(StandardCharsets.UTF_8) };
    byte[][] record_2 = { "100".getBytes(StandardCharsets.UTF_8), "200".getBytes(StandardCharsets.UTF_8), "123".getBytes(StandardCharsets.UTF_8), "1000".getBytes(StandardCharsets.UTF_8), "5.3".getBytes(StandardCharsets.UTF_8), "hive and hadoop".getBytes(StandardCharsets.UTF_8), new byte[0], "NULL".getBytes(StandardCharsets.UTF_8) };
    BytesRefArrayWritable bytes = new BytesRefArrayWritable(record_1.length);
    for (int i = 0; i < record_1.length; i++) {
        BytesRefWritable cu = new BytesRefWritable(record_1[i], 0, record_1[i].length);
        bytes.set(i, cu);
    }
    writer.append(bytes);
    bytes.clear();
    for (int i = 0; i < record_2.length; i++) {
        BytesRefWritable cu = new BytesRefWritable(record_2[i], 0, record_2[i].length);
        bytes.set(i, cu);
    }
    writer.append(bytes);
    writer.close();
    RCFile.Reader reader = new RCFile.Reader(fs, file, conf);
    LongWritable rowID = new LongWritable();
    assertTrue(reader.next(rowID));
    assertEquals(rowID.get(), 0L);
    assertTrue(reader.next(rowID));
    assertEquals(rowID.get(), 1L);
    BytesRefArrayWritable result = null;
    BytesRefWritable brw;
    for (int col = 0; col < 8; col++) {
        BytesRefArrayWritable result2 = reader.getColumn(col, result);
        if (result == null) {
            assertNotNull(result2);
            result = result2;
        } else {
            // #getColumn(2) should return the instance passed in:
            assertSame(result2, result);
        }
        // each column has height of 2:
        assertEquals(2, result.size());
        for (int row = 0; row < result.size(); row++) {
            brw = result.get(row);
            int start = brw.getStart();
            int len = brw.getLength();
            byte[] actualData = Arrays.copyOfRange(brw.getData(), start, start + len);
            byte[] expectedData = (row == 0) ? record_1[col] : record_2[col];
            assertArrayEquals("col=" + col + " : row=" + row, expectedData, actualData);
        }
        result.clear();
    }
    reader.close();
}
Also used : BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) RecordReader(org.apache.hadoop.mapred.RecordReader) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable) Test(org.junit.Test)

Example 22 with BytesRefArrayWritable

use of org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable in project hive by apache.

the class TestRCFile method testSync.

@Test
public void testSync() throws IOException {
    Path testDir = new Path(System.getProperty("test.tmp.dir", ".") + "/mapred/testsync");
    Path testFile = new Path(testDir, "test_rcfile");
    fs.delete(testFile, true);
    int intervalRecordCount = 500;
    CompressionCodec codec = null;
    int writeCount = 2500;
    Configuration cloneConf = new Configuration(conf);
    RCFileOutputFormat.setColumnNumber(cloneConf, bytesArray.length);
    cloneConf.setInt(HiveConf.ConfVars.HIVE_RCFILE_RECORD_INTERVAL.varname, intervalRecordCount);
    RCFile.Writer writer = new RCFile.Writer(fs, cloneConf, testFile, null, codec);
    BytesRefArrayWritable bytes = new BytesRefArrayWritable(bytesArray.length);
    for (int i = 0; i < bytesArray.length; i++) {
        BytesRefWritable cu = null;
        cu = new BytesRefWritable(bytesArray[i], 0, bytesArray[i].length);
        bytes.set(i, cu);
    }
    for (int i = 0; i < writeCount; i++) {
        writer.append(bytes);
    }
    writer.close();
    long fileLen = fs.getFileStatus(testFile).getLen();
    RCFileInputFormat inputFormat = new RCFileInputFormat();
    JobConf jobconf = new JobConf(cloneConf);
    jobconf.set("mapred.input.dir", testDir.toString());
    HiveConf.setLongVar(jobconf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, fileLen);
    InputSplit[] splits = inputFormat.getSplits(jobconf, 1);
    RCFileRecordReader rr = new RCFileRecordReader(jobconf, (FileSplit) splits[0]);
    long lastSync = 0;
    for (int i = 0; i < 2500; i++) {
        rr.sync(i);
        if (rr.getPos() < lastSync) {
            String reason = String.format("Sync at offset %d skipped sync block at location %d (returned %d instead)", i - 1, rr.getPos(), lastSync);
            System.out.println(reason);
            fail(reason);
        }
        lastSync = rr.getPos();
    }
    rr.close();
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable) Test(org.junit.Test)

Example 23 with BytesRefArrayWritable

use of org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable in project hive by apache.

the class TestRCFile method writeThenReadByRecordReader.

private void writeThenReadByRecordReader(int intervalRecordCount, int writeCount, int splitNumber, long minSplitSize, CompressionCodec codec) throws IOException {
    Path testDir = new Path(System.getProperty("test.tmp.dir", ".") + "/mapred/testsmallfirstsplit");
    Path testFile = new Path(testDir, "test_rcfile");
    fs.delete(testFile, true);
    Configuration cloneConf = new Configuration(conf);
    RCFileOutputFormat.setColumnNumber(cloneConf, bytesArray.length);
    cloneConf.setInt(HiveConf.ConfVars.HIVE_RCFILE_RECORD_INTERVAL.varname, intervalRecordCount);
    RCFile.Writer writer = new RCFile.Writer(fs, cloneConf, testFile, null, codec);
    BytesRefArrayWritable bytes = new BytesRefArrayWritable(bytesArray.length);
    for (int i = 0; i < bytesArray.length; i++) {
        BytesRefWritable cu = null;
        cu = new BytesRefWritable(bytesArray[i], 0, bytesArray[i].length);
        bytes.set(i, cu);
    }
    for (int i = 0; i < writeCount; i++) {
        if (i == intervalRecordCount) {
            System.out.println("write position:" + writer.getLength());
        }
        writer.append(bytes);
    }
    writer.close();
    RCFileInputFormat inputFormat = new RCFileInputFormat();
    JobConf jonconf = new JobConf(cloneConf);
    jonconf.set("mapred.input.dir", testDir.toString());
    HiveConf.setLongVar(jonconf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, minSplitSize);
    InputSplit[] splits = inputFormat.getSplits(jonconf, splitNumber);
    assertEquals("splits length should be " + splitNumber, splitNumber, splits.length);
    int readCount = 0;
    for (int i = 0; i < splits.length; i++) {
        int previousReadCount = readCount;
        RecordReader rr = inputFormat.getRecordReader(splits[i], jonconf, Reporter.NULL);
        Object key = rr.createKey();
        Object value = rr.createValue();
        while (rr.next(key, value)) {
            readCount++;
        }
        rr.close();
        System.out.println("The " + i + "th split read " + (readCount - previousReadCount));
    }
    assertEquals("readCount should be equal to writeCount", writeCount, readCount);
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) RecordReader(org.apache.hadoop.mapred.RecordReader) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)

Example 24 with BytesRefArrayWritable

use of org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable in project hive by apache.

the class TestRCFile method fullyReadTest.

public void fullyReadTest(FileSystem fs, int count, Path file) throws IOException, SerDeException {
    LOG.debug("reading " + count + " records");
    long start = System.currentTimeMillis();
    ColumnProjectionUtils.setReadAllColumns(conf);
    RCFile.Reader reader = new RCFile.Reader(fs, file, conf);
    LongWritable rowID = new LongWritable();
    int actualRead = 0;
    BytesRefArrayWritable cols = new BytesRefArrayWritable();
    while (reader.next(rowID)) {
        reader.getCurrentRow(cols);
        cols.resetValid(8);
        Object row = serDe.deserialize(cols);
        StructObjectInspector oi = (StructObjectInspector) serDe.getObjectInspector();
        List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs();
        assertEquals("Field size should be 8", 8, fieldRefs.size());
        for (int i = 0; i < fieldRefs.size(); i++) {
            Object fieldData = oi.getStructFieldData(row, fieldRefs.get(i));
            Object standardWritableData = ObjectInspectorUtils.copyToStandardObject(fieldData, fieldRefs.get(i).getFieldObjectInspector(), ObjectInspectorCopyOption.WRITABLE);
            assertEquals("Field " + i, standardWritableData, expectedFieldsData[i]);
        }
        // Serialize
        assertEquals("Class of the serialized object should be BytesRefArrayWritable", BytesRefArrayWritable.class, serDe.getSerializedClass());
        BytesRefArrayWritable serializedText = (BytesRefArrayWritable) serDe.serialize(row, oi);
        assertEquals("Serialized data", s, serializedText);
        actualRead++;
    }
    reader.close();
    assertEquals("Expect " + count + " rows, actual read " + actualRead, actualRead, count);
    long cost = System.currentTimeMillis() - start;
    LOG.debug("reading fully costs:" + cost + " milliseconds");
}
Also used : BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) RecordReader(org.apache.hadoop.mapred.RecordReader) LongWritable(org.apache.hadoop.io.LongWritable) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 25 with BytesRefArrayWritable

use of org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable in project hive by apache.

the class PerformTestRCFileAndSeqFile method performSequenceFileRead.

public void performSequenceFileRead(FileSystem fs, int count, Path file) throws IOException {
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);
    ByteWritable key = new ByteWritable();
    BytesRefArrayWritable val = new BytesRefArrayWritable();
    for (int i = 0; i < count; i++) {
        reader.next(key, val);
    }
}
Also used : SequenceFile(org.apache.hadoop.io.SequenceFile) BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) ByteWritable(org.apache.hadoop.hive.serde2.io.ByteWritable)

Aggregations

BytesRefArrayWritable (org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable)28 BytesRefWritable (org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)17 Configuration (org.apache.hadoop.conf.Configuration)13 LongWritable (org.apache.hadoop.io.LongWritable)12 Path (org.apache.hadoop.fs.Path)11 Test (org.junit.Test)11 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)10 Properties (java.util.Properties)7 RecordReader (org.apache.hadoop.mapred.RecordReader)7 DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)5 FileSystem (org.apache.hadoop.fs.FileSystem)4 RCFile (org.apache.hadoop.hive.ql.io.RCFile)4 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)4 JobConf (org.apache.hadoop.mapred.JobConf)4 IOException (java.io.IOException)3 ColumnarSerDe (org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe)3 ByteWritable (org.apache.hadoop.hive.serde2.io.ByteWritable)3 SimpleMapEqualComparer (org.apache.hadoop.hive.serde2.objectinspector.SimpleMapEqualComparer)3 CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)3 Random (java.util.Random)2