use of org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable in project hive by apache.
the class TestRCFile method testGetColumn.
/**
* Tests {@link RCFile.Reader#getColumn(int, BytesRefArrayWritable) } method.
* @throws IOException
*/
@Test
public void testGetColumn() throws IOException {
cleanup();
RCFileOutputFormat.setColumnNumber(conf, expectedFieldsData.length);
RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, RCFile.createMetadata(new Text("apple"), new Text("block"), new Text("cat"), new Text("dog")), new DefaultCodec());
byte[][] record_1 = { "123".getBytes(StandardCharsets.UTF_8), "456".getBytes(StandardCharsets.UTF_8), "789".getBytes(StandardCharsets.UTF_8), "1000".getBytes(StandardCharsets.UTF_8), "5.3".getBytes(StandardCharsets.UTF_8), "hive and hadoop".getBytes(StandardCharsets.UTF_8), new byte[0], "NULL".getBytes(StandardCharsets.UTF_8) };
byte[][] record_2 = { "100".getBytes(StandardCharsets.UTF_8), "200".getBytes(StandardCharsets.UTF_8), "123".getBytes(StandardCharsets.UTF_8), "1000".getBytes(StandardCharsets.UTF_8), "5.3".getBytes(StandardCharsets.UTF_8), "hive and hadoop".getBytes(StandardCharsets.UTF_8), new byte[0], "NULL".getBytes(StandardCharsets.UTF_8) };
BytesRefArrayWritable bytes = new BytesRefArrayWritable(record_1.length);
for (int i = 0; i < record_1.length; i++) {
BytesRefWritable cu = new BytesRefWritable(record_1[i], 0, record_1[i].length);
bytes.set(i, cu);
}
writer.append(bytes);
bytes.clear();
for (int i = 0; i < record_2.length; i++) {
BytesRefWritable cu = new BytesRefWritable(record_2[i], 0, record_2[i].length);
bytes.set(i, cu);
}
writer.append(bytes);
writer.close();
RCFile.Reader reader = new RCFile.Reader(fs, file, conf);
LongWritable rowID = new LongWritable();
assertTrue(reader.next(rowID));
assertEquals(rowID.get(), 0L);
assertTrue(reader.next(rowID));
assertEquals(rowID.get(), 1L);
BytesRefArrayWritable result = null;
BytesRefWritable brw;
for (int col = 0; col < 8; col++) {
BytesRefArrayWritable result2 = reader.getColumn(col, result);
if (result == null) {
assertNotNull(result2);
result = result2;
} else {
// #getColumn(2) should return the instance passed in:
assertSame(result2, result);
}
// each column has height of 2:
assertEquals(2, result.size());
for (int row = 0; row < result.size(); row++) {
brw = result.get(row);
int start = brw.getStart();
int len = brw.getLength();
byte[] actualData = Arrays.copyOfRange(brw.getData(), start, start + len);
byte[] expectedData = (row == 0) ? record_1[col] : record_2[col];
assertArrayEquals("col=" + col + " : row=" + row, expectedData, actualData);
}
result.clear();
}
reader.close();
}
use of org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable in project hive by apache.
the class TestRCFile method testSync.
@Test
public void testSync() throws IOException {
Path testDir = new Path(System.getProperty("test.tmp.dir", ".") + "/mapred/testsync");
Path testFile = new Path(testDir, "test_rcfile");
fs.delete(testFile, true);
int intervalRecordCount = 500;
CompressionCodec codec = null;
int writeCount = 2500;
Configuration cloneConf = new Configuration(conf);
RCFileOutputFormat.setColumnNumber(cloneConf, bytesArray.length);
cloneConf.setInt(HiveConf.ConfVars.HIVE_RCFILE_RECORD_INTERVAL.varname, intervalRecordCount);
RCFile.Writer writer = new RCFile.Writer(fs, cloneConf, testFile, null, codec);
BytesRefArrayWritable bytes = new BytesRefArrayWritable(bytesArray.length);
for (int i = 0; i < bytesArray.length; i++) {
BytesRefWritable cu = null;
cu = new BytesRefWritable(bytesArray[i], 0, bytesArray[i].length);
bytes.set(i, cu);
}
for (int i = 0; i < writeCount; i++) {
writer.append(bytes);
}
writer.close();
long fileLen = fs.getFileStatus(testFile).getLen();
RCFileInputFormat inputFormat = new RCFileInputFormat();
JobConf jobconf = new JobConf(cloneConf);
jobconf.set("mapred.input.dir", testDir.toString());
HiveConf.setLongVar(jobconf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, fileLen);
InputSplit[] splits = inputFormat.getSplits(jobconf, 1);
RCFileRecordReader rr = new RCFileRecordReader(jobconf, (FileSplit) splits[0]);
long lastSync = 0;
for (int i = 0; i < 2500; i++) {
rr.sync(i);
if (rr.getPos() < lastSync) {
String reason = String.format("Sync at offset %d skipped sync block at location %d (returned %d instead)", i - 1, rr.getPos(), lastSync);
System.out.println(reason);
fail(reason);
}
lastSync = rr.getPos();
}
rr.close();
}
use of org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable in project hive by apache.
the class TestRCFile method writeThenReadByRecordReader.
private void writeThenReadByRecordReader(int intervalRecordCount, int writeCount, int splitNumber, long minSplitSize, CompressionCodec codec) throws IOException {
Path testDir = new Path(System.getProperty("test.tmp.dir", ".") + "/mapred/testsmallfirstsplit");
Path testFile = new Path(testDir, "test_rcfile");
fs.delete(testFile, true);
Configuration cloneConf = new Configuration(conf);
RCFileOutputFormat.setColumnNumber(cloneConf, bytesArray.length);
cloneConf.setInt(HiveConf.ConfVars.HIVE_RCFILE_RECORD_INTERVAL.varname, intervalRecordCount);
RCFile.Writer writer = new RCFile.Writer(fs, cloneConf, testFile, null, codec);
BytesRefArrayWritable bytes = new BytesRefArrayWritable(bytesArray.length);
for (int i = 0; i < bytesArray.length; i++) {
BytesRefWritable cu = null;
cu = new BytesRefWritable(bytesArray[i], 0, bytesArray[i].length);
bytes.set(i, cu);
}
for (int i = 0; i < writeCount; i++) {
if (i == intervalRecordCount) {
System.out.println("write position:" + writer.getLength());
}
writer.append(bytes);
}
writer.close();
RCFileInputFormat inputFormat = new RCFileInputFormat();
JobConf jonconf = new JobConf(cloneConf);
jonconf.set("mapred.input.dir", testDir.toString());
HiveConf.setLongVar(jonconf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, minSplitSize);
InputSplit[] splits = inputFormat.getSplits(jonconf, splitNumber);
assertEquals("splits length should be " + splitNumber, splitNumber, splits.length);
int readCount = 0;
for (int i = 0; i < splits.length; i++) {
int previousReadCount = readCount;
RecordReader rr = inputFormat.getRecordReader(splits[i], jonconf, Reporter.NULL);
Object key = rr.createKey();
Object value = rr.createValue();
while (rr.next(key, value)) {
readCount++;
}
rr.close();
System.out.println("The " + i + "th split read " + (readCount - previousReadCount));
}
assertEquals("readCount should be equal to writeCount", writeCount, readCount);
}
use of org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable in project hive by apache.
the class TestRCFile method fullyReadTest.
public void fullyReadTest(FileSystem fs, int count, Path file) throws IOException, SerDeException {
LOG.debug("reading " + count + " records");
long start = System.currentTimeMillis();
ColumnProjectionUtils.setReadAllColumns(conf);
RCFile.Reader reader = new RCFile.Reader(fs, file, conf);
LongWritable rowID = new LongWritable();
int actualRead = 0;
BytesRefArrayWritable cols = new BytesRefArrayWritable();
while (reader.next(rowID)) {
reader.getCurrentRow(cols);
cols.resetValid(8);
Object row = serDe.deserialize(cols);
StructObjectInspector oi = (StructObjectInspector) serDe.getObjectInspector();
List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs();
assertEquals("Field size should be 8", 8, fieldRefs.size());
for (int i = 0; i < fieldRefs.size(); i++) {
Object fieldData = oi.getStructFieldData(row, fieldRefs.get(i));
Object standardWritableData = ObjectInspectorUtils.copyToStandardObject(fieldData, fieldRefs.get(i).getFieldObjectInspector(), ObjectInspectorCopyOption.WRITABLE);
assertEquals("Field " + i, standardWritableData, expectedFieldsData[i]);
}
// Serialize
assertEquals("Class of the serialized object should be BytesRefArrayWritable", BytesRefArrayWritable.class, serDe.getSerializedClass());
BytesRefArrayWritable serializedText = (BytesRefArrayWritable) serDe.serialize(row, oi);
assertEquals("Serialized data", s, serializedText);
actualRead++;
}
reader.close();
assertEquals("Expect " + count + " rows, actual read " + actualRead, actualRead, count);
long cost = System.currentTimeMillis() - start;
LOG.debug("reading fully costs:" + cost + " milliseconds");
}
use of org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable in project hive by apache.
the class PerformTestRCFileAndSeqFile method performSequenceFileRead.
public void performSequenceFileRead(FileSystem fs, int count, Path file) throws IOException {
SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);
ByteWritable key = new ByteWritable();
BytesRefArrayWritable val = new BytesRefArrayWritable();
for (int i = 0; i < count; i++) {
reader.next(key, val);
}
}
Aggregations