use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.
the class TestVectorizedListColumnReader method testVectorizedRowBatchSizeChangeListRead.
private void testVectorizedRowBatchSizeChangeListRead() throws Exception {
Configuration conf = new Configuration();
conf.set(IOConstants.COLUMNS, "list_binary_field_for_repeat_test");
conf.set(IOConstants.COLUMNS_TYPES, "array<string>");
conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
VectorizedParquetRecordReader reader = createTestParquetReader("message hive_schema {repeated binary list_binary_field_for_repeat_test;}", conf);
VectorizedRowBatch previous = reader.createValue();
try {
while (reader.next(NullWritable.get(), previous)) {
ListColumnVector vector = (ListColumnVector) previous.cols[0];
// When deal with big data, the VectorizedRowBatch will be used for the different file split
// to cache the data. Here is the situation: the first split only have 100 rows,
// and VectorizedRowBatch cache them, meanwhile, the size of VectorizedRowBatch will be
// updated to 100. The following code is to simulate the size change, but there will be no
// ArrayIndexOutOfBoundsException when process the next split which has more than 100 rows.
vector.lengths = new long[100];
vector.offsets = new long[100];
}
} finally {
reader.close();
}
}
use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.
the class TestVectorizedListColumnReader method testRepeateListRead.
private void testRepeateListRead(int elementNum, boolean isNull) throws Exception {
Configuration conf = new Configuration();
conf.set(IOConstants.COLUMNS, "list_int32_field_for_repeat_test");
conf.set(IOConstants.COLUMNS_TYPES, "array<int>");
conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
VectorizedParquetRecordReader reader = createTestParquetReader("message hive_schema {repeated int32 list_int32_field_for_repeat_test;}", conf);
VectorizedRowBatch previous = reader.createValue();
int row = 0;
try {
while (reader.next(NullWritable.get(), previous)) {
ListColumnVector vector = (ListColumnVector) previous.cols[0];
assertTrue(vector.isRepeating);
assertEquals(isNull, vector.isNull[0]);
for (int i = 0; i < vector.offsets.length; i++) {
if (row == elementNum) {
assertEquals(i, vector.offsets.length - 1);
break;
}
row++;
}
}
assertEquals("It doesn't exit at expected position", elementNum, row);
} finally {
reader.close();
}
}
use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.
the class TestVectorizedListColumnReader method testListRead.
private void testListRead(boolean isDictionaryEncoding, String type, int elementNum) throws Exception {
Configuration conf = new Configuration();
setTypeConfiguration(type, conf);
conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
VectorizedParquetRecordReader reader = createTestParquetReader(getSchema(type), conf);
VectorizedRowBatch previous = reader.createValue();
int row = 0;
int index = 0;
try {
while (reader.next(NullWritable.get(), previous)) {
ListColumnVector vector = (ListColumnVector) previous.cols[0];
// since Repeating only happens when offset length is 1.
assertEquals((vector.offsets.length == 1), vector.isRepeating);
for (int i = 0; i < vector.offsets.length; i++) {
if (row == elementNum) {
assertEquals(i, vector.offsets.length - 1);
break;
}
long start = vector.offsets[i];
long length = vector.lengths[i];
boolean isNull = isNull(row);
if (isNull) {
assertEquals(vector.isNull[i], true);
} else {
for (long j = 0; j < length; j++) {
assertValue(type, vector.child, isDictionaryEncoding, index, (int) (start + j));
index++;
}
}
row++;
}
}
assertEquals("It doesn't exit at expected position", elementNum, row);
} finally {
reader.close();
}
}
use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.
the class VectorizedColumnReaderTestBase method longRead.
private void longRead(boolean isDictionaryEncoding, Configuration conf) throws Exception {
VectorizedParquetRecordReader reader = createTestParquetReader("message test { required int64 int64_field;}", conf);
VectorizedRowBatch previous = reader.createValue();
try {
int c = 0;
while (reader.next(NullWritable.get(), previous)) {
LongColumnVector vector = (LongColumnVector) previous.cols[0];
assertTrue(vector.noNulls);
for (int i = 0; i < vector.vector.length; i++) {
if (c == nElements) {
break;
}
assertEquals("Failed at " + c, getLongValue(isDictionaryEncoding, c), vector.vector[i]);
assertFalse(vector.isNull[i]);
c++;
}
}
assertEquals(nElements, c);
} finally {
reader.close();
}
}
use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.
the class VectorizedColumnReaderTestBase method stringReadDecimal.
protected void stringReadDecimal(boolean isDictionaryEncoding) throws Exception {
Configuration conf = new Configuration();
conf.set(IOConstants.COLUMNS, "value");
conf.set(IOConstants.COLUMNS_TYPES, "string");
conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
VectorizedParquetRecordReader reader = createTestParquetReader("message hive_schema { required value (DECIMAL(5,2));}", conf);
VectorizedRowBatch previous = reader.createValue();
try {
int c = 0;
while (reader.next(NullWritable.get(), previous)) {
BytesColumnVector vector = (BytesColumnVector) previous.cols[0];
assertTrue(vector.noNulls);
for (int i = 0; i < vector.vector.length; i++) {
if (c == nElements) {
break;
}
String actual = new String(Arrays.copyOfRange(vector.vector[i], vector.start[i], vector.start[i] + vector.length[i]));
assertEquals("Check failed at pos " + c, getDecimal(isDictionaryEncoding, c).toString(), actual);
assertFalse(vector.isNull[i]);
c++;
}
}
assertEquals(nElements, c);
} finally {
reader.close();
}
}
Aggregations