Examples with VectorizedParquetRecordReader - org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader

Example 16 with VectorizedParquetRecordReader

use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.

the class TestVectorizedListColumnReader method testVectorizedRowBatchSizeChangeListRead.

private void testVectorizedRowBatchSizeChangeListRead() throws Exception {
    Configuration conf = new Configuration();
    conf.set(IOConstants.COLUMNS, "list_binary_field_for_repeat_test");
    conf.set(IOConstants.COLUMNS_TYPES, "array<string>");
    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
    VectorizedParquetRecordReader reader = createTestParquetReader("message hive_schema {repeated binary list_binary_field_for_repeat_test;}", conf);
    VectorizedRowBatch previous = reader.createValue();
    try {
        while (reader.next(NullWritable.get(), previous)) {
            ListColumnVector vector = (ListColumnVector) previous.cols[0];
            // When deal with big data, the VectorizedRowBatch will be used for the different file split
            // to cache the data. Here is the situation: the first split only have 100 rows,
            // and VectorizedRowBatch cache them, meanwhile, the size of VectorizedRowBatch will be
            // updated to 100. The following code is to simulate the size change, but there will be no
            // ArrayIndexOutOfBoundsException when process the next split which has more than 100 rows.
            vector.lengths = new long[100];
            vector.offsets = new long[100];
        }
    } finally {
        reader.close();
    }
}

Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) Configuration(org.apache.hadoop.conf.Configuration) ListColumnVector(org.apache.hadoop.hive.ql.exec.vector.ListColumnVector) VectorizedParquetRecordReader(org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader)

Example 17 with VectorizedParquetRecordReader

use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.

the class TestVectorizedListColumnReader method testRepeateListRead.

private void testRepeateListRead(int elementNum, boolean isNull) throws Exception {
    Configuration conf = new Configuration();
    conf.set(IOConstants.COLUMNS, "list_int32_field_for_repeat_test");
    conf.set(IOConstants.COLUMNS_TYPES, "array<int>");
    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
    VectorizedParquetRecordReader reader = createTestParquetReader("message hive_schema {repeated int32 list_int32_field_for_repeat_test;}", conf);
    VectorizedRowBatch previous = reader.createValue();
    int row = 0;
    try {
        while (reader.next(NullWritable.get(), previous)) {
            ListColumnVector vector = (ListColumnVector) previous.cols[0];
            assertTrue(vector.isRepeating);
            assertEquals(isNull, vector.isNull[0]);
            for (int i = 0; i < vector.offsets.length; i++) {
                if (row == elementNum) {
                    assertEquals(i, vector.offsets.length - 1);
                    break;
                }
                row++;
            }
        }
        assertEquals("It doesn't exit at expected position", elementNum, row);
    } finally {
        reader.close();
    }
}

Example 18 with VectorizedParquetRecordReader

use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.

the class TestVectorizedListColumnReader method testListRead.

private void testListRead(boolean isDictionaryEncoding, String type, int elementNum) throws Exception {
    Configuration conf = new Configuration();
    setTypeConfiguration(type, conf);
    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
    VectorizedParquetRecordReader reader = createTestParquetReader(getSchema(type), conf);
    VectorizedRowBatch previous = reader.createValue();
    int row = 0;
    int index = 0;
    try {
        while (reader.next(NullWritable.get(), previous)) {
            ListColumnVector vector = (ListColumnVector) previous.cols[0];
            // since Repeating only happens when offset length is 1.
            assertEquals((vector.offsets.length == 1), vector.isRepeating);
            for (int i = 0; i < vector.offsets.length; i++) {
                if (row == elementNum) {
                    assertEquals(i, vector.offsets.length - 1);
                    break;
                }
                long start = vector.offsets[i];
                long length = vector.lengths[i];
                boolean isNull = isNull(row);
                if (isNull) {
                    assertEquals(vector.isNull[i], true);
                } else {
                    for (long j = 0; j < length; j++) {
                        assertValue(type, vector.child, isDictionaryEncoding, index, (int) (start + j));
                        index++;
                    }
                }
                row++;
            }
        }
        assertEquals("It doesn't exit at expected position", elementNum, row);
    } finally {
        reader.close();
    }
}

Example 19 with VectorizedParquetRecordReader

use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.

the class VectorizedColumnReaderTestBase method longRead.

private void longRead(boolean isDictionaryEncoding, Configuration conf) throws Exception {
    VectorizedParquetRecordReader reader = createTestParquetReader("message test { required int64 int64_field;}", conf);
    VectorizedRowBatch previous = reader.createValue();
    try {
        int c = 0;
        while (reader.next(NullWritable.get(), previous)) {
            LongColumnVector vector = (LongColumnVector) previous.cols[0];
            assertTrue(vector.noNulls);
            for (int i = 0; i < vector.vector.length; i++) {
                if (c == nElements) {
                    break;
                }
                assertEquals("Failed at " + c, getLongValue(isDictionaryEncoding, c), vector.vector[i]);
                assertFalse(vector.isNull[i]);
                c++;
            }
        }
        assertEquals(nElements, c);
    } finally {
        reader.close();
    }
}

Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) VectorizedParquetRecordReader(org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)

Example 20 with VectorizedParquetRecordReader

use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.

the class VectorizedColumnReaderTestBase method stringReadDecimal.

protected void stringReadDecimal(boolean isDictionaryEncoding) throws Exception {
    Configuration conf = new Configuration();
    conf.set(IOConstants.COLUMNS, "value");
    conf.set(IOConstants.COLUMNS_TYPES, "string");
    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
    VectorizedParquetRecordReader reader = createTestParquetReader("message hive_schema { required value (DECIMAL(5,2));}", conf);
    VectorizedRowBatch previous = reader.createValue();
    try {
        int c = 0;
        while (reader.next(NullWritable.get(), previous)) {
            BytesColumnVector vector = (BytesColumnVector) previous.cols[0];
            assertTrue(vector.noNulls);
            for (int i = 0; i < vector.vector.length; i++) {
                if (c == nElements) {
                    break;
                }
                String actual = new String(Arrays.copyOfRange(vector.vector[i], vector.start[i], vector.start[i] + vector.length[i]));
                assertEquals("Check failed at pos " + c, getDecimal(isDictionaryEncoding, c).toString(), actual);
                assertFalse(vector.isNull[i]);
                c++;
            }
        }
        assertEquals(nElements, c);
    } finally {
        reader.close();
    }
}

Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) Configuration(org.apache.hadoop.conf.Configuration) VectorizedParquetRecordReader(org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)

Aggregations

VectorizedParquetRecordReader (org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader)28 VectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch)26 Configuration (org.apache.hadoop.conf.Configuration)18 DoubleColumnVector (org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)9 LongColumnVector (org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)7 BytesColumnVector (org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)5 ListColumnVector (org.apache.hadoop.hive.ql.exec.vector.ListColumnVector)4 StructColumnVector (org.apache.hadoop.hive.ql.exec.vector.StructColumnVector)4 Timestamp (java.sql.Timestamp)2 MapColumnVector (org.apache.hadoop.hive.ql.exec.vector.MapColumnVector)2 JobConf (org.apache.hadoop.mapred.JobConf)2 Job (org.apache.hadoop.mapreduce.Job)2 DecimalColumnVector (org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector)1 TimestampColumnVector (org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector)1 ParquetInputFormat (org.apache.parquet.hadoop.ParquetInputFormat)1 ParquetInputSplit (org.apache.parquet.hadoop.ParquetInputSplit)1