Search in sources :

Example 11 with VectorizedParquetRecordReader

use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.

the class VectorizedColumnReaderTestBase method timestampRead.

protected void timestampRead(boolean isDictionaryEncoding) throws InterruptedException, HiveException, IOException {
    conf.set(IOConstants.COLUMNS, "int96_field");
    conf.set(IOConstants.COLUMNS_TYPES, "timestamp");
    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
    VectorizedParquetRecordReader reader = createTestParquetReader("message test { required " + "int96 int96_field;}", conf);
    VectorizedRowBatch previous = reader.createValue();
    try {
        int c = 0;
        while (reader.next(NullWritable.get(), previous)) {
            TimestampColumnVector vector = (TimestampColumnVector) previous.cols[0];
            assertTrue(vector.noNulls);
            for (int i = 0; i < vector.nanos.length; i++) {
                if (c == nElements) {
                    break;
                }
                Timestamp expected = isDictionaryEncoding ? new Timestamp(c % UNIQUE_NUM) : new Timestamp(c);
                assertEquals("Not the same time at " + c, expected.getTime(), vector.getTime(i));
                assertEquals("Not the same nano at " + c, expected.getNanos(), vector.getNanos(i));
                assertFalse(vector.isNull[i]);
                c++;
            }
        }
        assertEquals(nElements, c);
    } finally {
        reader.close();
    }
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) TimestampColumnVector(org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector) VectorizedParquetRecordReader(org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader) Timestamp(java.sql.Timestamp)

Example 12 with VectorizedParquetRecordReader

use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.

the class VectorizedColumnReaderTestBase method decimalRead.

protected void decimalRead(boolean isDictionaryEncoding) throws Exception {
    Configuration readerConf = new Configuration();
    readerConf.set(IOConstants.COLUMNS, "value");
    readerConf.set(IOConstants.COLUMNS_TYPES, "decimal(5,2)");
    readerConf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
    readerConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
    VectorizedParquetRecordReader reader = createTestParquetReader("message hive_schema { required value (DECIMAL(5,2));}", readerConf);
    VectorizedRowBatch previous = reader.createValue();
    try {
        int c = 0;
        while (reader.next(NullWritable.get(), previous)) {
            DecimalColumnVector vector = (DecimalColumnVector) previous.cols[0];
            assertTrue(vector.noNulls);
            for (int i = 0; i < vector.vector.length; i++) {
                if (c == nElements) {
                    break;
                }
                assertEquals("Check failed at pos " + c, getDecimal(isDictionaryEncoding, c), vector.vector[i].getHiveDecimal());
                assertFalse(vector.isNull[i]);
                c++;
            }
        }
        assertEquals(nElements, c);
    } finally {
        reader.close();
    }
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) DecimalColumnVector(org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector) Configuration(org.apache.hadoop.conf.Configuration) VectorizedParquetRecordReader(org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader)

Example 13 with VectorizedParquetRecordReader

use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.

the class VectorizedColumnReaderTestBase method stringReadDouble.

protected void stringReadDouble(boolean isDictionaryEncoding) throws Exception {
    Configuration readerConf = new Configuration();
    readerConf.set(IOConstants.COLUMNS, "double_field");
    readerConf.set(IOConstants.COLUMNS_TYPES, "string");
    readerConf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
    readerConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
    VectorizedParquetRecordReader reader = createTestParquetReader("message test { required double double_field;}", readerConf);
    VectorizedRowBatch previous = reader.createValue();
    try {
        int c = 0;
        while (reader.next(NullWritable.get(), previous)) {
            BytesColumnVector vector = (BytesColumnVector) previous.cols[0];
            assertTrue(vector.noNulls);
            for (int i = 0; i < vector.vector.length; i++) {
                if (c == nElements) {
                    break;
                }
                String actual = new String(Arrays.copyOfRange(vector.vector[i], vector.start[i], vector.start[i] + vector.length[i]));
                assertEquals("Failed at " + c, String.valueOf(getDoubleValue(isDictionaryEncoding, c)), actual);
                assertFalse(vector.isNull[i]);
                c++;
            }
        }
        assertEquals(nElements, c);
    } finally {
        reader.close();
    }
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) Configuration(org.apache.hadoop.conf.Configuration) VectorizedParquetRecordReader(org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)

Example 14 with VectorizedParquetRecordReader

use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.

the class VectorizedColumnReaderTestBase method doubleReadInt.

protected void doubleReadInt(boolean isDictionaryEncoding) throws InterruptedException, HiveException, IOException {
    conf.set(IOConstants.COLUMNS, "int32_field");
    conf.set(IOConstants.COLUMNS_TYPES, "double");
    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
    VectorizedParquetRecordReader reader = createTestParquetReader("message test { required int32 int32_field;}", conf);
    VectorizedRowBatch previous = reader.createValue();
    try {
        int c = 0;
        while (reader.next(NullWritable.get(), previous)) {
            DoubleColumnVector vector = (DoubleColumnVector) previous.cols[0];
            assertTrue(vector.noNulls);
            for (int i = 0; i < vector.vector.length; i++) {
                if (c == nElements) {
                    break;
                }
                assertEquals("Failed at " + c, getIntValue(isDictionaryEncoding, c), vector.vector[i], 0);
                assertFalse(vector.isNull[i]);
                c++;
            }
        }
        assertEquals(nElements, c);
    } finally {
        reader.close();
    }
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) DoubleColumnVector(org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector) VectorizedParquetRecordReader(org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader)

Example 15 with VectorizedParquetRecordReader

use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.

the class VectorizedColumnReaderTestBase method createParquetReader.

protected VectorizedParquetRecordReader createParquetReader(String schemaString, Configuration conf) throws IOException, InterruptedException, HiveException {
    conf.set(PARQUET_READ_SCHEMA, schemaString);
    HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true);
    HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp");
    Job vectorJob = new Job(conf, "read vector");
    ParquetInputFormat.setInputPaths(vectorJob, file);
    ParquetInputFormat parquetInputFormat = new ParquetInputFormat(GroupReadSupport.class);
    ParquetInputSplit split = (ParquetInputSplit) parquetInputFormat.getSplits(vectorJob).get(0);
    initialVectorizedRowBatchCtx(conf);
    return new VectorizedParquetRecordReader(split, new JobConf(conf));
}
Also used : VectorizedParquetRecordReader(org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader) ParquetInputSplit(org.apache.parquet.hadoop.ParquetInputSplit) ParquetInputFormat(org.apache.parquet.hadoop.ParquetInputFormat) Job(org.apache.hadoop.mapreduce.Job) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

VectorizedParquetRecordReader (org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader)28 VectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch)26 Configuration (org.apache.hadoop.conf.Configuration)18 DoubleColumnVector (org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)9 LongColumnVector (org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)7 BytesColumnVector (org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)5 ListColumnVector (org.apache.hadoop.hive.ql.exec.vector.ListColumnVector)4 StructColumnVector (org.apache.hadoop.hive.ql.exec.vector.StructColumnVector)4 Timestamp (java.sql.Timestamp)2 MapColumnVector (org.apache.hadoop.hive.ql.exec.vector.MapColumnVector)2 JobConf (org.apache.hadoop.mapred.JobConf)2 Job (org.apache.hadoop.mapreduce.Job)2 DecimalColumnVector (org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector)1 TimestampColumnVector (org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector)1 ParquetInputFormat (org.apache.parquet.hadoop.ParquetInputFormat)1 ParquetInputSplit (org.apache.parquet.hadoop.ParquetInputSplit)1