Search in sources :

Example 21 with VectorizedParquetRecordReader

use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.

the class VectorizedColumnReaderTestBase method nestedStructRead1.

protected void nestedStructRead1(boolean isDictionaryEncoding) throws Exception {
    Configuration conf = new Configuration();
    conf.set(IOConstants.COLUMNS, "nested_struct_field");
    conf.set(IOConstants.COLUMNS_TYPES, "struct<nsf:struct<c:int>>");
    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
    String schema = "message hive_schema {\n" + "group nested_struct_field {\n" + "  optional group nsf {\n" + "    optional int32 c;\n" + "  }" + "}\n";
    VectorizedParquetRecordReader reader = createTestParquetReader(schema, conf);
    VectorizedRowBatch previous = reader.createValue();
    int c = 0;
    try {
        while (reader.next(NullWritable.get(), previous)) {
            StructColumnVector vector = (StructColumnVector) previous.cols[0];
            StructColumnVector sv = (StructColumnVector) vector.fields[0];
            LongColumnVector cv = (LongColumnVector) sv.fields[0];
            for (int i = 0; i < cv.vector.length; i++) {
                if (c == nElements) {
                    break;
                }
                assertEquals(getIntValue(isDictionaryEncoding, c), cv.vector[i]);
                assertFalse(vector.isNull[i]);
                assertFalse(vector.isRepeating);
                c++;
            }
        }
        assertEquals("It doesn't exit at expected position", nElements, c);
    } finally {
        reader.close();
    }
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) Configuration(org.apache.hadoop.conf.Configuration) VectorizedParquetRecordReader(org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader) StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)

Example 22 with VectorizedParquetRecordReader

use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.

the class VectorizedColumnReaderTestBase method floatReadLong.

protected void floatReadLong(boolean isDictionaryEncoding) throws Exception {
    Configuration c = new Configuration();
    c.set(IOConstants.COLUMNS, "int64_field");
    c.set(IOConstants.COLUMNS_TYPES, "float");
    c.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
    c.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
    VectorizedParquetRecordReader reader = createTestParquetReader("message test { required int64 int64_field;}", c);
    VectorizedRowBatch previous = reader.createValue();
    try {
        int count = 0;
        while (reader.next(NullWritable.get(), previous)) {
            DoubleColumnVector vector = (DoubleColumnVector) previous.cols[0];
            assertTrue(vector.noNulls);
            for (int i = 0; i < vector.vector.length; i++) {
                if (count == nElements) {
                    break;
                }
                assertEquals("Failed at " + count, getLongValue(isDictionaryEncoding, count), vector.vector[i], 0);
                assertFalse(vector.isNull[i]);
                count++;
            }
        }
        assertEquals(nElements, count);
    } finally {
        reader.close();
    }
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) DoubleColumnVector(org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector) Configuration(org.apache.hadoop.conf.Configuration) VectorizedParquetRecordReader(org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader)

Example 23 with VectorizedParquetRecordReader

use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.

the class VectorizedColumnReaderTestBase method stringReadBoolean.

protected void stringReadBoolean() throws Exception {
    Configuration conf = new Configuration();
    conf.set(IOConstants.COLUMNS, "boolean_field");
    conf.set(IOConstants.COLUMNS_TYPES, "string");
    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
    VectorizedParquetRecordReader reader = createTestParquetReader("message test { required boolean boolean_field;}", conf);
    VectorizedRowBatch previous = reader.createValue();
    try {
        int c = 0;
        while (reader.next(NullWritable.get(), previous)) {
            BytesColumnVector vector = (BytesColumnVector) previous.cols[0];
            assertTrue(vector.noNulls);
            for (int i = 0; i < vector.vector.length; i++) {
                if (c == nElements) {
                    break;
                }
                String actual = new String(Arrays.copyOfRange(vector.vector[i], vector.start[i], vector.start[i] + vector.length[i]));
                assertEquals("Failed at " + c, String.valueOf(getBooleanValue(c)), actual);
                assertFalse(vector.isNull[i]);
                c++;
            }
        }
        assertEquals(nElements, c);
    } finally {
        reader.close();
    }
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) Configuration(org.apache.hadoop.conf.Configuration) VectorizedParquetRecordReader(org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)

Example 24 with VectorizedParquetRecordReader

use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.

the class VectorizedColumnReaderTestBase method structRead.

protected void structRead(boolean isDictionaryEncoding) throws Exception {
    Configuration conf = new Configuration();
    conf.set(IOConstants.COLUMNS, "struct_field");
    conf.set(IOConstants.COLUMNS_TYPES, "struct<a:int,b:double>");
    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
    String schema = "message hive_schema {\n" + "group struct_field {\n" + "  optional int32 a;\n" + "  optional double b;\n" + "}\n" + "}\n";
    VectorizedParquetRecordReader reader = createTestParquetReader(schema, conf);
    VectorizedRowBatch previous = reader.createValue();
    int c = 0;
    try {
        while (reader.next(NullWritable.get(), previous)) {
            StructColumnVector vector = (StructColumnVector) previous.cols[0];
            LongColumnVector cv = (LongColumnVector) vector.fields[0];
            DoubleColumnVector dv = (DoubleColumnVector) vector.fields[1];
            for (int i = 0; i < cv.vector.length; i++) {
                if (c == nElements) {
                    break;
                }
                assertEquals(getIntValue(isDictionaryEncoding, c), cv.vector[i]);
                assertEquals(getDoubleValue(isDictionaryEncoding, c), dv.vector[i], 0);
                assertFalse(vector.isNull[i]);
                assertFalse(vector.isRepeating);
                c++;
            }
        }
        assertEquals("It doesn't exit at expected position", nElements, c);
    } finally {
        reader.close();
    }
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) DoubleColumnVector(org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector) Configuration(org.apache.hadoop.conf.Configuration) VectorizedParquetRecordReader(org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader) StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)

Example 25 with VectorizedParquetRecordReader

use of org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader in project hive by apache.

the class VectorizedColumnReaderTestBase method structReadSomeNull.

protected void structReadSomeNull(boolean isDictionaryEncoding) throws Exception {
    Configuration conf = new Configuration();
    conf.set(IOConstants.COLUMNS, "struct_field_some_null");
    conf.set(IOConstants.COLUMNS_TYPES, "struct<f:int,g:double>");
    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
    String schema = "message hive_schema {\n" + "group struct_field_some_null {\n" + "  optional int32 f;\n" + "  optional double g;\n" + "}\n";
    VectorizedParquetRecordReader reader = createTestParquetReader(schema, conf);
    VectorizedRowBatch previous = reader.createValue();
    int c = 0;
    try {
        while (reader.next(NullWritable.get(), previous)) {
            StructColumnVector sv = (StructColumnVector) previous.cols[0];
            LongColumnVector fv = (LongColumnVector) sv.fields[0];
            DoubleColumnVector gv = (DoubleColumnVector) sv.fields[1];
            for (int i = 0; i < fv.vector.length; i++) {
                if (c == nElements) {
                    break;
                }
                assertEquals(c % 2 == 0, fv.isNull[i]);
                assertEquals(c % 3 == 0, gv.isNull[i]);
                assertEquals(c % /* 2*3 = */
                6 == 0, sv.isNull[i]);
                if (!sv.isNull[i]) {
                    if (!fv.isNull[i]) {
                        assertEquals(getIntValue(isDictionaryEncoding, c), fv.vector[i]);
                    }
                    if (!gv.isNull[i]) {
                        assertEquals(getDoubleValue(isDictionaryEncoding, c), gv.vector[i], 0);
                    }
                }
                assertFalse(fv.isRepeating);
                c++;
            }
        }
        assertEquals("It doesn't exit at expected position", nElements, c);
    } finally {
        reader.close();
    }
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) DoubleColumnVector(org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector) Configuration(org.apache.hadoop.conf.Configuration) VectorizedParquetRecordReader(org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader) StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)

Aggregations

VectorizedParquetRecordReader (org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader)28 VectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch)26 Configuration (org.apache.hadoop.conf.Configuration)18 DoubleColumnVector (org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)9 LongColumnVector (org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)7 BytesColumnVector (org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)5 ListColumnVector (org.apache.hadoop.hive.ql.exec.vector.ListColumnVector)4 StructColumnVector (org.apache.hadoop.hive.ql.exec.vector.StructColumnVector)4 Timestamp (java.sql.Timestamp)2 MapColumnVector (org.apache.hadoop.hive.ql.exec.vector.MapColumnVector)2 JobConf (org.apache.hadoop.mapred.JobConf)2 Job (org.apache.hadoop.mapreduce.Job)2 DecimalColumnVector (org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector)1 TimestampColumnVector (org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector)1 ParquetInputFormat (org.apache.parquet.hadoop.ParquetInputFormat)1 ParquetInputSplit (org.apache.parquet.hadoop.ParquetInputSplit)1