use of org.apache.parquet.schema.MessageType in project hive by apache.
the class TestArrayCompatibility method testUnannotatedListOfPrimitives.
@Test
public void testUnannotatedListOfPrimitives() throws Exception {
MessageType fileSchema = Types.buildMessage().repeated(INT32).named("list_of_ints").named("UnannotatedListOfPrimitives");
Path test = writeDirect("UnannotatedListOfPrimitives", fileSchema, new DirectWriter() {
@Override
public void write(RecordConsumer rc) {
rc.startMessage();
rc.startField("list_of_ints", 0);
rc.addInteger(34);
rc.addInteger(35);
rc.addInteger(36);
rc.endField("list_of_ints", 0);
rc.endMessage();
}
});
ArrayWritable expected = list(new IntWritable(34), new IntWritable(35), new IntWritable(36));
List<ArrayWritable> records = read(test);
Assert.assertEquals("Should have only one record", 1, records.size());
assertEquals("Should match expected record", expected, records.get(0));
}
use of org.apache.parquet.schema.MessageType in project hive by apache.
the class TestParquetRecordReaderWrapper method testBuilder.
@Test
public void testBuilder() throws Exception {
SearchArgument sarg = SearchArgumentFactory.newBuilder().startNot().startOr().isNull("x", PredicateLeaf.Type.LONG).between("y", PredicateLeaf.Type.LONG, 10L, 20L).in("z", PredicateLeaf.Type.LONG, 1L, 2L, 3L).nullSafeEquals("a", PredicateLeaf.Type.STRING, "stinger").end().end().build();
MessageType schema = MessageTypeParser.parseMessageType("message test {" + " optional int32 x; required int32 y; required int32 z;" + " optional binary a;}");
FilterPredicate p = ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema);
String expected = "and(and(and(not(eq(x, null)), not(and(lteq(y, 20), not(lt(y, 10))))), not(or(or(eq(z, 1), " + "eq(z, 2)), eq(z, 3)))), not(eq(a, Binary{\"stinger\"})))";
assertEquals(expected, p.toString());
}
use of org.apache.parquet.schema.MessageType in project hive by apache.
the class TestParquetRecordReaderWrapper method testBuilderFloat.
@Test
public void testBuilderFloat() throws Exception {
SearchArgument sarg = SearchArgumentFactory.newBuilder().startAnd().lessThan("x", PredicateLeaf.Type.LONG, 22L).lessThan("x1", PredicateLeaf.Type.LONG, 22L).lessThanEquals("y", PredicateLeaf.Type.STRING, new HiveChar("hi", 10).toString()).equals("z", PredicateLeaf.Type.FLOAT, new Double(0.22)).equals("z1", PredicateLeaf.Type.FLOAT, new Double(0.22)).end().build();
MessageType schema = MessageTypeParser.parseMessageType("message test {" + " required int32 x; required int32 x1;" + " required binary y; required float z; required float z1;}");
FilterPredicate p = ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema);
String expected = "and(and(and(and(lt(x, 22), lt(x1, 22))," + " lteq(y, Binary{\"hi \"})), eq(z, " + "0.22)), eq(z1, 0.22))";
assertEquals(expected, p.toString());
}
use of org.apache.parquet.schema.MessageType in project hive by apache.
the class TestParquetRowGroupFilter method testRowGroupFilterTakeEffect.
@Test
public void testRowGroupFilterTakeEffect() throws Exception {
// define schema
columnNames = "intCol";
columnTypes = "int";
StructObjectInspector inspector = getObjectInspector(columnNames, columnTypes);
MessageType fileSchema = MessageTypeParser.parseMessageType("message hive_schema {\n" + " optional int32 intCol;\n" + "}\n");
conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, "intCol");
conf.set("columns", "intCol");
conf.set("columns.types", "int");
// create Parquet file with specific data
Path testPath = writeDirect("RowGroupFilterTakeEffect", fileSchema, new DirectWriter() {
@Override
public void write(RecordConsumer consumer) {
for (int i = 0; i < 100; i++) {
consumer.startMessage();
consumer.startField("int", 0);
consumer.addInteger(i);
consumer.endField("int", 0);
consumer.endMessage();
}
}
});
// > 50
GenericUDF udf = new GenericUDFOPGreaterThan();
List<ExprNodeDesc> children = Lists.newArrayList();
ExprNodeColumnDesc columnDesc = new ExprNodeColumnDesc(Integer.class, "intCol", "T", false);
ExprNodeConstantDesc constantDesc = new ExprNodeConstantDesc(50);
children.add(columnDesc);
children.add(constantDesc);
ExprNodeGenericFuncDesc genericFuncDesc = new ExprNodeGenericFuncDesc(inspector, udf, children);
String searchArgumentStr = SerializationUtilities.serializeExpression(genericFuncDesc);
conf.set(TableScanDesc.FILTER_EXPR_CONF_STR, searchArgumentStr);
ParquetRecordReaderWrapper recordReader = (ParquetRecordReaderWrapper) new MapredParquetInputFormat().getRecordReader(new FileSplit(testPath, 0, fileLength(testPath), (String[]) null), conf, null);
Assert.assertEquals("row group is not filtered correctly", 1, recordReader.getFiltedBlocks().size());
// > 100
constantDesc = new ExprNodeConstantDesc(100);
children.set(1, constantDesc);
genericFuncDesc = new ExprNodeGenericFuncDesc(inspector, udf, children);
searchArgumentStr = SerializationUtilities.serializeExpression(genericFuncDesc);
conf.set(TableScanDesc.FILTER_EXPR_CONF_STR, searchArgumentStr);
recordReader = (ParquetRecordReaderWrapper) new MapredParquetInputFormat().getRecordReader(new FileSplit(testPath, 0, fileLength(testPath), (String[]) null), conf, null);
Assert.assertEquals("row group is not filtered correctly", 0, recordReader.getFiltedBlocks().size());
}
use of org.apache.parquet.schema.MessageType in project hive by apache.
the class VectorizedParquetRecordReader method initialize.
public void initialize(ParquetInputSplit split, JobConf configuration) throws IOException, InterruptedException {
jobConf = configuration;
ParquetMetadata footer;
List<BlockMetaData> blocks;
boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
this.file = split.getPath();
long[] rowGroupOffsets = split.getRowGroupOffsets();
String columnNames = configuration.get(IOConstants.COLUMNS);
columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
// if task.side.metadata is set, rowGroupOffsets is null
if (rowGroupOffsets == null) {
//TODO check whether rowGroupOffSets can be null
// then we need to apply the predicate push down filter
footer = readFooter(configuration, file, range(split.getStart(), split.getEnd()));
MessageType fileSchema = footer.getFileMetaData().getSchema();
FilterCompat.Filter filter = getFilter(configuration);
blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
} else {
// otherwise we find the row groups that were selected on the client
footer = readFooter(configuration, file, NO_FILTER);
Set<Long> offsets = new HashSet<>();
for (long offset : rowGroupOffsets) {
offsets.add(offset);
}
blocks = new ArrayList<>();
for (BlockMetaData block : footer.getBlocks()) {
if (offsets.contains(block.getStartingPos())) {
blocks.add(block);
}
}
// verify we found them all
if (blocks.size() != rowGroupOffsets.length) {
long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
for (int i = 0; i < foundRowGroupOffsets.length; i++) {
foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
}
// provide a good error message in case there's a bug
throw new IllegalStateException("All the offsets listed in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
}
}
for (BlockMetaData block : blocks) {
this.totalRowCount += block.getRowCount();
}
this.fileSchema = footer.getFileMetaData().getSchema();
MessageType tableSchema;
if (indexAccess) {
List<Integer> indexSequence = new ArrayList<>();
// Generates a sequence list of indexes
for (int i = 0; i < columnNamesList.size(); i++) {
indexSequence.add(i);
}
tableSchema = DataWritableReadSupport.getSchemaByIndex(fileSchema, columnNamesList, indexSequence);
} else {
tableSchema = DataWritableReadSupport.getSchemaByName(fileSchema, columnNamesList, columnTypesList);
}
indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
if (!ColumnProjectionUtils.isReadAllColumns(configuration) && !indexColumnsWanted.isEmpty()) {
requestedSchema = DataWritableReadSupport.getSchemaByIndex(tableSchema, columnNamesList, indexColumnsWanted);
} else {
requestedSchema = fileSchema;
}
this.reader = new ParquetFileReader(configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
}
Aggregations