use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.
the class TestColumnIndexBuilder method testBuildUInt8.
@Test
public void testBuildUInt8() {
PrimitiveType type = Types.required(INT32).as(UINT_8).named("test_uint8");
ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
assertThat(builder, instanceOf(IntColumnIndexBuilder.class));
assertNull(builder.build());
IntColumn col = intColumn("test_col");
StatsBuilder sb = new StatsBuilder();
builder.add(sb.stats(type, 4, 10));
builder.add(sb.stats(type, 11, 17, null));
builder.add(sb.stats(type, 2, 2, null, null));
builder.add(sb.stats(type, null, null, null));
builder.add(sb.stats(type, 1, 0xFF));
builder.add(sb.stats(type, 0xEF, 0xFA));
assertEquals(6, builder.getPageCount());
assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
ColumnIndex columnIndex = builder.build();
assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder());
assertCorrectNullCounts(columnIndex, 0, 1, 2, 3, 0, 0);
assertCorrectNullPages(columnIndex, false, false, false, true, false, false);
assertCorrectValues(columnIndex.getMaxValues(), 10, 17, 2, null, 0xFF, 0xFA);
assertCorrectValues(columnIndex.getMinValues(), 4, 11, 2, null, 1, 0xEF);
assertCorrectFiltering(columnIndex, eq(col, 2), 2, 4);
assertCorrectFiltering(columnIndex, eq(col, null), 1, 2, 3);
Set<Integer> set1 = new HashSet<>();
set1.add(2);
assertCorrectFiltering(columnIndex, in(col, set1), 2, 4);
assertCorrectFiltering(columnIndex, notIn(col, set1), 0, 1, 3, 5);
set1.add(null);
assertCorrectFiltering(columnIndex, in(col, set1), 1, 2, 3, 4);
assertCorrectFiltering(columnIndex, notIn(col, set1), 0, 5);
assertCorrectFiltering(columnIndex, notEq(col, 2), 0, 1, 2, 3, 4, 5);
assertCorrectFiltering(columnIndex, notEq(col, null), 0, 1, 2, 4, 5);
assertCorrectFiltering(columnIndex, gt(col, 2), 0, 1, 4, 5);
assertCorrectFiltering(columnIndex, gtEq(col, 2), 0, 1, 2, 4, 5);
assertCorrectFiltering(columnIndex, lt(col, 0xEF), 0, 1, 2, 4);
assertCorrectFiltering(columnIndex, ltEq(col, 0xEF), 0, 1, 2, 4, 5);
assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 0, 1, 4, 5);
assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5);
builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
sb = new StatsBuilder();
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, 0, 0, null, null));
builder.add(sb.stats(type, 0, 42, null));
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, null, null, null));
builder.add(sb.stats(type, 42, 0xEE));
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, 0xEF, 0xFF));
builder.add(sb.stats(type, null, null));
assertEquals(9, builder.getPageCount());
assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
columnIndex = builder.build();
assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder());
assertCorrectNullCounts(columnIndex, 2, 2, 1, 2, 3, 0, 2, 0, 2);
assertCorrectNullPages(columnIndex, true, false, false, true, true, false, true, false, true);
assertCorrectValues(columnIndex.getMaxValues(), null, 0, 42, null, null, 0xEE, null, 0xFF, null);
assertCorrectValues(columnIndex.getMinValues(), null, 0, 0, null, null, 42, null, 0xEF, null);
assertCorrectFiltering(columnIndex, eq(col, 2), 2);
assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 4, 6, 8);
Set<Integer> set2 = new HashSet<>();
set2.add(2);
assertCorrectFiltering(columnIndex, in(col, set2), 2);
assertCorrectFiltering(columnIndex, notIn(col, set2), 0, 1, 3, 4, 5, 6, 7, 8);
set2.add(null);
assertCorrectFiltering(columnIndex, in(col, set2), 0, 1, 2, 3, 4, 6, 8);
assertCorrectFiltering(columnIndex, notIn(col, set2), 5, 7);
assertCorrectFiltering(columnIndex, notEq(col, 2), 0, 1, 2, 3, 4, 5, 6, 7, 8);
assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 5, 7);
assertCorrectFiltering(columnIndex, gt(col, 0xEE), 7);
assertCorrectFiltering(columnIndex, gtEq(col, 0xEE), 5, 7);
assertCorrectFiltering(columnIndex, lt(col, 42), 1, 2);
assertCorrectFiltering(columnIndex, ltEq(col, 42), 1, 2, 5);
assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 1, 2, 5, 7);
assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5, 6, 7, 8);
builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
sb = new StatsBuilder();
builder.add(sb.stats(type, null, null, null, null, null));
builder.add(sb.stats(type, 0xFF, 0xFF));
builder.add(sb.stats(type, null, null, null));
builder.add(sb.stats(type, 0xEF, 0xEA, null));
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, 0xEE, 42));
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, 41, 0));
assertEquals(9, builder.getPageCount());
assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
columnIndex = builder.build();
assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder());
assertCorrectNullCounts(columnIndex, 5, 0, 3, 1, 2, 0, 2, 2, 0);
assertCorrectNullPages(columnIndex, true, false, true, false, true, false, true, true, false);
assertCorrectValues(columnIndex.getMaxValues(), null, 0xFF, null, 0xEF, null, 0xEE, null, null, 41);
assertCorrectValues(columnIndex.getMinValues(), null, 0xFF, null, 0xEA, null, 42, null, null, 0);
assertCorrectFiltering(columnIndex, eq(col, 0xAB), 5);
assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 6, 7);
Set<Integer> set3 = new HashSet<>();
set3.add(0xAB);
assertCorrectFiltering(columnIndex, in(col, set3), 5);
assertCorrectFiltering(columnIndex, notIn(col, set3), 0, 1, 2, 3, 4, 6, 7, 8);
set3.add(null);
assertCorrectFiltering(columnIndex, in(col, set3), 0, 2, 3, 4, 5, 6, 7);
assertCorrectFiltering(columnIndex, notIn(col, set3), 1, 8);
assertCorrectFiltering(columnIndex, notEq(col, 0xFF), 0, 2, 3, 4, 5, 6, 7, 8);
assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 5, 8);
assertCorrectFiltering(columnIndex, gt(col, 0xFF));
assertCorrectFiltering(columnIndex, gtEq(col, 0xFF), 1);
assertCorrectFiltering(columnIndex, lt(col, 42), 8);
assertCorrectFiltering(columnIndex, ltEq(col, 42), 5, 8);
assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 1, 3, 5, 8);
assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 2, 3, 4, 5, 6, 7, 8);
}
use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.
the class TestColumnIndexBuilder method testBuildInt32.
@Test
public void testBuildInt32() {
PrimitiveType type = Types.required(INT32).named("test_int32");
ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
assertThat(builder, instanceOf(IntColumnIndexBuilder.class));
assertNull(builder.build());
IntColumn col = intColumn("test_col");
StatsBuilder sb = new StatsBuilder();
builder.add(sb.stats(type, -4, 10));
builder.add(sb.stats(type, -11, 7, null));
builder.add(sb.stats(type, 2, 2, null, null));
builder.add(sb.stats(type, null, null, null));
builder.add(sb.stats(type, 1, 2));
builder.add(sb.stats(type, -21, 8));
assertEquals(6, builder.getPageCount());
assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
ColumnIndex columnIndex = builder.build();
assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder());
assertCorrectNullCounts(columnIndex, 0, 1, 2, 3, 0, 0);
assertCorrectNullPages(columnIndex, false, false, false, true, false, false);
assertCorrectValues(columnIndex.getMaxValues(), 10, 7, 2, null, 2, 8);
assertCorrectValues(columnIndex.getMinValues(), -4, -11, 2, null, 1, -21);
assertCorrectFiltering(columnIndex, eq(col, 2), 0, 1, 2, 4, 5);
assertCorrectFiltering(columnIndex, eq(col, null), 1, 2, 3);
Set<Integer> set1 = new HashSet<>();
set1.add(2);
assertCorrectFiltering(columnIndex, in(col, set1), 0, 1, 2, 4, 5);
assertCorrectFiltering(columnIndex, notIn(col, set1), 3);
set1.add(null);
assertCorrectFiltering(columnIndex, in(col, set1), 0, 1, 2, 3, 4, 5);
assertCorrectFiltering(columnIndex, notIn(col, set1), new int[0]);
assertCorrectFiltering(columnIndex, notEq(col, 2), 0, 1, 2, 3, 4, 5);
assertCorrectFiltering(columnIndex, notEq(col, null), 0, 1, 2, 4, 5);
assertCorrectFiltering(columnIndex, gt(col, 2), 0, 1, 5);
assertCorrectFiltering(columnIndex, gtEq(col, 2), 0, 1, 2, 4, 5);
assertCorrectFiltering(columnIndex, lt(col, 2), 0, 1, 4, 5);
assertCorrectFiltering(columnIndex, ltEq(col, 2), 0, 1, 2, 4, 5);
assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 0, 1, 5);
assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5);
builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
sb = new StatsBuilder();
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, -532, -345, null, null));
builder.add(sb.stats(type, -500, -42, null));
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, null, null, null));
builder.add(sb.stats(type, -42, 2));
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, 3, 42));
builder.add(sb.stats(type, null, null));
assertEquals(9, builder.getPageCount());
assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
columnIndex = builder.build();
assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder());
assertCorrectNullCounts(columnIndex, 2, 2, 1, 2, 3, 0, 2, 0, 2);
assertCorrectNullPages(columnIndex, true, false, false, true, true, false, true, false, true);
assertCorrectValues(columnIndex.getMaxValues(), null, -345, -42, null, null, 2, null, 42, null);
assertCorrectValues(columnIndex.getMinValues(), null, -532, -500, null, null, -42, null, 3, null);
assertCorrectFiltering(columnIndex, eq(col, 2), 5);
assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 4, 6, 8);
Set<Integer> set2 = new HashSet<>();
set2.add(2);
assertCorrectFiltering(columnIndex, in(col, set2), 5);
assertCorrectFiltering(columnIndex, notIn(col, set2), 0, 1, 2, 3, 4, 6, 7, 8);
set2.add(null);
assertCorrectFiltering(columnIndex, in(col, set2), 0, 1, 2, 3, 4, 5, 6, 8);
assertCorrectFiltering(columnIndex, notIn(col, set2), 7);
assertCorrectFiltering(columnIndex, notEq(col, 2), 0, 1, 2, 3, 4, 5, 6, 7, 8);
assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 5, 7);
assertCorrectFiltering(columnIndex, gt(col, 2), 7);
assertCorrectFiltering(columnIndex, gtEq(col, 2), 5, 7);
assertCorrectFiltering(columnIndex, lt(col, 2), 1, 2, 5);
assertCorrectFiltering(columnIndex, ltEq(col, 2), 1, 2, 5);
assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 1, 2, 5, 7);
assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5, 6, 7, 8);
builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
sb = new StatsBuilder();
builder.add(sb.stats(type, null, null, null, null, null));
builder.add(sb.stats(type, 532, 345));
builder.add(sb.stats(type, null, null, null));
builder.add(sb.stats(type, 234, 42, null));
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, 42, -2));
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, -3, -42));
assertEquals(9, builder.getPageCount());
assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
columnIndex = builder.build();
assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder());
assertCorrectNullCounts(columnIndex, 5, 0, 3, 1, 2, 0, 2, 2, 0);
assertCorrectNullPages(columnIndex, true, false, true, false, true, false, true, true, false);
assertCorrectValues(columnIndex.getMaxValues(), null, 532, null, 234, null, 42, null, null, -3);
assertCorrectValues(columnIndex.getMinValues(), null, 345, null, 42, null, -2, null, null, -42);
assertCorrectFiltering(columnIndex, eq(col, 2), 5);
assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 6, 7);
Set<Integer> set3 = new HashSet<>();
set3.add(2);
assertCorrectFiltering(columnIndex, in(col, set3), 5);
assertCorrectFiltering(columnIndex, notIn(col, set3), 0, 1, 2, 3, 4, 6, 7, 8);
set3.add(null);
assertCorrectFiltering(columnIndex, in(col, set3), 0, 2, 3, 4, 5, 6, 7);
assertCorrectFiltering(columnIndex, notIn(col, set3), 1, 8);
assertCorrectFiltering(columnIndex, notEq(col, 2), 0, 1, 2, 3, 4, 5, 6, 7, 8);
assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 5, 8);
assertCorrectFiltering(columnIndex, gt(col, 2), 1, 3, 5);
assertCorrectFiltering(columnIndex, gtEq(col, 2), 1, 3, 5);
assertCorrectFiltering(columnIndex, lt(col, 2), 5, 8);
assertCorrectFiltering(columnIndex, ltEq(col, 2), 5, 8);
assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 1, 3, 5, 8);
assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5, 6, 7, 8);
}
use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.
the class ValidatingColumnWriteStore method testReadUsingProjectedSchema.
@Test
public void testReadUsingProjectedSchema() {
MessageType orginalSchema = new MessageType("schema", new PrimitiveType(REQUIRED, INT32, "a"), new PrimitiveType(REQUIRED, INT32, "b"));
MessageType projectedSchema = new MessageType("schema", new PrimitiveType(OPTIONAL, INT32, "b"));
MemPageStore store = new MemPageStore(1);
SimpleGroupFactory groupFactory = new SimpleGroupFactory(orginalSchema);
writeGroups(orginalSchema, store, groupFactory.newGroup().append("a", 1).append("b", 2));
{
List<Group> groups = new ArrayList<>();
groups.addAll(readGroups(store, orginalSchema, projectedSchema, 1));
Object[][] expected = { { 2 } };
validateGroups(groups, expected);
}
}
use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.
the class ValidatingColumnWriteStore method testReadUsingRequestedSchemaWithExtraFields.
@Test
public void testReadUsingRequestedSchemaWithExtraFields() {
MessageType orginalSchema = new MessageType("schema", new PrimitiveType(REQUIRED, INT32, "a"), new PrimitiveType(OPTIONAL, INT32, "b"));
MessageType schemaWithExtraField = new MessageType("schema", new PrimitiveType(OPTIONAL, INT32, "b"), new PrimitiveType(OPTIONAL, INT32, "a"), new PrimitiveType(OPTIONAL, INT32, "c"));
MemPageStore memPageStoreForOriginalSchema = new MemPageStore(1);
MemPageStore memPageStoreForSchemaWithExtraField = new MemPageStore(1);
SimpleGroupFactory groupFactory = new SimpleGroupFactory(orginalSchema);
writeGroups(orginalSchema, memPageStoreForOriginalSchema, groupFactory.newGroup().append("a", 1).append("b", 2));
SimpleGroupFactory groupFactory2 = new SimpleGroupFactory(schemaWithExtraField);
writeGroups(schemaWithExtraField, memPageStoreForSchemaWithExtraField, groupFactory2.newGroup().append("a", 1).append("b", 2).append("c", 3));
{
List<Group> groups = new ArrayList<>();
groups.addAll(readGroups(memPageStoreForOriginalSchema, orginalSchema, schemaWithExtraField, 1));
groups.addAll(readGroups(memPageStoreForSchemaWithExtraField, schemaWithExtraField, schemaWithExtraField, 1));
// TODO: add once we have the support for empty projection
// groups1.addAll(readGroups(memPageStore3, schema3, schema2, 1));
Object[][] expected = { { 2, 1, null }, { 2, 1, 3 } // { null, null}
};
validateGroups(groups, expected);
}
}
use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.
the class TestParquetParser method testPaperExample.
@Test
public void testPaperExample() {
String example = "message Document {\n" + " required int64 DocId;\n" + " optional group Links {\n" + " repeated int64 Backward;\n" + " repeated int64 Forward; }\n" + " repeated group Name {\n" + " repeated group Language {\n" + " required binary Code;\n" + " required binary Country; }\n" + " optional binary Url; }}";
MessageType parsed = parseMessageType(example);
MessageType manuallyMade = new MessageType("Document", new PrimitiveType(REQUIRED, INT64, "DocId"), new GroupType(OPTIONAL, "Links", new PrimitiveType(REPEATED, INT64, "Backward"), new PrimitiveType(REPEATED, INT64, "Forward")), new GroupType(REPEATED, "Name", new GroupType(REPEATED, "Language", new PrimitiveType(REQUIRED, BINARY, "Code"), new PrimitiveType(REQUIRED, BINARY, "Country")), new PrimitiveType(OPTIONAL, BINARY, "Url")));
assertEquals(manuallyMade, parsed);
MessageType parsedThenReparsed = parseMessageType(parsed.toString());
assertEquals(manuallyMade, parsedThenReparsed);
parsed = parseMessageType("message m { required group a {required binary b;} required group c { required int64 d; }}");
manuallyMade = new MessageType("m", new GroupType(REQUIRED, "a", new PrimitiveType(REQUIRED, BINARY, "b")), new GroupType(REQUIRED, "c", new PrimitiveType(REQUIRED, INT64, "d")));
assertEquals(manuallyMade, parsed);
parsedThenReparsed = parseMessageType(parsed.toString());
assertEquals(manuallyMade, parsedThenReparsed);
}
Aggregations