use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class DictionaryFilter method expandDictionary.
@SuppressWarnings("unchecked")
private <T extends Comparable<T>> Set<T> expandDictionary(ColumnChunkMetaData meta) throws IOException {
ColumnDescriptor col = new ColumnDescriptor(meta.getPath().toArray(), meta.getPrimitiveType(), -1, -1);
DictionaryPage page = dictionaries.readDictionaryPage(col);
// the chunk may not be dictionary-encoded
if (page == null) {
return null;
}
Dictionary dict = page.getEncoding().initDictionary(col, page);
Set dictSet = new HashSet<T>();
for (int i = 0; i <= dict.getMaxId(); i++) {
switch(meta.getType()) {
case BINARY:
dictSet.add(dict.decodeToBinary(i));
break;
case INT32:
dictSet.add(dict.decodeToInt(i));
break;
case INT64:
dictSet.add(dict.decodeToLong(i));
break;
case FLOAT:
dictSet.add(dict.decodeToFloat(i));
break;
case DOUBLE:
dictSet.add(dict.decodeToDouble(i));
break;
default:
LOG.warn("Unknown dictionary type{}", meta.getType());
}
}
return (Set<T>) dictSet;
}
use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class TestParquetFileWriter method testWriteReadStatistics.
@Test
public void testWriteReadStatistics() throws Exception {
// this test assumes statistics will be read
Assume.assumeTrue(!shouldIgnoreStatistics(Version.FULL_VERSION, BINARY));
File testFile = temp.newFile();
testFile.delete();
Path path = new Path(testFile.toURI());
Configuration configuration = new Configuration();
configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b (UTF8);} required group c { required int64 d; }}");
String[] path1 = { "a", "b" };
ColumnDescriptor c1 = schema.getColumnDescription(path1);
String[] path2 = { "c", "d" };
ColumnDescriptor c2 = schema.getColumnDescription(path2);
byte[] bytes1 = { 0, 1, 2, 3 };
byte[] bytes2 = { 1, 2, 3, 4 };
byte[] bytes3 = { 2, 3, 4, 5 };
byte[] bytes4 = { 3, 4, 5, 6 };
CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
BinaryStatistics statsB1C1P1 = new BinaryStatistics();
BinaryStatistics statsB1C1P2 = new BinaryStatistics();
LongStatistics statsB1C2P1 = new LongStatistics();
LongStatistics statsB1C2P2 = new LongStatistics();
BinaryStatistics statsB2C1P1 = new BinaryStatistics();
LongStatistics statsB2C2P1 = new LongStatistics();
statsB1C1P1.setMinMax(Binary.fromString("s"), Binary.fromString("z"));
statsB1C1P2.setMinMax(Binary.fromString("a"), Binary.fromString("b"));
statsB1C2P1.setMinMax(2l, 10l);
statsB1C2P2.setMinMax(-6l, 4l);
statsB2C1P1.setMinMax(Binary.fromString("d"), Binary.fromString("e"));
statsB2C2P1.setMinMax(11l, 122l);
ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
w.start();
w.startBlock(3);
w.startColumn(c1, 5, codec);
w.writeDataPage(2, 4, BytesInput.from(bytes1), statsB1C1P1, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(bytes1), statsB1C1P2, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(c2, 6, codec);
w.writeDataPage(3, 4, BytesInput.from(bytes2), statsB1C2P1, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(1, 4, BytesInput.from(bytes2), statsB1C2P2, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
w.startBlock(4);
w.startColumn(c1, 7, codec);
w.writeDataPage(7, 4, BytesInput.from(bytes3), statsB2C1P1, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(c2, 8, codec);
w.writeDataPage(8, 4, BytesInput.from(bytes4), statsB2C2P1, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
w.end(new HashMap<String, String>());
ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
for (BlockMetaData block : readFooter.getBlocks()) {
for (ColumnChunkMetaData col : block.getColumns()) {
col.getPath();
}
}
// correct statistics
BinaryStatistics bs1 = new BinaryStatistics();
bs1.setMinMax(Binary.fromString("a"), Binary.fromString("z"));
LongStatistics ls1 = new LongStatistics();
ls1.setMinMax(-6l, 10l);
BinaryStatistics bs2 = new BinaryStatistics();
bs2.setMinMax(Binary.fromString("d"), Binary.fromString("e"));
LongStatistics ls2 = new LongStatistics();
ls2.setMinMax(11l, 122l);
{
// assert stats are correct for the first block
BinaryStatistics bsout = (BinaryStatistics) readFooter.getBlocks().get(0).getColumns().get(0).getStatistics();
String str = new String(bsout.getMaxBytes());
String str2 = new String(bsout.getMinBytes());
TestUtils.assertStatsValuesEqual(bs1, readFooter.getBlocks().get(0).getColumns().get(0).getStatistics());
TestUtils.assertStatsValuesEqual(ls1, readFooter.getBlocks().get(0).getColumns().get(1).getStatistics());
}
{
// assert stats are correct for the second block
TestUtils.assertStatsValuesEqual(bs2, readFooter.getBlocks().get(1).getColumns().get(0).getStatistics());
TestUtils.assertStatsValuesEqual(ls2, readFooter.getBlocks().get(1).getColumns().get(1).getStatistics());
}
}
use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class TestColumnChunkPageWriteStore method testColumnOrderV1.
@Test
public void testColumnOrderV1() throws IOException {
ParquetFileWriter mockFileWriter = Mockito.mock(ParquetFileWriter.class);
InOrder inOrder = inOrder(mockFileWriter);
MessageType schema = Types.buildMessage().required(BINARY).as(UTF8).named("a_string").required(INT32).named("an_int").required(INT64).named("a_long").required(FLOAT).named("a_float").required(DOUBLE).named("a_double").named("order_test");
BytesInput fakeData = BytesInput.fromInt(34);
int fakeCount = 3;
BinaryStatistics fakeStats = new BinaryStatistics();
// TODO - look back at this, an allocator was being passed here in the ByteBuffer changes
// see comment at this constructor
ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(UNCOMPRESSED), schema, new HeapByteBufferAllocator());
for (ColumnDescriptor col : schema.getColumns()) {
PageWriter pageWriter = store.getPageWriter(col);
pageWriter.writePage(fakeData, fakeCount, fakeStats, RLE, RLE, PLAIN);
}
// flush to the mock writer
store.flushToFileWriter(mockFileWriter);
for (ColumnDescriptor col : schema.getColumns()) {
inOrder.verify(mockFileWriter).startColumn(eq(col), eq((long) fakeCount), eq(UNCOMPRESSED));
}
}
use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class TestColumnChunkPageWriteStore method test.
@Test
public void test() throws Exception {
Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
Path root = file.getParent();
FileSystem fs = file.getFileSystem(conf);
if (fs.exists(root)) {
fs.delete(root, true);
}
fs.mkdirs(root);
MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
ColumnDescriptor col = schema.getColumns().get(0);
Encoding dataEncoding = PLAIN;
int valueCount = 10;
int d = 1;
int r = 2;
int v = 3;
BytesInput definitionLevels = BytesInput.fromInt(d);
BytesInput repetitionLevels = BytesInput.fromInt(r);
Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
BytesInput data = BytesInput.fromInt(v);
int rowCount = 5;
int nullCount = 1;
{
ParquetFileWriter writer = new ParquetFileWriter(conf, schema, file);
writer.start();
writer.startBlock(rowCount);
{
ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator());
PageWriter pageWriter = store.getPageWriter(col);
pageWriter.writePageV2(rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics);
store.flushToFileWriter(writer);
}
writer.endBlock();
writer.end(new HashMap<String, String>());
}
{
ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
PageReadStore rowGroup = reader.readNextRowGroup();
PageReader pageReader = rowGroup.getPageReader(col);
DataPageV2 page = (DataPageV2) pageReader.readPage();
assertEquals(rowCount, page.getRowCount());
assertEquals(nullCount, page.getNullCount());
assertEquals(valueCount, page.getValueCount());
assertEquals(d, intValue(page.getDefinitionLevels()));
assertEquals(r, intValue(page.getRepetitionLevels()));
assertEquals(dataEncoding, page.getDataEncoding());
assertEquals(v, intValue(page.getData()));
assertEquals(statistics.toString(), page.getStatistics().toString());
reader.close();
}
}
use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class TestInputFormat method createParquetFile.
private void createParquetFile(File file) throws IOException {
Path path = new Path(file.toURI());
Configuration configuration = new Configuration();
MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b;}}");
String[] columnPath = { "a", "b" };
ColumnDescriptor c1 = schema.getColumnDescription(columnPath);
byte[] bytes1 = { 0, 1, 2, 3 };
byte[] bytes2 = { 2, 3, 4, 5 };
CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
BinaryStatistics stats = new BinaryStatistics();
ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
w.start();
w.startBlock(3);
w.startColumn(c1, 5, codec);
w.writeDataPage(2, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
w.startBlock(4);
w.startColumn(c1, 7, codec);
w.writeDataPage(7, 4, BytesInput.from(bytes2), stats, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
w.end(new HashMap<String, String>());
}
Aggregations