use of org.apache.parquet.column.Encoding in project parquet-mr by apache.
the class DictionaryFilter method hasNonDictionaryPages.
@SuppressWarnings("deprecation")
private static boolean hasNonDictionaryPages(ColumnChunkMetaData meta) {
EncodingStats stats = meta.getEncodingStats();
if (stats != null) {
return stats.hasNonDictionaryEncodedPages();
}
// without EncodingStats, fall back to testing the encoding list
Set<Encoding> encodings = new HashSet<Encoding>(meta.getEncodings());
if (encodings.remove(Encoding.PLAIN_DICTIONARY)) {
// if remove returned true, PLAIN_DICTIONARY was present, which means at
// least one page was dictionary encoded and 1.0 encodings are used
// RLE and BIT_PACKED are only used for repetition or definition levels
encodings.remove(Encoding.RLE);
encodings.remove(Encoding.BIT_PACKED);
if (encodings.isEmpty()) {
// no encodings other than dictionary or rep/def levels
return false;
}
return true;
} else {
// page encoding stats
return true;
}
}
use of org.apache.parquet.column.Encoding in project parquet-mr by apache.
the class TestInputFormat method makeBlockFromStats.
public static BlockMetaData makeBlockFromStats(IntStatistics stats, long valueCount) {
BlockMetaData blockMetaData = new BlockMetaData();
ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"), PrimitiveTypeName.INT32, CompressionCodecName.GZIP, new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)), stats, 100l, 100l, valueCount, 100l, 100l);
blockMetaData.addColumn(column);
blockMetaData.setTotalByteSize(200l);
blockMetaData.setRowCount(valueCount);
return blockMetaData;
}
use of org.apache.parquet.column.Encoding in project parquet-mr by apache.
the class TestParquetFileWriter method testAlignmentWithPadding.
@Test
public void testAlignmentWithPadding() throws Exception {
File testFile = temp.newFile();
Path path = new Path(testFile.toURI());
Configuration conf = new Configuration();
// uses the test constructor
ParquetFileWriter w = new ParquetFileWriter(conf, SCHEMA, path, 120, 60);
w.start();
w.startBlock(3);
w.startColumn(C1, 5, CODEC);
long c1Starts = w.getPos();
w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c1Ends = w.getPos();
w.startColumn(C2, 6, CODEC);
long c2Starts = w.getPos();
w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c2Ends = w.getPos();
w.endBlock();
// should be 109
long firstRowGroupEnds = w.getPos();
w.startBlock(4);
w.startColumn(C1, 7, CODEC);
w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(C2, 8, CODEC);
w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
long secondRowGroupEnds = w.getPos();
w.end(new HashMap<String, String>());
FileSystem fs = path.getFileSystem(conf);
long fileLen = fs.getFileStatus(path).getLen();
FSDataInputStream data = fs.open(path);
// 4-byte offset + "PAR1"
data.seek(fileLen - 8);
long footerLen = BytesUtils.readIntLittleEndian(data);
long startFooter = fileLen - footerLen - 8;
assertEquals("Footer should start after second row group without padding", secondRowGroupEnds, startFooter);
ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path);
assertEquals("footer: " + readFooter, 2, readFooter.getBlocks().size());
assertEquals(c1Ends - c1Starts, readFooter.getBlocks().get(0).getColumns().get(0).getTotalSize());
assertEquals(c2Ends - c2Starts, readFooter.getBlocks().get(0).getColumns().get(1).getTotalSize());
assertEquals(c2Ends - c1Starts, readFooter.getBlocks().get(0).getTotalByteSize());
HashSet<Encoding> expectedEncoding = new HashSet<Encoding>();
expectedEncoding.add(PLAIN);
expectedEncoding.add(BIT_PACKED);
assertEquals(expectedEncoding, readFooter.getBlocks().get(0).getColumns().get(0).getEncodings());
// verify block starting positions with padding
assertEquals("First row group should start after magic", 4, readFooter.getBlocks().get(0).getStartingPos());
assertTrue("First row group should end before the block size (120)", firstRowGroupEnds < 120);
assertEquals("Second row group should start at the block size", 120, readFooter.getBlocks().get(1).getStartingPos());
{
// read first block of col #1
ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(SCHEMA.getColumnDescription(PATH1)));
PageReadStore pages = r.readNextRowGroup();
assertEquals(3, pages.getRowCount());
validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
assertNull(r.readNextRowGroup());
}
{
// read all blocks of col #1 and #2
ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, readFooter.getBlocks(), Arrays.asList(SCHEMA.getColumnDescription(PATH1), SCHEMA.getColumnDescription(PATH2)));
PageReadStore pages = r.readNextRowGroup();
assertEquals(3, pages.getRowCount());
validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
validateContains(SCHEMA, pages, PATH2, 2, BytesInput.from(BYTES2));
validateContains(SCHEMA, pages, PATH2, 3, BytesInput.from(BYTES2));
validateContains(SCHEMA, pages, PATH2, 1, BytesInput.from(BYTES2));
pages = r.readNextRowGroup();
assertEquals(4, pages.getRowCount());
validateContains(SCHEMA, pages, PATH1, 7, BytesInput.from(BYTES3));
validateContains(SCHEMA, pages, PATH2, 8, BytesInput.from(BYTES4));
assertNull(r.readNextRowGroup());
}
PrintFooter.main(new String[] { path.toString() });
}
use of org.apache.parquet.column.Encoding in project parquet-mr by apache.
the class TestParquetWriter method test.
@Test
public void test() throws Exception {
Configuration conf = new Configuration();
Path root = new Path("target/tests/TestParquetWriter/");
enforceEmptyDir(conf, root);
MessageType schema = parseMessageType("message test { " + "required binary binary_field; " + "required int32 int32_field; " + "required int64 int64_field; " + "required boolean boolean_field; " + "required float float_field; " + "required double double_field; " + "required fixed_len_byte_array(3) flba_field; " + "required int96 int96_field; " + "} ");
GroupWriteSupport.setSchema(schema, conf);
SimpleGroupFactory f = new SimpleGroupFactory(schema);
Map<String, Encoding> expected = new HashMap<String, Encoding>();
expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
expected.put("1000-" + PARQUET_1_0, PLAIN);
expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
for (int modulo : asList(10, 1000)) {
for (WriterVersion version : WriterVersion.values()) {
Path file = new Path(root, version.name() + "_" + modulo);
ParquetWriter<Group> writer = new ParquetWriter<Group>(file, new GroupWriteSupport(), UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
for (int i = 0; i < 1000; i++) {
writer.write(f.newGroup().append("binary_field", "test" + (i % modulo)).append("int32_field", 32).append("int64_field", 64l).append("boolean_field", true).append("float_field", 1.0f).append("double_field", 2.0d).append("flba_field", "foo").append("int96_field", Binary.fromConstantByteArray(new byte[12])));
}
writer.close();
ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
for (int i = 0; i < 1000; i++) {
Group group = reader.read();
assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
assertEquals(32, group.getInteger("int32_field", 0));
assertEquals(64l, group.getLong("int64_field", 0));
assertEquals(true, group.getBoolean("boolean_field", 0));
assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
assertEquals(Binary.fromConstantByteArray(new byte[12]), group.getInt96("int96_field", 0));
}
reader.close();
ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
for (BlockMetaData blockMetaData : footer.getBlocks()) {
for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
if (column.getPath().toDotString().equals("binary_field")) {
String key = modulo + "-" + version;
Encoding expectedEncoding = expected.get(key);
assertTrue(key + ":" + column.getEncodings() + " should contain " + expectedEncoding, column.getEncodings().contains(expectedEncoding));
}
}
}
assertEquals("Object model property should be example", "example", footer.getFileMetaData().getKeyValueMetaData().get(ParquetWriter.OBJECT_MODEL_NAME_PROP));
}
}
}
use of org.apache.parquet.column.Encoding in project parquet-mr by apache.
the class TestColumnChunkMetaData method newMD.
private ColumnChunkMetaData newMD(long big) {
Set<Encoding> e = new HashSet<Encoding>();
PrimitiveTypeName t = BINARY;
ColumnPath p = ColumnPath.get("foo");
CompressionCodecName c = CompressionCodecName.GZIP;
BinaryStatistics s = new BinaryStatistics();
ColumnChunkMetaData md = ColumnChunkMetaData.get(p, t, c, e, s, big, 0, 0, 0, 0);
return md;
}
Aggregations