use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class TestParquetFileWriter method testAlignmentWithNoPaddingNeeded.
@Test
public void testAlignmentWithNoPaddingNeeded() throws Exception {
File testFile = temp.newFile();
Path path = new Path(testFile.toURI());
Configuration conf = new Configuration();
// uses the test constructor
ParquetFileWriter w = new ParquetFileWriter(conf, SCHEMA, path, 100, 50);
w.start();
w.startBlock(3);
w.startColumn(C1, 5, CODEC);
long c1Starts = w.getPos();
w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c1Ends = w.getPos();
w.startColumn(C2, 6, CODEC);
long c2Starts = w.getPos();
w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c2Ends = w.getPos();
w.endBlock();
// should be 109
long firstRowGroupEnds = w.getPos();
w.startBlock(4);
w.startColumn(C1, 7, CODEC);
w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(C2, 8, CODEC);
w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
long secondRowGroupEnds = w.getPos();
w.end(new HashMap<String, String>());
FileSystem fs = path.getFileSystem(conf);
long fileLen = fs.getFileStatus(path).getLen();
FSDataInputStream data = fs.open(path);
// 4-byte offset + "PAR1"
data.seek(fileLen - 8);
long footerLen = BytesUtils.readIntLittleEndian(data);
long startFooter = fileLen - footerLen - 8;
assertEquals("Footer should start after second row group without padding", secondRowGroupEnds, startFooter);
ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path);
assertEquals("footer: " + readFooter, 2, readFooter.getBlocks().size());
assertEquals(c1Ends - c1Starts, readFooter.getBlocks().get(0).getColumns().get(0).getTotalSize());
assertEquals(c2Ends - c2Starts, readFooter.getBlocks().get(0).getColumns().get(1).getTotalSize());
assertEquals(c2Ends - c1Starts, readFooter.getBlocks().get(0).getTotalByteSize());
HashSet<Encoding> expectedEncoding = new HashSet<Encoding>();
expectedEncoding.add(PLAIN);
expectedEncoding.add(BIT_PACKED);
assertEquals(expectedEncoding, readFooter.getBlocks().get(0).getColumns().get(0).getEncodings());
// verify block starting positions with padding
assertEquals("First row group should start after magic", 4, readFooter.getBlocks().get(0).getStartingPos());
assertTrue("First row group should end before the block size (120)", firstRowGroupEnds > 100);
assertEquals("Second row group should start after no padding", 109, readFooter.getBlocks().get(1).getStartingPos());
{
// read first block of col #1
ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(SCHEMA.getColumnDescription(PATH1)));
PageReadStore pages = r.readNextRowGroup();
assertEquals(3, pages.getRowCount());
validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
assertNull(r.readNextRowGroup());
}
{
// read all blocks of col #1 and #2
ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, readFooter.getBlocks(), Arrays.asList(SCHEMA.getColumnDescription(PATH1), SCHEMA.getColumnDescription(PATH2)));
PageReadStore pages = r.readNextRowGroup();
assertEquals(3, pages.getRowCount());
validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
validateContains(SCHEMA, pages, PATH2, 2, BytesInput.from(BYTES2));
validateContains(SCHEMA, pages, PATH2, 3, BytesInput.from(BYTES2));
validateContains(SCHEMA, pages, PATH2, 1, BytesInput.from(BYTES2));
pages = r.readNextRowGroup();
assertEquals(4, pages.getRowCount());
validateContains(SCHEMA, pages, PATH1, 7, BytesInput.from(BYTES3));
validateContains(SCHEMA, pages, PATH2, 8, BytesInput.from(BYTES4));
assertNull(r.readNextRowGroup());
}
PrintFooter.main(new String[] { path.toString() });
}
use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class ShowPagesCommand method run.
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
String source = targets.get(0);
ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source));
MessageType schema = reader.getFileMetaData().getSchema();
Map<ColumnDescriptor, PrimitiveType> columns = Maps.newLinkedHashMap();
if (this.columns == null || this.columns.isEmpty()) {
for (ColumnDescriptor descriptor : schema.getColumns()) {
columns.put(descriptor, primitive(schema, descriptor.getPath()));
}
} else {
for (String column : this.columns) {
columns.put(descriptor(column, schema), primitive(column, schema));
}
}
CompressionCodecName codec = reader.getRowGroups().get(0).getColumns().get(0).getCodec();
// accumulate formatted lines to print by column
Map<String, List<String>> formatted = Maps.newLinkedHashMap();
PageFormatter formatter = new PageFormatter();
PageReadStore pageStore;
int rowGroupNum = 0;
while ((pageStore = reader.readNextRowGroup()) != null) {
for (ColumnDescriptor descriptor : columns.keySet()) {
List<String> lines = formatted.get(columnName(descriptor));
if (lines == null) {
lines = Lists.newArrayList();
formatted.put(columnName(descriptor), lines);
}
formatter.setContext(rowGroupNum, columns.get(descriptor), codec);
PageReader pages = pageStore.getPageReader(descriptor);
DictionaryPage dict = pages.readDictionaryPage();
if (dict != null) {
lines.add(formatter.format(dict));
}
DataPage page;
while ((page = pages.readPage()) != null) {
lines.add(formatter.format(page));
}
}
rowGroupNum += 1;
}
// TODO: Show total column size and overall size per value in the column summary line
for (String columnName : formatted.keySet()) {
console.info(String.format("\nColumn: %s\n%s", columnName, StringUtils.leftPad("", 80, '-')));
console.info(formatter.getHeader());
for (String line : formatted.get(columnName)) {
console.info(line);
}
console.info("");
}
return 0;
}
use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class TestParquetFileWriter method testWriteRead.
@Test
public void testWriteRead() throws Exception {
File testFile = temp.newFile();
testFile.delete();
Path path = new Path(testFile.toURI());
Configuration configuration = new Configuration();
ParquetFileWriter w = new ParquetFileWriter(configuration, SCHEMA, path);
w.start();
w.startBlock(3);
w.startColumn(C1, 5, CODEC);
long c1Starts = w.getPos();
w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c1Ends = w.getPos();
w.startColumn(C2, 6, CODEC);
long c2Starts = w.getPos();
w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c2Ends = w.getPos();
w.endBlock();
w.startBlock(4);
w.startColumn(C1, 7, CODEC);
w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(C2, 8, CODEC);
w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
w.end(new HashMap<String, String>());
ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
assertEquals("footer: " + readFooter, 2, readFooter.getBlocks().size());
assertEquals(c1Ends - c1Starts, readFooter.getBlocks().get(0).getColumns().get(0).getTotalSize());
assertEquals(c2Ends - c2Starts, readFooter.getBlocks().get(0).getColumns().get(1).getTotalSize());
assertEquals(c2Ends - c1Starts, readFooter.getBlocks().get(0).getTotalByteSize());
HashSet<Encoding> expectedEncoding = new HashSet<Encoding>();
expectedEncoding.add(PLAIN);
expectedEncoding.add(BIT_PACKED);
assertEquals(expectedEncoding, readFooter.getBlocks().get(0).getColumns().get(0).getEncodings());
{
// read first block of col #1
ParquetFileReader r = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(SCHEMA.getColumnDescription(PATH1)));
PageReadStore pages = r.readNextRowGroup();
assertEquals(3, pages.getRowCount());
validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
assertNull(r.readNextRowGroup());
}
{
// read all blocks of col #1 and #2
ParquetFileReader r = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path, readFooter.getBlocks(), Arrays.asList(SCHEMA.getColumnDescription(PATH1), SCHEMA.getColumnDescription(PATH2)));
PageReadStore pages = r.readNextRowGroup();
assertEquals(3, pages.getRowCount());
validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
validateContains(SCHEMA, pages, PATH2, 2, BytesInput.from(BYTES2));
validateContains(SCHEMA, pages, PATH2, 3, BytesInput.from(BYTES2));
validateContains(SCHEMA, pages, PATH2, 1, BytesInput.from(BYTES2));
pages = r.readNextRowGroup();
assertEquals(4, pages.getRowCount());
validateContains(SCHEMA, pages, PATH1, 7, BytesInput.from(BYTES3));
validateContains(SCHEMA, pages, PATH2, 8, BytesInput.from(BYTES4));
assertNull(r.readNextRowGroup());
}
PrintFooter.main(new String[] { path.toString() });
}
use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class TestColumnChunkPageWriteStore method test.
@Test
public void test() throws Exception {
Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
Path root = file.getParent();
FileSystem fs = file.getFileSystem(conf);
if (fs.exists(root)) {
fs.delete(root, true);
}
fs.mkdirs(root);
MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
ColumnDescriptor col = schema.getColumns().get(0);
Encoding dataEncoding = PLAIN;
int valueCount = 10;
int d = 1;
int r = 2;
int v = 3;
BytesInput definitionLevels = BytesInput.fromInt(d);
BytesInput repetitionLevels = BytesInput.fromInt(r);
Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
BytesInput data = BytesInput.fromInt(v);
int rowCount = 5;
int nullCount = 1;
{
ParquetFileWriter writer = new ParquetFileWriter(conf, schema, file);
writer.start();
writer.startBlock(rowCount);
{
ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator());
PageWriter pageWriter = store.getPageWriter(col);
pageWriter.writePageV2(rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics);
store.flushToFileWriter(writer);
}
writer.endBlock();
writer.end(new HashMap<String, String>());
}
{
ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
PageReadStore rowGroup = reader.readNextRowGroup();
PageReader pageReader = rowGroup.getPageReader(col);
DataPageV2 page = (DataPageV2) pageReader.readPage();
assertEquals(rowCount, page.getRowCount());
assertEquals(nullCount, page.getNullCount());
assertEquals(valueCount, page.getValueCount());
assertEquals(d, intValue(page.getDefinitionLevels()));
assertEquals(r, intValue(page.getRepetitionLevels()));
assertEquals(dataEncoding, page.getDataEncoding());
assertEquals(v, intValue(page.getData()));
assertEquals(statistics.toString(), page.getStatistics().toString());
reader.close();
}
}
use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class CheckParquet251Command method check.
private String check(String file) throws IOException {
Path path = qualifiedPath(file);
ParquetMetadata footer = ParquetFileReader.readFooter(getConf(), path, ParquetMetadataConverter.NO_FILTER);
FileMetaData meta = footer.getFileMetaData();
String createdBy = meta.getCreatedBy();
if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
// create fake metadata that will read corrupt stats and return them
FileMetaData fakeMeta = new FileMetaData(meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);
// get just the binary columns
List<ColumnDescriptor> columns = Lists.newArrayList();
Iterables.addAll(columns, Iterables.filter(meta.getSchema().getColumns(), new Predicate<ColumnDescriptor>() {
@Override
public boolean apply(@Nullable ColumnDescriptor input) {
return input != null && input.getType() == BINARY;
}
}));
// now check to see if the data is actually corrupt
ParquetFileReader reader = new ParquetFileReader(getConf(), fakeMeta, path, footer.getBlocks(), columns);
try {
PageStatsValidator validator = new PageStatsValidator();
for (PageReadStore pages = reader.readNextRowGroup(); pages != null; pages = reader.readNextRowGroup()) {
validator.validate(columns, pages);
}
} catch (BadStatsException e) {
return e.getMessage();
}
}
return null;
}
Aggregations