use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class TestDataPageV1Checksums method testWriteOffVerifyOn.
/**
* Do not write out page level crc checksums, but enable verification on the read path. Tests
* that the read still succeeds and does not throw an exception.
*/
@Test
public void testWriteOffVerifyOn() throws IOException {
Configuration conf = new Configuration();
conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);
try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
PageReadStore pageReadStore = reader.readNextRowGroup();
assertCorrectContent(readNextPage(colADesc, pageReadStore).getBytes().toByteArray(), colAPage1Bytes);
assertCorrectContent(readNextPage(colADesc, pageReadStore).getBytes().toByteArray(), colAPage2Bytes);
assertCorrectContent(readNextPage(colBDesc, pageReadStore).getBytes().toByteArray(), colBPage1Bytes);
assertCorrectContent(readNextPage(colBDesc, pageReadStore).getBytes().toByteArray(), colBPage2Bytes);
}
}
use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class ShowPagesCommand method run.
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
String source = targets.get(0);
try (ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source))) {
MessageType schema = reader.getFileMetaData().getSchema();
Map<ColumnDescriptor, PrimitiveType> columns = Maps.newLinkedHashMap();
if (this.columns == null || this.columns.isEmpty()) {
for (ColumnDescriptor descriptor : schema.getColumns()) {
columns.put(descriptor, primitive(schema, descriptor.getPath()));
}
} else {
for (String column : this.columns) {
columns.put(descriptor(column, schema), primitive(column, schema));
}
}
CompressionCodecName codec = reader.getRowGroups().get(0).getColumns().get(0).getCodec();
// accumulate formatted lines to print by column
Map<String, List<String>> formatted = Maps.newLinkedHashMap();
PageFormatter formatter = new PageFormatter();
PageReadStore pageStore;
int rowGroupNum = 0;
while ((pageStore = reader.readNextRowGroup()) != null) {
for (ColumnDescriptor descriptor : columns.keySet()) {
List<String> lines = formatted.get(columnName(descriptor));
if (lines == null) {
lines = Lists.newArrayList();
formatted.put(columnName(descriptor), lines);
}
formatter.setContext(rowGroupNum, columns.get(descriptor), codec);
PageReader pages = pageStore.getPageReader(descriptor);
DictionaryPage dict = pages.readDictionaryPage();
if (dict != null) {
lines.add(formatter.format(dict));
}
DataPage page;
while ((page = pages.readPage()) != null) {
lines.add(formatter.format(page));
}
}
rowGroupNum += 1;
}
// TODO: Show total column size and overall size per value in the column summary line
for (String columnName : formatted.keySet()) {
console.info(String.format("\nColumn: %s\n%s", columnName, new TextStringBuilder(80).appendPadding(80, '-')));
console.info(formatter.getHeader());
for (String line : formatted.get(columnName)) {
console.info(line);
}
console.info("");
}
}
return 0;
}
use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class ColumnIndexValidator method checkContractViolations.
public static List<ContractViolation> checkContractViolations(InputFile file) throws IOException {
List<ContractViolation> violations = new ArrayList<>();
try (ParquetFileReader reader = ParquetFileReader.open(file)) {
FileMetaData meta = reader.getFooter().getFileMetaData();
MessageType schema = meta.getSchema();
List<ColumnDescriptor> columns = schema.getColumns();
List<BlockMetaData> blocks = reader.getFooter().getBlocks();
int rowGroupNumber = 0;
PageReadStore rowGroup = reader.readNextRowGroup();
while (rowGroup != null) {
ColumnReadStore columnReadStore = new ColumnReadStoreImpl(rowGroup, new DummyRecordConverter(schema).getRootConverter(), schema, null);
List<ColumnChunkMetaData> columnChunks = blocks.get(rowGroupNumber).getColumns();
assert (columnChunks.size() == columns.size());
for (int columnNumber = 0; columnNumber < columns.size(); ++columnNumber) {
ColumnDescriptor column = columns.get(columnNumber);
ColumnChunkMetaData columnChunk = columnChunks.get(columnNumber);
ColumnIndex columnIndex = reader.readColumnIndex(columnChunk);
if (columnIndex == null) {
continue;
}
ColumnPath columnPath = columnChunk.getPath();
OffsetIndex offsetIndex = reader.readOffsetIndex(columnChunk);
List<ByteBuffer> minValues = columnIndex.getMinValues();
List<ByteBuffer> maxValues = columnIndex.getMaxValues();
BoundaryOrder boundaryOrder = columnIndex.getBoundaryOrder();
List<Long> nullCounts = columnIndex.getNullCounts();
List<Boolean> nullPages = columnIndex.getNullPages();
long rowNumber = 0;
ColumnReader columnReader = columnReadStore.getColumnReader(column);
ByteBuffer prevMinValue = null;
ByteBuffer prevMaxValue = null;
for (int pageNumber = 0; pageNumber < offsetIndex.getPageCount(); ++pageNumber) {
boolean isNullPage = nullPages.get(pageNumber);
ByteBuffer minValue = minValues.get(pageNumber);
ByteBuffer maxValue = maxValues.get(pageNumber);
PageValidator pageValidator = new PageValidator(column.getPrimitiveType(), rowGroupNumber, columnNumber, columnPath, pageNumber, violations, columnReader, minValue, maxValue, prevMinValue, prevMaxValue, boundaryOrder, nullCounts.get(pageNumber), isNullPage);
if (!isNullPage) {
prevMinValue = minValue;
prevMaxValue = maxValue;
}
long lastRowNumberInPage = offsetIndex.getLastRowIndex(pageNumber, rowGroup.getRowCount());
while (rowNumber <= lastRowNumberInPage) {
pageValidator.validateValuesBelongingToRow();
++rowNumber;
}
pageValidator.finishPage();
}
}
rowGroup = reader.readNextRowGroup();
rowGroupNumber++;
}
}
return violations;
}
use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class DumpCommand method dump.
public static void dump(PrettyPrintWriter out, ParquetMetadata meta, MessageType schema, Path inpath, boolean showmd, boolean showdt, Set<String> showColumns) throws IOException {
Configuration conf = new Configuration();
List<BlockMetaData> blocks = meta.getBlocks();
List<ColumnDescriptor> columns = schema.getColumns();
if (showColumns != null) {
columns = new ArrayList<ColumnDescriptor>();
for (ColumnDescriptor column : schema.getColumns()) {
String path = Joiner.on('.').skipNulls().join(column.getPath());
if (showColumns.contains(path)) {
columns.add(column);
}
}
}
ParquetFileReader freader = null;
if (showmd) {
try {
long group = 0;
for (BlockMetaData block : blocks) {
if (group != 0)
out.println();
out.format("row group %d%n", group++);
out.rule('-');
List<ColumnChunkMetaData> ccmds = block.getColumns();
if (showColumns != null) {
ccmds = new ArrayList<ColumnChunkMetaData>();
for (ColumnChunkMetaData ccmd : block.getColumns()) {
String path = Joiner.on('.').skipNulls().join(ccmd.getPath().toArray());
if (showColumns.contains(path)) {
ccmds.add(ccmd);
}
}
}
MetadataUtils.showDetails(out, ccmds);
List<BlockMetaData> rblocks = Collections.singletonList(block);
freader = new ParquetFileReader(conf, meta.getFileMetaData(), inpath, rblocks, columns);
PageReadStore store = freader.readNextRowGroup();
while (store != null) {
out.incrementTabLevel();
for (ColumnDescriptor column : columns) {
out.println();
dump(out, store, column);
}
out.decrementTabLevel();
store = freader.readNextRowGroup();
}
out.flushColumns();
}
} finally {
if (freader != null) {
freader.close();
}
}
}
if (showdt) {
boolean first = true;
for (ColumnDescriptor column : columns) {
if (!first || showmd)
out.println();
first = false;
out.format("%s %s%n", column.getType(), Joiner.on('.').skipNulls().join(column.getPath()));
out.rule('-');
try {
long page = 1;
long total = blocks.size();
long offset = 1;
freader = new ParquetFileReader(conf, meta.getFileMetaData(), inpath, blocks, Collections.singletonList(column));
PageReadStore store = freader.readNextRowGroup();
while (store != null) {
ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DumpGroupConverter(), schema, meta.getFileMetaData().getCreatedBy());
dump(out, crstore, column, page++, total, offset);
offset += store.getRowCount();
store = freader.readNextRowGroup();
}
out.flushColumns();
} finally {
out.flushColumns();
if (freader != null) {
freader.close();
}
}
}
}
}
use of org.apache.parquet.column.page.PageReadStore in project hive by apache.
the class VectorizedParquetRecordReader method checkEndOfRowGroup.
private void checkEndOfRowGroup() throws IOException {
if (rowsReturned != totalCountLoadedSoFar) {
return;
}
PageReadStore pages = reader.readNextRowGroup();
if (pages == null) {
throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount);
}
List<ColumnDescriptor> columns = requestedSchema.getColumns();
List<Type> types = requestedSchema.getFields();
columnReaders = new VectorizedColumnReader[columns.size()];
if (!ColumnProjectionUtils.isReadAllColumns(jobConf)) {
// However, if colsToInclude is not empty we should initialize each columnReader
if (!colsToInclude.isEmpty()) {
for (int i = 0; i < types.size(); ++i) {
columnReaders[i] = buildVectorizedParquetReader(columnTypesList.get(colsToInclude.get(i)), types.get(i), pages, requestedSchema.getColumns(), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, 0);
}
}
} else {
for (int i = 0; i < types.size(); ++i) {
columnReaders[i] = buildVectorizedParquetReader(columnTypesList.get(i), types.get(i), pages, requestedSchema.getColumns(), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, 0);
}
}
totalCountLoadedSoFar += pages.getRowCount();
}
Aggregations