use of org.apache.parquet.column.page.DataPageV1 in project parquet-mr by apache.
the class TestParquetFileWriter method validateContains.
private void validateContains(MessageType schema, PageReadStore pages, String[] path, int values, BytesInput bytes) throws IOException {
PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path));
DataPage page = pageReader.readPage();
assertEquals(values, page.getValueCount());
assertArrayEquals(bytes.toByteArray(), ((DataPageV1) page).getBytes().toByteArray());
}
use of org.apache.parquet.column.page.DataPageV1 in project parquet-mr by apache.
the class MemPageWriter method writePage.
@Override
public void writePage(BytesInput bytesInput, int valueCount, Statistics statistics, Encoding rlEncoding, Encoding dlEncoding, Encoding valuesEncoding) throws IOException {
if (valueCount == 0) {
throw new ParquetEncodingException("illegal page of 0 values");
}
memSize += bytesInput.size();
pages.add(new DataPageV1(BytesInput.copy(bytesInput), valueCount, (int) bytesInput.size(), statistics, rlEncoding, dlEncoding, valuesEncoding));
totalValueCount += valueCount;
LOG.debug("page written for {} bytes and {} records", bytesInput.size(), valueCount);
}
use of org.apache.parquet.column.page.DataPageV1 in project parquet-mr by apache.
the class DumpCommand method dump.
public static void dump(final PrettyPrintWriter out, PageReadStore store, ColumnDescriptor column) throws IOException {
PageReader reader = store.getPageReader(column);
long vc = reader.getTotalValueCount();
int rmax = column.getMaxRepetitionLevel();
int dmax = column.getMaxDefinitionLevel();
out.format("%s TV=%d RL=%d DL=%d", Joiner.on('.').skipNulls().join(column.getPath()), vc, rmax, dmax);
DictionaryPage dict = reader.readDictionaryPage();
if (dict != null) {
out.format(" DS:%d", dict.getDictionarySize());
out.format(" DE:%s", dict.getEncoding());
}
out.println();
out.rule('-');
DataPage page = reader.readPage();
for (long count = 0; page != null; count++) {
out.format("page %d:", count);
page.accept(new Visitor<Void>() {
@Override
public Void visit(DataPageV1 pageV1) {
out.format(" DLE:%s", pageV1.getDlEncoding());
out.format(" RLE:%s", pageV1.getRlEncoding());
out.format(" VLE:%s", pageV1.getValueEncoding());
Statistics<?> statistics = pageV1.getStatistics();
if (statistics != null) {
out.format(" ST:[%s]", statistics);
} else {
out.format(" ST:[none]");
}
return null;
}
@Override
public Void visit(DataPageV2 pageV2) {
out.format(" DLE:RLE");
out.format(" RLE:RLE");
out.format(" VLE:%s", pageV2.getDataEncoding());
Statistics<?> statistics = pageV2.getStatistics();
if (statistics != null) {
out.format(" ST:[%s]", statistics);
} else {
out.format(" ST:[none]");
}
return null;
}
});
out.format(" SZ:%d", page.getUncompressedSize());
out.format(" VC:%d", page.getValueCount());
out.println();
page = reader.readPage();
}
}
use of org.apache.parquet.column.page.DataPageV1 in project flink by apache.
the class AbstractColumnReader method readToVector.
/**
* Reads `total` values from this columnReader into column.
*/
@Override
public final void readToVector(int readNumber, VECTOR vector) throws IOException {
int rowId = 0;
WritableIntVector dictionaryIds = null;
if (dictionary != null) {
dictionaryIds = vector.reserveDictionaryIds(readNumber);
}
while (readNumber > 0) {
// Compute the number of values we want to read in this page.
int leftInPage = (int) (endOfPageValueCount - valuesRead);
if (leftInPage == 0) {
DataPage page = pageReader.readPage();
if (page instanceof DataPageV1) {
readPageV1((DataPageV1) page);
} else if (page instanceof DataPageV2) {
readPageV2((DataPageV2) page);
} else {
throw new RuntimeException("Unsupported page type: " + page.getClass());
}
leftInPage = (int) (endOfPageValueCount - valuesRead);
}
int num = Math.min(readNumber, leftInPage);
if (isCurrentPageDictionaryEncoded) {
// Read and decode dictionary ids.
runLenDecoder.readDictionaryIds(num, dictionaryIds, vector, rowId, maxDefLevel, this.dictionaryIdsDecoder);
if (vector.hasDictionary() || (rowId == 0 && supportLazyDecode())) {
// Column vector supports lazy decoding of dictionary values so just set the
// dictionary.
// We can't do this if rowId != 0 AND the column doesn't have a dictionary (i.e.
// some
// non-dictionary encoded values have already been added).
vector.setDictionary(new ParquetDictionary(dictionary));
} else {
readBatchFromDictionaryIds(rowId, num, vector, dictionaryIds);
}
} else {
if (vector.hasDictionary() && rowId != 0) {
// This batch already has dictionary encoded values but this new page is not.
// The batch
// does not support a mix of dictionary and not so we will decode the
// dictionary.
readBatchFromDictionaryIds(0, rowId, vector, vector.getDictionaryIds());
}
vector.setDictionary(null);
readBatch(rowId, num, vector);
}
valuesRead += num;
rowId += num;
readNumber -= num;
}
}
use of org.apache.parquet.column.page.DataPageV1 in project hive by apache.
the class VectorizedPrimitiveColumnReader method readPage.
private void readPage() throws IOException {
DataPage page = pageReader.readPage();
// TODO: Why is this a visitor?
page.accept(new DataPage.Visitor<Void>() {
@Override
public Void visit(DataPageV1 dataPageV1) {
readPageV1(dataPageV1);
return null;
}
@Override
public Void visit(DataPageV2 dataPageV2) {
readPageV2(dataPageV2);
return null;
}
});
}
Aggregations