use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.
the class ColumnIndexFilter method applyPredicate.
private RowRanges applyPredicate(Column<?> column, Function<ColumnIndex, PrimitiveIterator.OfInt> func, RowRanges rangesForMissingColumns) {
ColumnPath columnPath = column.getColumnPath();
if (!columns.contains(columnPath)) {
return rangesForMissingColumns;
}
OffsetIndex oi = columnIndexStore.getOffsetIndex(columnPath);
ColumnIndex ci = columnIndexStore.getColumnIndex(columnPath);
if (ci == null) {
LOGGER.info("No column index for column {} is available; Unable to filter on this column", columnPath);
return allRows();
}
return RowRanges.create(rowCount, func.apply(ci), oi);
}
use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.
the class ColumnMasker method processChunk.
private void processChunk(ColumnDescriptor descriptor, ColumnChunkMetaData chunk, ColumnReadStoreImpl crStore, TransParquetFileReader reader, ParquetFileWriter writer, MessageType schema, Set<ColumnPath> paths, MaskMode maskMode) throws IOException {
reader.setStreamPosition(chunk.getStartingPos());
if (paths.contains(chunk.getPath())) {
if (maskMode.equals(MaskMode.NULLIFY)) {
Type.Repetition repetition = descriptor.getPrimitiveType().getRepetition();
if (repetition.equals(Type.Repetition.REQUIRED)) {
throw new IOException("Required column [" + descriptor.getPrimitiveType().getName() + "] cannot be nullified");
}
nullifyColumn(descriptor, chunk, crStore, writer, schema);
} else {
throw new UnsupportedOperationException("Only nullify is supported for now");
}
} else {
BloomFilter bloomFilter = reader.readBloomFilter(chunk);
ColumnIndex columnIndex = reader.readColumnIndex(chunk);
OffsetIndex offsetIndex = reader.readOffsetIndex(chunk);
writer.appendColumnChunk(descriptor, reader.getStream(), chunk, bloomFilter, columnIndex, offsetIndex);
}
}
use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.
the class CompressionConverter method processChunk.
private void processChunk(TransParquetFileReader reader, ParquetFileWriter writer, ColumnChunkMetaData chunk, String createdBy, CompressionCodecName codecName) throws IOException {
CompressionCodecFactory codecFactory = HadoopCodecs.newFactory(0);
CompressionCodecFactory.BytesInputDecompressor decompressor = codecFactory.getDecompressor(chunk.getCodec());
CompressionCodecFactory.BytesInputCompressor compressor = codecFactory.getCompressor(codecName);
ColumnIndex columnIndex = reader.readColumnIndex(chunk);
OffsetIndex offsetIndex = reader.readOffsetIndex(chunk);
reader.setStreamPosition(chunk.getStartingPos());
DictionaryPage dictionaryPage = null;
long readValues = 0;
Statistics statistics = null;
ParquetMetadataConverter converter = new ParquetMetadataConverter();
int pageIndex = 0;
long totalChunkValues = chunk.getValueCount();
while (readValues < totalChunkValues) {
PageHeader pageHeader = reader.readPageHeader();
int compressedPageSize = pageHeader.getCompressed_page_size();
byte[] pageLoad;
switch(pageHeader.type) {
case DICTIONARY_PAGE:
if (dictionaryPage != null) {
throw new IOException("has more than one dictionary page in column chunk");
}
DictionaryPageHeader dictPageHeader = pageHeader.dictionary_page_header;
pageLoad = translatePageLoad(reader, true, compressor, decompressor, pageHeader.getCompressed_page_size(), pageHeader.getUncompressed_page_size());
writer.writeDictionaryPage(new DictionaryPage(BytesInput.from(pageLoad), pageHeader.getUncompressed_page_size(), dictPageHeader.getNum_values(), converter.getEncoding(dictPageHeader.getEncoding())));
break;
case DATA_PAGE:
DataPageHeader headerV1 = pageHeader.data_page_header;
pageLoad = translatePageLoad(reader, true, compressor, decompressor, pageHeader.getCompressed_page_size(), pageHeader.getUncompressed_page_size());
statistics = convertStatistics(createdBy, chunk.getPrimitiveType(), headerV1.getStatistics(), columnIndex, pageIndex, converter);
readValues += headerV1.getNum_values();
if (offsetIndex != null) {
long rowCount = 1 + offsetIndex.getLastRowIndex(pageIndex, totalChunkValues) - offsetIndex.getFirstRowIndex(pageIndex);
writer.writeDataPage(toIntWithCheck(headerV1.getNum_values()), pageHeader.getUncompressed_page_size(), BytesInput.from(pageLoad), statistics, toIntWithCheck(rowCount), converter.getEncoding(headerV1.getRepetition_level_encoding()), converter.getEncoding(headerV1.getDefinition_level_encoding()), converter.getEncoding(headerV1.getEncoding()));
} else {
writer.writeDataPage(toIntWithCheck(headerV1.getNum_values()), pageHeader.getUncompressed_page_size(), BytesInput.from(pageLoad), statistics, converter.getEncoding(headerV1.getRepetition_level_encoding()), converter.getEncoding(headerV1.getDefinition_level_encoding()), converter.getEncoding(headerV1.getEncoding()));
}
pageIndex++;
break;
case DATA_PAGE_V2:
DataPageHeaderV2 headerV2 = pageHeader.data_page_header_v2;
int rlLength = headerV2.getRepetition_levels_byte_length();
BytesInput rlLevels = readBlockAllocate(rlLength, reader);
int dlLength = headerV2.getDefinition_levels_byte_length();
BytesInput dlLevels = readBlockAllocate(dlLength, reader);
int payLoadLength = pageHeader.getCompressed_page_size() - rlLength - dlLength;
int rawDataLength = pageHeader.getUncompressed_page_size() - rlLength - dlLength;
pageLoad = translatePageLoad(reader, headerV2.is_compressed, compressor, decompressor, payLoadLength, rawDataLength);
statistics = convertStatistics(createdBy, chunk.getPrimitiveType(), headerV2.getStatistics(), columnIndex, pageIndex, converter);
readValues += headerV2.getNum_values();
writer.writeDataPageV2(headerV2.getNum_rows(), headerV2.getNum_nulls(), headerV2.getNum_values(), rlLevels, dlLevels, converter.getEncoding(headerV2.getEncoding()), BytesInput.from(pageLoad), rawDataLength, statistics);
pageIndex++;
break;
default:
LOG.debug("skipping page of type {} of size {}", pageHeader.getType(), compressedPageSize);
break;
}
}
}
use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.
the class ColumnEncryptorTest method validateColumns.
private void validateColumns(TransParquetFileReader inReader, TransParquetFileReader outReader, List<ColumnChunkMetaData> inColumns, List<ColumnChunkMetaData> outColumns) throws IOException {
for (int i = 0; i < inColumns.size(); i++) {
ColumnChunkMetaData inChunk = inColumns.get(i);
ColumnChunkMetaData outChunk = outColumns.get(i);
OffsetIndex inOffsetIndex = inReader.readOffsetIndex(inChunk);
OffsetIndex outOffsetIndex = outReader.readOffsetIndex(outChunk);
assertEquals(inOffsetIndex.getPageCount(), outOffsetIndex.getPageCount());
if (outChunk.isEncrypted()) {
continue;
}
validatePages(inReader, outReader, inOffsetIndex, outOffsetIndex);
}
}
use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.
the class TestColumnChunkPageWriteStore method test.
@Test
public void test() throws Exception {
Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
Path root = file.getParent();
FileSystem fs = file.getFileSystem(conf);
if (fs.exists(root)) {
fs.delete(root, true);
}
fs.mkdirs(root);
MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
ColumnDescriptor col = schema.getColumns().get(0);
Encoding dataEncoding = PLAIN;
int valueCount = 10;
int d = 1;
int r = 2;
int v = 3;
BytesInput definitionLevels = BytesInput.fromInt(d);
BytesInput repetitionLevels = BytesInput.fromInt(r);
Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
BytesInput data = BytesInput.fromInt(v);
int rowCount = 5;
int nullCount = 1;
statistics.incrementNumNulls(nullCount);
statistics.setMinMaxFromBytes(new byte[] { 0, 1, 2 }, new byte[] { 0, 1, 2, 3 });
long pageOffset;
long pageSize;
{
OutputFileForTesting outputFile = new OutputFileForTesting(file, conf);
ParquetFileWriter writer = new ParquetFileWriter(outputFile, schema, Mode.CREATE, ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.MAX_PADDING_SIZE_DEFAULT);
writer.start();
writer.startBlock(rowCount);
pageOffset = outputFile.out().getPos();
{
ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator(), Integer.MAX_VALUE);
PageWriter pageWriter = store.getPageWriter(col);
pageWriter.writePageV2(rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics);
store.flushToFileWriter(writer);
pageSize = outputFile.out().getPos() - pageOffset;
}
writer.endBlock();
writer.end(new HashMap<String, String>());
}
{
ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
PageReadStore rowGroup = reader.readNextRowGroup();
PageReader pageReader = rowGroup.getPageReader(col);
DataPageV2 page = (DataPageV2) pageReader.readPage();
assertEquals(rowCount, page.getRowCount());
assertEquals(nullCount, page.getNullCount());
assertEquals(valueCount, page.getValueCount());
assertEquals(d, intValue(page.getDefinitionLevels()));
assertEquals(r, intValue(page.getRepetitionLevels()));
assertEquals(dataEncoding, page.getDataEncoding());
assertEquals(v, intValue(page.getData()));
// Checking column/offset indexes for the one page
ColumnChunkMetaData column = footer.getBlocks().get(0).getColumns().get(0);
ColumnIndex columnIndex = reader.readColumnIndex(column);
assertArrayEquals(statistics.getMinBytes(), columnIndex.getMinValues().get(0).array());
assertArrayEquals(statistics.getMaxBytes(), columnIndex.getMaxValues().get(0).array());
assertEquals(statistics.getNumNulls(), columnIndex.getNullCounts().get(0).longValue());
assertFalse(columnIndex.getNullPages().get(0));
OffsetIndex offsetIndex = reader.readOffsetIndex(column);
assertEquals(1, offsetIndex.getPageCount());
assertEquals(pageSize, offsetIndex.getCompressedPageSize(0));
assertEquals(0, offsetIndex.getFirstRowIndex(0));
assertEquals(pageOffset, offsetIndex.getOffset(0));
reader.close();
}
}
Aggregations