use of org.apache.parquet.column.statistics.BinaryStatistics in project drill by apache.
the class ParquetFooterStatCollector method collectColStat.
@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
Stopwatch timer = Stopwatch.createStarted();
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, new ArrayList<>(fields), autoCorrectCorruptDates);
// map from column name to ColumnDescriptor
Map<SchemaPath, ColumnDescriptor> columnDescMap = new HashMap<>();
// map from column name to ColumnChunkMetaData
final Map<SchemaPath, ColumnChunkMetaData> columnChkMetaMap = new HashMap<>();
// map from column name to MajorType
final Map<SchemaPath, TypeProtos.MajorType> columnTypeMap = new HashMap<>();
// map from column name to SchemaElement
final Map<SchemaPath, SchemaElement> schemaElementMap = new HashMap<>();
// map from column name to column statistics.
final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
final org.apache.parquet.format.FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
for (final ColumnDescriptor column : footer.getFileMetaData().getSchema().getColumns()) {
final SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getPath());
if (fields.contains(schemaPath)) {
columnDescMap.put(schemaPath, column);
}
}
for (final SchemaElement se : fileMetaData.getSchema()) {
final SchemaPath schemaPath = SchemaPath.getSimplePath(se.getName());
if (fields.contains(schemaPath)) {
schemaElementMap.put(schemaPath, se);
}
}
for (final ColumnChunkMetaData colMetaData : footer.getBlocks().get(rowGroupIndex).getColumns()) {
final SchemaPath schemaPath = SchemaPath.getCompoundPath(colMetaData.getPath().toArray());
if (fields.contains(schemaPath)) {
columnChkMetaMap.put(schemaPath, colMetaData);
}
}
for (final SchemaPath path : fields) {
if (columnDescMap.containsKey(path) && schemaElementMap.containsKey(path) && columnChkMetaMap.containsKey(path)) {
ColumnDescriptor columnDesc = columnDescMap.get(path);
SchemaElement se = schemaElementMap.get(path);
ColumnChunkMetaData metaData = columnChkMetaMap.get(path);
TypeProtos.MajorType type = ParquetToDrillTypeConverter.toMajorType(columnDesc.getType(), se.getType_length(), getDataMode(columnDesc), se, options);
columnTypeMap.put(path, type);
Statistics stat = metaData.getStatistics();
if (type.getMinorType() == TypeProtos.MinorType.DATE) {
stat = convertDateStatIfNecessary(metaData.getStatistics(), containsCorruptDates);
}
statMap.put(path, new ColumnStatistics(stat, type));
} else {
final String columnName = path.getRootSegment().getPath();
if (implicitColValues.containsKey(columnName)) {
TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
Statistics stat = new BinaryStatistics();
stat.setNumNulls(0);
byte[] val = implicitColValues.get(columnName).getBytes();
stat.setMinMaxFromBytes(val, val);
statMap.put(path, new ColumnStatistics(stat, type));
}
}
}
if (logger.isDebugEnabled()) {
logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
}
return statMap;
}
use of org.apache.parquet.column.statistics.BinaryStatistics in project drill by apache.
the class ParquetMetaStatCollector method collectColStat.
@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
Stopwatch timer = Stopwatch.createStarted();
// map from column to ColumnMetadata
final Map<SchemaPath, Metadata.ColumnMetadata> columnMetadataMap = new HashMap<>();
// map from column name to column statistics.
final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
for (final Metadata.ColumnMetadata columnMetadata : columnMetadataList) {
SchemaPath schemaPath = SchemaPath.getCompoundPath(columnMetadata.getName());
columnMetadataMap.put(schemaPath, columnMetadata);
}
for (final SchemaPath schemaPath : fields) {
final PrimitiveType.PrimitiveTypeName primitiveType;
final OriginalType originalType;
final Metadata.ColumnMetadata columnMetadata = columnMetadataMap.get(schemaPath);
if (columnMetadata != null) {
final Object min = columnMetadata.getMinValue();
final Object max = columnMetadata.getMaxValue();
final Long numNull = columnMetadata.getNulls();
primitiveType = this.parquetTableMetadata.getPrimitiveType(columnMetadata.getName());
originalType = this.parquetTableMetadata.getOriginalType(columnMetadata.getName());
final Integer repetitionLevel = this.parquetTableMetadata.getRepetitionLevel(columnMetadata.getName());
statMap.put(schemaPath, getStat(min, max, numNull, primitiveType, originalType, repetitionLevel));
} else {
final String columnName = schemaPath.getRootSegment().getPath();
if (implicitColValues.containsKey(columnName)) {
TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
Statistics stat = new BinaryStatistics();
stat.setNumNulls(0);
byte[] val = implicitColValues.get(columnName).getBytes();
stat.setMinMaxFromBytes(val, val);
statMap.put(schemaPath, new ColumnStatistics(stat, type));
}
}
}
if (logger.isDebugEnabled()) {
logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
}
return statMap;
}
use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.
the class TestParquetFileWriter method createFile.
private void createFile(Configuration configuration, Path path, MessageType schema) throws IOException {
String[] path1 = { "a", "b" };
ColumnDescriptor c1 = schema.getColumnDescription(path1);
String[] path2 = { "c", "d" };
ColumnDescriptor c2 = schema.getColumnDescription(path2);
byte[] bytes1 = { 0, 1, 2, 3 };
byte[] bytes2 = { 1, 2, 3, 4 };
byte[] bytes3 = { 2, 3, 4, 5 };
byte[] bytes4 = { 3, 4, 5, 6 };
CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
BinaryStatistics stats1 = new BinaryStatistics();
BinaryStatistics stats2 = new BinaryStatistics();
ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
w.start();
w.startBlock(3);
w.startColumn(c1, 5, codec);
w.writeDataPage(2, 4, BytesInput.from(bytes1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(bytes1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(c2, 6, codec);
w.writeDataPage(2, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(1, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
w.startBlock(4);
w.startColumn(c1, 7, codec);
w.writeDataPage(7, 4, BytesInput.from(bytes3), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(c2, 8, codec);
w.writeDataPage(8, 4, BytesInput.from(bytes4), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
final HashMap<String, String> extraMetaData = new HashMap<String, String>();
extraMetaData.put("foo", "bar");
extraMetaData.put(path.getName(), path.getName());
w.end(extraMetaData);
}
use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.
the class TestCorruptDeltaByteArrays method testColumnReaderImplWithCorruptPage.
@Test
public void testColumnReaderImplWithCorruptPage() throws Exception {
ColumnDescriptor column = new ColumnDescriptor(new String[] { "s" }, PrimitiveType.PrimitiveTypeName.BINARY, 0, 0);
MemPageStore pages = new MemPageStore(0);
PageWriter memWriter = pages.getPageWriter(column);
ParquetProperties parquetProps = ParquetProperties.builder().withDictionaryEncoding(false).build();
// get generic repetition and definition level bytes to use for pages
ValuesWriter rdValues = parquetProps.newDefinitionLevelWriter(column);
for (int i = 0; i < 10; i += 1) {
rdValues.writeInteger(0);
}
// use a byte array backed BytesInput because it is reused
BytesInput rd = BytesInput.from(rdValues.getBytes().toByteArray());
DeltaByteArrayWriter writer = getDeltaByteArrayWriter();
String lastValue = null;
List<String> values = new ArrayList<String>();
for (int i = 0; i < 10; i += 1) {
lastValue = str(i);
writer.writeBytes(Binary.fromString(lastValue));
values.add(lastValue);
}
memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
pages.addRowCount(10);
// sets previous to new byte[0]
writer.reset();
corruptWriter(writer, lastValue);
for (int i = 10; i < 20; i += 1) {
String value = str(i);
writer.writeBytes(Binary.fromString(value));
values.add(value);
}
memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
pages.addRowCount(10);
final List<String> actualValues = new ArrayList<String>();
PrimitiveConverter converter = new PrimitiveConverter() {
@Override
public void addBinary(Binary value) {
actualValues.add(value.toStringUsingUTF8());
}
};
ColumnReaderImpl columnReader = new ColumnReaderImpl(column, pages.getPageReader(column), converter, new ParsedVersion("parquet-mr", "1.6.0", "abcd"));
while (actualValues.size() < columnReader.getTotalValueCount()) {
columnReader.writeCurrentValueToConverter();
columnReader.consume();
}
Assert.assertEquals(values, actualValues);
}
use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.
the class TestParquetFileWriter method testWriteReadStatistics.
@Test
public void testWriteReadStatistics() throws Exception {
// this test assumes statistics will be read
Assume.assumeTrue(!shouldIgnoreStatistics(Version.FULL_VERSION, BINARY));
File testFile = temp.newFile();
testFile.delete();
Path path = new Path(testFile.toURI());
Configuration configuration = new Configuration();
configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b (UTF8);} required group c { required int64 d; }}");
String[] path1 = { "a", "b" };
ColumnDescriptor c1 = schema.getColumnDescription(path1);
String[] path2 = { "c", "d" };
ColumnDescriptor c2 = schema.getColumnDescription(path2);
byte[] bytes1 = { 0, 1, 2, 3 };
byte[] bytes2 = { 1, 2, 3, 4 };
byte[] bytes3 = { 2, 3, 4, 5 };
byte[] bytes4 = { 3, 4, 5, 6 };
CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
BinaryStatistics statsB1C1P1 = new BinaryStatistics();
BinaryStatistics statsB1C1P2 = new BinaryStatistics();
LongStatistics statsB1C2P1 = new LongStatistics();
LongStatistics statsB1C2P2 = new LongStatistics();
BinaryStatistics statsB2C1P1 = new BinaryStatistics();
LongStatistics statsB2C2P1 = new LongStatistics();
statsB1C1P1.setMinMax(Binary.fromString("s"), Binary.fromString("z"));
statsB1C1P2.setMinMax(Binary.fromString("a"), Binary.fromString("b"));
statsB1C2P1.setMinMax(2l, 10l);
statsB1C2P2.setMinMax(-6l, 4l);
statsB2C1P1.setMinMax(Binary.fromString("d"), Binary.fromString("e"));
statsB2C2P1.setMinMax(11l, 122l);
ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
w.start();
w.startBlock(3);
w.startColumn(c1, 5, codec);
w.writeDataPage(2, 4, BytesInput.from(bytes1), statsB1C1P1, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(bytes1), statsB1C1P2, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(c2, 6, codec);
w.writeDataPage(3, 4, BytesInput.from(bytes2), statsB1C2P1, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(1, 4, BytesInput.from(bytes2), statsB1C2P2, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
w.startBlock(4);
w.startColumn(c1, 7, codec);
w.writeDataPage(7, 4, BytesInput.from(bytes3), statsB2C1P1, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(c2, 8, codec);
w.writeDataPage(8, 4, BytesInput.from(bytes4), statsB2C2P1, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
w.end(new HashMap<String, String>());
ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
for (BlockMetaData block : readFooter.getBlocks()) {
for (ColumnChunkMetaData col : block.getColumns()) {
col.getPath();
}
}
// correct statistics
BinaryStatistics bs1 = new BinaryStatistics();
bs1.setMinMax(Binary.fromString("a"), Binary.fromString("z"));
LongStatistics ls1 = new LongStatistics();
ls1.setMinMax(-6l, 10l);
BinaryStatistics bs2 = new BinaryStatistics();
bs2.setMinMax(Binary.fromString("d"), Binary.fromString("e"));
LongStatistics ls2 = new LongStatistics();
ls2.setMinMax(11l, 122l);
{
// assert stats are correct for the first block
BinaryStatistics bsout = (BinaryStatistics) readFooter.getBlocks().get(0).getColumns().get(0).getStatistics();
String str = new String(bsout.getMaxBytes());
String str2 = new String(bsout.getMinBytes());
TestUtils.assertStatsValuesEqual(bs1, readFooter.getBlocks().get(0).getColumns().get(0).getStatistics());
TestUtils.assertStatsValuesEqual(ls1, readFooter.getBlocks().get(0).getColumns().get(1).getStatistics());
}
{
// assert stats are correct for the second block
TestUtils.assertStatsValuesEqual(bs2, readFooter.getBlocks().get(1).getColumns().get(0).getStatistics());
TestUtils.assertStatsValuesEqual(ls2, readFooter.getBlocks().get(1).getColumns().get(1).getStatistics());
}
}
Aggregations