use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class ShowPagesCommand method run.
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
String source = targets.get(0);
ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source));
MessageType schema = reader.getFileMetaData().getSchema();
Map<ColumnDescriptor, PrimitiveType> columns = Maps.newLinkedHashMap();
if (this.columns == null || this.columns.isEmpty()) {
for (ColumnDescriptor descriptor : schema.getColumns()) {
columns.put(descriptor, primitive(schema, descriptor.getPath()));
}
} else {
for (String column : this.columns) {
columns.put(descriptor(column, schema), primitive(column, schema));
}
}
CompressionCodecName codec = reader.getRowGroups().get(0).getColumns().get(0).getCodec();
// accumulate formatted lines to print by column
Map<String, List<String>> formatted = Maps.newLinkedHashMap();
PageFormatter formatter = new PageFormatter();
PageReadStore pageStore;
int rowGroupNum = 0;
while ((pageStore = reader.readNextRowGroup()) != null) {
for (ColumnDescriptor descriptor : columns.keySet()) {
List<String> lines = formatted.get(columnName(descriptor));
if (lines == null) {
lines = Lists.newArrayList();
formatted.put(columnName(descriptor), lines);
}
formatter.setContext(rowGroupNum, columns.get(descriptor), codec);
PageReader pages = pageStore.getPageReader(descriptor);
DictionaryPage dict = pages.readDictionaryPage();
if (dict != null) {
lines.add(formatter.format(dict));
}
DataPage page;
while ((page = pages.readPage()) != null) {
lines.add(formatter.format(page));
}
}
rowGroupNum += 1;
}
// TODO: Show total column size and overall size per value in the column summary line
for (String columnName : formatted.keySet()) {
console.info(String.format("\nColumn: %s\n%s", columnName, StringUtils.leftPad("", 80, '-')));
console.info(formatter.getHeader());
for (String line : formatted.get(columnName)) {
console.info(line);
}
console.info("");
}
return 0;
}
use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class TestCorruptDeltaByteArrays method testColumnReaderImplWithCorruptPage.
@Test
public void testColumnReaderImplWithCorruptPage() throws Exception {
ColumnDescriptor column = new ColumnDescriptor(new String[] { "s" }, PrimitiveType.PrimitiveTypeName.BINARY, 0, 0);
MemPageStore pages = new MemPageStore(0);
PageWriter memWriter = pages.getPageWriter(column);
ParquetProperties parquetProps = ParquetProperties.builder().withDictionaryEncoding(false).build();
// get generic repetition and definition level bytes to use for pages
ValuesWriter rdValues = parquetProps.newDefinitionLevelWriter(column);
for (int i = 0; i < 10; i += 1) {
rdValues.writeInteger(0);
}
// use a byte array backed BytesInput because it is reused
BytesInput rd = BytesInput.from(rdValues.getBytes().toByteArray());
DeltaByteArrayWriter writer = getDeltaByteArrayWriter();
String lastValue = null;
List<String> values = new ArrayList<String>();
for (int i = 0; i < 10; i += 1) {
lastValue = str(i);
writer.writeBytes(Binary.fromString(lastValue));
values.add(lastValue);
}
memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
pages.addRowCount(10);
// sets previous to new byte[0]
writer.reset();
corruptWriter(writer, lastValue);
for (int i = 10; i < 20; i += 1) {
String value = str(i);
writer.writeBytes(Binary.fromString(value));
values.add(value);
}
memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
pages.addRowCount(10);
final List<String> actualValues = new ArrayList<String>();
PrimitiveConverter converter = new PrimitiveConverter() {
@Override
public void addBinary(Binary value) {
actualValues.add(value.toStringUsingUTF8());
}
};
ColumnReaderImpl columnReader = new ColumnReaderImpl(column, pages.getPageReader(column), converter, new ParsedVersion("parquet-mr", "1.6.0", "abcd"));
while (actualValues.size() < columnReader.getTotalValueCount()) {
columnReader.writeCurrentValueToConverter();
columnReader.consume();
}
Assert.assertEquals(values, actualValues);
}
use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class SchemaCompatibilityValidator method validateColumn.
private <T extends Comparable<T>> void validateColumn(Column<T> column) {
ColumnPath path = column.getColumnPath();
Class<?> alreadySeen = columnTypesEncountered.get(path);
if (alreadySeen != null && !alreadySeen.equals(column.getColumnType())) {
throw new IllegalArgumentException("Column: " + path.toDotString() + " was provided with different types in the same predicate." + " Found both: (" + alreadySeen + ", " + column.getColumnType() + ")");
}
if (alreadySeen == null) {
columnTypesEncountered.put(path, column.getColumnType());
}
ColumnDescriptor descriptor = getColumnDescriptor(path);
if (descriptor == null) {
// updateNull() a value is missing, so this will be handled correctly.
return;
}
if (descriptor.getMaxRepetitionLevel() > 0) {
throw new IllegalArgumentException("FilterPredicates do not currently support repeated columns. " + "Column " + path.toDotString() + " is repeated.");
}
ValidTypeMap.assertTypeValid(column, descriptor.getType());
}
use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class MessageType method getColumns.
public List<ColumnDescriptor> getColumns() {
List<String[]> paths = this.getPaths(0);
List<ColumnDescriptor> columns = new ArrayList<ColumnDescriptor>(paths.size());
for (String[] path : paths) {
// TODO: optimize this
PrimitiveType primitiveType = getType(path).asPrimitiveType();
columns.add(new ColumnDescriptor(path, primitiveType, getMaxRepetitionLevel(path), getMaxDefinitionLevel(path)));
}
return columns;
}
use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class PrimitiveColumnIO method setLevels.
@Override
void setLevels(int r, int d, String[] fieldPath, int[] fieldIndexPath, List<ColumnIO> repetition, List<ColumnIO> path) {
super.setLevels(r, d, fieldPath, fieldIndexPath, repetition, path);
PrimitiveType type = getType().asPrimitiveType();
this.columnDescriptor = new ColumnDescriptor(fieldPath, type, getRepetitionLevel(), getDefinitionLevel());
this.path = path.toArray(new ColumnIO[path.size()]);
}
Aggregations