use of org.apache.parquet.column.Dictionary in project parquet-mr by apache.
the class ShowDictionaryCommand method run.
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
String source = targets.get(0);
ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source));
MessageType schema = reader.getFileMetaData().getSchema();
ColumnDescriptor descriptor = Util.descriptor(column, schema);
PrimitiveType type = Util.primitive(column, schema);
Preconditions.checkNotNull(type);
DictionaryPageReadStore dictionaryReader;
int rowGroup = 0;
while ((dictionaryReader = reader.getNextDictionaryReader()) != null) {
DictionaryPage page = dictionaryReader.readDictionaryPage(descriptor);
Dictionary dict = page.getEncoding().initDictionary(descriptor, page);
console.info("\nRow group {} dictionary for \"{}\":", rowGroup, column, page.getCompressedSize());
for (int i = 0; i <= dict.getMaxId(); i += 1) {
switch(type.getPrimitiveTypeName()) {
case BINARY:
if (type.getOriginalType() == OriginalType.UTF8) {
console.info("{}: {}", String.format("%6d", i), Util.humanReadable(dict.decodeToBinary(i).toStringUsingUTF8(), 70));
} else {
console.info("{}: {}", String.format("%6d", i), Util.humanReadable(dict.decodeToBinary(i).getBytesUnsafe(), 70));
}
break;
case INT32:
console.info("{}: {}", String.format("%6d", i), dict.decodeToInt(i));
break;
case INT64:
console.info("{}: {}", String.format("%6d", i), dict.decodeToLong(i));
break;
case FLOAT:
console.info("{}: {}", String.format("%6d", i), dict.decodeToFloat(i));
break;
case DOUBLE:
console.info("{}: {}", String.format("%6d", i), dict.decodeToDouble(i));
break;
default:
throw new IllegalArgumentException("Unknown dictionary type: " + type.getPrimitiveTypeName());
}
}
reader.skipNextRowGroup();
rowGroup += 1;
}
console.info("");
return 0;
}
use of org.apache.parquet.column.Dictionary in project parquet-mr by apache.
the class DictionaryFilter method expandDictionary.
@SuppressWarnings("unchecked")
private <T extends Comparable<T>> Set<T> expandDictionary(ColumnChunkMetaData meta) throws IOException {
ColumnDescriptor col = new ColumnDescriptor(meta.getPath().toArray(), meta.getPrimitiveType(), -1, -1);
DictionaryPage page = dictionaries.readDictionaryPage(col);
// the chunk may not be dictionary-encoded
if (page == null) {
return null;
}
Dictionary dict = page.getEncoding().initDictionary(col, page);
Set dictSet = new HashSet<T>();
for (int i = 0; i <= dict.getMaxId(); i++) {
switch(meta.getType()) {
case BINARY:
dictSet.add(dict.decodeToBinary(i));
break;
case INT32:
dictSet.add(dict.decodeToInt(i));
break;
case INT64:
dictSet.add(dict.decodeToLong(i));
break;
case FLOAT:
dictSet.add(dict.decodeToFloat(i));
break;
case DOUBLE:
dictSet.add(dict.decodeToDouble(i));
break;
default:
LOG.warn("Unknown dictionary type{}", meta.getType());
}
}
return (Set<T>) dictSet;
}
use of org.apache.parquet.column.Dictionary in project parquet-mr by apache.
the class TestDictionary method initDicReader.
private DictionaryValuesReader initDicReader(ValuesWriter cw, PrimitiveTypeName type) throws IOException {
final DictionaryPage dictionaryPage = cw.toDictPageAndClose().copy();
final ColumnDescriptor descriptor = new ColumnDescriptor(new String[] { "foo" }, type, 0, 0);
final Dictionary dictionary = PLAIN.initDictionary(descriptor, dictionaryPage);
final DictionaryValuesReader cr = new DictionaryValuesReader(dictionary);
return cr;
}
Aggregations