use of org.apache.parquet.column.ColumnDescriptor in project drill by axbaretto.
the class ParquetColumnChunkPageWriteStore method flushToFileWriter.
/**
* Writes the column chunks in the corresponding row group
* @param writer the parquet file writer
* @throws IOException if the file can not be created
*/
public void flushToFileWriter(ParquetFileWriter writer) throws IOException {
for (ColumnDescriptor path : schema.getColumns()) {
ColumnChunkPageWriter pageWriter = writers.get(path);
pageWriter.writeToFileWriter(writer);
}
}
use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class ParquetFileReader method readNextRowGroup.
/**
* Reads all the columns requested from the row group at the current file position.
* @throws IOException if an error occurs while reading
* @return the PageReadStore which can provide PageReaders for each column.
*/
public PageReadStore readNextRowGroup() throws IOException {
if (currentBlock == blocks.size()) {
return null;
}
BlockMetaData block = blocks.get(currentBlock);
if (block.getRowCount() == 0) {
throw new RuntimeException("Illegal row group of 0 rows");
}
this.currentRowGroup = new ColumnChunkPageReadStore(block.getRowCount());
// prepare the list of consecutive chunks to read them in one scan
List<ConsecutiveChunkList> allChunks = new ArrayList<ConsecutiveChunkList>();
ConsecutiveChunkList currentChunks = null;
for (ColumnChunkMetaData mc : block.getColumns()) {
ColumnPath pathKey = mc.getPath();
BenchmarkCounter.incrementTotalBytes(mc.getTotalSize());
ColumnDescriptor columnDescriptor = paths.get(pathKey);
if (columnDescriptor != null) {
long startingPos = mc.getStartingPos();
// first chunk or not consecutive => new list
if (currentChunks == null || currentChunks.endPos() != startingPos) {
currentChunks = new ConsecutiveChunkList(startingPos);
allChunks.add(currentChunks);
}
currentChunks.addChunk(new ChunkDescriptor(columnDescriptor, mc, startingPos, (int) mc.getTotalSize()));
}
}
// actually read all the chunks
for (ConsecutiveChunkList consecutiveChunks : allChunks) {
final List<Chunk> chunks = consecutiveChunks.readAll(f);
for (Chunk chunk : chunks) {
currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages());
}
}
// avoid re-reading bytes the dictionary reader is used after this call
if (nextDictionaryReader != null) {
nextDictionaryReader.setRowGroup(currentRowGroup);
}
advanceToNextBlock();
return currentRowGroup;
}
use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class ColumnChunkPageWriteStore method flushToFileWriter.
public void flushToFileWriter(ParquetFileWriter writer) throws IOException {
for (ColumnDescriptor path : schema.getColumns()) {
ColumnChunkPageWriter pageWriter = writers.get(path);
pageWriter.writeToFileWriter(writer);
}
}
use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class TestParquetFileWriter method createFile.
private void createFile(Configuration configuration, Path path, MessageType schema) throws IOException {
String[] path1 = { "a", "b" };
ColumnDescriptor c1 = schema.getColumnDescription(path1);
String[] path2 = { "c", "d" };
ColumnDescriptor c2 = schema.getColumnDescription(path2);
byte[] bytes1 = { 0, 1, 2, 3 };
byte[] bytes2 = { 1, 2, 3, 4 };
byte[] bytes3 = { 2, 3, 4, 5 };
byte[] bytes4 = { 3, 4, 5, 6 };
CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
BinaryStatistics stats1 = new BinaryStatistics();
BinaryStatistics stats2 = new BinaryStatistics();
ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
w.start();
w.startBlock(3);
w.startColumn(c1, 5, codec);
w.writeDataPage(2, 4, BytesInput.from(bytes1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(bytes1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(c2, 6, codec);
w.writeDataPage(2, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(1, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
w.startBlock(4);
w.startColumn(c1, 7, codec);
w.writeDataPage(7, 4, BytesInput.from(bytes3), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(c2, 8, codec);
w.writeDataPage(8, 4, BytesInput.from(bytes4), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
final HashMap<String, String> extraMetaData = new HashMap<String, String>();
extraMetaData.put("foo", "bar");
extraMetaData.put(path.getName(), path.getName());
w.end(extraMetaData);
}
use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class ShowDictionaryCommand method run.
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
String source = targets.get(0);
ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source));
MessageType schema = reader.getFileMetaData().getSchema();
ColumnDescriptor descriptor = Util.descriptor(column, schema);
PrimitiveType type = Util.primitive(column, schema);
Preconditions.checkNotNull(type);
DictionaryPageReadStore dictionaryReader;
int rowGroup = 0;
while ((dictionaryReader = reader.getNextDictionaryReader()) != null) {
DictionaryPage page = dictionaryReader.readDictionaryPage(descriptor);
Dictionary dict = page.getEncoding().initDictionary(descriptor, page);
console.info("\nRow group {} dictionary for \"{}\":", rowGroup, column, page.getCompressedSize());
for (int i = 0; i <= dict.getMaxId(); i += 1) {
switch(type.getPrimitiveTypeName()) {
case BINARY:
if (type.getOriginalType() == OriginalType.UTF8) {
console.info("{}: {}", String.format("%6d", i), Util.humanReadable(dict.decodeToBinary(i).toStringUsingUTF8(), 70));
} else {
console.info("{}: {}", String.format("%6d", i), Util.humanReadable(dict.decodeToBinary(i).getBytesUnsafe(), 70));
}
break;
case INT32:
console.info("{}: {}", String.format("%6d", i), dict.decodeToInt(i));
break;
case INT64:
console.info("{}: {}", String.format("%6d", i), dict.decodeToLong(i));
break;
case FLOAT:
console.info("{}: {}", String.format("%6d", i), dict.decodeToFloat(i));
break;
case DOUBLE:
console.info("{}: {}", String.format("%6d", i), dict.decodeToDouble(i));
break;
default:
throw new IllegalArgumentException("Unknown dictionary type: " + type.getPrimitiveTypeName());
}
}
reader.skipNextRowGroup();
rowGroup += 1;
}
console.info("");
return 0;
}
Aggregations