Search in sources :

Example 6 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project drill by axbaretto.

the class ParquetColumnChunkPageWriteStore method flushToFileWriter.

/**
 * Writes the column chunks in the corresponding row group
 * @param writer the parquet file writer
 * @throws IOException if the file can not be created
 */
public void flushToFileWriter(ParquetFileWriter writer) throws IOException {
    for (ColumnDescriptor path : schema.getColumns()) {
        ColumnChunkPageWriter pageWriter = writers.get(path);
        pageWriter.writeToFileWriter(writer);
    }
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor)

Example 7 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class ParquetFileReader method readNextRowGroup.

/**
 * Reads all the columns requested from the row group at the current file position.
 * @throws IOException if an error occurs while reading
 * @return the PageReadStore which can provide PageReaders for each column.
 */
public PageReadStore readNextRowGroup() throws IOException {
    if (currentBlock == blocks.size()) {
        return null;
    }
    BlockMetaData block = blocks.get(currentBlock);
    if (block.getRowCount() == 0) {
        throw new RuntimeException("Illegal row group of 0 rows");
    }
    this.currentRowGroup = new ColumnChunkPageReadStore(block.getRowCount());
    // prepare the list of consecutive chunks to read them in one scan
    List<ConsecutiveChunkList> allChunks = new ArrayList<ConsecutiveChunkList>();
    ConsecutiveChunkList currentChunks = null;
    for (ColumnChunkMetaData mc : block.getColumns()) {
        ColumnPath pathKey = mc.getPath();
        BenchmarkCounter.incrementTotalBytes(mc.getTotalSize());
        ColumnDescriptor columnDescriptor = paths.get(pathKey);
        if (columnDescriptor != null) {
            long startingPos = mc.getStartingPos();
            // first chunk or not consecutive => new list
            if (currentChunks == null || currentChunks.endPos() != startingPos) {
                currentChunks = new ConsecutiveChunkList(startingPos);
                allChunks.add(currentChunks);
            }
            currentChunks.addChunk(new ChunkDescriptor(columnDescriptor, mc, startingPos, (int) mc.getTotalSize()));
        }
    }
    // actually read all the chunks
    for (ConsecutiveChunkList consecutiveChunks : allChunks) {
        final List<Chunk> chunks = consecutiveChunks.readAll(f);
        for (Chunk chunk : chunks) {
            currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages());
        }
    }
    // avoid re-reading bytes the dictionary reader is used after this call
    if (nextDictionaryReader != null) {
        nextDictionaryReader.setRowGroup(currentRowGroup);
    }
    advanceToNextBlock();
    return currentRowGroup;
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath)

Example 8 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class ColumnChunkPageWriteStore method flushToFileWriter.

public void flushToFileWriter(ParquetFileWriter writer) throws IOException {
    for (ColumnDescriptor path : schema.getColumns()) {
        ColumnChunkPageWriter pageWriter = writers.get(path);
        pageWriter.writeToFileWriter(writer);
    }
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor)

Example 9 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class TestParquetFileWriter method createFile.

private void createFile(Configuration configuration, Path path, MessageType schema) throws IOException {
    String[] path1 = { "a", "b" };
    ColumnDescriptor c1 = schema.getColumnDescription(path1);
    String[] path2 = { "c", "d" };
    ColumnDescriptor c2 = schema.getColumnDescription(path2);
    byte[] bytes1 = { 0, 1, 2, 3 };
    byte[] bytes2 = { 1, 2, 3, 4 };
    byte[] bytes3 = { 2, 3, 4, 5 };
    byte[] bytes4 = { 3, 4, 5, 6 };
    CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
    BinaryStatistics stats1 = new BinaryStatistics();
    BinaryStatistics stats2 = new BinaryStatistics();
    ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
    w.start();
    w.startBlock(3);
    w.startColumn(c1, 5, codec);
    w.writeDataPage(2, 4, BytesInput.from(bytes1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(bytes1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(c2, 6, codec);
    w.writeDataPage(2, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(1, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.startBlock(4);
    w.startColumn(c1, 7, codec);
    w.writeDataPage(7, 4, BytesInput.from(bytes3), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(c2, 8, codec);
    w.writeDataPage(8, 4, BytesInput.from(bytes4), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    final HashMap<String, String> extraMetaData = new HashMap<String, String>();
    extraMetaData.put("foo", "bar");
    extraMetaData.put(path.getName(), path.getName());
    w.end(extraMetaData);
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics)

Example 10 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class ShowDictionaryCommand method run.

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
    Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
    Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
    String source = targets.get(0);
    ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source));
    MessageType schema = reader.getFileMetaData().getSchema();
    ColumnDescriptor descriptor = Util.descriptor(column, schema);
    PrimitiveType type = Util.primitive(column, schema);
    Preconditions.checkNotNull(type);
    DictionaryPageReadStore dictionaryReader;
    int rowGroup = 0;
    while ((dictionaryReader = reader.getNextDictionaryReader()) != null) {
        DictionaryPage page = dictionaryReader.readDictionaryPage(descriptor);
        Dictionary dict = page.getEncoding().initDictionary(descriptor, page);
        console.info("\nRow group {} dictionary for \"{}\":", rowGroup, column, page.getCompressedSize());
        for (int i = 0; i <= dict.getMaxId(); i += 1) {
            switch(type.getPrimitiveTypeName()) {
                case BINARY:
                    if (type.getOriginalType() == OriginalType.UTF8) {
                        console.info("{}: {}", String.format("%6d", i), Util.humanReadable(dict.decodeToBinary(i).toStringUsingUTF8(), 70));
                    } else {
                        console.info("{}: {}", String.format("%6d", i), Util.humanReadable(dict.decodeToBinary(i).getBytesUnsafe(), 70));
                    }
                    break;
                case INT32:
                    console.info("{}: {}", String.format("%6d", i), dict.decodeToInt(i));
                    break;
                case INT64:
                    console.info("{}: {}", String.format("%6d", i), dict.decodeToLong(i));
                    break;
                case FLOAT:
                    console.info("{}: {}", String.format("%6d", i), dict.decodeToFloat(i));
                    break;
                case DOUBLE:
                    console.info("{}: {}", String.format("%6d", i), dict.decodeToDouble(i));
                    break;
                default:
                    throw new IllegalArgumentException("Unknown dictionary type: " + type.getPrimitiveTypeName());
            }
        }
        reader.skipNextRowGroup();
        rowGroup += 1;
    }
    console.info("");
    return 0;
}
Also used : Dictionary(org.apache.parquet.column.Dictionary) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PrimitiveType(org.apache.parquet.schema.PrimitiveType) DictionaryPageReadStore(org.apache.parquet.column.page.DictionaryPageReadStore) MessageType(org.apache.parquet.schema.MessageType) DictionaryPage(org.apache.parquet.column.page.DictionaryPage)

Aggregations

ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)88 MessageType (org.apache.parquet.schema.MessageType)33 PrimitiveType (org.apache.parquet.schema.PrimitiveType)18 Test (org.testng.annotations.Test)18 RichColumnDescriptor (com.facebook.presto.parquet.RichColumnDescriptor)16 ArrayList (java.util.ArrayList)16 GroupType (org.apache.parquet.schema.GroupType)14 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)12 Test (org.junit.Test)12 Domain (com.facebook.presto.common.predicate.Domain)11 TupleDomain (com.facebook.presto.common.predicate.TupleDomain)11 Path (org.apache.hadoop.fs.Path)11 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)11 List (java.util.List)10 ImmutableList (com.google.common.collect.ImmutableList)9 HashMap (java.util.HashMap)9 Configuration (org.apache.hadoop.conf.Configuration)9 Type (org.apache.parquet.schema.Type)9 HiveColumnHandle (com.facebook.presto.hive.HiveColumnHandle)8 IOException (java.io.IOException)7