Search in sources :

Example 86 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project drill by apache.

the class ParquetFileWriter method appendRowGroup.

public void appendRowGroup(SeekableInputStream from, BlockMetaData rowGroup, boolean dropColumns) throws IOException {
    startBlock(rowGroup.getRowCount());
    Map<String, ColumnChunkMetaData> columnsToCopy = new HashMap<String, ColumnChunkMetaData>();
    for (ColumnChunkMetaData chunk : rowGroup.getColumns()) {
        columnsToCopy.put(chunk.getPath().toDotString(), chunk);
    }
    List<ColumnChunkMetaData> columnsInOrder = new ArrayList<ColumnChunkMetaData>();
    for (ColumnDescriptor descriptor : schema.getColumns()) {
        String path = ColumnPath.get(descriptor.getPath()).toDotString();
        ColumnChunkMetaData chunk = columnsToCopy.remove(path);
        if (chunk != null) {
            columnsInOrder.add(chunk);
        } else {
            throw new IllegalArgumentException(String.format("Missing column '%s', cannot copy row group: %s", path, rowGroup));
        }
    }
    // complain if some columns would be dropped and that's not okay
    if (!dropColumns && !columnsToCopy.isEmpty()) {
        throw new IllegalArgumentException(String.format("Columns cannot be copied (missing from target schema): %s", String.join(", ", columnsToCopy.keySet())));
    }
    // copy the data for all chunks
    long start = -1;
    long length = 0;
    long blockUncompressedSize = 0L;
    for (int i = 0; i < columnsInOrder.size(); i += 1) {
        ColumnChunkMetaData chunk = columnsInOrder.get(i);
        // get this chunk's start position in the new file
        long newChunkStart = out.getPos() + length;
        // add this chunk to be copied with any previous chunks
        if (start < 0) {
            // no previous chunk included, start at this chunk's starting pos
            start = chunk.getStartingPos();
        }
        length += chunk.getTotalSize();
        if ((i + 1) == columnsInOrder.size() || columnsInOrder.get(i + 1).getStartingPos() != (start + length)) {
            // not contiguous. do the copy now.
            copy(from, out, start, length);
            // reset to start at the next column chunk
            start = -1;
            length = 0;
        }
        // TODO: column/offset indexes are not copied
        // (it would require seeking to the end of the file for each row groups)
        currentColumnIndexes.add(null);
        currentOffsetIndexes.add(null);
        Offsets offsets = Offsets.getOffsets(from, chunk, newChunkStart);
        currentBlock.addColumn(ColumnChunkMetaData.get(chunk.getPath(), chunk.getPrimitiveType(), chunk.getCodec(), chunk.getEncodingStats(), chunk.getEncodings(), chunk.getStatistics(), offsets.firstDataPageOffset, offsets.dictionaryPageOffset, chunk.getValueCount(), chunk.getTotalSize(), chunk.getTotalUncompressedSize()));
        blockUncompressedSize += chunk.getTotalUncompressedSize();
    }
    currentBlock.setTotalByteSize(blockUncompressedSize);
    endBlock();
}
Also used : ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) HashMap(java.util.HashMap) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList)

Example 87 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project drill by apache.

the class ParquetColumnChunkPageWriteStore method flushToFileWriter.

public void flushToFileWriter(ParquetFileWriter writer) throws IOException {
    for (ColumnDescriptor path : schema.getColumns()) {
        ColumnChunkPageWriter pageWriter = writers.get(path);
        pageWriter.writeToFileWriter(writer);
    }
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor)

Example 88 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project drill by apache.

the class ParquetReaderUtility method getColNameToColumnDescriptorMapping.

/**
 * Map full column paths to all ColumnDescriptors in file schema
 *
 * @param footer Parquet file metadata
 * @return       column full path to ColumnDescriptor object map
 */
public static Map<String, ColumnDescriptor> getColNameToColumnDescriptorMapping(ParquetMetadata footer) {
    Map<String, ColumnDescriptor> colDescMap = new HashMap<>();
    List<ColumnDescriptor> columns = footer.getFileMetaData().getSchema().getColumns();
    for (ColumnDescriptor column : columns) {
        colDescMap.put(getFullColumnPath(column), column);
    }
    return colDescMap;
}
Also used : HashMap(java.util.HashMap) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor)

Aggregations

ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)88 MessageType (org.apache.parquet.schema.MessageType)33 PrimitiveType (org.apache.parquet.schema.PrimitiveType)18 Test (org.testng.annotations.Test)18 RichColumnDescriptor (com.facebook.presto.parquet.RichColumnDescriptor)16 ArrayList (java.util.ArrayList)16 GroupType (org.apache.parquet.schema.GroupType)14 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)12 Test (org.junit.Test)12 Domain (com.facebook.presto.common.predicate.Domain)11 TupleDomain (com.facebook.presto.common.predicate.TupleDomain)11 Path (org.apache.hadoop.fs.Path)11 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)11 List (java.util.List)10 ImmutableList (com.google.common.collect.ImmutableList)9 HashMap (java.util.HashMap)9 Configuration (org.apache.hadoop.conf.Configuration)9 Type (org.apache.parquet.schema.Type)9 HiveColumnHandle (com.facebook.presto.hive.HiveColumnHandle)8 IOException (java.io.IOException)7