Search in sources :

Example 81 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project flink by apache.

the class ParquetColumnarRowSplitReader method checkSchema.

private void checkSchema() throws IOException, UnsupportedOperationException {
    if (selectedTypes.length != requestedSchema.getFieldCount()) {
        throw new RuntimeException("The quality of field type is incompatible with the request schema!");
    }
    /*
         * Check that the requested schema is supported.
         */
    for (int i = 0; i < requestedSchema.getFieldCount(); ++i) {
        Type t = requestedSchema.getFields().get(i);
        if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) {
            throw new UnsupportedOperationException("Complex types not supported.");
        }
        String[] colPath = requestedSchema.getPaths().get(i);
        if (fileSchema.containsPath(colPath)) {
            ColumnDescriptor fd = fileSchema.getColumnDescription(colPath);
            if (!fd.equals(requestedSchema.getColumns().get(i))) {
                throw new UnsupportedOperationException("Schema evolution not supported.");
            }
        } else {
            if (requestedSchema.getColumns().get(i).getMaxDefinitionLevel() == 0) {
                // invalid.
                throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath));
            }
        }
    }
}
Also used : GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) LogicalType(org.apache.flink.table.types.logical.LogicalType) Type(org.apache.parquet.schema.Type) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) IOException(java.io.IOException)

Example 82 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project flink by apache.

the class ParquetColumnarRowSplitReader method readNextRowGroup.

private void readNextRowGroup() throws IOException {
    PageReadStore pages = reader.readNextRowGroup();
    if (pages == null) {
        throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount);
    }
    List<ColumnDescriptor> columns = requestedSchema.getColumns();
    columnReaders = new AbstractColumnReader[columns.size()];
    for (int i = 0; i < columns.size(); ++i) {
        columnReaders[i] = createColumnReader(utcTimestamp, selectedTypes[i], columns.get(i), pages.getPageReader(columns.get(i)));
    }
    totalCountLoadedSoFar += pages.getRowCount();
}
Also used : PageReadStore(org.apache.parquet.column.page.PageReadStore) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) IOException(java.io.IOException)

Example 83 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project drill by apache.

the class TestFileGenerator method generateParquetFile.

public static void generateParquetFile(String filename, ParquetTestProperties props) throws Exception {
    int currentBooleanByte = 0;
    WrapAroundCounter booleanBitCounter = new WrapAroundCounter(7);
    Configuration configuration = new Configuration();
    configuration.set(FileSystem.FS_DEFAULT_NAME_KEY, "file:///");
    // "message m { required int32 integer; required int64 integer64; required boolean b; required float f; required double d;}"
    FileSystem fs = FileSystem.get(configuration);
    Path path = new Path(filename);
    if (fs.exists(path)) {
        fs.delete(path, false);
    }
    String messageSchema = "message m {";
    for (FieldInfo fieldInfo : props.fields.values()) {
        messageSchema += " required " + fieldInfo.parquetType + " " + fieldInfo.name + ";";
    }
    // remove the last semicolon, java really needs a join method for strings...
    // TODO - nvm apparently it requires a semicolon after every field decl, might want to file a bug
    // messageSchema = messageSchema.substring(schemaType, messageSchema.length() - 1);
    messageSchema += "}";
    MessageType schema = MessageTypeParser.parseMessageType(messageSchema);
    CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
    ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
    w.start();
    HashMap<String, Integer> columnValuesWritten = new HashMap<>();
    int valsWritten;
    for (int k = 0; k < props.numberRowGroups; k++) {
        w.startBlock(props.recordsPerRowGroup);
        currentBooleanByte = 0;
        booleanBitCounter.reset();
        for (FieldInfo fieldInfo : props.fields.values()) {
            if (!columnValuesWritten.containsKey(fieldInfo.name)) {
                columnValuesWritten.put(fieldInfo.name, 0);
                valsWritten = 0;
            } else {
                valsWritten = columnValuesWritten.get(fieldInfo.name);
            }
            String[] path1 = { fieldInfo.name };
            ColumnDescriptor c1 = schema.getColumnDescription(path1);
            w.startColumn(c1, props.recordsPerRowGroup, codec);
            final int valsPerPage = (int) Math.ceil(props.recordsPerRowGroup / (float) fieldInfo.numberOfPages);
            // 1 MB
            final int PAGE_SIZE = 1024 * 1024;
            byte[] bytes;
            RunLengthBitPackingHybridValuesWriter defLevels = new RunLengthBitPackingHybridValuesWriter(MAX_EXPECTED_BIT_WIDTH_FOR_DEFINITION_LEVELS, valsPerPage, PAGE_SIZE, new DirectByteBufferAllocator());
            RunLengthBitPackingHybridValuesWriter repLevels = new RunLengthBitPackingHybridValuesWriter(MAX_EXPECTED_BIT_WIDTH_FOR_DEFINITION_LEVELS, valsPerPage, PAGE_SIZE, new DirectByteBufferAllocator());
            // for variable length binary fields
            int bytesNeededToEncodeLength = 4;
            if (fieldInfo.bitLength > 0) {
                bytes = new byte[(int) Math.ceil(valsPerPage * fieldInfo.bitLength / 8.0)];
            } else {
                // the twelve at the end is to account for storing a 4 byte length with each value
                int totalValLength = ((byte[]) fieldInfo.values[0]).length + ((byte[]) fieldInfo.values[1]).length + ((byte[]) fieldInfo.values[2]).length + 3 * bytesNeededToEncodeLength;
                // used for the case where there is a number of values in this row group that is not divisible by 3
                int leftOverBytes = 0;
                if (valsPerPage % 3 > 0) {
                    leftOverBytes += ((byte[]) fieldInfo.values[1]).length + bytesNeededToEncodeLength;
                }
                if (valsPerPage % 3 > 1) {
                    leftOverBytes += ((byte[]) fieldInfo.values[2]).length + bytesNeededToEncodeLength;
                }
                bytes = new byte[valsPerPage / 3 * totalValLength + leftOverBytes];
            }
            int bytesPerPage = (int) (valsPerPage * (fieldInfo.bitLength / 8.0));
            int bytesWritten = 0;
            for (int z = 0; z < fieldInfo.numberOfPages; z++, bytesWritten = 0) {
                for (int i = 0; i < valsPerPage; i++) {
                    repLevels.writeInteger(0);
                    defLevels.writeInteger(1);
                    if (fieldInfo.values[0] instanceof Boolean) {
                        bytes[currentBooleanByte] |= bitFields[booleanBitCounter.val] & ((boolean) fieldInfo.values[valsWritten % 3] ? allBitsTrue : allBitsFalse);
                        booleanBitCounter.increment();
                        if (booleanBitCounter.val == 0) {
                            currentBooleanByte++;
                        }
                        valsWritten++;
                        if (currentBooleanByte > bytesPerPage) {
                            break;
                        }
                    } else {
                        if (fieldInfo.values[valsWritten % 3] instanceof byte[]) {
                            System.arraycopy(ByteArrayUtil.toByta(((byte[]) fieldInfo.values[valsWritten % 3]).length), 0, bytes, bytesWritten, bytesNeededToEncodeLength);
                            System.arraycopy(fieldInfo.values[valsWritten % 3], 0, bytes, bytesWritten + bytesNeededToEncodeLength, ((byte[]) fieldInfo.values[valsWritten % 3]).length);
                            bytesWritten += ((byte[]) fieldInfo.values[valsWritten % 3]).length + bytesNeededToEncodeLength;
                        } else {
                            System.arraycopy(ByteArrayUtil.toByta(fieldInfo.values[valsWritten % 3]), 0, bytes, i * (fieldInfo.bitLength / 8), fieldInfo.bitLength / 8);
                        }
                        valsWritten++;
                    }
                }
                byte[] fullPage = new byte[2 * 4 * valsPerPage + bytes.length];
                byte[] repLevelBytes = repLevels.getBytes().toByteArray();
                byte[] defLevelBytes = defLevels.getBytes().toByteArray();
                System.arraycopy(bytes, 0, fullPage, 0, bytes.length);
                System.arraycopy(repLevelBytes, 0, fullPage, bytes.length, repLevelBytes.length);
                System.arraycopy(defLevelBytes, 0, fullPage, bytes.length + repLevelBytes.length, defLevelBytes.length);
                w.writeDataPage((props.recordsPerRowGroup / fieldInfo.numberOfPages), fullPage.length, BytesInput.from(fullPage), RLE, RLE, PLAIN);
                currentBooleanByte = 0;
            }
            w.endColumn();
            columnValuesWritten.remove(fieldInfo.name);
            columnValuesWritten.put(fieldInfo.name, valsWritten);
        }
        w.endBlock();
    }
    w.end(new HashMap<String, String>());
    logger.debug("Finished generating parquet file {}", path.getName());
}
Also used : Path(org.apache.hadoop.fs.Path) DirectByteBufferAllocator(org.apache.parquet.bytes.DirectByteBufferAllocator) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetFileWriter(org.apache.parquet.hadoop.ParquetFileWriter) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) RunLengthBitPackingHybridValuesWriter(org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter) FileSystem(org.apache.hadoop.fs.FileSystem) MessageType(org.apache.parquet.schema.MessageType)

Example 84 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project drill by apache.

the class ParquetSchema method loadParquetSchema.

/**
 * Scan the Parquet footer, then map each Parquet column to the list of columns
 * we want to read. Track those to be read.
 */
private void loadParquetSchema() {
    // TODO - figure out how to deal with this better once we add nested reading, note also look where this map is used below
    // store a map from column name to converted types if they are non-null
    Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
    // loop to add up the length of the fixed width columns and build the schema
    for (ColumnDescriptor column : footer.getFileMetaData().getSchema().getColumns()) {
        ParquetColumnMetadata columnMetadata = new ParquetColumnMetadata(column);
        columnMetadata.resolveDrillType(schemaElements, options);
        if (!columnSelected(column)) {
            continue;
        }
        selectedColumnMetadata.add(columnMetadata);
    }
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) SchemaElement(org.apache.parquet.format.SchemaElement)

Example 85 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project drill by apache.

the class DrillParquetReader method getAllColumnsFrom.

/**
 * Convert the columns in the parquet schema to a list of SchemaPath columns so that they can be compared in case
 * insensitive manner to the projection columns.
 *
 * @param schema Parquet file schema
 * @return paths to all fields in schema
 */
private static List<SchemaPath> getAllColumnsFrom(MessageType schema) {
    List<SchemaPath> schemaPaths = new LinkedList<>();
    for (ColumnDescriptor columnDescriptor : schema.getColumns()) {
        String[] schemaColDesc = Arrays.copyOf(columnDescriptor.getPath(), columnDescriptor.getPath().length);
        SchemaPath schemaPath = SchemaPath.getCompoundPath(schemaColDesc);
        schemaPaths.add(schemaPath);
    }
    return schemaPaths;
}
Also used : SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) LinkedList(java.util.LinkedList)

Aggregations

ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)88 MessageType (org.apache.parquet.schema.MessageType)33 PrimitiveType (org.apache.parquet.schema.PrimitiveType)18 Test (org.testng.annotations.Test)18 RichColumnDescriptor (com.facebook.presto.parquet.RichColumnDescriptor)16 ArrayList (java.util.ArrayList)16 GroupType (org.apache.parquet.schema.GroupType)14 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)12 Test (org.junit.Test)12 Domain (com.facebook.presto.common.predicate.Domain)11 TupleDomain (com.facebook.presto.common.predicate.TupleDomain)11 Path (org.apache.hadoop.fs.Path)11 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)11 List (java.util.List)10 ImmutableList (com.google.common.collect.ImmutableList)9 HashMap (java.util.HashMap)9 Configuration (org.apache.hadoop.conf.Configuration)9 Type (org.apache.parquet.schema.Type)9 HiveColumnHandle (com.facebook.presto.hive.HiveColumnHandle)8 IOException (java.io.IOException)7