Search in sources :

Example 1 with FileMetaData

use of org.apache.parquet.format.FileMetaData in project parquet-mr by apache.

the class ParquetMetadataConverter method toParquetMetadata.

public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
    List<BlockMetaData> blocks = parquetMetadata.getBlocks();
    List<RowGroup> rowGroups = new ArrayList<RowGroup>();
    long numRows = 0;
    for (BlockMetaData block : blocks) {
        numRows += block.getRowCount();
        addRowGroup(parquetMetadata, rowGroups, block);
    }
    FileMetaData fileMetaData = new FileMetaData(currentVersion, toParquetSchema(parquetMetadata.getFileMetaData().getSchema()), numRows, rowGroups);
    Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
    for (Entry<String, String> keyValue : keyValues) {
        addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
    }
    fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());
    fileMetaData.setColumn_orders(getColumnOrders(parquetMetadata.getFileMetaData().getSchema()));
    return fileMetaData;
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Entry(java.util.Map.Entry) RowGroup(org.apache.parquet.format.RowGroup) ArrayList(java.util.ArrayList) Util.readFileMetaData(org.apache.parquet.format.Util.readFileMetaData) FileMetaData(org.apache.parquet.format.FileMetaData)

Example 2 with FileMetaData

use of org.apache.parquet.format.FileMetaData in project parquet-mr by apache.

the class TestParquetMetadataConverter method testNullFieldMetadataDebugLogging.

@Test
public void testNullFieldMetadataDebugLogging() {
    MessageType schema = parseMessageType("message test { optional binary some_null_field; }");
    org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap<String, String>(), null);
    List<BlockMetaData> blockMetaDataList = new ArrayList<BlockMetaData>();
    BlockMetaData blockMetaData = new BlockMetaData();
    blockMetaData.addColumn(createColumnChunkMetaData());
    blockMetaDataList.add(blockMetaData);
    ParquetMetadata metadata = new ParquetMetadata(fileMetaData, blockMetaDataList);
    ParquetMetadata.toJSON(metadata);
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ArrayList(java.util.ArrayList) MessageType(org.apache.parquet.schema.MessageType) MessageTypeParser.parseMessageType(org.apache.parquet.schema.MessageTypeParser.parseMessageType) FileMetaData(org.apache.parquet.format.FileMetaData) Test(org.junit.Test)

Example 3 with FileMetaData

use of org.apache.parquet.format.FileMetaData in project parquet-mr by apache.

the class TestParquetMetadataConverter method testColumnOrders.

@Test
public void testColumnOrders() throws IOException {
    MessageType schema = parseMessageType("message test {" + // Normal column with type defined column order -> typeDefined
    "  optional binary binary_col;" + "  optional group map_col (MAP) {" + "    repeated group map (MAP_KEY_VALUE) {" + // Key to be hacked to have unknown column order -> undefined
    "        required binary key (UTF8);" + "        optional group list_col (LIST) {" + "          repeated group list {" + // INT96 element with type defined column order -> undefined
    "            optional int96 array_element;" + "          }" + "        }" + "    }" + "  }" + "}");
    org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap<String, String>(), null);
    ParquetMetadata metadata = new ParquetMetadata(fileMetaData, new ArrayList<BlockMetaData>());
    ParquetMetadataConverter converter = new ParquetMetadataConverter();
    FileMetaData formatMetadata = converter.toParquetMetadata(1, metadata);
    List<org.apache.parquet.format.ColumnOrder> columnOrders = formatMetadata.getColumn_orders();
    assertEquals(3, columnOrders.size());
    for (org.apache.parquet.format.ColumnOrder columnOrder : columnOrders) {
        assertTrue(columnOrder.isSetTYPE_ORDER());
    }
    // Simulate that thrift got a union type that is not in the generated code
    // (when the file contains a not-yet-supported column order)
    columnOrders.get(1).clear();
    MessageType resultSchema = converter.fromParquetMetadata(formatMetadata).getFileMetaData().getSchema();
    List<ColumnDescriptor> columns = resultSchema.getColumns();
    assertEquals(3, columns.size());
    assertEquals(ColumnOrder.typeDefined(), columns.get(0).getPrimitiveType().columnOrder());
    assertEquals(ColumnOrder.undefined(), columns.get(1).getPrimitiveType().columnOrder());
    assertEquals(ColumnOrder.undefined(), columns.get(2).getPrimitiveType().columnOrder());
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) MessageType(org.apache.parquet.schema.MessageType) MessageTypeParser.parseMessageType(org.apache.parquet.schema.MessageTypeParser.parseMessageType) FileMetaData(org.apache.parquet.format.FileMetaData) ColumnOrder(org.apache.parquet.schema.ColumnOrder) Test(org.junit.Test)

Example 4 with FileMetaData

use of org.apache.parquet.format.FileMetaData in project parquet-mr by apache.

the class TestParquetMetadataConverter method metadata.

private FileMetaData metadata(long... sizes) {
    List<SchemaElement> schema = emptyList();
    List<RowGroup> rowGroups = new ArrayList<RowGroup>();
    long offset = 0;
    for (long size : sizes) {
        ColumnChunk columnChunk = new ColumnChunk(offset);
        columnChunk.setMeta_data(new ColumnMetaData(INT32, Collections.<org.apache.parquet.format.Encoding>emptyList(), Collections.<String>emptyList(), UNCOMPRESSED, 10l, size * 2, size, offset));
        rowGroups.add(new RowGroup(Arrays.asList(columnChunk), size, 1));
        offset += size;
    }
    return new FileMetaData(1, schema, sizes.length, rowGroups);
}
Also used : RowGroup(org.apache.parquet.format.RowGroup) ArrayList(java.util.ArrayList) ColumnChunk(org.apache.parquet.format.ColumnChunk) SchemaElement(org.apache.parquet.format.SchemaElement) ColumnMetaData(org.apache.parquet.format.ColumnMetaData) FileMetaData(org.apache.parquet.format.FileMetaData)

Example 5 with FileMetaData

use of org.apache.parquet.format.FileMetaData in project parquet-mr by apache.

the class ParquetMetadataConverter method readParquetMetadata.

public ParquetMetadata readParquetMetadata(final InputStream from, MetadataFilter filter) throws IOException {
    FileMetaData fileMetaData = filter.accept(new MetadataFilterVisitor<FileMetaData, IOException>() {

        @Override
        public FileMetaData visit(NoFilter filter) throws IOException {
            return readFileMetaData(from);
        }

        @Override
        public FileMetaData visit(SkipMetadataFilter filter) throws IOException {
            return readFileMetaData(from, true);
        }

        @Override
        public FileMetaData visit(OffsetMetadataFilter filter) throws IOException {
            return filterFileMetaDataByStart(readFileMetaData(from), filter);
        }

        @Override
        public FileMetaData visit(RangeMetadataFilter filter) throws IOException {
            return filterFileMetaDataByMidpoint(readFileMetaData(from), filter);
        }
    });
    LOG.debug("{}", fileMetaData);
    ParquetMetadata parquetMetadata = fromParquetMetadata(fileMetaData);
    if (LOG.isDebugEnabled())
        LOG.debug(ParquetMetadata.toPrettyJSON(parquetMetadata));
    return parquetMetadata;
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) IOException(java.io.IOException) Util.readFileMetaData(org.apache.parquet.format.Util.readFileMetaData) FileMetaData(org.apache.parquet.format.FileMetaData)

Aggregations

FileMetaData (org.apache.parquet.format.FileMetaData)9 ArrayList (java.util.ArrayList)4 SchemaElement (org.apache.parquet.format.SchemaElement)4 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)4 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)4 HashMap (java.util.HashMap)3 RowGroup (org.apache.parquet.format.RowGroup)3 Util.readFileMetaData (org.apache.parquet.format.Util.readFileMetaData)3 ParquetMetadataConverter (org.apache.parquet.format.converter.ParquetMetadataConverter)3 MessageType (org.apache.parquet.schema.MessageType)3 IOException (java.io.IOException)2 ColumnChunk (org.apache.parquet.format.ColumnChunk)2 ColumnMetaData (org.apache.parquet.format.ColumnMetaData)2 MessageTypeParser.parseMessageType (org.apache.parquet.schema.MessageTypeParser.parseMessageType)2 Test (org.junit.Test)2 ParquetCorruptionException (com.facebook.presto.parquet.ParquetCorruptionException)1 ParquetDataSource (com.facebook.presto.parquet.ParquetDataSource)1 ParquetValidationUtils.validateParquet (com.facebook.presto.parquet.ParquetValidationUtils.validateParquet)1 DynamicSliceOutput (io.airlift.slice.DynamicSliceOutput)1 Slice (io.airlift.slice.Slice)1