Search in sources :

Example 16 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class ParquetMetadataConverter method fromParquetMetadata.

public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws IOException {
    MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders());
    List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
    List<RowGroup> row_groups = parquetMetadata.getRow_groups();
    if (row_groups != null) {
        for (RowGroup rowGroup : row_groups) {
            BlockMetaData blockMetaData = new BlockMetaData();
            blockMetaData.setRowCount(rowGroup.getNum_rows());
            blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
            List<ColumnChunk> columns = rowGroup.getColumns();
            String filePath = columns.get(0).getFile_path();
            for (ColumnChunk columnChunk : columns) {
                if ((filePath == null && columnChunk.getFile_path() != null) || (filePath != null && !filePath.equals(columnChunk.getFile_path()))) {
                    throw new ParquetDecodingException("all column chunks of the same row group must be in the same file for now");
                }
                ColumnMetaData metaData = columnChunk.meta_data;
                ColumnPath path = getPath(metaData);
                ColumnChunkMetaData column = ColumnChunkMetaData.get(path, messageType.getType(path.toArray()).asPrimitiveType(), fromFormatCodec(metaData.codec), convertEncodingStats(metaData.getEncoding_stats()), fromFormatEncodings(metaData.encodings), fromParquetStatistics(parquetMetadata.getCreated_by(), metaData.statistics, messageType.getType(path.toArray()).asPrimitiveType()), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size);
                // TODO
                // index_page_offset
                // key_value_metadata
                blockMetaData.addColumn(column);
            }
            blockMetaData.setPath(filePath);
            blocks.add(blockMetaData);
        }
    }
    Map<String, String> keyValueMetaData = new HashMap<String, String>();
    List<KeyValue> key_value_metadata = parquetMetadata.getKey_value_metadata();
    if (key_value_metadata != null) {
        for (KeyValue keyValue : key_value_metadata) {
            keyValueMetaData.put(keyValue.key, keyValue.value);
        }
    }
    return new ParquetMetadata(new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, parquetMetadata.getCreated_by()), blocks);
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetDecodingException(org.apache.parquet.io.ParquetDecodingException) KeyValue(org.apache.parquet.format.KeyValue) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) RowGroup(org.apache.parquet.format.RowGroup) ArrayList(java.util.ArrayList) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) ColumnChunk(org.apache.parquet.format.ColumnChunk) ColumnMetaData(org.apache.parquet.format.ColumnMetaData) MessageType(org.apache.parquet.schema.MessageType)

Example 17 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class ParquetFileReader method footersFromSummaryFile.

static List<Footer> footersFromSummaryFile(final Path parent, ParquetMetadata mergedFooters) {
    Map<Path, ParquetMetadata> footers = new HashMap<Path, ParquetMetadata>();
    List<BlockMetaData> blocks = mergedFooters.getBlocks();
    for (BlockMetaData block : blocks) {
        String path = block.getPath();
        Path fullPath = new Path(parent, path);
        ParquetMetadata current = footers.get(fullPath);
        if (current == null) {
            current = new ParquetMetadata(mergedFooters.getFileMetaData(), new ArrayList<BlockMetaData>());
            footers.put(fullPath, current);
        }
        current.getBlocks().add(block);
    }
    List<Footer> result = new ArrayList<Footer>();
    for (Entry<Path, ParquetMetadata> entry : footers.entrySet()) {
        result.add(new Footer(entry.getKey(), entry.getValue()));
    }
    return result;
}
Also used : Path(org.apache.hadoop.fs.Path) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList)

Example 18 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class ParquetFileWriter method mergeFooters.

static ParquetMetadata mergeFooters(Path root, List<Footer> footers) {
    String rootPath = root.toUri().getPath();
    GlobalMetaData fileMetaData = null;
    List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
    for (Footer footer : footers) {
        String footerPath = footer.getFile().toUri().getPath();
        if (!footerPath.startsWith(rootPath)) {
            throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root);
        }
        footerPath = footerPath.substring(rootPath.length());
        while (footerPath.startsWith("/")) {
            footerPath = footerPath.substring(1);
        }
        fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
        for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
            block.setPath(footerPath);
            blocks.add(block);
        }
    }
    return new ParquetMetadata(fileMetaData.merge(), blocks);
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetEncodingException(org.apache.parquet.io.ParquetEncodingException) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ArrayList(java.util.ArrayList) GlobalMetaData(org.apache.parquet.hadoop.metadata.GlobalMetaData)

Example 19 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class TestParquetWriterNewPage method test.

@Test
public void test() throws Exception {
    Configuration conf = new Configuration();
    Path root = new Path("target/tests/TestParquetWriter/");
    FileSystem fs = root.getFileSystem(conf);
    if (fs.exists(root)) {
        fs.delete(root, true);
    }
    fs.mkdirs(root);
    MessageType schema = parseMessageType("message test { " + "required binary binary_field; " + "required int32 int32_field; " + "required int64 int64_field; " + "required boolean boolean_field; " + "required float float_field; " + "required double double_field; " + "required fixed_len_byte_array(3) flba_field; " + "required int96 int96_field; " + "optional binary null_field; " + "} ");
    GroupWriteSupport.setSchema(schema, conf);
    SimpleGroupFactory f = new SimpleGroupFactory(schema);
    Map<String, Encoding> expected = new HashMap<String, Encoding>();
    expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
    expected.put("1000-" + PARQUET_1_0, PLAIN);
    expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
    expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
    for (int modulo : asList(10, 1000)) {
        for (WriterVersion version : WriterVersion.values()) {
            Path file = new Path(root, version.name() + "_" + modulo);
            ParquetWriter<Group> writer = new ParquetWriter<Group>(file, new GroupWriteSupport(), UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
            for (int i = 0; i < 1000; i++) {
                writer.write(f.newGroup().append("binary_field", "test" + (i % modulo)).append("int32_field", 32).append("int64_field", 64l).append("boolean_field", true).append("float_field", 1.0f).append("double_field", 2.0d).append("flba_field", "foo").append("int96_field", Binary.fromConstantByteArray(new byte[12])));
            }
            writer.close();
            ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
            for (int i = 0; i < 1000; i++) {
                Group group = reader.read();
                assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
                assertEquals(32, group.getInteger("int32_field", 0));
                assertEquals(64l, group.getLong("int64_field", 0));
                assertEquals(true, group.getBoolean("boolean_field", 0));
                assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
                assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
                assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
                assertEquals(Binary.fromConstantByteArray(new byte[12]), group.getInt96("int96_field", 0));
                assertEquals(0, group.getFieldRepetitionCount("null_field"));
            }
            reader.close();
            ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
            for (BlockMetaData blockMetaData : footer.getBlocks()) {
                for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
                    if (column.getPath().toDotString().equals("binary_field")) {
                        String key = modulo + "-" + version;
                        Encoding expectedEncoding = expected.get(key);
                        assertTrue(key + ":" + column.getEncodings() + " should contain " + expectedEncoding, column.getEncodings().contains(expectedEncoding));
                    }
                }
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) GroupReadSupport(org.apache.parquet.hadoop.example.GroupReadSupport) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) Encoding(org.apache.parquet.column.Encoding) WriterVersion(org.apache.parquet.column.ParquetProperties.WriterVersion) GroupWriteSupport(org.apache.parquet.hadoop.example.GroupWriteSupport) FileSystem(org.apache.hadoop.fs.FileSystem) MessageTypeParser.parseMessageType(org.apache.parquet.schema.MessageTypeParser.parseMessageType) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 20 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class TestParquetWriterAppendBlocks method testAllowDroppingColumns.

@Test
public void testAllowDroppingColumns() throws IOException {
    MessageType droppedColumnSchema = Types.buildMessage().required(BINARY).as(UTF8).named("string").named("AppendTest");
    Path droppedColumnFile = newTemp();
    ParquetFileWriter writer = new ParquetFileWriter(CONF, droppedColumnSchema, droppedColumnFile);
    writer.start();
    writer.appendFile(CONF, file1);
    writer.appendFile(CONF, file2);
    writer.end(EMPTY_METADATA);
    LinkedList<Group> expected = new LinkedList<Group>();
    expected.addAll(file1content);
    expected.addAll(file2content);
    ParquetMetadata footer = ParquetFileReader.readFooter(CONF, droppedColumnFile, NO_FILTER);
    for (BlockMetaData rowGroup : footer.getBlocks()) {
        Assert.assertEquals("Should have only the string column", 1, rowGroup.getColumns().size());
    }
    ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), droppedColumnFile).build();
    Group next;
    while ((next = reader.read()) != null) {
        Group expectedNext = expected.removeFirst();
        Assert.assertEquals("Each string should match", expectedNext.getString("string", 0), next.getString("string", 0));
    }
    Assert.assertEquals("All records should be present", 0, expected.size());
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) GroupReadSupport(org.apache.parquet.hadoop.example.GroupReadSupport) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) MessageType(org.apache.parquet.schema.MessageType) LinkedList(java.util.LinkedList) Test(org.junit.Test)

Aggregations

ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)76 Path (org.apache.hadoop.fs.Path)39 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)27 Configuration (org.apache.hadoop.conf.Configuration)21 MessageType (org.apache.parquet.schema.MessageType)21 ArrayList (java.util.ArrayList)19 IOException (java.io.IOException)18 Test (org.junit.Test)17 FileSystem (org.apache.hadoop.fs.FileSystem)16 Map (java.util.Map)11 FileMetaData (org.apache.parquet.hadoop.metadata.FileMetaData)11 File (java.io.File)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)9 HashMap (java.util.HashMap)8 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)7 List (java.util.List)6 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)6 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)6 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)6