Search in sources :

Example 56 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class ParquetFileWriter method writeMergedMetadataFile.

/**
 * Given a list of metadata files, merge them into a single metadata file.
 * Requires that the schemas be compatible, and the extraMetaData be exactly equal.
 * This is useful when merging 2 directories of parquet files into a single directory, as long
 * as both directories were written with compatible schemas and equal extraMetaData.
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
public static void writeMergedMetadataFile(List<Path> files, Path outputPath, Configuration conf) throws IOException {
    ParquetMetadata merged = mergeMetadataFiles(files, conf);
    writeMetadataFile(outputPath, merged, outputPath.getFileSystem(conf));
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata)

Example 57 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class ParquetFileReader method readSummaryFile.

/**
 * Specifically reads a given summary file
 * @param configuration
 * @param summaryStatus
 * @return the metadata translated for each file
 * @throws IOException
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
public static List<Footer> readSummaryFile(Configuration configuration, FileStatus summaryStatus) throws IOException {
    final Path parent = summaryStatus.getPath().getParent();
    ParquetMetadata mergedFooters = readFooter(configuration, summaryStatus, filter(false));
    return footersFromSummaryFile(parent, mergedFooters);
}
Also used : Path(org.apache.hadoop.fs.Path) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata)

Example 58 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class DictionaryFilterTest method setUp.

@Before
public void setUp() throws Exception {
    reader = ParquetFileReader.open(conf, file);
    ParquetMetadata meta = reader.getFooter();
    ccmd = meta.getBlocks().get(0).getColumns();
    dictionaries = reader.getDictionaryReader(meta.getBlocks().get(0));
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) Before(org.junit.Before)

Example 59 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class TestInputFormat method getDummyCacheValue.

private ParquetInputFormat.FootersCacheValue getDummyCacheValue(File file, FileSystem fs) throws IOException {
    Path path = new Path(file.getPath());
    FileStatus status = fs.getFileStatus(path);
    ParquetInputFormat.FileStatusWrapper statusWrapper = new ParquetInputFormat.FileStatusWrapper(status);
    ParquetMetadata mockMetadata = mock(ParquetMetadata.class);
    ParquetInputFormat.FootersCacheValue cacheValue = new ParquetInputFormat.FootersCacheValue(statusWrapper, new Footer(path, mockMetadata));
    assertTrue(cacheValue.isCurrent(statusWrapper));
    return cacheValue;
}
Also used : Path(org.apache.hadoop.fs.Path) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) FileStatus(org.apache.hadoop.fs.FileStatus) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata)

Example 60 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class TestParquetWriter method test.

@Test
public void test() throws Exception {
    Configuration conf = new Configuration();
    Path root = new Path("target/tests/TestParquetWriter/");
    enforceEmptyDir(conf, root);
    MessageType schema = parseMessageType("message test { " + "required binary binary_field; " + "required int32 int32_field; " + "required int64 int64_field; " + "required boolean boolean_field; " + "required float float_field; " + "required double double_field; " + "required fixed_len_byte_array(3) flba_field; " + "required int96 int96_field; " + "} ");
    GroupWriteSupport.setSchema(schema, conf);
    SimpleGroupFactory f = new SimpleGroupFactory(schema);
    Map<String, Encoding> expected = new HashMap<String, Encoding>();
    expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
    expected.put("1000-" + PARQUET_1_0, PLAIN);
    expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
    expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
    for (int modulo : asList(10, 1000)) {
        for (WriterVersion version : WriterVersion.values()) {
            Path file = new Path(root, version.name() + "_" + modulo);
            ParquetWriter<Group> writer = new ParquetWriter<Group>(file, new GroupWriteSupport(), UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
            for (int i = 0; i < 1000; i++) {
                writer.write(f.newGroup().append("binary_field", "test" + (i % modulo)).append("int32_field", 32).append("int64_field", 64l).append("boolean_field", true).append("float_field", 1.0f).append("double_field", 2.0d).append("flba_field", "foo").append("int96_field", Binary.fromConstantByteArray(new byte[12])));
            }
            writer.close();
            ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
            for (int i = 0; i < 1000; i++) {
                Group group = reader.read();
                assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
                assertEquals(32, group.getInteger("int32_field", 0));
                assertEquals(64l, group.getLong("int64_field", 0));
                assertEquals(true, group.getBoolean("boolean_field", 0));
                assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
                assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
                assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
                assertEquals(Binary.fromConstantByteArray(new byte[12]), group.getInt96("int96_field", 0));
            }
            reader.close();
            ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
            for (BlockMetaData blockMetaData : footer.getBlocks()) {
                for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
                    if (column.getPath().toDotString().equals("binary_field")) {
                        String key = modulo + "-" + version;
                        Encoding expectedEncoding = expected.get(key);
                        assertTrue(key + ":" + column.getEncodings() + " should contain " + expectedEncoding, column.getEncodings().contains(expectedEncoding));
                    }
                }
            }
            assertEquals("Object model property should be example", "example", footer.getFileMetaData().getKeyValueMetaData().get(ParquetWriter.OBJECT_MODEL_NAME_PROP));
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) GroupReadSupport(org.apache.parquet.hadoop.example.GroupReadSupport) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) ExampleParquetWriter(org.apache.parquet.hadoop.example.ExampleParquetWriter) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) Encoding(org.apache.parquet.column.Encoding) WriterVersion(org.apache.parquet.column.ParquetProperties.WriterVersion) GroupWriteSupport(org.apache.parquet.hadoop.example.GroupWriteSupport) MessageTypeParser.parseMessageType(org.apache.parquet.schema.MessageTypeParser.parseMessageType) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Aggregations

ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)76 Path (org.apache.hadoop.fs.Path)39 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)27 Configuration (org.apache.hadoop.conf.Configuration)21 MessageType (org.apache.parquet.schema.MessageType)21 ArrayList (java.util.ArrayList)19 IOException (java.io.IOException)18 Test (org.junit.Test)17 FileSystem (org.apache.hadoop.fs.FileSystem)16 Map (java.util.Map)11 FileMetaData (org.apache.parquet.hadoop.metadata.FileMetaData)11 File (java.io.File)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)9 HashMap (java.util.HashMap)8 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)7 List (java.util.List)6 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)6 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)6 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)6