Search in sources :

Example 26 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project zeppelin by apache.

the class SqlInterpreterTest method createParquetFile.

public File createParquetFile(int[] values, ParquetProperties.WriterVersion version) throws IOException {
    File file = File.createTempFile("zeppelin-flink-input", ".par");
    file.delete();
    Path path = new Path(file.getAbsolutePath());
    Configuration conf = new Configuration();
    MessageType schema = MessageTypeParser.parseMessageType("message test { " + "required int32 int32_field; " + "} ");
    GroupWriteSupport.setSchema(schema, conf);
    SimpleGroupFactory f = new SimpleGroupFactory(schema);
    ParquetWriter<Group> writer = new ParquetWriter<Group>(path, new GroupWriteSupport(), CompressionCodecName.UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
    for (int i = 0; i < values.length; i++) {
        writer.write(f.newGroup().append("int32_field", values[i]));
    }
    writer.close();
    return file;
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) InterpreterGroup(org.apache.zeppelin.interpreter.InterpreterGroup) GroupWriteSupport(org.apache.parquet.hadoop.example.GroupWriteSupport) Configuration(org.apache.hadoop.conf.Configuration) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) OrcFile(org.apache.orc.OrcFile) File(java.io.File) MessageType(org.apache.parquet.schema.MessageType)

Example 27 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project drill by apache.

the class ParquetSimpleTestFileGenerator method main.

public static void main(String[] args) throws IOException {
    SimpleGroupFactory sgf = new SimpleGroupFactory(simpleSchema);
    GroupFactory gf = new SimpleGroupFactory(complexSchema);
    SimpleGroupFactory sngf = new SimpleGroupFactory(simpleNullableSchema);
    GroupFactory ngf = new SimpleGroupFactory(complexNullableSchema);
    // Generate files with dictionary encoding enabled and disabled
    ParquetWriter<Group> simpleWriter = initWriter(simpleSchema, "drill/parquet_test_file_simple", true);
    ParquetWriter<Group> complexWriter = initWriter(complexSchema, "drill/parquet_test_file_complex", true);
    ParquetWriter<Group> simpleNullableWriter = initWriter(simpleNullableSchema, "drill/parquet_test_file_simple_nullable", true);
    ParquetWriter<Group> complexNullableWriter = initWriter(complexNullableSchema, "drill/parquet_test_file_complex_nullable", true);
    ParquetWriter<Group> simpleNoDictWriter = initWriter(simpleSchema, "drill/parquet_test_file_simple_nodict", false);
    ParquetWriter<Group> complexNoDictWriter = initWriter(complexSchema, "drill/parquet_test_file_complex_nodict", false);
    ParquetWriter<Group> simpleNullableNoDictWriter = initWriter(simpleNullableSchema, "drill/parquet_test_file_simple_nullable_nodict", false);
    ParquetWriter<Group> complexNullableNoDictWriter = initWriter(complexNullableSchema, "drill/parquet_test_file_complex_nullable_nodict", false);
    ParquetSimpleTestFileGenerator.writeSimpleValues(sgf, simpleWriter, false);
    ParquetSimpleTestFileGenerator.writeSimpleValues(sngf, simpleNullableWriter, true);
    ParquetSimpleTestFileGenerator.writeComplexValues(gf, complexWriter, false);
    ParquetSimpleTestFileGenerator.writeComplexValues(ngf, complexNullableWriter, true);
    ParquetSimpleTestFileGenerator.writeSimpleValues(sgf, simpleNoDictWriter, false);
    ParquetSimpleTestFileGenerator.writeSimpleValues(sngf, simpleNullableNoDictWriter, true);
    ParquetSimpleTestFileGenerator.writeComplexValues(gf, complexNoDictWriter, false);
    ParquetSimpleTestFileGenerator.writeComplexValues(ngf, complexNullableNoDictWriter, true);
    simpleWriter.close();
    complexWriter.close();
    simpleNullableWriter.close();
    complexNullableWriter.close();
    simpleNoDictWriter.close();
    complexNoDictWriter.close();
    simpleNullableNoDictWriter.close();
    complexNullableNoDictWriter.close();
}
Also used : Group(org.apache.parquet.example.data.Group) GroupFactory(org.apache.parquet.example.data.GroupFactory) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory)

Example 28 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.

the class TestDataPageV1Checksums method writeNestedWithNullsSampleParquetFile.

private Path writeNestedWithNullsSampleParquetFile(Configuration conf, boolean dictionaryEncoding, CompressionCodecName compression) throws IOException {
    File file = tempFolder.newFile();
    file.delete();
    Path path = new Path(file.toURI());
    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path).withConf(conf).withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withCompressionCodec(compression).withDictionaryEncoding(dictionaryEncoding).withType(schemaNestedWithNulls).withPageWriteChecksumEnabled(ParquetOutputFormat.getPageWriteChecksumEnabled(conf)).build()) {
        GroupFactory groupFactory = new SimpleGroupFactory(schemaNestedWithNulls);
        Random rand = new Random(42);
        for (int i = 0; i < numRecordsNestedWithNullsFile; i++) {
            Group group = groupFactory.newGroup();
            if (rand.nextDouble() > nullRatio) {
                // are dictionary encoded when required, perform modulo.
                if (rand.nextDouble() > 0.5) {
                    group.addGroup("c").append("id", (long) i).addGroup("d").append("val", rand.nextInt() % 10);
                } else {
                    group.addGroup("c").append("id", (long) i).addGroup("d").append("val", rand.nextInt() % 10).append("val", rand.nextInt() % 10).append("val", rand.nextInt() % 10);
                }
            }
            writer.write(group);
        }
    }
    return path;
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) Random(java.util.Random) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) GroupFactory(org.apache.parquet.example.data.GroupFactory) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) InputFile(org.apache.parquet.io.InputFile) HadoopOutputFile(org.apache.parquet.hadoop.util.HadoopOutputFile) File(java.io.File) OutputFile(org.apache.parquet.io.OutputFile) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile)

Example 29 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.

the class TestParquetWriter method test.

@Test
public void test() throws Exception {
    Configuration conf = new Configuration();
    Path root = new Path("target/tests/TestParquetWriter/");
    enforceEmptyDir(conf, root);
    MessageType schema = parseMessageType("message test { " + "required binary binary_field; " + "required int32 int32_field; " + "required int64 int64_field; " + "required boolean boolean_field; " + "required float float_field; " + "required double double_field; " + "required fixed_len_byte_array(3) flba_field; " + "required int96 int96_field; " + "} ");
    GroupWriteSupport.setSchema(schema, conf);
    SimpleGroupFactory f = new SimpleGroupFactory(schema);
    Map<String, Encoding> expected = new HashMap<String, Encoding>();
    expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
    expected.put("1000-" + PARQUET_1_0, PLAIN);
    expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
    expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
    for (int modulo : asList(10, 1000)) {
        for (WriterVersion version : WriterVersion.values()) {
            Path file = new Path(root, version.name() + "_" + modulo);
            ParquetWriter<Group> writer = ExampleParquetWriter.builder(new TestOutputFile(file, conf)).withCompressionCodec(UNCOMPRESSED).withRowGroupSize(1024).withPageSize(1024).withDictionaryPageSize(512).enableDictionaryEncoding().withValidation(false).withWriterVersion(version).withConf(conf).build();
            for (int i = 0; i < 1000; i++) {
                writer.write(f.newGroup().append("binary_field", "test" + (i % modulo)).append("int32_field", 32).append("int64_field", 64l).append("boolean_field", true).append("float_field", 1.0f).append("double_field", 2.0d).append("flba_field", "foo").append("int96_field", Binary.fromConstantByteArray(new byte[12])));
            }
            writer.close();
            ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
            for (int i = 0; i < 1000; i++) {
                Group group = reader.read();
                assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
                assertEquals(32, group.getInteger("int32_field", 0));
                assertEquals(64l, group.getLong("int64_field", 0));
                assertEquals(true, group.getBoolean("boolean_field", 0));
                assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
                assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
                assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
                assertEquals(Binary.fromConstantByteArray(new byte[12]), group.getInt96("int96_field", 0));
            }
            reader.close();
            ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
            for (BlockMetaData blockMetaData : footer.getBlocks()) {
                for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
                    if (column.getPath().toDotString().equals("binary_field")) {
                        String key = modulo + "-" + version;
                        Encoding expectedEncoding = expected.get(key);
                        assertTrue(key + ":" + column.getEncodings() + " should contain " + expectedEncoding, column.getEncodings().contains(expectedEncoding));
                    }
                }
            }
            assertEquals("Object model property should be example", "example", footer.getFileMetaData().getKeyValueMetaData().get(ParquetWriter.OBJECT_MODEL_NAME_PROP));
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) GroupReadSupport(org.apache.parquet.hadoop.example.GroupReadSupport) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) Encoding(org.apache.parquet.column.Encoding) WriterVersion(org.apache.parquet.column.ParquetProperties.WriterVersion) MessageType(org.apache.parquet.schema.MessageType) MessageTypeParser.parseMessageType(org.apache.parquet.schema.MessageTypeParser.parseMessageType) Test(org.junit.Test)

Example 30 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.

the class TestParquetWriter method testParquetFileNumberOfBlocks.

private void testParquetFileNumberOfBlocks(int minRowCountForPageSizeCheck, int maxRowCountForPageSizeCheck, int expectedNumberOfBlocks) throws IOException {
    MessageType schema = Types.buildMessage().required(BINARY).as(stringType()).named("str").named("msg");
    Configuration conf = new Configuration();
    GroupWriteSupport.setSchema(schema, conf);
    File file = temp.newFile();
    temp.delete();
    Path path = new Path(file.getAbsolutePath());
    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path).withConf(conf).withRowGroupSize(1).withMinRowCountForPageSizeCheck(minRowCountForPageSizeCheck).withMaxRowCountForPageSizeCheck(maxRowCountForPageSizeCheck).build()) {
        SimpleGroupFactory factory = new SimpleGroupFactory(schema);
        writer.write(factory.newGroup().append("str", "foo"));
        writer.write(factory.newGroup().append("str", "bar"));
        writer.write(factory.newGroup().append("str", "baz"));
    }
    try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, conf))) {
        ParquetMetadata footer = reader.getFooter();
        assertEquals(expectedNumberOfBlocks, footer.getBlocks().size());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) OutputFile(org.apache.parquet.io.OutputFile) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) HadoopOutputFile(org.apache.parquet.hadoop.util.HadoopOutputFile) File(java.io.File) MessageType(org.apache.parquet.schema.MessageType) MessageTypeParser.parseMessageType(org.apache.parquet.schema.MessageTypeParser.parseMessageType)

Aggregations

SimpleGroupFactory (org.apache.parquet.example.data.simple.SimpleGroupFactory)37 Group (org.apache.parquet.example.data.Group)33 MessageType (org.apache.parquet.schema.MessageType)20 Path (org.apache.hadoop.fs.Path)16 Configuration (org.apache.hadoop.conf.Configuration)13 Test (org.junit.Test)12 File (java.io.File)10 GroupFactory (org.apache.parquet.example.data.GroupFactory)10 MessageTypeParser.parseMessageType (org.apache.parquet.schema.MessageTypeParser.parseMessageType)8 GroupWriteSupport (org.apache.parquet.hadoop.example.GroupWriteSupport)7 ParquetWriter (org.apache.parquet.hadoop.ParquetWriter)6 PrimitiveType (org.apache.parquet.schema.PrimitiveType)5 MemPageStore (org.apache.parquet.column.page.mem.MemPageStore)4 HadoopInputFile (org.apache.parquet.hadoop.util.HadoopInputFile)4 HadoopOutputFile (org.apache.parquet.hadoop.util.HadoopOutputFile)4 OutputFile (org.apache.parquet.io.OutputFile)4 Binary (org.apache.parquet.io.api.Binary)4 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 Random (java.util.Random)3