Search in sources :

Example 1 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project h2o-3 by h2oai.

the class ParquetFileGenerator method generateParquetFile.

static File generateParquetFile(File parentDir, String filename, int nrows, Date date) throws IOException {
    File f = new File(parentDir, filename);
    Configuration conf = new Configuration();
    MessageType schema = parseMessageType("message test { " + "required int32 int32_field; " + "required int64 int64_field; " + "required float float_field; " + "required double double_field; " + "required int64 timestamp_field (TIMESTAMP_MILLIS);" + "} ");
    GroupWriteSupport.setSchema(schema, conf);
    SimpleGroupFactory fact = new SimpleGroupFactory(schema);
    ParquetWriter<Group> writer = new ParquetWriter<Group>(new Path(f.getPath()), new GroupWriteSupport(), UNCOMPRESSED, 1024, 1024, 512, true, false, ParquetProperties.WriterVersion.PARQUET_2_0, conf);
    try {
        for (int i = 0; i < nrows; i++) {
            writer.write(fact.newGroup().append("int32_field", 32 + i).append("int64_field", 64L + i).append("float_field", 1.0f + i).append("double_field", 2.0d + i).append("timestamp_field", date.getTime() + (i * 117)));
        }
    } finally {
        writer.close();
    }
    return f;
}
Also used : Path(org.apache.hadoop.fs.Path) GroupWriteSupport(org.apache.parquet.hadoop.example.GroupWriteSupport) Group(org.apache.parquet.example.data.Group) Configuration(org.apache.hadoop.conf.Configuration) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) AvroParquetWriter(org.apache.parquet.avro.AvroParquetWriter) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) File(java.io.File) MessageTypeParser.parseMessageType(org.apache.parquet.schema.MessageTypeParser.parseMessageType) MessageType(org.apache.parquet.schema.MessageType)

Example 2 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project h2o-3 by h2oai.

the class ParquetFileGenerator method generateSparseParquetFile.

static File generateSparseParquetFile(File parentDir, String filename, int nrows) throws IOException {
    File f = new File(parentDir, filename);
    Configuration conf = new Configuration();
    MessageType schema = parseMessageType("message test { optional int32 int32_field; optional binary string_field (UTF8); required int32 row; optional int32 int32_field2; } ");
    GroupWriteSupport.setSchema(schema, conf);
    SimpleGroupFactory fact = new SimpleGroupFactory(schema);
    ParquetWriter<Group> writer = new ParquetWriter<Group>(new Path(f.getPath()), new GroupWriteSupport(), UNCOMPRESSED, 1024, 1024, 512, true, false, ParquetProperties.WriterVersion.PARQUET_2_0, conf);
    try {
        for (int i = 0; i < nrows; i++) {
            Group g = fact.newGroup();
            if (i % 10 == 0) {
                g = g.append("int32_field", i);
            }
            if (i % 10 == 0) {
                g = g.append("string_field", "CAT_" + (i % 10));
            }
            if (i % 10 == 0) {
                g = g.append("int32_field2", i);
            }
            writer.write(g.append("row", i));
        }
    } finally {
        writer.close();
    }
    return f;
}
Also used : Path(org.apache.hadoop.fs.Path) GroupWriteSupport(org.apache.parquet.hadoop.example.GroupWriteSupport) Group(org.apache.parquet.example.data.Group) Configuration(org.apache.hadoop.conf.Configuration) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) AvroParquetWriter(org.apache.parquet.avro.AvroParquetWriter) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) File(java.io.File) MessageTypeParser.parseMessageType(org.apache.parquet.schema.MessageTypeParser.parseMessageType) MessageType(org.apache.parquet.schema.MessageType)

Example 3 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.

the class TestParquetWriterNewPage method test.

@Test
public void test() throws Exception {
    Configuration conf = new Configuration();
    Path root = new Path("target/tests/TestParquetWriter/");
    FileSystem fs = root.getFileSystem(conf);
    if (fs.exists(root)) {
        fs.delete(root, true);
    }
    fs.mkdirs(root);
    MessageType schema = parseMessageType("message test { " + "required binary binary_field; " + "required int32 int32_field; " + "required int64 int64_field; " + "required boolean boolean_field; " + "required float float_field; " + "required double double_field; " + "required fixed_len_byte_array(3) flba_field; " + "required int96 int96_field; " + "optional binary null_field; " + "} ");
    GroupWriteSupport.setSchema(schema, conf);
    SimpleGroupFactory f = new SimpleGroupFactory(schema);
    Map<String, Encoding> expected = new HashMap<String, Encoding>();
    expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
    expected.put("1000-" + PARQUET_1_0, PLAIN);
    expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
    expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
    for (int modulo : asList(10, 1000)) {
        for (WriterVersion version : WriterVersion.values()) {
            Path file = new Path(root, version.name() + "_" + modulo);
            ParquetWriter<Group> writer = new ParquetWriter<Group>(file, new GroupWriteSupport(), UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
            for (int i = 0; i < 1000; i++) {
                writer.write(f.newGroup().append("binary_field", "test" + (i % modulo)).append("int32_field", 32).append("int64_field", 64l).append("boolean_field", true).append("float_field", 1.0f).append("double_field", 2.0d).append("flba_field", "foo").append("int96_field", Binary.fromConstantByteArray(new byte[12])));
            }
            writer.close();
            ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
            for (int i = 0; i < 1000; i++) {
                Group group = reader.read();
                assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
                assertEquals(32, group.getInteger("int32_field", 0));
                assertEquals(64l, group.getLong("int64_field", 0));
                assertEquals(true, group.getBoolean("boolean_field", 0));
                assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
                assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
                assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
                assertEquals(Binary.fromConstantByteArray(new byte[12]), group.getInt96("int96_field", 0));
                assertEquals(0, group.getFieldRepetitionCount("null_field"));
            }
            reader.close();
            ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
            for (BlockMetaData blockMetaData : footer.getBlocks()) {
                for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
                    if (column.getPath().toDotString().equals("binary_field")) {
                        String key = modulo + "-" + version;
                        Encoding expectedEncoding = expected.get(key);
                        assertTrue(key + ":" + column.getEncodings() + " should contain " + expectedEncoding, column.getEncodings().contains(expectedEncoding));
                    }
                }
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) GroupReadSupport(org.apache.parquet.hadoop.example.GroupReadSupport) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) Encoding(org.apache.parquet.column.Encoding) WriterVersion(org.apache.parquet.column.ParquetProperties.WriterVersion) GroupWriteSupport(org.apache.parquet.hadoop.example.GroupWriteSupport) FileSystem(org.apache.hadoop.fs.FileSystem) MessageTypeParser.parseMessageType(org.apache.parquet.schema.MessageTypeParser.parseMessageType) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 4 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.

the class TestReadWriteEncodingStats method writeData.

private static void writeData(ParquetWriter<Group> writer) throws IOException {
    SimpleGroupFactory f = new SimpleGroupFactory(SCHEMA);
    for (int i = 0; i < NUM_RECORDS; i += 1) {
        int index = i % ALPHABET.length();
        Group group = f.newGroup().append("dict_binary_field", ALPHABET.substring(index, index + 1)).append("plain_int32_field", i).append("fallback_binary_field", i < (NUM_RECORDS / 2) ? ALPHABET.substring(index, index + 1) : UUID.randomUUID().toString());
        writer.write(group);
    }
}
Also used : Group(org.apache.parquet.example.data.Group) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory)

Example 5 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.

the class ValidatingColumnWriteStore method testReadUsingRequestedSchemaWithExtraFields.

@Test
public void testReadUsingRequestedSchemaWithExtraFields() {
    MessageType orginalSchema = new MessageType("schema", new PrimitiveType(REQUIRED, INT32, "a"), new PrimitiveType(OPTIONAL, INT32, "b"));
    MessageType schemaWithExtraField = new MessageType("schema", new PrimitiveType(OPTIONAL, INT32, "b"), new PrimitiveType(OPTIONAL, INT32, "a"), new PrimitiveType(OPTIONAL, INT32, "c"));
    MemPageStore memPageStoreForOriginalSchema = new MemPageStore(1);
    MemPageStore memPageStoreForSchemaWithExtraField = new MemPageStore(1);
    SimpleGroupFactory groupFactory = new SimpleGroupFactory(orginalSchema);
    writeGroups(orginalSchema, memPageStoreForOriginalSchema, groupFactory.newGroup().append("a", 1).append("b", 2));
    SimpleGroupFactory groupFactory2 = new SimpleGroupFactory(schemaWithExtraField);
    writeGroups(schemaWithExtraField, memPageStoreForSchemaWithExtraField, groupFactory2.newGroup().append("a", 1).append("b", 2).append("c", 3));
    {
        List<Group> groups = new ArrayList<Group>();
        groups.addAll(readGroups(memPageStoreForOriginalSchema, orginalSchema, schemaWithExtraField, 1));
        groups.addAll(readGroups(memPageStoreForSchemaWithExtraField, schemaWithExtraField, schemaWithExtraField, 1));
        // TODO: add once we have the support for empty projection
        // groups1.addAll(readGroups(memPageStore3, schema3, schema2, 1));
        Object[][] expected = { { 2, 1, null }, { 2, 1, 3 } // { null, null}
        };
        validateGroups(groups, expected);
    }
}
Also used : Group(org.apache.parquet.example.data.Group) PrimitiveType(org.apache.parquet.schema.PrimitiveType) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) ArrayList(java.util.ArrayList) List(java.util.List) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Aggregations

SimpleGroupFactory (org.apache.parquet.example.data.simple.SimpleGroupFactory)27 Group (org.apache.parquet.example.data.Group)25 MessageType (org.apache.parquet.schema.MessageType)15 Test (org.junit.Test)10 Path (org.apache.hadoop.fs.Path)8 GroupWriteSupport (org.apache.parquet.hadoop.example.GroupWriteSupport)7 Configuration (org.apache.hadoop.conf.Configuration)6 File (java.io.File)5 GroupFactory (org.apache.parquet.example.data.GroupFactory)5 ParquetWriter (org.apache.parquet.hadoop.ParquetWriter)5 MessageTypeParser.parseMessageType (org.apache.parquet.schema.MessageTypeParser.parseMessageType)5 PrimitiveType (org.apache.parquet.schema.PrimitiveType)5 MemPageStore (org.apache.parquet.column.page.mem.MemPageStore)4 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 HiveDecimal (org.apache.hadoop.hive.common.type.HiveDecimal)3 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)3 Binary (org.apache.parquet.io.api.Binary)3 List (java.util.List)2 AvroParquetWriter (org.apache.parquet.avro.AvroParquetWriter)2