Search in sources :

Example 11 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.

the class ValidatingColumnWriteStore method testReadUsingRequestedSchemaWithExtraFields.

@Test
public void testReadUsingRequestedSchemaWithExtraFields() {
    MessageType orginalSchema = new MessageType("schema", new PrimitiveType(REQUIRED, INT32, "a"), new PrimitiveType(OPTIONAL, INT32, "b"));
    MessageType schemaWithExtraField = new MessageType("schema", new PrimitiveType(OPTIONAL, INT32, "b"), new PrimitiveType(OPTIONAL, INT32, "a"), new PrimitiveType(OPTIONAL, INT32, "c"));
    MemPageStore memPageStoreForOriginalSchema = new MemPageStore(1);
    MemPageStore memPageStoreForSchemaWithExtraField = new MemPageStore(1);
    SimpleGroupFactory groupFactory = new SimpleGroupFactory(orginalSchema);
    writeGroups(orginalSchema, memPageStoreForOriginalSchema, groupFactory.newGroup().append("a", 1).append("b", 2));
    SimpleGroupFactory groupFactory2 = new SimpleGroupFactory(schemaWithExtraField);
    writeGroups(schemaWithExtraField, memPageStoreForSchemaWithExtraField, groupFactory2.newGroup().append("a", 1).append("b", 2).append("c", 3));
    {
        List<Group> groups = new ArrayList<>();
        groups.addAll(readGroups(memPageStoreForOriginalSchema, orginalSchema, schemaWithExtraField, 1));
        groups.addAll(readGroups(memPageStoreForSchemaWithExtraField, schemaWithExtraField, schemaWithExtraField, 1));
        // TODO: add once we have the support for empty projection
        // groups1.addAll(readGroups(memPageStore3, schema3, schema2, 1));
        Object[][] expected = { { 2, 1, null }, { 2, 1, 3 } // { null, null}
        };
        validateGroups(groups, expected);
    }
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) ArrayList(java.util.ArrayList) List(java.util.List) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 12 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.

the class TestReadWriteEncodingStats method writeData.

private static void writeData(ParquetWriter<Group> writer) throws IOException {
    SimpleGroupFactory f = new SimpleGroupFactory(SCHEMA);
    for (int i = 0; i < NUM_RECORDS; i += 1) {
        int index = i % ALPHABET.length();
        Group group = f.newGroup().append("dict_binary_field", ALPHABET.substring(index, index + 1)).append("plain_int32_field", i).append("fallback_binary_field", i < (NUM_RECORDS / 2) ? ALPHABET.substring(index, index + 1) : UUID.randomUUID().toString());
        writer.write(group);
    }
}
Also used : Group(org.apache.parquet.example.data.Group) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory)

Example 13 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.

the class TestEncryptionOptions method testWriteEncryptedParquetFiles.

private void testWriteEncryptedParquetFiles(Path root, List<SingleRow> data) throws IOException {
    Configuration conf = new Configuration();
    // Ensure that several pages will be created
    int pageSize = data.size() / 10;
    // Ensure that there are more row-groups created
    int rowGroupSize = pageSize * 6 * 5;
    SimpleGroupFactory f = new SimpleGroupFactory(SCHEMA);
    EncryptionConfiguration[] encryptionConfigurations = EncryptionConfiguration.values();
    for (EncryptionConfiguration encryptionConfiguration : encryptionConfigurations) {
        Path file = new Path(root, getFileName(encryptionConfiguration));
        FileEncryptionProperties encryptionProperties = encryptionConfiguration.getEncryptionProperties();
        LOG.info("\nWrite " + file.toString());
        try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(file).withWriteMode(OVERWRITE).withRowGroupSize(rowGroupSize).withPageSize(pageSize).withType(SCHEMA).withConf(conf).withEncryption(encryptionProperties).build()) {
            for (SingleRow singleRow : data) {
                writer.write(f.newGroup().append(SingleRow.BOOLEAN_FIELD_NAME, singleRow.boolean_field).append(SingleRow.INT32_FIELD_NAME, singleRow.int32_field).append(SingleRow.FLOAT_FIELD_NAME, singleRow.float_field).append(SingleRow.DOUBLE_FIELD_NAME, singleRow.double_field).append(SingleRow.BINARY_FIELD_NAME, Binary.fromConstantByteArray(singleRow.ba_field)).append(SingleRow.FIXED_LENGTH_BINARY_FIELD_NAME, Binary.fromConstantByteArray(singleRow.flba_field)).append(SingleRow.PLAINTEXT_INT32_FIELD_NAME, singleRow.plaintext_int32_field));
            }
        }
    }
}
Also used : ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) Configuration(org.apache.hadoop.conf.Configuration) FileEncryptionProperties(org.apache.parquet.crypto.FileEncryptionProperties) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) SingleRow(org.apache.parquet.crypto.SingleRow)

Example 14 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.

the class TestLargeColumnChunk method createFile.

@BeforeClass
public static void createFile() throws IOException {
    file = new Path(folder.newFile().getAbsolutePath());
    GroupFactory factory = new SimpleGroupFactory(SCHEMA);
    Random random = new Random(RANDOM_SEED);
    Configuration conf = new Configuration();
    GroupWriteSupport.setSchema(SCHEMA, conf);
    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(HadoopOutputFile.fromPath(file, conf)).withWriteMode(OVERWRITE).withConf(conf).withCompressionCodec(UNCOMPRESSED).withRowGroupSize(// 4G to ensure all data goes to one row group
    4L * 1024 * 1024 * 1024).withBloomFilterEnabled(true).build()) {
        for (long id = 0; id < ROW_COUNT; ++id) {
            Group group = factory.newGroup();
            group.add(ID_INDEX, id);
            Binary data = nextBinary(random);
            group.add(DATA_INDEX, data);
            writer.write(group);
            if (id == ID_OF_FILTERED_DATA) {
                VALUE_IN_DATA = data;
            }
        }
    }
    VALUE_NOT_IN_DATA = nextBinary(random);
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) Random(java.util.Random) Configuration(org.apache.hadoop.conf.Configuration) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) GroupFactory(org.apache.parquet.example.data.GroupFactory) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) Binary(org.apache.parquet.io.api.Binary)

Example 15 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.

the class TestParquetWriter method testNullValuesWithPageRowLimit.

// Testing the issue of PARQUET-1531 where writing null nested rows leads to empty pages if the page row count limit
// is reached.
@Test
public void testNullValuesWithPageRowLimit() throws IOException {
    MessageType schema = Types.buildMessage().optionalList().optionalElement(BINARY).as(stringType()).named("str_list").named("msg");
    final int recordCount = 100;
    Configuration conf = new Configuration();
    GroupWriteSupport.setSchema(schema, conf);
    GroupFactory factory = new SimpleGroupFactory(schema);
    Group listNull = factory.newGroup();
    File file = temp.newFile();
    file.delete();
    Path path = new Path(file.getAbsolutePath());
    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path).withPageRowCountLimit(10).withConf(conf).build()) {
        for (int i = 0; i < recordCount; ++i) {
            writer.write(listNull);
        }
    }
    try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), path).build()) {
        int readRecordCount = 0;
        for (Group group = reader.read(); group != null; group = reader.read()) {
            assertEquals(listNull.toString(), group.toString());
            ++readRecordCount;
        }
        assertEquals("Number of written records should be equal to the read one", recordCount, readRecordCount);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) GroupReadSupport(org.apache.parquet.hadoop.example.GroupReadSupport) Configuration(org.apache.hadoop.conf.Configuration) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) GroupFactory(org.apache.parquet.example.data.GroupFactory) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) OutputFile(org.apache.parquet.io.OutputFile) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) HadoopOutputFile(org.apache.parquet.hadoop.util.HadoopOutputFile) File(java.io.File) MessageType(org.apache.parquet.schema.MessageType) MessageTypeParser.parseMessageType(org.apache.parquet.schema.MessageTypeParser.parseMessageType) Test(org.junit.Test)

Aggregations

SimpleGroupFactory (org.apache.parquet.example.data.simple.SimpleGroupFactory)37 Group (org.apache.parquet.example.data.Group)33 MessageType (org.apache.parquet.schema.MessageType)20 Path (org.apache.hadoop.fs.Path)16 Configuration (org.apache.hadoop.conf.Configuration)13 Test (org.junit.Test)12 File (java.io.File)10 GroupFactory (org.apache.parquet.example.data.GroupFactory)10 MessageTypeParser.parseMessageType (org.apache.parquet.schema.MessageTypeParser.parseMessageType)8 GroupWriteSupport (org.apache.parquet.hadoop.example.GroupWriteSupport)7 ParquetWriter (org.apache.parquet.hadoop.ParquetWriter)6 PrimitiveType (org.apache.parquet.schema.PrimitiveType)5 MemPageStore (org.apache.parquet.column.page.mem.MemPageStore)4 HadoopInputFile (org.apache.parquet.hadoop.util.HadoopInputFile)4 HadoopOutputFile (org.apache.parquet.hadoop.util.HadoopOutputFile)4 OutputFile (org.apache.parquet.io.OutputFile)4 Binary (org.apache.parquet.io.api.Binary)4 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 Random (java.util.Random)3