Search in sources :

Example 31 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.

the class TestMergeMetadataFiles method writeFile.

private static void writeFile(File out, Configuration conf, boolean useSchema2) throws IOException {
    if (!useSchema2) {
        GroupWriteSupport.setSchema(schema, conf);
    } else {
        GroupWriteSupport.setSchema(schema2, conf);
    }
    SimpleGroupFactory f = new SimpleGroupFactory(schema);
    Map<String, String> extraMetaData = new HashMap<String, String>();
    extraMetaData.put("schema_num", useSchema2 ? "2" : "1");
    ParquetWriter<Group> writer = ExampleParquetWriter.builder(new Path(out.getAbsolutePath())).withConf(conf).withExtraMetaData(extraMetaData).build();
    for (int i = 0; i < 1000; i++) {
        Group g = f.newGroup().append("binary_field", "test" + i).append("int32_field", i).append("int64_field", (long) i).append("boolean_field", i % 2 == 0).append("float_field", (float) i).append("double_field", (double) i).append("flba_field", "foo");
        if (!useSchema2) {
            g = g.append("int96_field", Binary.fromConstantByteArray(new byte[12]));
        }
        writer.write(g);
    }
    writer.close();
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) HashMap(java.util.HashMap) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory)

Example 32 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.

the class TestPropertiesDrivenEncryption method writeEncryptedParquetFile.

private void writeEncryptedParquetFile(Path root, List<SingleRow> data, EncryptionConfiguration encryptionConfiguration, int threadNumber) {
    MessageType schema = SingleRow.getSchema();
    SimpleGroupFactory f = new SimpleGroupFactory(schema);
    // Ensure that several pages will be created
    int pageSize = data.size() / 10;
    // Ensure that there are more row-groups created
    int rowGroupSize = pageSize * 6 * 5;
    Path file = new Path(root, getFileName(root, encryptionConfiguration, threadNumber));
    LOG.info("\nWrite " + file.toString());
    Configuration conf = encryptionConfiguration.getHadoopConfiguration(this);
    FileEncryptionProperties fileEncryptionProperties = null;
    try {
        if (null == conf) {
            conf = new Configuration();
        } else {
            EncryptionPropertiesFactory cryptoFactory = EncryptionPropertiesFactory.loadFactory(conf);
            fileEncryptionProperties = cryptoFactory.getFileEncryptionProperties(conf, file, null);
        }
    } catch (Exception e) {
        addErrorToErrorCollectorAndLog("Failed writing " + file.toString(), e, encryptionConfiguration, null);
        return;
    }
    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(file).withConf(conf).withWriteMode(OVERWRITE).withType(schema).withPageSize(pageSize).withRowGroupSize(rowGroupSize).withEncryption(fileEncryptionProperties).build()) {
        for (SingleRow singleRow : data) {
            writer.write(f.newGroup().append(SingleRow.BOOLEAN_FIELD_NAME, singleRow.boolean_field).append(SingleRow.INT32_FIELD_NAME, singleRow.int32_field).append(SingleRow.FLOAT_FIELD_NAME, singleRow.float_field).append(SingleRow.DOUBLE_FIELD_NAME, singleRow.double_field).append(SingleRow.BINARY_FIELD_NAME, Binary.fromConstantByteArray(singleRow.ba_field)).append(SingleRow.FIXED_LENGTH_BINARY_FIELD_NAME, Binary.fromConstantByteArray(singleRow.flba_field)).append(SingleRow.PLAINTEXT_INT32_FIELD_NAME, singleRow.plaintext_int32_field));
        }
    } catch (Exception e) {
        addErrorToErrorCollectorAndLog("Failed writing " + file.toString(), e, encryptionConfiguration, null);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) Configuration(org.apache.hadoop.conf.Configuration) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) MessageType(org.apache.parquet.schema.MessageType) IOException(java.io.IOException)

Example 33 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.

the class DictionaryFilterTest method prepareFile.

private static void prepareFile(WriterVersion version, Path file) throws IOException {
    GroupWriteSupport.setSchema(schema, conf);
    SimpleGroupFactory f = new SimpleGroupFactory(schema);
    ParquetWriter<Group> writer = ExampleParquetWriter.builder(file).withWriterVersion(version).withCompressionCodec(GZIP).withRowGroupSize(1024 * 1024).withPageSize(1024).enableDictionaryEncoding().withDictionaryPageSize(2 * 1024).withConf(conf).build();
    writeData(f, writer);
}
Also used : Group(org.apache.parquet.example.data.Group) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory)

Example 34 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.

the class ValidatingColumnWriteStore method testOptionalRequiredInteraction.

@Test
public void testOptionalRequiredInteraction() {
    for (int i = 0; i < 6; i++) {
        Type current = new PrimitiveType(Repetition.REQUIRED, PrimitiveTypeName.BINARY, "primitive");
        for (int j = 0; j < i; j++) {
            current = new GroupType(Repetition.REQUIRED, "req" + j, current);
        }
        MessageType groupSchema = new MessageType("schema" + i, current);
        GroupFactory gf = new SimpleGroupFactory(groupSchema);
        List<Group> groups = new ArrayList<>();
        Group root = gf.newGroup();
        Group currentGroup = root;
        for (int j = 0; j < i; j++) {
            currentGroup = currentGroup.addGroup(0);
        }
        currentGroup.add(0, Binary.fromString("foo"));
        groups.add(root);
        testSchema(groupSchema, groups);
    }
    for (int i = 0; i < 6; i++) {
        Type current = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, "primitive");
        for (int j = 0; j < i; j++) {
            current = new GroupType(Repetition.REQUIRED, "req" + j, current);
        }
        MessageType groupSchema = new MessageType("schema" + (i + 6), current);
        GroupFactory gf = new SimpleGroupFactory(groupSchema);
        List<Group> groups = new ArrayList<>();
        Group rootDefined = gf.newGroup();
        Group rootUndefined = gf.newGroup();
        Group currentDefinedGroup = rootDefined;
        Group currentUndefinedGroup = rootUndefined;
        for (int j = 0; j < i; j++) {
            currentDefinedGroup = currentDefinedGroup.addGroup(0);
            currentUndefinedGroup = currentUndefinedGroup.addGroup(0);
        }
        currentDefinedGroup.add(0, Binary.fromString("foo"));
        groups.add(rootDefined);
        groups.add(rootUndefined);
        testSchema(groupSchema, groups);
    }
    for (int i = 0; i < 6; i++) {
        Type current = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, "primitive");
        for (int j = 0; j < 6; j++) {
            current = new GroupType(i == j ? Repetition.OPTIONAL : Repetition.REQUIRED, "req" + j, current);
        }
        MessageType groupSchema = new MessageType("schema" + (i + 12), current);
        GroupFactory gf = new SimpleGroupFactory(groupSchema);
        List<Group> groups = new ArrayList<>();
        Group rootDefined = gf.newGroup();
        Group rootUndefined = gf.newGroup();
        Group currentDefinedGroup = rootDefined;
        Group currentUndefinedGroup = rootUndefined;
        for (int j = 0; j < 6; j++) {
            currentDefinedGroup = currentDefinedGroup.addGroup(0);
            if (i < j) {
                currentUndefinedGroup = currentUndefinedGroup.addGroup(0);
            }
        }
        currentDefinedGroup.add(0, Binary.fromString("foo"));
        groups.add(rootDefined);
        groups.add(rootUndefined);
        testSchema(groupSchema, groups);
    }
}
Also used : Group(org.apache.parquet.example.data.Group) PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) GroupType(org.apache.parquet.schema.GroupType) ArrayList(java.util.ArrayList) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) GroupFactory(org.apache.parquet.example.data.GroupFactory) PrimitiveType(org.apache.parquet.schema.PrimitiveType) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 35 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.

the class ValidatingColumnWriteStore method testOneOfEach.

@Test
public void testOneOfEach() {
    MessageType oneOfEachSchema = MessageTypeParser.parseMessageType(oneOfEach);
    GroupFactory gf = new SimpleGroupFactory(oneOfEachSchema);
    Group g1 = gf.newGroup().append("a", 1l).append("b", 2).append("c", 3.0f).append("d", 4.0d).append("e", true).append("f", Binary.fromString("6")).append("g", new NanoTime(1234, System.currentTimeMillis() * 1000)).append("h", Binary.fromString("abc"));
    testSchema(oneOfEachSchema, Arrays.asList(g1));
}
Also used : NanoTime(org.apache.parquet.example.data.simple.NanoTime) Group(org.apache.parquet.example.data.Group) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) GroupFactory(org.apache.parquet.example.data.GroupFactory) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Aggregations

SimpleGroupFactory (org.apache.parquet.example.data.simple.SimpleGroupFactory)37 Group (org.apache.parquet.example.data.Group)33 MessageType (org.apache.parquet.schema.MessageType)20 Path (org.apache.hadoop.fs.Path)16 Configuration (org.apache.hadoop.conf.Configuration)13 Test (org.junit.Test)12 File (java.io.File)10 GroupFactory (org.apache.parquet.example.data.GroupFactory)10 MessageTypeParser.parseMessageType (org.apache.parquet.schema.MessageTypeParser.parseMessageType)8 GroupWriteSupport (org.apache.parquet.hadoop.example.GroupWriteSupport)7 ParquetWriter (org.apache.parquet.hadoop.ParquetWriter)6 PrimitiveType (org.apache.parquet.schema.PrimitiveType)5 MemPageStore (org.apache.parquet.column.page.mem.MemPageStore)4 HadoopInputFile (org.apache.parquet.hadoop.util.HadoopInputFile)4 HadoopOutputFile (org.apache.parquet.hadoop.util.HadoopOutputFile)4 OutputFile (org.apache.parquet.io.OutputFile)4 Binary (org.apache.parquet.io.api.Binary)4 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 Random (java.util.Random)3