use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.
the class ValidatingColumnWriteStore method testReadUsingRequestedSchemaWithExtraFields.
@Test
public void testReadUsingRequestedSchemaWithExtraFields() {
MessageType orginalSchema = new MessageType("schema", new PrimitiveType(REQUIRED, INT32, "a"), new PrimitiveType(OPTIONAL, INT32, "b"));
MessageType schemaWithExtraField = new MessageType("schema", new PrimitiveType(OPTIONAL, INT32, "b"), new PrimitiveType(OPTIONAL, INT32, "a"), new PrimitiveType(OPTIONAL, INT32, "c"));
MemPageStore memPageStoreForOriginalSchema = new MemPageStore(1);
MemPageStore memPageStoreForSchemaWithExtraField = new MemPageStore(1);
SimpleGroupFactory groupFactory = new SimpleGroupFactory(orginalSchema);
writeGroups(orginalSchema, memPageStoreForOriginalSchema, groupFactory.newGroup().append("a", 1).append("b", 2));
SimpleGroupFactory groupFactory2 = new SimpleGroupFactory(schemaWithExtraField);
writeGroups(schemaWithExtraField, memPageStoreForSchemaWithExtraField, groupFactory2.newGroup().append("a", 1).append("b", 2).append("c", 3));
{
List<Group> groups = new ArrayList<>();
groups.addAll(readGroups(memPageStoreForOriginalSchema, orginalSchema, schemaWithExtraField, 1));
groups.addAll(readGroups(memPageStoreForSchemaWithExtraField, schemaWithExtraField, schemaWithExtraField, 1));
// TODO: add once we have the support for empty projection
// groups1.addAll(readGroups(memPageStore3, schema3, schema2, 1));
Object[][] expected = { { 2, 1, null }, { 2, 1, 3 } // { null, null}
};
validateGroups(groups, expected);
}
}
use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.
the class TestReadWriteEncodingStats method writeData.
private static void writeData(ParquetWriter<Group> writer) throws IOException {
SimpleGroupFactory f = new SimpleGroupFactory(SCHEMA);
for (int i = 0; i < NUM_RECORDS; i += 1) {
int index = i % ALPHABET.length();
Group group = f.newGroup().append("dict_binary_field", ALPHABET.substring(index, index + 1)).append("plain_int32_field", i).append("fallback_binary_field", i < (NUM_RECORDS / 2) ? ALPHABET.substring(index, index + 1) : UUID.randomUUID().toString());
writer.write(group);
}
}
use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.
the class TestEncryptionOptions method testWriteEncryptedParquetFiles.
private void testWriteEncryptedParquetFiles(Path root, List<SingleRow> data) throws IOException {
Configuration conf = new Configuration();
// Ensure that several pages will be created
int pageSize = data.size() / 10;
// Ensure that there are more row-groups created
int rowGroupSize = pageSize * 6 * 5;
SimpleGroupFactory f = new SimpleGroupFactory(SCHEMA);
EncryptionConfiguration[] encryptionConfigurations = EncryptionConfiguration.values();
for (EncryptionConfiguration encryptionConfiguration : encryptionConfigurations) {
Path file = new Path(root, getFileName(encryptionConfiguration));
FileEncryptionProperties encryptionProperties = encryptionConfiguration.getEncryptionProperties();
LOG.info("\nWrite " + file.toString());
try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(file).withWriteMode(OVERWRITE).withRowGroupSize(rowGroupSize).withPageSize(pageSize).withType(SCHEMA).withConf(conf).withEncryption(encryptionProperties).build()) {
for (SingleRow singleRow : data) {
writer.write(f.newGroup().append(SingleRow.BOOLEAN_FIELD_NAME, singleRow.boolean_field).append(SingleRow.INT32_FIELD_NAME, singleRow.int32_field).append(SingleRow.FLOAT_FIELD_NAME, singleRow.float_field).append(SingleRow.DOUBLE_FIELD_NAME, singleRow.double_field).append(SingleRow.BINARY_FIELD_NAME, Binary.fromConstantByteArray(singleRow.ba_field)).append(SingleRow.FIXED_LENGTH_BINARY_FIELD_NAME, Binary.fromConstantByteArray(singleRow.flba_field)).append(SingleRow.PLAINTEXT_INT32_FIELD_NAME, singleRow.plaintext_int32_field));
}
}
}
}
use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.
the class TestLargeColumnChunk method createFile.
@BeforeClass
public static void createFile() throws IOException {
file = new Path(folder.newFile().getAbsolutePath());
GroupFactory factory = new SimpleGroupFactory(SCHEMA);
Random random = new Random(RANDOM_SEED);
Configuration conf = new Configuration();
GroupWriteSupport.setSchema(SCHEMA, conf);
try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(HadoopOutputFile.fromPath(file, conf)).withWriteMode(OVERWRITE).withConf(conf).withCompressionCodec(UNCOMPRESSED).withRowGroupSize(// 4G to ensure all data goes to one row group
4L * 1024 * 1024 * 1024).withBloomFilterEnabled(true).build()) {
for (long id = 0; id < ROW_COUNT; ++id) {
Group group = factory.newGroup();
group.add(ID_INDEX, id);
Binary data = nextBinary(random);
group.add(DATA_INDEX, data);
writer.write(group);
if (id == ID_OF_FILTERED_DATA) {
VALUE_IN_DATA = data;
}
}
}
VALUE_NOT_IN_DATA = nextBinary(random);
}
use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.
the class TestParquetWriter method testNullValuesWithPageRowLimit.
// Testing the issue of PARQUET-1531 where writing null nested rows leads to empty pages if the page row count limit
// is reached.
@Test
public void testNullValuesWithPageRowLimit() throws IOException {
MessageType schema = Types.buildMessage().optionalList().optionalElement(BINARY).as(stringType()).named("str_list").named("msg");
final int recordCount = 100;
Configuration conf = new Configuration();
GroupWriteSupport.setSchema(schema, conf);
GroupFactory factory = new SimpleGroupFactory(schema);
Group listNull = factory.newGroup();
File file = temp.newFile();
file.delete();
Path path = new Path(file.getAbsolutePath());
try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path).withPageRowCountLimit(10).withConf(conf).build()) {
for (int i = 0; i < recordCount; ++i) {
writer.write(listNull);
}
}
try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), path).build()) {
int readRecordCount = 0;
for (Group group = reader.read(); group != null; group = reader.read()) {
assertEquals(listNull.toString(), group.toString());
++readRecordCount;
}
assertEquals("Number of written records should be equal to the read one", recordCount, readRecordCount);
}
}
Aggregations