use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.
the class TestParquetWriter method testParquetFileWithBloomFilter.
@Test
public void testParquetFileWithBloomFilter() throws IOException {
MessageType schema = Types.buildMessage().required(BINARY).as(stringType()).named("name").named("msg");
String[] testNames = { "hello", "parquet", "bloom", "filter" };
Configuration conf = new Configuration();
GroupWriteSupport.setSchema(schema, conf);
GroupFactory factory = new SimpleGroupFactory(schema);
File file = temp.newFile();
file.delete();
Path path = new Path(file.getAbsolutePath());
try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path).withPageRowCountLimit(10).withConf(conf).withDictionaryEncoding(false).withBloomFilterEnabled("name", true).build()) {
for (String testName : testNames) {
writer.write(factory.newGroup().append("name", testName));
}
}
try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0);
BloomFilter bloomFilter = reader.getBloomFilterDataReader(blockMetaData).readBloomFilter(blockMetaData.getColumns().get(0));
for (String name : testNames) {
assertTrue(bloomFilter.findHash(LongHashFunction.xx(0).hashBytes(Binary.fromString(name).toByteBuffer())));
}
}
}
use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.
the class TestParquetWriterNewPage method test.
@Test
public void test() throws Exception {
Configuration conf = new Configuration();
Path root = new Path("target/tests/TestParquetWriter/");
FileSystem fs = root.getFileSystem(conf);
if (fs.exists(root)) {
fs.delete(root, true);
}
fs.mkdirs(root);
MessageType schema = parseMessageType("message test { " + "required binary binary_field; " + "required int32 int32_field; " + "required int64 int64_field; " + "required boolean boolean_field; " + "required float float_field; " + "required double double_field; " + "required fixed_len_byte_array(3) flba_field; " + "required int96 int96_field; " + "optional binary null_field; " + "} ");
GroupWriteSupport.setSchema(schema, conf);
SimpleGroupFactory f = new SimpleGroupFactory(schema);
Map<String, Encoding> expected = new HashMap<String, Encoding>();
expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
expected.put("1000-" + PARQUET_1_0, PLAIN);
expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
for (int modulo : asList(10, 1000)) {
for (WriterVersion version : WriterVersion.values()) {
Path file = new Path(root, version.name() + "_" + modulo);
ParquetWriter<Group> writer = new ParquetWriter<Group>(file, new GroupWriteSupport(), UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
for (int i = 0; i < 1000; i++) {
writer.write(f.newGroup().append("binary_field", "test" + (i % modulo)).append("int32_field", 32).append("int64_field", 64l).append("boolean_field", true).append("float_field", 1.0f).append("double_field", 2.0d).append("flba_field", "foo").append("int96_field", Binary.fromConstantByteArray(new byte[12])));
}
writer.close();
ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
for (int i = 0; i < 1000; i++) {
Group group = reader.read();
assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
assertEquals(32, group.getInteger("int32_field", 0));
assertEquals(64l, group.getLong("int64_field", 0));
assertEquals(true, group.getBoolean("boolean_field", 0));
assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
assertEquals(Binary.fromConstantByteArray(new byte[12]), group.getInt96("int96_field", 0));
assertEquals(0, group.getFieldRepetitionCount("null_field"));
}
reader.close();
ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
for (BlockMetaData blockMetaData : footer.getBlocks()) {
for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
if (column.getPath().toDotString().equals("binary_field")) {
String key = modulo + "-" + version;
Encoding expectedEncoding = expected.get(key);
assertTrue(key + ":" + column.getEncodings() + " should contain " + expectedEncoding, column.getEncodings().contains(expectedEncoding));
}
}
}
}
}
}
use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.
the class FileEncodingsIT method writeValuesToFile.
/**
* Writes a set of values to a parquet file.
* The ParquetWriter will write the values with dictionary encoding disabled so that we test specific encodings for
*/
private void writeValuesToFile(Path file, PrimitiveTypeName type, List<?> values, int rowGroupSize, int pageSize, boolean enableDictionary, WriterVersion version) throws IOException {
MessageType schema;
if (type == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
schema = Types.buildMessage().required(type).length(FIXED_LENGTH).named("field").named("test");
} else {
schema = Types.buildMessage().required(type).named("field").named("test");
}
SimpleGroupFactory message = new SimpleGroupFactory(schema);
GroupWriteSupport.setSchema(schema, configuration);
ParquetWriter<Group> writer = ExampleParquetWriter.builder(file).withCompressionCodec(compression).withRowGroupSize(rowGroupSize).withPageSize(pageSize).withDictionaryPageSize(TEST_DICT_PAGE_SIZE).withDictionaryEncoding(enableDictionary).withWriterVersion(version).withConf(configuration).build();
for (Object o : values) {
switch(type) {
case BOOLEAN:
writer.write(message.newGroup().append("field", (Boolean) o));
break;
case INT32:
writer.write(message.newGroup().append("field", (Integer) o));
break;
case INT64:
writer.write(message.newGroup().append("field", (Long) o));
break;
case FLOAT:
writer.write(message.newGroup().append("field", (Float) o));
break;
case DOUBLE:
writer.write(message.newGroup().append("field", (Double) o));
break;
case INT96:
case BINARY:
case FIXED_LEN_BYTE_ARRAY:
writer.write(message.newGroup().append("field", (Binary) o));
break;
default:
throw new IllegalArgumentException("Unknown type name: " + type);
}
}
writer.close();
}
use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.
the class DataGenerator method generateData.
public void generateData(Path outFile, Configuration configuration, ParquetProperties.WriterVersion version, int blockSize, int pageSize, int fixedLenByteArraySize, CompressionCodecName codec, int nRows) throws IOException {
if (exists(configuration, outFile)) {
System.out.println("File already exists " + outFile);
return;
}
System.out.println("Generating data @ " + outFile);
MessageType schema = parseMessageType("message test { " + "required binary binary_field; " + "required int32 int32_field; " + "required int64 int64_field; " + "required boolean boolean_field; " + "required float float_field; " + "required double double_field; " + "required fixed_len_byte_array(" + fixedLenByteArraySize + ") flba_field; " + "required int96 int96_field; " + "} ");
GroupWriteSupport.setSchema(schema, configuration);
SimpleGroupFactory f = new SimpleGroupFactory(schema);
ParquetWriter<Group> writer = new ParquetWriter<Group>(outFile, new GroupWriteSupport(), codec, blockSize, pageSize, DICT_PAGE_SIZE, true, false, version, configuration);
// generate some data for the fixed len byte array field
char[] chars = new char[fixedLenByteArraySize];
Arrays.fill(chars, '*');
for (int i = 0; i < nRows; i++) {
writer.write(f.newGroup().append("binary_field", randomUUID().toString()).append("int32_field", i).append("int64_field", 64l).append("boolean_field", true).append("float_field", 1.0f).append("double_field", 2.0d).append("flba_field", new String(chars)).append("int96_field", Binary.fromConstantByteArray(new byte[12])));
}
writer.close();
}
use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.
the class PageChecksumDataGenerator method generateData.
public void generateData(Path outFile, int nRows, boolean writeChecksums, CompressionCodecName compression) throws IOException {
if (exists(configuration, outFile)) {
System.out.println("File already exists " + outFile);
return;
}
ParquetWriter<Group> writer = ExampleParquetWriter.builder(outFile).withConf(configuration).withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withCompressionCodec(compression).withDictionaryEncoding(true).withType(SCHEMA).withPageWriteChecksumEnabled(writeChecksums).build();
GroupFactory groupFactory = new SimpleGroupFactory(SCHEMA);
Random rand = new Random(42);
for (int i = 0; i < nRows; i++) {
Group group = groupFactory.newGroup();
group.append("long_field", (long) i).append("binary_field", randomUUID().toString()).addGroup("group").append("int_field", rand.nextInt() % 100).append("int_field", rand.nextInt() % 100).append("int_field", rand.nextInt() % 100).append("int_field", rand.nextInt() % 100);
writer.write(group);
}
writer.close();
}
Aggregations