use of org.apache.parquet.hadoop.example.GroupReadSupport in project parquet-mr by apache.
the class TestParquetWriterNewPage method test.
@Test
public void test() throws Exception {
Configuration conf = new Configuration();
Path root = new Path("target/tests/TestParquetWriter/");
FileSystem fs = root.getFileSystem(conf);
if (fs.exists(root)) {
fs.delete(root, true);
}
fs.mkdirs(root);
MessageType schema = parseMessageType("message test { " + "required binary binary_field; " + "required int32 int32_field; " + "required int64 int64_field; " + "required boolean boolean_field; " + "required float float_field; " + "required double double_field; " + "required fixed_len_byte_array(3) flba_field; " + "required int96 int96_field; " + "optional binary null_field; " + "} ");
GroupWriteSupport.setSchema(schema, conf);
SimpleGroupFactory f = new SimpleGroupFactory(schema);
Map<String, Encoding> expected = new HashMap<String, Encoding>();
expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
expected.put("1000-" + PARQUET_1_0, PLAIN);
expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
for (int modulo : asList(10, 1000)) {
for (WriterVersion version : WriterVersion.values()) {
Path file = new Path(root, version.name() + "_" + modulo);
ParquetWriter<Group> writer = new ParquetWriter<Group>(file, new GroupWriteSupport(), UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
for (int i = 0; i < 1000; i++) {
writer.write(f.newGroup().append("binary_field", "test" + (i % modulo)).append("int32_field", 32).append("int64_field", 64l).append("boolean_field", true).append("float_field", 1.0f).append("double_field", 2.0d).append("flba_field", "foo").append("int96_field", Binary.fromConstantByteArray(new byte[12])));
}
writer.close();
ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
for (int i = 0; i < 1000; i++) {
Group group = reader.read();
assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
assertEquals(32, group.getInteger("int32_field", 0));
assertEquals(64l, group.getLong("int64_field", 0));
assertEquals(true, group.getBoolean("boolean_field", 0));
assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
assertEquals(Binary.fromConstantByteArray(new byte[12]), group.getInt96("int96_field", 0));
assertEquals(0, group.getFieldRepetitionCount("null_field"));
}
reader.close();
ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
for (BlockMetaData blockMetaData : footer.getBlocks()) {
for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
if (column.getPath().toDotString().equals("binary_field")) {
String key = modulo + "-" + version;
Encoding expectedEncoding = expected.get(key);
assertTrue(key + ":" + column.getEncodings() + " should contain " + expectedEncoding, column.getEncodings().contains(expectedEncoding));
}
}
}
}
}
}
use of org.apache.parquet.hadoop.example.GroupReadSupport in project parquet-mr by apache.
the class TestParquetWriterAppendBlocks method testBasicBehavior.
@Test
public void testBasicBehavior() throws IOException {
Path combinedFile = newTemp();
ParquetFileWriter writer = new ParquetFileWriter(CONF, FILE_SCHEMA, combinedFile);
writer.start();
writer.appendFile(CONF, file1);
writer.appendFile(CONF, file2);
writer.end(EMPTY_METADATA);
LinkedList<Group> expected = new LinkedList<Group>();
expected.addAll(file1content);
expected.addAll(file2content);
ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), combinedFile).build();
Group next;
while ((next = reader.read()) != null) {
Group expectedNext = expected.removeFirst();
// check each value; equals is not supported for simple records
Assert.assertEquals("Each id should match", expectedNext.getInteger("id", 0), next.getInteger("id", 0));
Assert.assertEquals("Each string should match", expectedNext.getString("string", 0), next.getString("string", 0));
}
Assert.assertEquals("All records should be present", 0, expected.size());
}
use of org.apache.parquet.hadoop.example.GroupReadSupport in project parquet-mr by apache.
the class TestParquetWriterAppendBlocks method testAllowDroppingColumns.
@Test
public void testAllowDroppingColumns() throws IOException {
MessageType droppedColumnSchema = Types.buildMessage().required(BINARY).as(UTF8).named("string").named("AppendTest");
Path droppedColumnFile = newTemp();
ParquetFileWriter writer = new ParquetFileWriter(CONF, droppedColumnSchema, droppedColumnFile);
writer.start();
writer.appendFile(CONF, file1);
writer.appendFile(CONF, file2);
writer.end(EMPTY_METADATA);
LinkedList<Group> expected = new LinkedList<Group>();
expected.addAll(file1content);
expected.addAll(file2content);
ParquetMetadata footer = ParquetFileReader.readFooter(CONF, droppedColumnFile, NO_FILTER);
for (BlockMetaData rowGroup : footer.getBlocks()) {
Assert.assertEquals("Should have only the string column", 1, rowGroup.getColumns().size());
}
ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), droppedColumnFile).build();
Group next;
while ((next = reader.read()) != null) {
Group expectedNext = expected.removeFirst();
Assert.assertEquals("Each string should match", expectedNext.getString("string", 0), next.getString("string", 0));
}
Assert.assertEquals("All records should be present", 0, expected.size());
}
use of org.apache.parquet.hadoop.example.GroupReadSupport in project parquet-mr by apache.
the class TestFiltersWithMissingColumns method countFilteredRecords.
public static long countFilteredRecords(Path path, FilterPredicate pred) throws IOException {
ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), path).withFilter(FilterCompat.get(pred)).build();
long count = 0;
try {
while (reader.read() != null) {
count += 1;
}
} finally {
reader.close();
}
return count;
}
use of org.apache.parquet.hadoop.example.GroupReadSupport in project apex-malhar by apache.
the class AbstractParquetFileReader method openFile.
/**
* Opens the file to read using GroupReadSupport
*/
@Override
protected InputStream openFile(Path path) throws IOException {
InputStream is = super.openFile(path);
GroupReadSupport readSupport = new GroupReadSupport();
readSupport.init(configuration, null, schema);
reader = new ParquetReader<>(path, readSupport);
return is;
}
Aggregations