use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.
the class TestParquetWriterAppendBlocks method testFailDroppingColumns.
@Test
public void testFailDroppingColumns() throws IOException {
MessageType droppedColumnSchema = Types.buildMessage().required(BINARY).as(UTF8).named("string").named("AppendTest");
final ParquetMetadata footer = ParquetFileReader.readFooter(CONF, file1, NO_FILTER);
final FSDataInputStream incoming = file1.getFileSystem(CONF).open(file1);
Path droppedColumnFile = newTemp();
final ParquetFileWriter writer = new ParquetFileWriter(CONF, droppedColumnSchema, droppedColumnFile);
writer.start();
TestUtils.assertThrows("Should complain that id column is dropped", IllegalArgumentException.class, new Callable<Void>() {
@Override
public Void call() throws Exception {
writer.appendRowGroups(incoming, footer.getBlocks(), false);
return null;
}
});
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.
the class TestThriftToParquetFileWriter method testWriteStatistics.
@Test
public void testWriteStatistics() throws Exception {
// create correct stats small numbers
IntStatistics intStatsSmall = new IntStatistics();
intStatsSmall.setMinMax(2, 100);
LongStatistics longStatsSmall = new LongStatistics();
longStatsSmall.setMinMax(-17l, 287L);
DoubleStatistics doubleStatsSmall = new DoubleStatistics();
doubleStatsSmall.setMinMax(-15.55d, 9.63d);
BinaryStatistics binaryStatsSmall = new BinaryStatistics();
binaryStatsSmall.setMinMax(Binary.fromString("as"), Binary.fromString("world"));
BooleanStatistics boolStats = new BooleanStatistics();
boolStats.setMinMax(false, true);
// write rows to a file
Path p = createFile(new RequiredPrimitiveFixture(false, (byte) 32, (short) 32, 2, 90l, -15.55d, "as"), new RequiredPrimitiveFixture(false, (byte) 100, (short) 100, 100, 287l, -9.0d, "world"), new RequiredPrimitiveFixture(true, (byte) 2, (short) 2, 9, -17l, 9.63d, "hello"));
final Configuration configuration = new Configuration();
configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
final FileSystem fs = p.getFileSystem(configuration);
FileStatus fileStatus = fs.getFileStatus(p);
ParquetMetadata footer = ParquetFileReader.readFooter(configuration, p);
for (BlockMetaData bmd : footer.getBlocks()) {
for (ColumnChunkMetaData cmd : bmd.getColumns()) {
switch(cmd.getType()) {
case INT32:
TestUtils.assertStatsValuesEqual(intStatsSmall, cmd.getStatistics());
break;
case INT64:
TestUtils.assertStatsValuesEqual(longStatsSmall, cmd.getStatistics());
break;
case DOUBLE:
TestUtils.assertStatsValuesEqual(doubleStatsSmall, cmd.getStatistics());
break;
case BOOLEAN:
TestUtils.assertStatsValuesEqual(boolStats, cmd.getStatistics());
break;
case BINARY:
// there is also info_string that has no statistics
if (cmd.getPath().toString() == "[test_string]")
TestUtils.assertStatsValuesEqual(binaryStatsSmall, cmd.getStatistics());
break;
}
}
}
// create correct stats large numbers
IntStatistics intStatsLarge = new IntStatistics();
intStatsLarge.setMinMax(-Integer.MAX_VALUE, Integer.MAX_VALUE);
LongStatistics longStatsLarge = new LongStatistics();
longStatsLarge.setMinMax(-Long.MAX_VALUE, Long.MAX_VALUE);
DoubleStatistics doubleStatsLarge = new DoubleStatistics();
doubleStatsLarge.setMinMax(-Double.MAX_VALUE, Double.MAX_VALUE);
BinaryStatistics binaryStatsLarge = new BinaryStatistics();
binaryStatsLarge.setMinMax(Binary.fromString("some small string"), Binary.fromString("some very large string here to test in this function"));
// write rows to a file
Path p_large = createFile(new RequiredPrimitiveFixture(false, (byte) 2, (short) 32, -Integer.MAX_VALUE, -Long.MAX_VALUE, -Double.MAX_VALUE, "some small string"), new RequiredPrimitiveFixture(false, (byte) 100, (short) 100, Integer.MAX_VALUE, Long.MAX_VALUE, Double.MAX_VALUE, "some very large string here to test in this function"), new RequiredPrimitiveFixture(true, (byte) 2, (short) 2, 9, -17l, 9.63d, "hello"));
// make new configuration and create file with new large stats
final Configuration configuration_large = new Configuration();
configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
final FileSystem fs_large = p_large.getFileSystem(configuration_large);
FileStatus fileStatus_large = fs_large.getFileStatus(p_large);
ParquetMetadata footer_large = ParquetFileReader.readFooter(configuration_large, p_large);
for (BlockMetaData bmd : footer_large.getBlocks()) {
for (ColumnChunkMetaData cmd : bmd.getColumns()) {
switch(cmd.getType()) {
case INT32:
// testing the correct limits of an int32, there are also byte and short, tested earlier
if (cmd.getPath().toString() == "[test_i32]")
TestUtils.assertStatsValuesEqual(intStatsLarge, cmd.getStatistics());
break;
case INT64:
TestUtils.assertStatsValuesEqual(longStatsLarge, cmd.getStatistics());
break;
case DOUBLE:
TestUtils.assertStatsValuesEqual(doubleStatsLarge, cmd.getStatistics());
break;
case BOOLEAN:
TestUtils.assertStatsValuesEqual(boolStats, cmd.getStatistics());
break;
case BINARY:
// there is also info_string that has no statistics
if (cmd.getPath().toString() == "[test_string]")
TestUtils.assertStatsValuesEqual(binaryStatsLarge, cmd.getStatistics());
break;
}
}
}
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.
the class TestInputOutputFormatWithPadding method testBasicBehaviorWithPadding.
@Test
public void testBasicBehaviorWithPadding() throws Exception {
HadoopOutputFile.getBlockFileSystems().add("file");
File inputFile = temp.newFile();
FileOutputStream out = new FileOutputStream(inputFile);
out.write(FILE_CONTENT.getBytes("UTF-8"));
out.close();
File tempFolder = temp.newFolder();
tempFolder.delete();
Path tempPath = new Path(tempFolder.toURI());
File outputFolder = temp.newFile();
outputFolder.delete();
Configuration conf = new Configuration();
// May test against multiple hadoop versions
conf.set("dfs.block.size", "1024");
conf.set("dfs.blocksize", "1024");
conf.set("dfs.blockSize", "1024");
conf.set("fs.local.block.size", "1024");
// don't use a cached FS with a different block size
conf.set("fs.file.impl.disable.cache", "true");
// disable summary metadata, it isn't needed
conf.set("parquet.enable.summary-metadata", "false");
conf.set("parquet.example.schema", PARQUET_TYPE.toString());
{
Job writeJob = new Job(conf, "write");
writeJob.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(writeJob, new Path(inputFile.toString()));
writeJob.setOutputFormatClass(ParquetOutputFormat.class);
writeJob.setMapperClass(Writer.class);
// write directly to Parquet without reduce
writeJob.setNumReduceTasks(0);
ParquetOutputFormat.setWriteSupportClass(writeJob, GroupWriteSupport.class);
ParquetOutputFormat.setBlockSize(writeJob, 1024);
ParquetOutputFormat.setPageSize(writeJob, 512);
ParquetOutputFormat.setDictionaryPageSize(writeJob, 512);
ParquetOutputFormat.setEnableDictionary(writeJob, true);
// always pad
ParquetOutputFormat.setMaxPaddingSize(writeJob, 1023);
ParquetOutputFormat.setOutputPath(writeJob, tempPath);
waitForJob(writeJob);
}
// make sure padding was added
File parquetFile = getDataFile(tempFolder);
ParquetMetadata footer = ParquetFileReader.readFooter(conf, new Path(parquetFile.toString()), ParquetMetadataConverter.NO_FILTER);
for (BlockMetaData block : footer.getBlocks()) {
Assert.assertTrue("Block should start at a multiple of the block size", block.getStartingPos() % 1024 == 0);
}
{
Job readJob = new Job(conf, "read");
readJob.setInputFormatClass(NoSplits.class);
ParquetInputFormat.setReadSupportClass(readJob, GroupReadSupport.class);
TextInputFormat.addInputPath(readJob, tempPath);
readJob.setOutputFormatClass(TextOutputFormat.class);
readJob.setMapperClass(Reader.class);
// write directly to text without reduce
readJob.setNumReduceTasks(0);
TextOutputFormat.setOutputPath(readJob, new Path(outputFolder.toString()));
waitForJob(readJob);
}
File dataFile = getDataFile(outputFolder);
Assert.assertNotNull("Should find a data file", dataFile);
StringBuilder contentBuilder = new StringBuilder();
for (String line : Files.readAllLines(dataFile, UTF_8)) {
contentBuilder.append(line);
}
String reconstructed = contentBuilder.toString();
Assert.assertEquals("Should match written file content", FILE_CONTENT, reconstructed);
HadoopOutputFile.getBlockFileSystems().remove("file");
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.
the class TestColumnChunkPageWriteStore method test.
@Test
public void test() throws Exception {
Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
Path root = file.getParent();
FileSystem fs = file.getFileSystem(conf);
if (fs.exists(root)) {
fs.delete(root, true);
}
fs.mkdirs(root);
MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
ColumnDescriptor col = schema.getColumns().get(0);
Encoding dataEncoding = PLAIN;
int valueCount = 10;
int d = 1;
int r = 2;
int v = 3;
BytesInput definitionLevels = BytesInput.fromInt(d);
BytesInput repetitionLevels = BytesInput.fromInt(r);
Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
BytesInput data = BytesInput.fromInt(v);
int rowCount = 5;
int nullCount = 1;
{
ParquetFileWriter writer = new ParquetFileWriter(conf, schema, file);
writer.start();
writer.startBlock(rowCount);
{
ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator());
PageWriter pageWriter = store.getPageWriter(col);
pageWriter.writePageV2(rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics);
store.flushToFileWriter(writer);
}
writer.endBlock();
writer.end(new HashMap<String, String>());
}
{
ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
PageReadStore rowGroup = reader.readNextRowGroup();
PageReader pageReader = rowGroup.getPageReader(col);
DataPageV2 page = (DataPageV2) pageReader.readPage();
assertEquals(rowCount, page.getRowCount());
assertEquals(nullCount, page.getNullCount());
assertEquals(valueCount, page.getValueCount());
assertEquals(d, intValue(page.getDefinitionLevels()));
assertEquals(r, intValue(page.getRepetitionLevels()));
assertEquals(dataEncoding, page.getDataEncoding());
assertEquals(v, intValue(page.getData()));
assertEquals(statistics.toString(), page.getStatistics().toString());
reader.close();
}
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.
the class TestParquetMetadataConverter method testNullFieldMetadataDebugLogging.
@Test
public void testNullFieldMetadataDebugLogging() {
MessageType schema = parseMessageType("message test { optional binary some_null_field; }");
org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap<String, String>(), null);
List<BlockMetaData> blockMetaDataList = new ArrayList<BlockMetaData>();
BlockMetaData blockMetaData = new BlockMetaData();
blockMetaData.addColumn(createColumnChunkMetaData());
blockMetaDataList.add(blockMetaData);
ParquetMetadata metadata = new ParquetMetadata(fileMetaData, blockMetaDataList);
ParquetMetadata.toJSON(metadata);
}
Aggregations