Search in sources :

Example 21 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class TestParquetWriterAppendBlocks method testFailDroppingColumns.

@Test
public void testFailDroppingColumns() throws IOException {
    MessageType droppedColumnSchema = Types.buildMessage().required(BINARY).as(UTF8).named("string").named("AppendTest");
    final ParquetMetadata footer = ParquetFileReader.readFooter(CONF, file1, NO_FILTER);
    final FSDataInputStream incoming = file1.getFileSystem(CONF).open(file1);
    Path droppedColumnFile = newTemp();
    final ParquetFileWriter writer = new ParquetFileWriter(CONF, droppedColumnSchema, droppedColumnFile);
    writer.start();
    TestUtils.assertThrows("Should complain that id column is dropped", IllegalArgumentException.class, new Callable<Void>() {

        @Override
        public Void call() throws Exception {
            writer.appendRowGroups(incoming, footer.getBlocks(), false);
            return null;
        }
    });
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) MessageType(org.apache.parquet.schema.MessageType) IOException(java.io.IOException) Test(org.junit.Test)

Example 22 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class TestThriftToParquetFileWriter method testWriteStatistics.

@Test
public void testWriteStatistics() throws Exception {
    // create correct stats small numbers
    IntStatistics intStatsSmall = new IntStatistics();
    intStatsSmall.setMinMax(2, 100);
    LongStatistics longStatsSmall = new LongStatistics();
    longStatsSmall.setMinMax(-17l, 287L);
    DoubleStatistics doubleStatsSmall = new DoubleStatistics();
    doubleStatsSmall.setMinMax(-15.55d, 9.63d);
    BinaryStatistics binaryStatsSmall = new BinaryStatistics();
    binaryStatsSmall.setMinMax(Binary.fromString("as"), Binary.fromString("world"));
    BooleanStatistics boolStats = new BooleanStatistics();
    boolStats.setMinMax(false, true);
    // write rows to a file
    Path p = createFile(new RequiredPrimitiveFixture(false, (byte) 32, (short) 32, 2, 90l, -15.55d, "as"), new RequiredPrimitiveFixture(false, (byte) 100, (short) 100, 100, 287l, -9.0d, "world"), new RequiredPrimitiveFixture(true, (byte) 2, (short) 2, 9, -17l, 9.63d, "hello"));
    final Configuration configuration = new Configuration();
    configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
    final FileSystem fs = p.getFileSystem(configuration);
    FileStatus fileStatus = fs.getFileStatus(p);
    ParquetMetadata footer = ParquetFileReader.readFooter(configuration, p);
    for (BlockMetaData bmd : footer.getBlocks()) {
        for (ColumnChunkMetaData cmd : bmd.getColumns()) {
            switch(cmd.getType()) {
                case INT32:
                    TestUtils.assertStatsValuesEqual(intStatsSmall, cmd.getStatistics());
                    break;
                case INT64:
                    TestUtils.assertStatsValuesEqual(longStatsSmall, cmd.getStatistics());
                    break;
                case DOUBLE:
                    TestUtils.assertStatsValuesEqual(doubleStatsSmall, cmd.getStatistics());
                    break;
                case BOOLEAN:
                    TestUtils.assertStatsValuesEqual(boolStats, cmd.getStatistics());
                    break;
                case BINARY:
                    // there is also info_string that has no statistics
                    if (cmd.getPath().toString() == "[test_string]")
                        TestUtils.assertStatsValuesEqual(binaryStatsSmall, cmd.getStatistics());
                    break;
            }
        }
    }
    // create correct stats large numbers
    IntStatistics intStatsLarge = new IntStatistics();
    intStatsLarge.setMinMax(-Integer.MAX_VALUE, Integer.MAX_VALUE);
    LongStatistics longStatsLarge = new LongStatistics();
    longStatsLarge.setMinMax(-Long.MAX_VALUE, Long.MAX_VALUE);
    DoubleStatistics doubleStatsLarge = new DoubleStatistics();
    doubleStatsLarge.setMinMax(-Double.MAX_VALUE, Double.MAX_VALUE);
    BinaryStatistics binaryStatsLarge = new BinaryStatistics();
    binaryStatsLarge.setMinMax(Binary.fromString("some small string"), Binary.fromString("some very large string here to test in this function"));
    // write rows to a file
    Path p_large = createFile(new RequiredPrimitiveFixture(false, (byte) 2, (short) 32, -Integer.MAX_VALUE, -Long.MAX_VALUE, -Double.MAX_VALUE, "some small string"), new RequiredPrimitiveFixture(false, (byte) 100, (short) 100, Integer.MAX_VALUE, Long.MAX_VALUE, Double.MAX_VALUE, "some very large string here to test in this function"), new RequiredPrimitiveFixture(true, (byte) 2, (short) 2, 9, -17l, 9.63d, "hello"));
    // make new configuration and create file with new large stats
    final Configuration configuration_large = new Configuration();
    configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
    final FileSystem fs_large = p_large.getFileSystem(configuration_large);
    FileStatus fileStatus_large = fs_large.getFileStatus(p_large);
    ParquetMetadata footer_large = ParquetFileReader.readFooter(configuration_large, p_large);
    for (BlockMetaData bmd : footer_large.getBlocks()) {
        for (ColumnChunkMetaData cmd : bmd.getColumns()) {
            switch(cmd.getType()) {
                case INT32:
                    // testing the correct limits of an int32, there are also byte and short, tested earlier
                    if (cmd.getPath().toString() == "[test_i32]")
                        TestUtils.assertStatsValuesEqual(intStatsLarge, cmd.getStatistics());
                    break;
                case INT64:
                    TestUtils.assertStatsValuesEqual(longStatsLarge, cmd.getStatistics());
                    break;
                case DOUBLE:
                    TestUtils.assertStatsValuesEqual(doubleStatsLarge, cmd.getStatistics());
                    break;
                case BOOLEAN:
                    TestUtils.assertStatsValuesEqual(boolStats, cmd.getStatistics());
                    break;
                case BINARY:
                    // there is also info_string that has no statistics
                    if (cmd.getPath().toString() == "[test_string]")
                        TestUtils.assertStatsValuesEqual(binaryStatsLarge, cmd.getStatistics());
                    break;
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) RequiredPrimitiveFixture(org.apache.parquet.thrift.test.RequiredPrimitiveFixture) FileSystem(org.apache.hadoop.fs.FileSystem) Test(org.junit.Test)

Example 23 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class TestInputOutputFormatWithPadding method testBasicBehaviorWithPadding.

@Test
public void testBasicBehaviorWithPadding() throws Exception {
    HadoopOutputFile.getBlockFileSystems().add("file");
    File inputFile = temp.newFile();
    FileOutputStream out = new FileOutputStream(inputFile);
    out.write(FILE_CONTENT.getBytes("UTF-8"));
    out.close();
    File tempFolder = temp.newFolder();
    tempFolder.delete();
    Path tempPath = new Path(tempFolder.toURI());
    File outputFolder = temp.newFile();
    outputFolder.delete();
    Configuration conf = new Configuration();
    // May test against multiple hadoop versions
    conf.set("dfs.block.size", "1024");
    conf.set("dfs.blocksize", "1024");
    conf.set("dfs.blockSize", "1024");
    conf.set("fs.local.block.size", "1024");
    // don't use a cached FS with a different block size
    conf.set("fs.file.impl.disable.cache", "true");
    // disable summary metadata, it isn't needed
    conf.set("parquet.enable.summary-metadata", "false");
    conf.set("parquet.example.schema", PARQUET_TYPE.toString());
    {
        Job writeJob = new Job(conf, "write");
        writeJob.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(writeJob, new Path(inputFile.toString()));
        writeJob.setOutputFormatClass(ParquetOutputFormat.class);
        writeJob.setMapperClass(Writer.class);
        // write directly to Parquet without reduce
        writeJob.setNumReduceTasks(0);
        ParquetOutputFormat.setWriteSupportClass(writeJob, GroupWriteSupport.class);
        ParquetOutputFormat.setBlockSize(writeJob, 1024);
        ParquetOutputFormat.setPageSize(writeJob, 512);
        ParquetOutputFormat.setDictionaryPageSize(writeJob, 512);
        ParquetOutputFormat.setEnableDictionary(writeJob, true);
        // always pad
        ParquetOutputFormat.setMaxPaddingSize(writeJob, 1023);
        ParquetOutputFormat.setOutputPath(writeJob, tempPath);
        waitForJob(writeJob);
    }
    // make sure padding was added
    File parquetFile = getDataFile(tempFolder);
    ParquetMetadata footer = ParquetFileReader.readFooter(conf, new Path(parquetFile.toString()), ParquetMetadataConverter.NO_FILTER);
    for (BlockMetaData block : footer.getBlocks()) {
        Assert.assertTrue("Block should start at a multiple of the block size", block.getStartingPos() % 1024 == 0);
    }
    {
        Job readJob = new Job(conf, "read");
        readJob.setInputFormatClass(NoSplits.class);
        ParquetInputFormat.setReadSupportClass(readJob, GroupReadSupport.class);
        TextInputFormat.addInputPath(readJob, tempPath);
        readJob.setOutputFormatClass(TextOutputFormat.class);
        readJob.setMapperClass(Reader.class);
        // write directly to text without reduce
        readJob.setNumReduceTasks(0);
        TextOutputFormat.setOutputPath(readJob, new Path(outputFolder.toString()));
        waitForJob(readJob);
    }
    File dataFile = getDataFile(outputFolder);
    Assert.assertNotNull("Should find a data file", dataFile);
    StringBuilder contentBuilder = new StringBuilder();
    for (String line : Files.readAllLines(dataFile, UTF_8)) {
        contentBuilder.append(line);
    }
    String reconstructed = contentBuilder.toString();
    Assert.assertEquals("Should match written file content", FILE_CONTENT, reconstructed);
    HadoopOutputFile.getBlockFileSystems().remove("file");
}
Also used : Path(org.apache.hadoop.fs.Path) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) GroupReadSupport(org.apache.parquet.hadoop.example.GroupReadSupport) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) GroupWriteSupport(org.apache.parquet.hadoop.example.GroupWriteSupport) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat) TextOutputFormat(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat) FileOutputStream(java.io.FileOutputStream) Job(org.apache.hadoop.mapreduce.Job) HadoopOutputFile(org.apache.parquet.hadoop.util.HadoopOutputFile) File(java.io.File) OutputFile(org.apache.parquet.io.OutputFile) Test(org.junit.Test)

Example 24 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class TestColumnChunkPageWriteStore method test.

@Test
public void test() throws Exception {
    Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
    Path root = file.getParent();
    FileSystem fs = file.getFileSystem(conf);
    if (fs.exists(root)) {
        fs.delete(root, true);
    }
    fs.mkdirs(root);
    MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
    ColumnDescriptor col = schema.getColumns().get(0);
    Encoding dataEncoding = PLAIN;
    int valueCount = 10;
    int d = 1;
    int r = 2;
    int v = 3;
    BytesInput definitionLevels = BytesInput.fromInt(d);
    BytesInput repetitionLevels = BytesInput.fromInt(r);
    Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
    BytesInput data = BytesInput.fromInt(v);
    int rowCount = 5;
    int nullCount = 1;
    {
        ParquetFileWriter writer = new ParquetFileWriter(conf, schema, file);
        writer.start();
        writer.startBlock(rowCount);
        {
            ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator());
            PageWriter pageWriter = store.getPageWriter(col);
            pageWriter.writePageV2(rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics);
            store.flushToFileWriter(writer);
        }
        writer.endBlock();
        writer.end(new HashMap<String, String>());
    }
    {
        ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
        ParquetFileReader reader = new ParquetFileReader(conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
        PageReadStore rowGroup = reader.readNextRowGroup();
        PageReader pageReader = rowGroup.getPageReader(col);
        DataPageV2 page = (DataPageV2) pageReader.readPage();
        assertEquals(rowCount, page.getRowCount());
        assertEquals(nullCount, page.getNullCount());
        assertEquals(valueCount, page.getValueCount());
        assertEquals(d, intValue(page.getDefinitionLevels()));
        assertEquals(r, intValue(page.getRepetitionLevels()));
        assertEquals(dataEncoding, page.getDataEncoding());
        assertEquals(v, intValue(page.getData()));
        assertEquals(statistics.toString(), page.getStatistics().toString());
        reader.close();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BytesInput(org.apache.parquet.bytes.BytesInput) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) Encoding(org.apache.parquet.column.Encoding) DataPageV2(org.apache.parquet.column.page.DataPageV2) HeapByteBufferAllocator(org.apache.parquet.bytes.HeapByteBufferAllocator) PageReadStore(org.apache.parquet.column.page.PageReadStore) FileSystem(org.apache.hadoop.fs.FileSystem) MessageType(org.apache.parquet.schema.MessageType) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 25 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class TestParquetMetadataConverter method testNullFieldMetadataDebugLogging.

@Test
public void testNullFieldMetadataDebugLogging() {
    MessageType schema = parseMessageType("message test { optional binary some_null_field; }");
    org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap<String, String>(), null);
    List<BlockMetaData> blockMetaDataList = new ArrayList<BlockMetaData>();
    BlockMetaData blockMetaData = new BlockMetaData();
    blockMetaData.addColumn(createColumnChunkMetaData());
    blockMetaDataList.add(blockMetaData);
    ParquetMetadata metadata = new ParquetMetadata(fileMetaData, blockMetaDataList);
    ParquetMetadata.toJSON(metadata);
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ArrayList(java.util.ArrayList) MessageType(org.apache.parquet.schema.MessageType) MessageTypeParser.parseMessageType(org.apache.parquet.schema.MessageTypeParser.parseMessageType) FileMetaData(org.apache.parquet.format.FileMetaData) Test(org.junit.Test)

Aggregations

ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)76 Path (org.apache.hadoop.fs.Path)39 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)27 Configuration (org.apache.hadoop.conf.Configuration)21 MessageType (org.apache.parquet.schema.MessageType)21 ArrayList (java.util.ArrayList)19 IOException (java.io.IOException)18 Test (org.junit.Test)17 FileSystem (org.apache.hadoop.fs.FileSystem)16 Map (java.util.Map)11 FileMetaData (org.apache.parquet.hadoop.metadata.FileMetaData)11 File (java.io.File)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)9 HashMap (java.util.HashMap)8 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)7 List (java.util.List)6 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)6 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)6 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)6