Search in sources :

Example 1 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project drill by apache.

the class AsyncPageReader method decompress.

private DrillBuf decompress(PageHeader pageHeader, DrillBuf compressedData) {
    DrillBuf pageDataBuf = null;
    Stopwatch timer = Stopwatch.createUnstarted();
    long timeToRead;
    int compressedSize = pageHeader.getCompressed_page_size();
    int uncompressedSize = pageHeader.getUncompressed_page_size();
    pageDataBuf = allocateTemporaryBuffer(uncompressedSize);
    try {
        timer.start();
        CompressionCodecName codecName = parentColumnReader.columnChunkMetaData.getCodec();
        ByteBuffer input = compressedData.nioBuffer(0, compressedSize);
        ByteBuffer output = pageDataBuf.nioBuffer(0, uncompressedSize);
        DecompressionHelper decompressionHelper = new DecompressionHelper(codecName);
        decompressionHelper.decompress(input, compressedSize, output, uncompressedSize);
        pageDataBuf.writerIndex(uncompressedSize);
        timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
        this.updateStats(pageHeader, "Decompress", 0, timeToRead, compressedSize, uncompressedSize);
    } catch (IOException e) {
        handleAndThrowException(e, "Error decompressing data.");
    }
    return pageDataBuf;
}
Also used : CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) Stopwatch(com.google.common.base.Stopwatch) IOException(java.io.IOException) ByteBuffer(java.nio.ByteBuffer) DrillBuf(io.netty.buffer.DrillBuf)

Example 2 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project drill by axbaretto.

the class AsyncPageReader method decompress.

private DrillBuf decompress(PageHeader pageHeader, DrillBuf compressedData) {
    DrillBuf pageDataBuf = null;
    Stopwatch timer = Stopwatch.createUnstarted();
    long timeToRead;
    int compressedSize = pageHeader.getCompressed_page_size();
    int uncompressedSize = pageHeader.getUncompressed_page_size();
    pageDataBuf = allocateTemporaryBuffer(uncompressedSize);
    try {
        timer.start();
        CompressionCodecName codecName = parentColumnReader.columnChunkMetaData.getCodec();
        ByteBuffer input = compressedData.nioBuffer(0, compressedSize);
        ByteBuffer output = pageDataBuf.nioBuffer(0, uncompressedSize);
        DecompressionHelper decompressionHelper = new DecompressionHelper(codecName);
        decompressionHelper.decompress(input, compressedSize, output, uncompressedSize);
        pageDataBuf.writerIndex(uncompressedSize);
        timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
        this.updateStats(pageHeader, "Decompress", 0, timeToRead, compressedSize, uncompressedSize);
    } catch (IOException e) {
        handleAndThrowException(e, "Error decompressing data.");
    }
    return pageDataBuf;
}
Also used : CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) Stopwatch(com.google.common.base.Stopwatch) IOException(java.io.IOException) ByteBuffer(java.nio.ByteBuffer) DrillBuf(io.netty.buffer.DrillBuf)

Example 3 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project parquet-mr by apache.

the class ShowPagesCommand method run.

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
    Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
    Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
    String source = targets.get(0);
    ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source));
    MessageType schema = reader.getFileMetaData().getSchema();
    Map<ColumnDescriptor, PrimitiveType> columns = Maps.newLinkedHashMap();
    if (this.columns == null || this.columns.isEmpty()) {
        for (ColumnDescriptor descriptor : schema.getColumns()) {
            columns.put(descriptor, primitive(schema, descriptor.getPath()));
        }
    } else {
        for (String column : this.columns) {
            columns.put(descriptor(column, schema), primitive(column, schema));
        }
    }
    CompressionCodecName codec = reader.getRowGroups().get(0).getColumns().get(0).getCodec();
    // accumulate formatted lines to print by column
    Map<String, List<String>> formatted = Maps.newLinkedHashMap();
    PageFormatter formatter = new PageFormatter();
    PageReadStore pageStore;
    int rowGroupNum = 0;
    while ((pageStore = reader.readNextRowGroup()) != null) {
        for (ColumnDescriptor descriptor : columns.keySet()) {
            List<String> lines = formatted.get(columnName(descriptor));
            if (lines == null) {
                lines = Lists.newArrayList();
                formatted.put(columnName(descriptor), lines);
            }
            formatter.setContext(rowGroupNum, columns.get(descriptor), codec);
            PageReader pages = pageStore.getPageReader(descriptor);
            DictionaryPage dict = pages.readDictionaryPage();
            if (dict != null) {
                lines.add(formatter.format(dict));
            }
            DataPage page;
            while ((page = pages.readPage()) != null) {
                lines.add(formatter.format(page));
            }
        }
        rowGroupNum += 1;
    }
    // TODO: Show total column size and overall size per value in the column summary line
    for (String columnName : formatted.keySet()) {
        console.info(String.format("\nColumn: %s\n%s", columnName, StringUtils.leftPad("", 80, '-')));
        console.info(formatter.getHeader());
        for (String line : formatted.get(columnName)) {
            console.info(line);
        }
        console.info("");
    }
    return 0;
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) Util.minMaxAsString(org.apache.parquet.cli.Util.minMaxAsString) Util.encodingAsString(org.apache.parquet.cli.Util.encodingAsString) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) PageReadStore(org.apache.parquet.column.page.PageReadStore) PrimitiveType(org.apache.parquet.schema.PrimitiveType) List(java.util.List) MessageType(org.apache.parquet.schema.MessageType) DictionaryPage(org.apache.parquet.column.page.DictionaryPage)

Example 4 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project parquet-mr by apache.

the class FileEncodingsIT method getParameters.

@Parameterized.Parameters
public static Collection<Object[]> getParameters() {
    List<PrimitiveTypeName> types = Arrays.asList(PrimitiveTypeName.BOOLEAN, PrimitiveTypeName.INT32, PrimitiveTypeName.INT64, PrimitiveTypeName.INT96, PrimitiveTypeName.FLOAT, PrimitiveTypeName.DOUBLE, PrimitiveTypeName.BINARY, PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY);
    List<CompressionCodecName> codecs;
    String codecList = System.getenv("TEST_CODECS");
    if (codecList != null) {
        codecs = new ArrayList<CompressionCodecName>();
        for (String codec : codecList.split(",")) {
            codecs.add(CompressionCodecName.valueOf(codec.toUpperCase(Locale.ENGLISH)));
        }
    } else {
        // otherwise test just UNCOMPRESSED
        codecs = Arrays.asList(CompressionCodecName.UNCOMPRESSED);
    }
    System.err.println("Testing codecs: " + codecs);
    List<Object[]> parameters = new ArrayList<Object[]>();
    for (PrimitiveTypeName type : types) {
        for (CompressionCodecName codec : codecs) {
            parameters.add(new Object[] { type, codec });
        }
    }
    return parameters;
}
Also used : CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName)

Example 5 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project parquet-mr by apache.

the class TestInputFormat method createParquetFile.

private void createParquetFile(File file) throws IOException {
    Path path = new Path(file.toURI());
    Configuration configuration = new Configuration();
    MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b;}}");
    String[] columnPath = { "a", "b" };
    ColumnDescriptor c1 = schema.getColumnDescription(columnPath);
    byte[] bytes1 = { 0, 1, 2, 3 };
    byte[] bytes2 = { 2, 3, 4, 5 };
    CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
    BinaryStatistics stats = new BinaryStatistics();
    ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
    w.start();
    w.startBlock(3);
    w.startColumn(c1, 5, codec);
    w.writeDataPage(2, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.startBlock(4);
    w.startColumn(c1, 7, codec);
    w.writeDataPage(7, 4, BytesInput.from(bytes2), stats, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.end(new HashMap<String, String>());
}
Also used : Path(org.apache.hadoop.fs.Path) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Configuration(org.apache.hadoop.conf.Configuration) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) MessageType(org.apache.parquet.schema.MessageType)

Aggregations

CompressionCodecName (org.apache.parquet.hadoop.metadata.CompressionCodecName)25 DrillBuf (io.netty.buffer.DrillBuf)6 ByteBuffer (java.nio.ByteBuffer)6 Configuration (org.apache.hadoop.conf.Configuration)6 Path (org.apache.hadoop.fs.Path)6 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)5 ConnectorSession (com.facebook.presto.spi.ConnectorSession)4 Stopwatch (org.apache.drill.shaded.guava.com.google.common.base.Stopwatch)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 MessageType (org.apache.parquet.schema.MessageType)4 TestingConnectorSession (com.facebook.presto.testing.TestingConnectorSession)3 IOException (java.io.IOException)3 JobConf (org.apache.hadoop.mapred.JobConf)3 WriterVersion (org.apache.parquet.column.ParquetProperties.WriterVersion)3 ParquetFileWriter (org.apache.parquet.hadoop.ParquetFileWriter)3 HiveClientConfig (com.facebook.presto.hive.HiveClientConfig)2 Stopwatch (com.google.common.base.Stopwatch)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 List (java.util.List)2