Search in sources :

Example 21 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project presto by prestodb.

the class ParquetFileWriterFactory method createFileWriter.

@Override
public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf conf, ConnectorSession session, Optional<EncryptionInformation> encryptionInformation) {
    if (!isParquetOptimizedWriterEnabled(session)) {
        return Optional.empty();
    }
    if (!MapredParquetOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        return Optional.empty();
    }
    ParquetWriterOptions parquetWriterOptions = ParquetWriterOptions.builder().setMaxPageSize(getParquetWriterPageSize(session)).setMaxBlockSize(getParquetWriterBlockSize(session)).build();
    CompressionCodecName compressionCodecName = getCompression(conf);
    List<String> fileColumnNames = Splitter.on(',').trimResults().omitEmptyStrings().splitToList(schema.getProperty(META_TABLE_COLUMNS, ""));
    List<Type> fileColumnTypes = toHiveTypes(schema.getProperty(META_TABLE_COLUMN_TYPES, "")).stream().map(hiveType -> hiveType.getType(typeManager)).collect(toList());
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, conf);
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        return Optional.of(new ParquetFileWriter(fileSystem.create(path), rollbackAction, fileColumnNames, fileColumnTypes, parquetWriterOptions, fileInputColumnIndexes, compressionCodecName));
    } catch (IOException e) {
        throw new PrestoException(HIVE_WRITER_OPEN_ERROR, "Error creating Parquet file", e);
    }
}
Also used : HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) DateTimeZone(org.joda.time.DateTimeZone) FileSystem(org.apache.hadoop.fs.FileSystem) Inject(com.google.inject.Inject) HiveSessionProperties.getParquetWriterPageSize(com.facebook.presto.hive.HiveSessionProperties.getParquetWriterPageSize) Callable(java.util.concurrent.Callable) META_TABLE_COLUMNS(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMNS) PrestoException(com.facebook.presto.spi.PrestoException) HiveSessionProperties.isParquetOptimizedWriterEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetOptimizedWriterEnabled) NodeVersion(com.facebook.presto.hive.NodeVersion) TypeManager(com.facebook.presto.common.type.TypeManager) Objects.requireNonNull(java.util.Objects.requireNonNull) Path(org.apache.hadoop.fs.Path) HiveSessionProperties.getParquetWriterBlockSize(com.facebook.presto.hive.HiveSessionProperties.getParquetWriterBlockSize) EncryptionInformation(com.facebook.presto.hive.EncryptionInformation) Splitter(com.google.common.base.Splitter) Type(com.facebook.presto.common.type.Type) HiveClientConfig(com.facebook.presto.hive.HiveClientConfig) META_TABLE_COLUMN_TYPES(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMN_TYPES) Properties(java.util.Properties) ParquetOutputFormat(org.apache.parquet.hadoop.ParquetOutputFormat) StorageFormat(com.facebook.presto.hive.metastore.StorageFormat) HiveFileWriterFactory(com.facebook.presto.hive.HiveFileWriterFactory) ParquetWriterOptions(com.facebook.presto.parquet.writer.ParquetWriterOptions) HiveFileWriter(com.facebook.presto.hive.HiveFileWriter) IOException(java.io.IOException) ConnectorSession(com.facebook.presto.spi.ConnectorSession) JobConf(org.apache.hadoop.mapred.JobConf) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) MapredParquetOutputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat) Optional(java.util.Optional) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) HIVE_WRITER_OPEN_ERROR(com.facebook.presto.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR) HiveType.toHiveTypes(com.facebook.presto.hive.HiveType.toHiveTypes) PrestoException(com.facebook.presto.spi.PrestoException) IOException(java.io.IOException) Type(com.facebook.presto.common.type.Type) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) FileSystem(org.apache.hadoop.fs.FileSystem) ParquetWriterOptions(com.facebook.presto.parquet.writer.ParquetWriterOptions)

Example 22 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project drill by apache.

the class TestFileGenerator method generateParquetFile.

public static void generateParquetFile(String filename, ParquetTestProperties props) throws Exception {
    int currentBooleanByte = 0;
    WrapAroundCounter booleanBitCounter = new WrapAroundCounter(7);
    Configuration configuration = new Configuration();
    configuration.set(FileSystem.FS_DEFAULT_NAME_KEY, "file:///");
    // "message m { required int32 integer; required int64 integer64; required boolean b; required float f; required double d;}"
    FileSystem fs = FileSystem.get(configuration);
    Path path = new Path(filename);
    if (fs.exists(path)) {
        fs.delete(path, false);
    }
    String messageSchema = "message m {";
    for (FieldInfo fieldInfo : props.fields.values()) {
        messageSchema += " required " + fieldInfo.parquetType + " " + fieldInfo.name + ";";
    }
    // remove the last semicolon, java really needs a join method for strings...
    // TODO - nvm apparently it requires a semicolon after every field decl, might want to file a bug
    // messageSchema = messageSchema.substring(schemaType, messageSchema.length() - 1);
    messageSchema += "}";
    MessageType schema = MessageTypeParser.parseMessageType(messageSchema);
    CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
    ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
    w.start();
    HashMap<String, Integer> columnValuesWritten = new HashMap<>();
    int valsWritten;
    for (int k = 0; k < props.numberRowGroups; k++) {
        w.startBlock(props.recordsPerRowGroup);
        currentBooleanByte = 0;
        booleanBitCounter.reset();
        for (FieldInfo fieldInfo : props.fields.values()) {
            if (!columnValuesWritten.containsKey(fieldInfo.name)) {
                columnValuesWritten.put(fieldInfo.name, 0);
                valsWritten = 0;
            } else {
                valsWritten = columnValuesWritten.get(fieldInfo.name);
            }
            String[] path1 = { fieldInfo.name };
            ColumnDescriptor c1 = schema.getColumnDescription(path1);
            w.startColumn(c1, props.recordsPerRowGroup, codec);
            final int valsPerPage = (int) Math.ceil(props.recordsPerRowGroup / (float) fieldInfo.numberOfPages);
            // 1 MB
            final int PAGE_SIZE = 1024 * 1024;
            byte[] bytes;
            RunLengthBitPackingHybridValuesWriter defLevels = new RunLengthBitPackingHybridValuesWriter(MAX_EXPECTED_BIT_WIDTH_FOR_DEFINITION_LEVELS, valsPerPage, PAGE_SIZE, new DirectByteBufferAllocator());
            RunLengthBitPackingHybridValuesWriter repLevels = new RunLengthBitPackingHybridValuesWriter(MAX_EXPECTED_BIT_WIDTH_FOR_DEFINITION_LEVELS, valsPerPage, PAGE_SIZE, new DirectByteBufferAllocator());
            // for variable length binary fields
            int bytesNeededToEncodeLength = 4;
            if (fieldInfo.bitLength > 0) {
                bytes = new byte[(int) Math.ceil(valsPerPage * fieldInfo.bitLength / 8.0)];
            } else {
                // the twelve at the end is to account for storing a 4 byte length with each value
                int totalValLength = ((byte[]) fieldInfo.values[0]).length + ((byte[]) fieldInfo.values[1]).length + ((byte[]) fieldInfo.values[2]).length + 3 * bytesNeededToEncodeLength;
                // used for the case where there is a number of values in this row group that is not divisible by 3
                int leftOverBytes = 0;
                if (valsPerPage % 3 > 0) {
                    leftOverBytes += ((byte[]) fieldInfo.values[1]).length + bytesNeededToEncodeLength;
                }
                if (valsPerPage % 3 > 1) {
                    leftOverBytes += ((byte[]) fieldInfo.values[2]).length + bytesNeededToEncodeLength;
                }
                bytes = new byte[valsPerPage / 3 * totalValLength + leftOverBytes];
            }
            int bytesPerPage = (int) (valsPerPage * (fieldInfo.bitLength / 8.0));
            int bytesWritten = 0;
            for (int z = 0; z < fieldInfo.numberOfPages; z++, bytesWritten = 0) {
                for (int i = 0; i < valsPerPage; i++) {
                    repLevels.writeInteger(0);
                    defLevels.writeInteger(1);
                    if (fieldInfo.values[0] instanceof Boolean) {
                        bytes[currentBooleanByte] |= bitFields[booleanBitCounter.val] & ((boolean) fieldInfo.values[valsWritten % 3] ? allBitsTrue : allBitsFalse);
                        booleanBitCounter.increment();
                        if (booleanBitCounter.val == 0) {
                            currentBooleanByte++;
                        }
                        valsWritten++;
                        if (currentBooleanByte > bytesPerPage) {
                            break;
                        }
                    } else {
                        if (fieldInfo.values[valsWritten % 3] instanceof byte[]) {
                            System.arraycopy(ByteArrayUtil.toByta(((byte[]) fieldInfo.values[valsWritten % 3]).length), 0, bytes, bytesWritten, bytesNeededToEncodeLength);
                            System.arraycopy(fieldInfo.values[valsWritten % 3], 0, bytes, bytesWritten + bytesNeededToEncodeLength, ((byte[]) fieldInfo.values[valsWritten % 3]).length);
                            bytesWritten += ((byte[]) fieldInfo.values[valsWritten % 3]).length + bytesNeededToEncodeLength;
                        } else {
                            System.arraycopy(ByteArrayUtil.toByta(fieldInfo.values[valsWritten % 3]), 0, bytes, i * (fieldInfo.bitLength / 8), fieldInfo.bitLength / 8);
                        }
                        valsWritten++;
                    }
                }
                byte[] fullPage = new byte[2 * 4 * valsPerPage + bytes.length];
                byte[] repLevelBytes = repLevels.getBytes().toByteArray();
                byte[] defLevelBytes = defLevels.getBytes().toByteArray();
                System.arraycopy(bytes, 0, fullPage, 0, bytes.length);
                System.arraycopy(repLevelBytes, 0, fullPage, bytes.length, repLevelBytes.length);
                System.arraycopy(defLevelBytes, 0, fullPage, bytes.length + repLevelBytes.length, defLevelBytes.length);
                w.writeDataPage((props.recordsPerRowGroup / fieldInfo.numberOfPages), fullPage.length, BytesInput.from(fullPage), RLE, RLE, PLAIN);
                currentBooleanByte = 0;
            }
            w.endColumn();
            columnValuesWritten.remove(fieldInfo.name);
            columnValuesWritten.put(fieldInfo.name, valsWritten);
        }
        w.endBlock();
    }
    w.end(new HashMap<String, String>());
    logger.debug("Finished generating parquet file {}", path.getName());
}
Also used : Path(org.apache.hadoop.fs.Path) DirectByteBufferAllocator(org.apache.parquet.bytes.DirectByteBufferAllocator) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetFileWriter(org.apache.parquet.hadoop.ParquetFileWriter) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) RunLengthBitPackingHybridValuesWriter(org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter) FileSystem(org.apache.hadoop.fs.FileSystem) MessageType(org.apache.parquet.schema.MessageType)

Example 23 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project drill by apache.

the class PageReader method readCompressedPageV1.

/**
 * Reads a compressed v1 data page or a dictionary page, both of which are compressed
 * in their entirety.
 * @return decompressed Parquet page data
 * @throws IOException
 */
protected DrillBuf readCompressedPageV1() throws IOException {
    Stopwatch timer = Stopwatch.createUnstarted();
    int inputSize = pageHeader.getCompressed_page_size();
    int outputSize = pageHeader.getUncompressed_page_size();
    long start = dataReader.getPos();
    long timeToRead;
    DrillBuf inputPageData = null;
    DrillBuf outputPageData = this.allocator.buffer(outputSize);
    try {
        timer.start();
        inputPageData = dataReader.getNext(inputSize);
        timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
        this.updateStats(pageHeader, "Page Read", start, timeToRead, inputSize, inputSize);
        timer.reset();
        timer.start();
        start = dataReader.getPos();
        CompressionCodecName codecName = columnChunkMetaData.getCodec();
        BytesInputDecompressor decomp = codecFactory.getDecompressor(codecName);
        ByteBuffer input = inputPageData.nioBuffer(0, inputSize);
        ByteBuffer output = outputPageData.nioBuffer(0, outputSize);
        decomp.decompress(input, inputSize, output, outputSize);
        outputPageData.writerIndex(outputSize);
        timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
        if (logger.isTraceEnabled()) {
            logger.trace("Col: {}  readPos: {}  Uncompressed_size: {}  pageData: {}", columnChunkMetaData.toString(), dataReader.getPos(), outputSize, ByteBufUtil.hexDump(outputPageData));
        }
        this.updateStats(pageHeader, "Decompress", start, timeToRead, inputSize, outputSize);
    } finally {
        if (inputPageData != null) {
            inputPageData.release();
        }
    }
    return outputPageData;
}
Also used : CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) Stopwatch(org.apache.drill.shaded.guava.com.google.common.base.Stopwatch) BytesInputDecompressor(org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor) ByteBuffer(java.nio.ByteBuffer) DrillBuf(io.netty.buffer.DrillBuf)

Example 24 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project drill by apache.

the class AsyncPageReader method decompressPageV1.

/**
 * Reads a compressed v1 data page or a dictionary page, both of which are compressed
 * in their entirety.
 * @return decompressed Parquet page data
 * @throws IOException
 */
protected DrillBuf decompressPageV1(ReadStatus readStatus) throws IOException {
    Stopwatch timer = Stopwatch.createUnstarted();
    PageHeader pageHeader = readStatus.getPageHeader();
    int inputSize = pageHeader.getCompressed_page_size();
    int outputSize = pageHeader.getUncompressed_page_size();
    // TODO: does reporting this number have the same meaning in an async context?
    long start = dataReader.getPos();
    long timeToRead;
    DrillBuf inputPageData = readStatus.getPageData();
    DrillBuf outputPageData = this.allocator.buffer(outputSize);
    try {
        timer.start();
        CompressionCodecName codecName = columnChunkMetaData.getCodec();
        CompressionCodecFactory.BytesInputDecompressor decomp = codecFactory.getDecompressor(codecName);
        ByteBuffer input = inputPageData.nioBuffer(0, inputSize);
        ByteBuffer output = outputPageData.nioBuffer(0, outputSize);
        decomp.decompress(input, inputSize, output, outputSize);
        outputPageData.writerIndex(outputSize);
        timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
        if (logger.isTraceEnabled()) {
            logger.trace("Col: {}  readPos: {}  Uncompressed_size: {}  pageData: {}", columnChunkMetaData.toString(), // TODO: see comment on earlier call to getPos()
            dataReader.getPos(), outputSize, ByteBufUtil.hexDump(outputPageData));
        }
        this.updateStats(pageHeader, "Decompress", start, timeToRead, inputSize, outputSize);
    } finally {
        readStatus.setPageData(null);
        if (inputPageData != null) {
            inputPageData.release();
        }
    }
    return outputPageData;
}
Also used : CompressionCodecFactory(org.apache.parquet.compression.CompressionCodecFactory) PageHeader(org.apache.parquet.format.PageHeader) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) Stopwatch(org.apache.drill.shaded.guava.com.google.common.base.Stopwatch) ByteBuffer(java.nio.ByteBuffer) DrillBuf(io.netty.buffer.DrillBuf)

Example 25 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project hive by apache.

the class ParquetRecordWriterWrapper method initializeSerProperties.

private void initializeSerProperties(JobContext job, Properties tableProperties) {
    String blockSize = tableProperties.getProperty(ParquetOutputFormat.BLOCK_SIZE);
    Configuration conf = ContextUtil.getConfiguration(job);
    if (blockSize != null && !blockSize.isEmpty()) {
        LOG.debug("get override parquet.block.size property via tblproperties");
        conf.setInt(ParquetOutputFormat.BLOCK_SIZE, Integer.parseInt(blockSize));
    }
    String enableDictionaryPage = tableProperties.getProperty(ParquetOutputFormat.ENABLE_DICTIONARY);
    if (enableDictionaryPage != null && !enableDictionaryPage.isEmpty()) {
        LOG.debug("get override parquet.enable.dictionary property via tblproperties");
        conf.setBoolean(ParquetOutputFormat.ENABLE_DICTIONARY, Boolean.parseBoolean(enableDictionaryPage));
    }
    String compressionName = tableProperties.getProperty(ParquetOutputFormat.COMPRESSION);
    if (compressionName != null && !compressionName.isEmpty()) {
        // get override compression properties via "tblproperties" clause if it is set
        LOG.debug("get override compression properties via tblproperties");
        CompressionCodecName codecName = CompressionCodecName.fromConf(compressionName);
        conf.set(ParquetOutputFormat.COMPRESSION, codecName.name());
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName)

Aggregations

CompressionCodecName (org.apache.parquet.hadoop.metadata.CompressionCodecName)25 DrillBuf (io.netty.buffer.DrillBuf)6 ByteBuffer (java.nio.ByteBuffer)6 Configuration (org.apache.hadoop.conf.Configuration)6 Path (org.apache.hadoop.fs.Path)6 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)5 ConnectorSession (com.facebook.presto.spi.ConnectorSession)4 Stopwatch (org.apache.drill.shaded.guava.com.google.common.base.Stopwatch)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 MessageType (org.apache.parquet.schema.MessageType)4 TestingConnectorSession (com.facebook.presto.testing.TestingConnectorSession)3 IOException (java.io.IOException)3 JobConf (org.apache.hadoop.mapred.JobConf)3 WriterVersion (org.apache.parquet.column.ParquetProperties.WriterVersion)3 ParquetFileWriter (org.apache.parquet.hadoop.ParquetFileWriter)3 HiveClientConfig (com.facebook.presto.hive.HiveClientConfig)2 Stopwatch (com.google.common.base.Stopwatch)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 List (java.util.List)2