Search in sources :

Example 6 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project parquet-mr by apache.

the class ConvertCSVCommand method run.

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
    Preconditions.checkArgument(targets != null && targets.size() == 1, "CSV path is required.");
    if (header != null) {
        // if a header is given on the command line, don't assume one is in the file
        noHeader = true;
    }
    CSVProperties props = new CSVProperties.Builder().delimiter(delimiter).escape(escape).quote(quote).header(header).hasHeader(!noHeader).linesToSkip(linesToSkip).charset(charsetName).build();
    String source = targets.get(0);
    Schema csvSchema;
    if (avroSchemaFile != null) {
        csvSchema = Schemas.fromAvsc(open(avroSchemaFile));
    } else {
        Set<String> required = ImmutableSet.of();
        if (requiredFields != null) {
            required = ImmutableSet.copyOf(requiredFields);
        }
        String filename = new File(source).getName();
        String recordName;
        if (filename.contains(".")) {
            recordName = filename.substring(0, filename.indexOf("."));
        } else {
            recordName = filename;
        }
        csvSchema = AvroCSV.inferNullableSchema(recordName, open(source), props, required);
    }
    long count = 0;
    try (AvroCSVReader<Record> reader = new AvroCSVReader<>(open(source), props, csvSchema, Record.class, true)) {
        CompressionCodecName codec = Codecs.parquetCodec(compressionCodecName);
        try (ParquetWriter<Record> writer = AvroParquetWriter.<Record>builder(qualifiedPath(outputPath)).withWriterVersion(v2 ? PARQUET_2_0 : PARQUET_1_0).withWriteMode(overwrite ? ParquetFileWriter.Mode.OVERWRITE : ParquetFileWriter.Mode.CREATE).withCompressionCodec(codec).withDictionaryEncoding(true).withDictionaryPageSize(dictionaryPageSize).withPageSize(pageSize).withRowGroupSize(rowGroupSize).withDataModel(GenericData.get()).withConf(getConf()).withSchema(csvSchema).build()) {
            for (Record record : reader) {
                writer.write(record);
            }
        } catch (RuntimeException e) {
            throw new RuntimeException("Failed on record " + count, e);
        }
    }
    return 0;
}
Also used : Schema(org.apache.avro.Schema) CSVProperties(org.apache.parquet.cli.csv.CSVProperties) AvroCSVReader(org.apache.parquet.cli.csv.AvroCSVReader) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) Record(org.apache.avro.generic.GenericData.Record) File(java.io.File)

Example 7 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project parquet-mr by apache.

the class CodecConfig method getCodec.

public CompressionCodecName getCodec() {
    CompressionCodecName codec;
    Configuration configuration = getConfiguration();
    if (isParquetCompressionSet(configuration)) {
        // explicit parquet config
        codec = getParquetCompressionCodec(configuration);
    } else if (isHadoopCompressionSet()) {
        // from hadoop config
        codec = getHadoopCompressionCodec();
    } else {
        LOG.info("Compression set to false");
        codec = CompressionCodecName.UNCOMPRESSED;
    }
    LOG.info("Compression: {}", codec.name());
    return codec;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName)

Example 8 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project parquet-mr by apache.

the class TestParquetMetadataConverter method createColumnChunkMetaData.

private ColumnChunkMetaData createColumnChunkMetaData() {
    Set<org.apache.parquet.column.Encoding> e = new HashSet<org.apache.parquet.column.Encoding>();
    PrimitiveTypeName t = PrimitiveTypeName.BINARY;
    ColumnPath p = ColumnPath.get("foo");
    CompressionCodecName c = CompressionCodecName.GZIP;
    BinaryStatistics s = new BinaryStatistics();
    ColumnChunkMetaData md = ColumnChunkMetaData.get(p, t, c, e, s, 0, 0, 0, 0, 0);
    return md;
}
Also used : CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) HashSet(java.util.HashSet) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName)

Example 9 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project drill by apache.

the class AsyncPageReader method decompressPageV2.

/**
 * Reads a compressed v2 data page which excluded the repetition and definition level
 * sections from compression.
 * @return decompressed Parquet page data
 * @throws IOException
 */
protected DrillBuf decompressPageV2(ReadStatus readStatus) throws IOException {
    Stopwatch timer = Stopwatch.createUnstarted();
    PageHeader pageHeader = readStatus.getPageHeader();
    int inputSize = pageHeader.getCompressed_page_size();
    int repLevelSize = pageHeader.data_page_header_v2.getRepetition_levels_byte_length();
    int defLevelSize = pageHeader.data_page_header_v2.getDefinition_levels_byte_length();
    int compDataOffset = repLevelSize + defLevelSize;
    int outputSize = pageHeader.uncompressed_page_size;
    // TODO: does reporting this number have the same meaning in an async context?
    long start = dataReader.getPos();
    long timeToRead;
    DrillBuf inputPageData = readStatus.getPageData();
    DrillBuf outputPageData = this.allocator.buffer(outputSize);
    try {
        timer.start();
        // Write out the uncompressed section
        // Note that the following setBytes call to read the repetition and definition level sections
        // advances readerIndex in inputPageData but not writerIndex in outputPageData.
        outputPageData.setBytes(0, inputPageData, compDataOffset);
        // decompress from the start of compressed data to the end of the input buffer
        CompressionCodecName codecName = columnChunkMetaData.getCodec();
        CompressionCodecFactory.BytesInputDecompressor decomp = codecFactory.getDecompressor(codecName);
        ByteBuffer input = inputPageData.nioBuffer(compDataOffset, inputSize - compDataOffset);
        ByteBuffer output = outputPageData.nioBuffer(compDataOffset, outputSize - compDataOffset);
        decomp.decompress(input, inputSize - compDataOffset, output, outputSize - compDataOffset);
        outputPageData.writerIndex(outputSize);
        timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
        if (logger.isTraceEnabled()) {
            logger.trace("Col: {}  readPos: {}  Uncompressed_size: {}  pageData: {}", columnChunkMetaData.toString(), // TODO: see comment on earlier call to getPos()
            dataReader.getPos(), outputSize, ByteBufUtil.hexDump(outputPageData));
        }
        this.updateStats(pageHeader, "Decompress", start, timeToRead, inputSize, outputSize);
    } finally {
        readStatus.setPageData(null);
        if (inputPageData != null) {
            inputPageData.release();
        }
    }
    return outputPageData;
}
Also used : CompressionCodecFactory(org.apache.parquet.compression.CompressionCodecFactory) PageHeader(org.apache.parquet.format.PageHeader) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) Stopwatch(org.apache.drill.shaded.guava.com.google.common.base.Stopwatch) ByteBuffer(java.nio.ByteBuffer) DrillBuf(io.netty.buffer.DrillBuf)

Example 10 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project drill by apache.

the class PageReader method readCompressedPageV2.

/**
 * Reads a compressed v2 data page which excluded the repetition and definition level
 * sections from compression.
 * @return decompressed Parquet page data
 * @throws IOException
 */
protected DrillBuf readCompressedPageV2() throws IOException {
    Stopwatch timer = Stopwatch.createUnstarted();
    int inputSize = pageHeader.getCompressed_page_size();
    int repLevelSize = pageHeader.data_page_header_v2.getRepetition_levels_byte_length();
    int defLevelSize = pageHeader.data_page_header_v2.getDefinition_levels_byte_length();
    int compDataOffset = repLevelSize + defLevelSize;
    int outputSize = pageHeader.uncompressed_page_size;
    long start = dataReader.getPos();
    long timeToRead;
    DrillBuf inputPageData = null;
    DrillBuf outputPageData = this.allocator.buffer(outputSize);
    try {
        timer.start();
        // Read in both the uncompressed and compressed sections
        inputPageData = dataReader.getNext(inputSize);
        timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
        this.updateStats(pageHeader, "Page Read", start, timeToRead, inputSize, inputSize);
        timer.reset();
        timer.start();
        start = dataReader.getPos();
        // Write out the uncompressed section
        // Note that the following setBytes call to read the repetition and definition level sections
        // advances readerIndex in inputPageData but not writerIndex in outputPageData.
        outputPageData.setBytes(0, inputPageData, compDataOffset);
        // decompress from the start of compressed data to the end of the input buffer
        CompressionCodecName codecName = columnChunkMetaData.getCodec();
        BytesInputDecompressor decomp = codecFactory.getDecompressor(codecName);
        ByteBuffer input = inputPageData.nioBuffer(compDataOffset, inputSize - compDataOffset);
        ByteBuffer output = outputPageData.nioBuffer(compDataOffset, outputSize - compDataOffset);
        decomp.decompress(input, inputSize - compDataOffset, output, outputSize - compDataOffset);
        outputPageData.writerIndex(outputSize);
        timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
        if (logger.isTraceEnabled()) {
            logger.trace("Col: {}  readPos: {}  Uncompressed_size: {}  pageData: {}", columnChunkMetaData.toString(), dataReader.getPos(), outputSize, ByteBufUtil.hexDump(outputPageData));
        }
        this.updateStats(pageHeader, "Decompress", start, timeToRead, inputSize, outputSize);
    } finally {
        if (inputPageData != null) {
            inputPageData.release();
        }
    }
    return outputPageData;
}
Also used : CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) Stopwatch(org.apache.drill.shaded.guava.com.google.common.base.Stopwatch) BytesInputDecompressor(org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor) ByteBuffer(java.nio.ByteBuffer) DrillBuf(io.netty.buffer.DrillBuf)

Aggregations

CompressionCodecName (org.apache.parquet.hadoop.metadata.CompressionCodecName)25 DrillBuf (io.netty.buffer.DrillBuf)6 ByteBuffer (java.nio.ByteBuffer)6 Configuration (org.apache.hadoop.conf.Configuration)6 Path (org.apache.hadoop.fs.Path)6 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)5 ConnectorSession (com.facebook.presto.spi.ConnectorSession)4 Stopwatch (org.apache.drill.shaded.guava.com.google.common.base.Stopwatch)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 MessageType (org.apache.parquet.schema.MessageType)4 TestingConnectorSession (com.facebook.presto.testing.TestingConnectorSession)3 IOException (java.io.IOException)3 JobConf (org.apache.hadoop.mapred.JobConf)3 WriterVersion (org.apache.parquet.column.ParquetProperties.WriterVersion)3 ParquetFileWriter (org.apache.parquet.hadoop.ParquetFileWriter)3 HiveClientConfig (com.facebook.presto.hive.HiveClientConfig)2 Stopwatch (com.google.common.base.Stopwatch)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 List (java.util.List)2