Search in sources :

Example 1 with ParquetFileWriter

use of org.apache.parquet.hadoop.ParquetFileWriter in project parquet-mr by apache.

the class MergeCommand method execute.

@Override
public void execute(CommandLine options) throws Exception {
    // Prepare arguments
    List<String> args = options.getArgList();
    List<Path> inputFiles = getInputFiles(args.subList(0, args.size() - 1));
    Path outputFile = new Path(args.get(args.size() - 1));
    // Merge schema and extraMeta
    FileMetaData mergedMeta = mergedMetadata(inputFiles);
    PrintWriter out = new PrintWriter(Main.out, true);
    // Merge data
    ParquetFileWriter writer = new ParquetFileWriter(conf, mergedMeta.getSchema(), outputFile, ParquetFileWriter.Mode.CREATE);
    writer.start();
    boolean tooSmallFilesMerged = false;
    for (Path input : inputFiles) {
        if (input.getFileSystem(conf).getFileStatus(input).getLen() < TOO_SMALL_FILE_THRESHOLD) {
            out.format("Warning: file %s is too small, length: %d\n", input, input.getFileSystem(conf).getFileStatus(input).getLen());
            tooSmallFilesMerged = true;
        }
        writer.appendFile(HadoopInputFile.fromPath(input, conf));
    }
    if (tooSmallFilesMerged) {
        out.println("Warning: you merged too small files. " + "Although the size of the merged file is bigger, it STILL contains small row groups, thus you don't have the advantage of big row groups, " + "which usually leads to bad query performance!");
    }
    writer.end(mergedMeta.getKeyValueMetaData());
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetFileWriter(org.apache.parquet.hadoop.ParquetFileWriter) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) PrintWriter(java.io.PrintWriter)

Example 2 with ParquetFileWriter

use of org.apache.parquet.hadoop.ParquetFileWriter in project drill by apache.

the class ParquetRecordWriter method createParquetFileWriter.

private void createParquetFileWriter() throws IOException {
    Path path = new Path(location, prefix + "_" + index + ".parquet");
    // to ensure that our writer was the first to create output file, we create empty file first and fail if file exists
    Path firstCreatedPath = storageStrategy.createFileAndApply(fs, path);
    // since parquet reader supports partitions, it means that several output files may be created
    // if this writer was the one to create table folder, we store only folder and delete it with its content in case of abort
    // if table location was created before, we store only files created by this writer and delete them in case of abort
    addCleanUpLocation(fs, firstCreatedPath);
    // we need to re-apply file permission
    if (useSingleFSBlock) {
        // Passing blockSize creates files with this blockSize instead of filesystem default blockSize.
        // Currently, this is supported only by filesystems included in
        // BLOCK_FS_SCHEMES (ParquetFileWriter.java in parquet-mr), which includes HDFS.
        // For other filesystems, it uses default blockSize configured for the file system.
        parquetFileWriter = new ParquetFileWriter(conf, schema, path, ParquetFileWriter.Mode.OVERWRITE, blockSize, 0);
    } else {
        parquetFileWriter = new ParquetFileWriter(conf, schema, path, ParquetFileWriter.Mode.OVERWRITE);
    }
    storageStrategy.applyToFile(fs, path);
    parquetFileWriter.start();
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ParquetFileWriter(org.apache.parquet.hadoop.ParquetFileWriter)

Example 3 with ParquetFileWriter

use of org.apache.parquet.hadoop.ParquetFileWriter in project presto by prestodb.

the class ParquetRecordWriterUtil method createParquetWriter.

public static RecordWriter createParquetWriter(Path target, JobConf conf, Properties properties, boolean compress, ConnectorSession session) throws IOException, ReflectiveOperationException {
    conf.setLong(ParquetOutputFormat.BLOCK_SIZE, getParquetWriterBlockSize(session).toBytes());
    conf.setLong(ParquetOutputFormat.PAGE_SIZE, getParquetWriterPageSize(session).toBytes());
    RecordWriter recordWriter = new MapredParquetOutputFormat().getHiveRecordWriter(conf, target, Text.class, compress, properties, Reporter.NULL);
    Object realWriter = REAL_WRITER_FIELD.get(recordWriter);
    Object internalWriter = INTERNAL_WRITER_FIELD.get(realWriter);
    ParquetFileWriter fileWriter = (ParquetFileWriter) FILE_WRITER_FIELD.get(internalWriter);
    return new ExtendedRecordWriter() {

        private long length;

        @Override
        public long getWrittenBytes() {
            return length;
        }

        @Override
        public void write(Writable value) throws IOException {
            recordWriter.write(value);
            length = fileWriter.getPos();
        }

        @Override
        public void close(boolean abort) throws IOException {
            recordWriter.close(abort);
            if (!abort) {
                length = target.getFileSystem(conf).getFileStatus(target).getLen();
            }
        }
    };
}
Also used : RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) ParquetRecordWriter(org.apache.parquet.hadoop.ParquetRecordWriter) ExtendedRecordWriter(com.facebook.presto.hive.RecordFileWriter.ExtendedRecordWriter) MapredParquetOutputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat) ParquetFileWriter(org.apache.parquet.hadoop.ParquetFileWriter) ExtendedRecordWriter(com.facebook.presto.hive.RecordFileWriter.ExtendedRecordWriter) Writable(org.apache.hadoop.io.Writable)

Example 4 with ParquetFileWriter

use of org.apache.parquet.hadoop.ParquetFileWriter in project drill by apache.

the class ParquetRecordWriter method endRecord.

@Override
public void endRecord() throws IOException {
    consumer.endMessage();
    // we wait until there is at least one record before creating the parquet file
    if (parquetFileWriter == null) {
        Path path = new Path(location, prefix + "_" + index + ".parquet");
        // to ensure that our writer was the first to create output file, we create empty file first and fail if file exists
        Path firstCreatedPath = storageStrategy.createFileAndApply(fs, path);
        // since parquet reader supports partitions, it means that several output files may be created
        // if this writer was the one to create table folder, we store only folder and delete it with its content in case of abort
        // if table location was created before, we store only files created by this writer and delete them in case of abort
        addCleanUpLocation(fs, firstCreatedPath);
        // we need to re-apply file permission
        if (useSingleFSBlock) {
            // Passing blockSize creates files with this blockSize instead of filesystem default blockSize.
            // Currently, this is supported only by filesystems included in
            // BLOCK_FS_SCHEMES (ParquetFileWriter.java in parquet-mr), which includes HDFS.
            // For other filesystems, it uses default blockSize configured for the file system.
            parquetFileWriter = new ParquetFileWriter(conf, schema, path, ParquetFileWriter.Mode.OVERWRITE, blockSize, 0);
        } else {
            parquetFileWriter = new ParquetFileWriter(conf, schema, path, ParquetFileWriter.Mode.OVERWRITE);
        }
        storageStrategy.applyToFile(fs, path);
        parquetFileWriter.start();
    }
    recordCount++;
    checkBlockSizeReached();
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ParquetFileWriter(org.apache.parquet.hadoop.ParquetFileWriter)

Example 5 with ParquetFileWriter

use of org.apache.parquet.hadoop.ParquetFileWriter in project drill by axbaretto.

the class ParquetRecordWriter method endRecord.

@Override
public void endRecord() throws IOException {
    consumer.endMessage();
    // we wait until there is at least one record before creating the parquet file
    if (parquetFileWriter == null) {
        Path path = new Path(location, prefix + "_" + index + ".parquet");
        // to ensure that our writer was the first to create output file, we create empty file first and fail if file exists
        Path firstCreatedPath = storageStrategy.createFileAndApply(fs, path);
        // since parquet reader supports partitions, it means that several output files may be created
        // if this writer was the one to create table folder, we store only folder and delete it with its content in case of abort
        // if table location was created before, we store only files created by this writer and delete them in case of abort
        addCleanUpLocation(fs, firstCreatedPath);
        // we need to re-apply file permission
        if (useSingleFSBlock) {
            // Passing blockSize creates files with this blockSize instead of filesystem default blockSize.
            // Currently, this is supported only by filesystems included in
            // BLOCK_FS_SCHEMES (ParquetFileWriter.java in parquet-mr), which includes HDFS.
            // For other filesystems, it uses default blockSize configured for the file system.
            parquetFileWriter = new ParquetFileWriter(conf, schema, path, ParquetFileWriter.Mode.OVERWRITE, blockSize, 0);
        } else {
            parquetFileWriter = new ParquetFileWriter(conf, schema, path, ParquetFileWriter.Mode.OVERWRITE);
        }
        storageStrategy.applyToFile(fs, path);
        parquetFileWriter.start();
    }
    recordCount++;
    checkBlockSizeReached();
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ParquetFileWriter(org.apache.parquet.hadoop.ParquetFileWriter)

Aggregations

ParquetFileWriter (org.apache.parquet.hadoop.ParquetFileWriter)8 Path (org.apache.hadoop.fs.Path)7 SchemaPath (org.apache.drill.common.expression.SchemaPath)3 HashMap (java.util.HashMap)2 Configuration (org.apache.hadoop.conf.Configuration)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 DirectByteBufferAllocator (org.apache.parquet.bytes.DirectByteBufferAllocator)2 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)2 RunLengthBitPackingHybridValuesWriter (org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter)2 CompressionCodecName (org.apache.parquet.hadoop.metadata.CompressionCodecName)2 FileMetaData (org.apache.parquet.hadoop.metadata.FileMetaData)2 MessageType (org.apache.parquet.schema.MessageType)2 ExtendedRecordWriter (com.facebook.presto.hive.RecordFileWriter.ExtendedRecordWriter)1 PrintWriter (java.io.PrintWriter)1 ArrayList (java.util.ArrayList)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 RecordWriter (org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter)1 MapredParquetOutputFormat (org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat)1 Writable (org.apache.hadoop.io.Writable)1 ParquetRecordWriter (org.apache.parquet.hadoop.ParquetRecordWriter)1