use of org.apache.parquet.hadoop.ParquetFileWriter in project parquet-mr by apache.
the class MergeCommand method execute.
@Override
public void execute(CommandLine options) throws Exception {
// Prepare arguments
List<String> args = options.getArgList();
List<Path> inputFiles = getInputFiles(args.subList(0, args.size() - 1));
Path outputFile = new Path(args.get(args.size() - 1));
// Merge schema and extraMeta
FileMetaData mergedMeta = mergedMetadata(inputFiles);
PrintWriter out = new PrintWriter(Main.out, true);
// Merge data
ParquetFileWriter writer = new ParquetFileWriter(conf, mergedMeta.getSchema(), outputFile, ParquetFileWriter.Mode.CREATE);
writer.start();
boolean tooSmallFilesMerged = false;
for (Path input : inputFiles) {
if (input.getFileSystem(conf).getFileStatus(input).getLen() < TOO_SMALL_FILE_THRESHOLD) {
out.format("Warning: file %s is too small, length: %d\n", input, input.getFileSystem(conf).getFileStatus(input).getLen());
tooSmallFilesMerged = true;
}
writer.appendFile(HadoopInputFile.fromPath(input, conf));
}
if (tooSmallFilesMerged) {
out.println("Warning: you merged too small files. " + "Although the size of the merged file is bigger, it STILL contains small row groups, thus you don't have the advantage of big row groups, " + "which usually leads to bad query performance!");
}
writer.end(mergedMeta.getKeyValueMetaData());
}
use of org.apache.parquet.hadoop.ParquetFileWriter in project drill by apache.
the class ParquetRecordWriter method createParquetFileWriter.
private void createParquetFileWriter() throws IOException {
Path path = new Path(location, prefix + "_" + index + ".parquet");
// to ensure that our writer was the first to create output file, we create empty file first and fail if file exists
Path firstCreatedPath = storageStrategy.createFileAndApply(fs, path);
// since parquet reader supports partitions, it means that several output files may be created
// if this writer was the one to create table folder, we store only folder and delete it with its content in case of abort
// if table location was created before, we store only files created by this writer and delete them in case of abort
addCleanUpLocation(fs, firstCreatedPath);
// we need to re-apply file permission
if (useSingleFSBlock) {
// Passing blockSize creates files with this blockSize instead of filesystem default blockSize.
// Currently, this is supported only by filesystems included in
// BLOCK_FS_SCHEMES (ParquetFileWriter.java in parquet-mr), which includes HDFS.
// For other filesystems, it uses default blockSize configured for the file system.
parquetFileWriter = new ParquetFileWriter(conf, schema, path, ParquetFileWriter.Mode.OVERWRITE, blockSize, 0);
} else {
parquetFileWriter = new ParquetFileWriter(conf, schema, path, ParquetFileWriter.Mode.OVERWRITE);
}
storageStrategy.applyToFile(fs, path);
parquetFileWriter.start();
}
use of org.apache.parquet.hadoop.ParquetFileWriter in project presto by prestodb.
the class ParquetRecordWriterUtil method createParquetWriter.
public static RecordWriter createParquetWriter(Path target, JobConf conf, Properties properties, boolean compress, ConnectorSession session) throws IOException, ReflectiveOperationException {
conf.setLong(ParquetOutputFormat.BLOCK_SIZE, getParquetWriterBlockSize(session).toBytes());
conf.setLong(ParquetOutputFormat.PAGE_SIZE, getParquetWriterPageSize(session).toBytes());
RecordWriter recordWriter = new MapredParquetOutputFormat().getHiveRecordWriter(conf, target, Text.class, compress, properties, Reporter.NULL);
Object realWriter = REAL_WRITER_FIELD.get(recordWriter);
Object internalWriter = INTERNAL_WRITER_FIELD.get(realWriter);
ParquetFileWriter fileWriter = (ParquetFileWriter) FILE_WRITER_FIELD.get(internalWriter);
return new ExtendedRecordWriter() {
private long length;
@Override
public long getWrittenBytes() {
return length;
}
@Override
public void write(Writable value) throws IOException {
recordWriter.write(value);
length = fileWriter.getPos();
}
@Override
public void close(boolean abort) throws IOException {
recordWriter.close(abort);
if (!abort) {
length = target.getFileSystem(conf).getFileStatus(target).getLen();
}
}
};
}
use of org.apache.parquet.hadoop.ParquetFileWriter in project drill by apache.
the class ParquetRecordWriter method endRecord.
@Override
public void endRecord() throws IOException {
consumer.endMessage();
// we wait until there is at least one record before creating the parquet file
if (parquetFileWriter == null) {
Path path = new Path(location, prefix + "_" + index + ".parquet");
// to ensure that our writer was the first to create output file, we create empty file first and fail if file exists
Path firstCreatedPath = storageStrategy.createFileAndApply(fs, path);
// since parquet reader supports partitions, it means that several output files may be created
// if this writer was the one to create table folder, we store only folder and delete it with its content in case of abort
// if table location was created before, we store only files created by this writer and delete them in case of abort
addCleanUpLocation(fs, firstCreatedPath);
// we need to re-apply file permission
if (useSingleFSBlock) {
// Passing blockSize creates files with this blockSize instead of filesystem default blockSize.
// Currently, this is supported only by filesystems included in
// BLOCK_FS_SCHEMES (ParquetFileWriter.java in parquet-mr), which includes HDFS.
// For other filesystems, it uses default blockSize configured for the file system.
parquetFileWriter = new ParquetFileWriter(conf, schema, path, ParquetFileWriter.Mode.OVERWRITE, blockSize, 0);
} else {
parquetFileWriter = new ParquetFileWriter(conf, schema, path, ParquetFileWriter.Mode.OVERWRITE);
}
storageStrategy.applyToFile(fs, path);
parquetFileWriter.start();
}
recordCount++;
checkBlockSizeReached();
}
use of org.apache.parquet.hadoop.ParquetFileWriter in project drill by axbaretto.
the class ParquetRecordWriter method endRecord.
@Override
public void endRecord() throws IOException {
consumer.endMessage();
// we wait until there is at least one record before creating the parquet file
if (parquetFileWriter == null) {
Path path = new Path(location, prefix + "_" + index + ".parquet");
// to ensure that our writer was the first to create output file, we create empty file first and fail if file exists
Path firstCreatedPath = storageStrategy.createFileAndApply(fs, path);
// since parquet reader supports partitions, it means that several output files may be created
// if this writer was the one to create table folder, we store only folder and delete it with its content in case of abort
// if table location was created before, we store only files created by this writer and delete them in case of abort
addCleanUpLocation(fs, firstCreatedPath);
// we need to re-apply file permission
if (useSingleFSBlock) {
// Passing blockSize creates files with this blockSize instead of filesystem default blockSize.
// Currently, this is supported only by filesystems included in
// BLOCK_FS_SCHEMES (ParquetFileWriter.java in parquet-mr), which includes HDFS.
// For other filesystems, it uses default blockSize configured for the file system.
parquetFileWriter = new ParquetFileWriter(conf, schema, path, ParquetFileWriter.Mode.OVERWRITE, blockSize, 0);
} else {
parquetFileWriter = new ParquetFileWriter(conf, schema, path, ParquetFileWriter.Mode.OVERWRITE);
}
storageStrategy.applyToFile(fs, path);
parquetFileWriter.start();
}
recordCount++;
checkBlockSizeReached();
}
Aggregations