use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project presto by prestodb.
the class ParquetTester method assertRoundTrip.
void assertRoundTrip(List<ObjectInspector> objectInspectors, Iterable<?>[] writeValues, Iterable<?>[] readValues, List<String> columnNames, List<Type> columnTypes, Optional<MessageType> parquetSchema, boolean singleLevelArray) throws Exception {
for (WriterVersion version : versions) {
for (CompressionCodecName compressionCodecName : compressions) {
for (ConnectorSession session : sessions) {
try (TempFile tempFile = new TempFile("test", "parquet")) {
JobConf jobConf = new JobConf();
jobConf.setEnum(COMPRESSION, compressionCodecName);
jobConf.setBoolean(ENABLE_DICTIONARY, true);
jobConf.setEnum(WRITER_VERSION, version);
writeParquetColumn(jobConf, tempFile.getFile(), compressionCodecName, createTableProperties(columnNames, objectInspectors), getStandardStructObjectInspector(columnNames, objectInspectors), getIterators(writeValues), parquetSchema, singleLevelArray);
assertFileContents(session, tempFile.getFile(), getIterators(readValues), columnNames, columnTypes);
}
}
}
}
// write presto parquet
for (CompressionCodecName compressionCodecName : writerCompressions) {
for (ConnectorSession session : sessions) {
try (TempFile tempFile = new TempFile("test", "parquet")) {
OptionalInt min = stream(writeValues).mapToInt(Iterables::size).min();
checkState(min.isPresent());
writeParquetFileFromPresto(tempFile.getFile(), columnTypes, columnNames, getIterators(readValues), min.getAsInt(), compressionCodecName);
assertFileContents(session, tempFile.getFile(), getIterators(readValues), columnNames, columnTypes);
}
}
}
}
use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project presto by prestodb.
the class ParquetTester method assertNonHiveWriterRoundTrip.
void assertNonHiveWriterRoundTrip(List<ObjectInspector> objectInspectors, Iterable<?>[] writeValues, Iterable<?>[] readValues, List<String> columnNames, List<Type> columnTypes, org.apache.parquet.schema.MessageType parquetSchema) throws Exception {
for (WriterVersion version : versions) {
for (CompressionCodecName compression : compressions) {
org.apache.parquet.hadoop.metadata.CompressionCodecName compressionCodecName = org.apache.parquet.hadoop.metadata.CompressionCodecName.valueOf(compression.name());
for (ConnectorSession session : sessions) {
try (TempFile tempFile = new TempFile("test", "parquet")) {
JobConf jobConf = new JobConf();
jobConf.setEnum(COMPRESSION, compressionCodecName);
jobConf.setBoolean(ENABLE_DICTIONARY, true);
jobConf.setEnum(WRITER_VERSION, version);
nonHiveParquetWriter(jobConf, tempFile.getFile(), compressionCodecName, getStandardStructObjectInspector(columnNames, objectInspectors), getIterators(writeValues), parquetSchema);
assertFileContents(session, tempFile.getFile(), getIterators(readValues), columnNames, columnTypes);
}
}
}
}
}
use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project presto by prestodb.
the class ParquetTester method assertMaxReadBytes.
void assertMaxReadBytes(List<ObjectInspector> objectInspectors, Iterable<?>[] writeValues, Iterable<?>[] readValues, List<String> columnNames, List<Type> columnTypes, Optional<MessageType> parquetSchema, DataSize maxReadBlockSize) throws Exception {
WriterVersion version = PARQUET_1_0;
CompressionCodecName compressionCodecName = UNCOMPRESSED;
HiveClientConfig config = new HiveClientConfig().setHiveStorageFormat(HiveStorageFormat.PARQUET).setUseParquetColumnNames(false).setParquetMaxReadBlockSize(maxReadBlockSize);
ConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(config, new OrcFileWriterConfig(), new ParquetFileWriterConfig(), new CacheConfig()).getSessionProperties());
try (TempFile tempFile = new TempFile("test", "parquet")) {
JobConf jobConf = new JobConf();
jobConf.setEnum(COMPRESSION, compressionCodecName);
jobConf.setBoolean(ENABLE_DICTIONARY, true);
jobConf.setEnum(WRITER_VERSION, version);
writeParquetColumn(jobConf, tempFile.getFile(), compressionCodecName, createTableProperties(columnNames, objectInspectors), getStandardStructObjectInspector(columnNames, objectInspectors), getIterators(writeValues), parquetSchema, false);
Iterator<?>[] expectedValues = getIterators(readValues);
try (ConnectorPageSource pageSource = getFileFormat().createFileFormatReader(session, HDFS_ENVIRONMENT, tempFile.getFile(), columnNames, columnTypes)) {
assertPageSource(columnTypes, expectedValues, pageSource, Optional.of(getParquetMaxReadBlockSize(session).toBytes()));
assertFalse(stream(expectedValues).allMatch(Iterator::hasNext));
}
}
}
use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project nifi by apache.
the class PutParquet method applyCommonConfig.
private void applyCommonConfig(final ParquetWriter.Builder<?, ?> builder, final ProcessContext context, final FlowFile flowFile, final Configuration conf) {
builder.withConf(conf);
// Required properties
final boolean overwrite = context.getProperty(OVERWRITE).asBoolean();
final ParquetFileWriter.Mode mode = overwrite ? ParquetFileWriter.Mode.OVERWRITE : ParquetFileWriter.Mode.CREATE;
builder.withWriteMode(mode);
final PropertyDescriptor compressionTypeDescriptor = getPropertyDescriptor(COMPRESSION_TYPE.getName());
final String compressionTypeValue = context.getProperty(compressionTypeDescriptor).getValue();
final CompressionCodecName codecName = CompressionCodecName.valueOf(compressionTypeValue);
builder.withCompressionCodec(codecName);
if (context.getProperty(ROW_GROUP_SIZE).isSet()) {
try {
final Double rowGroupSize = context.getProperty(ROW_GROUP_SIZE).evaluateAttributeExpressions(flowFile).asDataSize(DataUnit.B);
if (rowGroupSize != null) {
builder.withRowGroupSize(rowGroupSize.intValue());
}
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException("Invalid data size for " + ROW_GROUP_SIZE.getDisplayName(), e);
}
}
if (context.getProperty(PAGE_SIZE).isSet()) {
try {
final Double pageSize = context.getProperty(PAGE_SIZE).evaluateAttributeExpressions(flowFile).asDataSize(DataUnit.B);
if (pageSize != null) {
builder.withPageSize(pageSize.intValue());
}
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException("Invalid data size for " + PAGE_SIZE.getDisplayName(), e);
}
}
if (context.getProperty(DICTIONARY_PAGE_SIZE).isSet()) {
try {
final Double dictionaryPageSize = context.getProperty(DICTIONARY_PAGE_SIZE).evaluateAttributeExpressions(flowFile).asDataSize(DataUnit.B);
if (dictionaryPageSize != null) {
builder.withDictionaryPageSize(dictionaryPageSize.intValue());
}
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException("Invalid data size for " + DICTIONARY_PAGE_SIZE.getDisplayName(), e);
}
}
if (context.getProperty(MAX_PADDING_SIZE).isSet()) {
try {
final Double maxPaddingSize = context.getProperty(MAX_PADDING_SIZE).evaluateAttributeExpressions(flowFile).asDataSize(DataUnit.B);
if (maxPaddingSize != null) {
builder.withMaxPaddingSize(maxPaddingSize.intValue());
}
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException("Invalid data size for " + MAX_PADDING_SIZE.getDisplayName(), e);
}
}
if (context.getProperty(ENABLE_DICTIONARY_ENCODING).isSet()) {
final boolean enableDictionaryEncoding = context.getProperty(ENABLE_DICTIONARY_ENCODING).asBoolean();
builder.withDictionaryEncoding(enableDictionaryEncoding);
}
if (context.getProperty(ENABLE_VALIDATION).isSet()) {
final boolean enableValidation = context.getProperty(ENABLE_VALIDATION).asBoolean();
builder.withValidation(enableValidation);
}
if (context.getProperty(WRITER_VERSION).isSet()) {
final String writerVersionValue = context.getProperty(WRITER_VERSION).getValue();
builder.withWriterVersion(ParquetProperties.WriterVersion.valueOf(writerVersionValue));
}
}
use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project parquet-mr by apache.
the class ConvertCommand method run.
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() == 1, "A data file is required.");
String source = targets.get(0);
CompressionCodecName codec = Codecs.parquetCodec(compressionCodecName);
Schema schema;
if (avroSchemaFile != null) {
schema = Schemas.fromAvsc(open(avroSchemaFile));
} else {
schema = getAvroSchema(source);
}
Schema projection = filterSchema(schema, columns);
Path outPath = qualifiedPath(outputPath);
FileSystem outFS = outPath.getFileSystem(getConf());
if (overwrite && outFS.exists(outPath)) {
console.debug("Deleting output file {} (already exists)", outPath);
outFS.delete(outPath);
}
Iterable<Record> reader = openDataFile(source, projection);
boolean threw = true;
long count = 0;
try {
try (ParquetWriter<Record> writer = AvroParquetWriter.<Record>builder(qualifiedPath(outputPath)).withWriterVersion(v2 ? PARQUET_2_0 : PARQUET_1_0).withConf(getConf()).withCompressionCodec(codec).withRowGroupSize(rowGroupSize).withDictionaryPageSize(dictionaryPageSize < 64 ? 64 : dictionaryPageSize).withDictionaryEncoding(dictionaryPageSize != 0).withPageSize(pageSize).withDataModel(GenericData.get()).withSchema(projection).build()) {
for (Record record : reader) {
writer.write(record);
count += 1;
}
}
threw = false;
} catch (RuntimeException e) {
throw new RuntimeException("Failed on record " + count, e);
} finally {
if (reader instanceof Closeable) {
Closeables.close((Closeable) reader, threw);
}
}
return 0;
}
Aggregations