Search in sources :

Example 1 with JobPath

use of alluxio.job.plan.transform.format.JobPath in project alluxio by Alluxio.

the class ParquetWriter method create.

/**
 * Creates a Parquet writer specifying a row group size.
 *
 * @param schema the schema
 * @param uri the URI to the output
 * @param rowGroupSize the row group size
 * @param enableDictionary whether to enable dictionary
 * @param compressionCodec the compression codec name
 * @return the writer
 */
public static ParquetWriter create(TableSchema schema, AlluxioURI uri, int rowGroupSize, boolean enableDictionary, String compressionCodec) throws IOException {
    Configuration conf = ReadWriterUtils.writeThroughConf();
    ParquetSchema parquetSchema = schema.toParquet();
    return new ParquetWriter(AvroParquetWriter.<Record>builder(HadoopOutputFile.fromPath(new JobPath(uri.getScheme(), uri.getAuthority().toString(), uri.getPath()), conf)).withWriterVersion(ParquetProperties.WriterVersion.PARQUET_2_0).withConf(conf).withCompressionCodec(CompressionCodecName.fromConf(compressionCodec)).withRowGroupSize(rowGroupSize).withDictionaryPageSize(org.apache.parquet.hadoop.ParquetWriter.DEFAULT_PAGE_SIZE).withDictionaryEncoding(enableDictionary).withPageSize(org.apache.parquet.hadoop.ParquetWriter.DEFAULT_PAGE_SIZE).withDataModel(GenericData.get()).withSchema(parquetSchema.getSchema()).build());
}
Also used : JobPath(alluxio.job.plan.transform.format.JobPath) Configuration(org.apache.hadoop.conf.Configuration) AvroParquetWriter(org.apache.parquet.avro.AvroParquetWriter)

Example 2 with JobPath

use of alluxio.job.plan.transform.format.JobPath in project alluxio by Alluxio.

the class ParquetReader method create.

/**
 * Creates a parquet reader.
 *
 * @param uri the URI to the input
 * @return the reader
 * @throws IOException when failed to create the reader
 */
public static ParquetReader create(AlluxioURI uri) throws IOException {
    Path inputPath = new JobPath(uri.getScheme(), uri.getAuthority().toString(), uri.getPath());
    Configuration conf = ReadWriterUtils.readNoCacheConf();
    InputFile inputFile = HadoopInputFile.fromPath(inputPath, conf);
    org.apache.parquet.hadoop.ParquetReader<Record> reader = AvroParquetReader.<Record>builder(inputFile).disableCompatibility().withDataModel(GenericData.get()).withConf(conf).build();
    Schema schema;
    ParquetMetadata footer;
    try (ParquetFileReader r = new ParquetFileReader(inputFile, ParquetReadOptions.builder().build())) {
        footer = r.getFooter();
        schema = new AvroSchemaConverter().convert(footer.getFileMetaData().getSchema());
    }
    return new ParquetReader(reader, schema, footer);
}
Also used : JobPath(alluxio.job.plan.transform.format.JobPath) Path(org.apache.hadoop.fs.Path) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) JobPath(alluxio.job.plan.transform.format.JobPath) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) Schema(org.apache.avro.Schema) TableSchema(alluxio.job.plan.transform.format.TableSchema) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) AvroParquetReader(org.apache.parquet.avro.AvroParquetReader) InputFile(org.apache.parquet.io.InputFile) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) Record(org.apache.avro.generic.GenericData.Record)

Aggregations

JobPath (alluxio.job.plan.transform.format.JobPath)2 Configuration (org.apache.hadoop.conf.Configuration)2 TableSchema (alluxio.job.plan.transform.format.TableSchema)1 Schema (org.apache.avro.Schema)1 Record (org.apache.avro.generic.GenericData.Record)1 Path (org.apache.hadoop.fs.Path)1 AvroParquetReader (org.apache.parquet.avro.AvroParquetReader)1 AvroParquetWriter (org.apache.parquet.avro.AvroParquetWriter)1 AvroSchemaConverter (org.apache.parquet.avro.AvroSchemaConverter)1 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)1 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)1 HadoopInputFile (org.apache.parquet.hadoop.util.HadoopInputFile)1 InputFile (org.apache.parquet.io.InputFile)1