Search in sources :

Example 16 with RecordWriter

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter in project presto by prestodb.

the class ParquetTester method writeParquetColumn.

private static DataSize writeParquetColumn(JobConf jobConf, File outputFile, CompressionCodecName compressionCodecName, ObjectInspector columnObjectInspector, Iterator<?> values) throws Exception {
    RecordWriter recordWriter = new MapredParquetOutputFormat().getHiveRecordWriter(jobConf, new Path(outputFile.toURI()), Text.class, compressionCodecName != UNCOMPRESSED, createTableProperties("test", columnObjectInspector.getTypeName()), () -> {
    });
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", columnObjectInspector);
    Object row = objectInspector.create();
    List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs());
    int i = 0;
    while (values.hasNext()) {
        Object value = values.next();
        objectInspector.setStructFieldData(row, fields.get(0), value);
        ParquetHiveSerDe serde = new ParquetHiveSerDe();
        serde.initialize(jobConf, createTableProperties("test", columnObjectInspector.getTypeName()), null);
        Writable record = serde.serialize(row, objectInspector);
        recordWriter.write(record);
        i++;
    }
    recordWriter.close(false);
    return succinctBytes(outputFile.length());
}
Also used : Path(org.apache.hadoop.fs.Path) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) MapredParquetOutputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat) ParquetHiveSerDe(org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe) Writable(org.apache.hadoop.io.Writable)

Example 17 with RecordWriter

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter in project presto by prestodb.

the class ParquetTestUtils method writeParquetColumnHive.

static void writeParquetColumnHive(File file, String columnName, boolean nullable, Type type, Iterator<?> values) throws Exception {
    JobConf jobConf = new JobConf();
    // Set this config to get around the issue of LocalFileSystem not getting registered when running the benchmarks using
    // the standalone jar with all dependencies
    jobConf.set("fs.file.impl", LocalFileSystem.class.getCanonicalName());
    jobConf.setLong(ParquetOutputFormat.BLOCK_SIZE, new DataSize(256, MEGABYTE).toBytes());
    jobConf.setLong(ParquetOutputFormat.PAGE_SIZE, new DataSize(100, KILOBYTE).toBytes());
    jobConf.set(ParquetOutputFormat.COMPRESSION, "snappy");
    Properties properties = new Properties();
    properties.setProperty("columns", columnName);
    properties.setProperty("columns.types", getHiveType(type));
    RecordWriter recordWriter = createParquetWriter(nullable, new Path(file.getAbsolutePath()), jobConf, properties, true);
    List<ObjectInspector> objectInspectors = getRowObjectInspectors(type);
    SettableStructObjectInspector tableObjectInspector = getStandardStructObjectInspector(ImmutableList.of(columnName), objectInspectors);
    Object row = tableObjectInspector.create();
    StructField structField = tableObjectInspector.getStructFieldRef(columnName);
    Setter setter = getSetter(type, tableObjectInspector, row, structField);
    Serializer serializer = initializeSerializer(jobConf, properties);
    while (values.hasNext()) {
        Object value = values.next();
        if (value == null) {
            tableObjectInspector.setStructFieldData(row, structField, null);
        } else {
            setter.set(value);
        }
        recordWriter.write(serializer.serialize(row, tableObjectInspector));
    }
    recordWriter.close(false);
}
Also used : Path(org.apache.hadoop.fs.Path) PrimitiveObjectInspectorFactory.javaByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaByteObjectInspector) PrimitiveObjectInspectorFactory.javaLongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaLongObjectInspector) PrimitiveObjectInspectorFactory.javaTimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaTimestampObjectInspector) PrimitiveObjectInspectorFactory.javaDateObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDateObjectInspector) PrimitiveObjectInspectorFactory.writableTimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableTimestampObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector) PrimitiveObjectInspectorFactory.javaFloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaFloatObjectInspector) PrimitiveObjectInspectorFactory.javaDoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDoubleObjectInspector) PrimitiveObjectInspectorFactory.javaIntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaIntObjectInspector) PrimitiveObjectInspectorFactory.writableFloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableFloatObjectInspector) PrimitiveObjectInspectorFactory.javaShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaShortObjectInspector) PrimitiveObjectInspectorFactory.writableBooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableBooleanObjectInspector) ObjectInspectorFactory.getStandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) PrimitiveObjectInspectorFactory.writableDoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableDoubleObjectInspector) PrimitiveObjectInspectorFactory.writableLongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableLongObjectInspector) PrimitiveObjectInspectorFactory.writableByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableByteObjectInspector) PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector) PrimitiveObjectInspectorFactory.writableHiveCharObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableHiveCharObjectInspector) PrimitiveObjectInspectorFactory.writableShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableShortObjectInspector) PrimitiveObjectInspectorFactory.javaBooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaBooleanObjectInspector) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector) PrimitiveObjectInspectorFactory.writableStringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableStringObjectInspector) PrimitiveObjectInspectorFactory.writableIntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableIntObjectInspector) Properties(java.util.Properties) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) DataSize(io.airlift.units.DataSize) JobConf(org.apache.hadoop.mapred.JobConf) Serializer(org.apache.hadoop.hive.serde2.Serializer)

Example 18 with RecordWriter

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter in project presto by prestodb.

the class HiveWriteUtils method createRcFileWriter.

private static RecordWriter createRcFileWriter(Path target, JobConf conf, Properties properties, boolean compress) throws IOException {
    int columns = properties.getProperty(META_TABLE_COLUMNS).split(",").length;
    RCFileOutputFormat.setColumnNumber(conf, columns);
    CompressionCodec codec = null;
    if (compress) {
        codec = ReflectionUtil.newInstance(getOutputCompressorClass(conf, DefaultCodec.class), conf);
    }
    RCFile.Writer writer = new RCFile.Writer(target.getFileSystem(conf), conf, target, () -> {
    }, codec);
    return new ExtendedRecordWriter() {

        private long length;

        @Override
        public long getWrittenBytes() {
            return length;
        }

        @Override
        public void write(Writable value) throws IOException {
            writer.append(value);
            length = writer.getLength();
        }

        @Override
        public void close(boolean abort) throws IOException {
            writer.close();
            if (!abort) {
                length = target.getFileSystem(conf).getFileStatus(target).getLen();
            }
        }
    };
}
Also used : RCFile(org.apache.hadoop.hive.ql.io.RCFile) ExtendedRecordWriter(com.facebook.presto.hive.RecordFileWriter.ExtendedRecordWriter) DateWritable(org.apache.hadoop.hive.serde2.io.DateWritable) Writable(org.apache.hadoop.io.Writable) IntWritable(org.apache.hadoop.io.IntWritable) BooleanWritable(org.apache.hadoop.io.BooleanWritable) DoubleWritable(org.apache.hadoop.hive.serde2.io.DoubleWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) LongWritable(org.apache.hadoop.io.LongWritable) ShortWritable(org.apache.hadoop.hive.serde2.io.ShortWritable) ByteWritable(org.apache.hadoop.io.ByteWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) TimestampWritable(org.apache.hadoop.hive.serde2.io.TimestampWritable) HiveDecimalWritable(org.apache.hadoop.hive.serde2.io.HiveDecimalWritable) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) ExtendedRecordWriter(com.facebook.presto.hive.RecordFileWriter.ExtendedRecordWriter) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) ParquetRecordWriterUtil.createParquetWriter(com.facebook.presto.hive.ParquetRecordWriterUtil.createParquetWriter)

Example 19 with RecordWriter

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter in project presto by prestodb.

the class RcFileTester method writeRcFileColumnOld.

private static DataSize writeRcFileColumnOld(File outputFile, Format format, Compression compression, Type type, Iterator<?> values) throws Exception {
    ObjectInspector columnObjectInspector = getJavaObjectInspector(type);
    RecordWriter recordWriter = createRcFileWriterOld(outputFile, compression, columnObjectInspector);
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", columnObjectInspector);
    Object row = objectInspector.create();
    List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs());
    Serializer serializer = format.createSerializer();
    Properties tableProperties = new Properties();
    tableProperties.setProperty("columns", "test");
    tableProperties.setProperty("columns.types", objectInspector.getTypeName());
    serializer.initialize(new JobConf(false), tableProperties);
    while (values.hasNext()) {
        Object value = values.next();
        value = preprocessWriteValueOld(type, value);
        objectInspector.setStructFieldData(row, fields.get(0), value);
        Writable record = serializer.serialize(row, objectInspector);
        recordWriter.write(record);
    }
    recordWriter.close(false);
    return new DataSize(outputFile.length(), BYTE).convertToMostSuccinctDataSize();
}
Also used : SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) PrimitiveObjectInspectorFactory.javaByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaByteObjectInspector) PrimitiveObjectInspectorFactory.javaLongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaLongObjectInspector) PrimitiveObjectInspectorFactory.javaTimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaTimestampObjectInspector) PrimitiveObjectInspectorFactory.javaDateObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDateObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector) PrimitiveObjectInspectorFactory.javaFloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaFloatObjectInspector) PrimitiveObjectInspectorFactory.javaDoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDoubleObjectInspector) PrimitiveObjectInspectorFactory.javaIntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaIntObjectInspector) PrimitiveObjectInspectorFactory.javaShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaShortObjectInspector) ObjectInspectorFactory.getStandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) PrimitiveObjectInspectorFactory.javaBooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaBooleanObjectInspector) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector) PrimitiveObjectInspectorFactory.javaStringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaStringObjectInspector) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) DataSize(io.airlift.units.DataSize) DateWritable(org.apache.hadoop.hive.serde2.io.DateWritable) Writable(org.apache.hadoop.io.Writable) IntWritable(org.apache.hadoop.io.IntWritable) BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) BooleanWritable(org.apache.hadoop.io.BooleanWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) LongWritable(org.apache.hadoop.io.LongWritable) ShortWritable(org.apache.hadoop.hive.serde2.io.ShortWritable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) ByteWritable(org.apache.hadoop.io.ByteWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) TimestampWritable(org.apache.hadoop.hive.serde2.io.TimestampWritable) HiveDecimalWritable(org.apache.hadoop.hive.serde2.io.HiveDecimalWritable) StructObject(org.apache.hadoop.hive.serde2.StructObject) Properties(java.util.Properties) JobConf(org.apache.hadoop.mapred.JobConf) Serializer(org.apache.hadoop.hive.serde2.Serializer)

Example 20 with RecordWriter

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter in project hive by apache.

the class Utilities method createEmptyBuckets.

/**
 * Check the existence of buckets according to bucket specification. Create empty buckets if
 * needed.
 *
 * @param hconf The definition of the FileSink.
 * @param paths A list of empty buckets to create
 * @param reporter The mapreduce reporter object
 * @throws HiveException
 * @throws IOException
 */
static void createEmptyBuckets(Configuration hconf, List<Path> paths, boolean isCompressed, TableDesc tableInfo, Reporter reporter) throws HiveException, IOException {
    JobConf jc;
    if (hconf instanceof JobConf) {
        jc = new JobConf(hconf);
    } else {
        // test code path
        jc = new JobConf(hconf);
    }
    HiveOutputFormat<?, ?> hiveOutputFormat = null;
    Class<? extends Writable> outputClass = null;
    try {
        AbstractSerDe serde = tableInfo.getSerDeClass().newInstance();
        serde.initialize(hconf, tableInfo.getProperties(), null);
        outputClass = serde.getSerializedClass();
        hiveOutputFormat = HiveFileFormatUtils.getHiveOutputFormat(hconf, tableInfo);
    } catch (SerDeException e) {
        throw new HiveException(e);
    } catch (InstantiationException e) {
        throw new HiveException(e);
    } catch (IllegalAccessException e) {
        throw new HiveException(e);
    }
    for (Path path : paths) {
        Utilities.FILE_OP_LOGGER.trace("creating empty bucket for {}", path);
        RecordWriter writer = hiveOutputFormat.getHiveRecordWriter(jc, path, outputClass, isCompressed, tableInfo.getProperties(), reporter);
        writer.close(false);
        LOG.info("created empty bucket for enforcing bucketing at {}", path);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) JobConf(org.apache.hadoop.mapred.JobConf) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) SerDeException(org.apache.hadoop.hive.serde2.SerDeException)

Aggregations

RecordWriter (org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter)24 Writable (org.apache.hadoop.io.Writable)16 Path (org.apache.hadoop.fs.Path)12 StructField (org.apache.hadoop.hive.serde2.objectinspector.StructField)10 BytesWritable (org.apache.hadoop.io.BytesWritable)8 JobConf (org.apache.hadoop.mapred.JobConf)8 FileSystem (org.apache.hadoop.fs.FileSystem)7 SettableStructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector)7 Text (org.apache.hadoop.io.Text)6 Properties (java.util.Properties)5 Serializer (org.apache.hadoop.hive.serde2.Serializer)5 SequenceFile (org.apache.hadoop.io.SequenceFile)4 Slice (io.airlift.slice.Slice)3 OutputStream (java.io.OutputStream)3 LongWritable (org.apache.hadoop.io.LongWritable)3 ExtendedRecordWriter (com.facebook.presto.hive.RecordFileWriter.ExtendedRecordWriter)2 DataSize (io.airlift.units.DataSize)2 File (java.io.File)2 IOException (java.io.IOException)2 Field (java.lang.reflect.Field)2