Search in sources :

Example 1 with RecordWriter

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter in project hive by apache.

the class Utilities method createEmptyBuckets.

/**
   * Check the existence of buckets according to bucket specification. Create empty buckets if
   * needed.
   *
   * @param hconf
   * @param paths A list of empty buckets to create
   * @param conf The definition of the FileSink.
   * @param reporter The mapreduce reporter object
   * @throws HiveException
   * @throws IOException
   */
private static void createEmptyBuckets(Configuration hconf, List<Path> paths, FileSinkDesc conf, Reporter reporter) throws HiveException, IOException {
    JobConf jc;
    if (hconf instanceof JobConf) {
        jc = new JobConf(hconf);
    } else {
        // test code path
        jc = new JobConf(hconf);
    }
    HiveOutputFormat<?, ?> hiveOutputFormat = null;
    Class<? extends Writable> outputClass = null;
    boolean isCompressed = conf.getCompressed();
    TableDesc tableInfo = conf.getTableInfo();
    try {
        Serializer serializer = (Serializer) tableInfo.getDeserializerClass().newInstance();
        serializer.initialize(null, tableInfo.getProperties());
        outputClass = serializer.getSerializedClass();
        hiveOutputFormat = HiveFileFormatUtils.getHiveOutputFormat(hconf, conf.getTableInfo());
    } catch (SerDeException e) {
        throw new HiveException(e);
    } catch (InstantiationException e) {
        throw new HiveException(e);
    } catch (IllegalAccessException e) {
        throw new HiveException(e);
    }
    for (Path path : paths) {
        RecordWriter writer = HiveFileFormatUtils.getRecordWriter(jc, hiveOutputFormat, outputClass, isCompressed, tableInfo.getProperties(), path, reporter);
        writer.close(false);
        LOG.info("created empty bucket for enforcing bucketing at " + path);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JobConf(org.apache.hadoop.mapred.JobConf) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) Serializer(org.apache.hadoop.hive.serde2.Serializer)

Example 2 with RecordWriter

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter in project presto by prestodb.

the class TestOrcPageSourceMemoryTracking method flushStripe.

private static void flushStripe(RecordWriter recordWriter) {
    try {
        Field writerField = OrcOutputFormat.class.getClassLoader().loadClass(ORC_RECORD_WRITER).getDeclaredField("writer");
        writerField.setAccessible(true);
        Writer writer = (Writer) writerField.get(recordWriter);
        Method flushStripe = WriterImpl.class.getDeclaredMethod("flushStripe");
        flushStripe.setAccessible(true);
        flushStripe.invoke(writer);
    } catch (ReflectiveOperationException e) {
        throw Throwables.propagate(e);
    }
}
Also used : StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) Field(java.lang.reflect.Field) Method(java.lang.reflect.Method) Writer(org.apache.hadoop.hive.ql.io.orc.Writer) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter)

Example 3 with RecordWriter

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter in project presto by prestodb.

the class HiveZeroRowFileCreator method generateZeroRowFile.

private byte[] generateZeroRowFile(ConnectorSession session, HdfsContext hdfsContext, Properties properties, String serDe, String outputFormatName, HiveCompressionCodec compressionCodec) {
    String tmpDirectoryPath = System.getProperty("java.io.tmpdir");
    String tmpFileName = format("presto-hive-zero-row-file-creator-%s-%s", session.getQueryId(), randomUUID().toString());
    java.nio.file.Path tmpFilePath = Paths.get(tmpDirectoryPath, tmpFileName);
    try {
        Path target = new Path(format("file://%s/%s", tmpDirectoryPath, tmpFileName));
        // https://github.com/prestodb/presto/issues/14401 JSON Format reader does not fetch compression from source system
        JobConf conf = configureCompression(hdfsEnvironment.getConfiguration(hdfsContext, target), outputFormatName.equals(HiveStorageFormat.JSON.getOutputFormat()) ? compressionCodec : NONE);
        if (outputFormatName.equals(HiveStorageFormat.PAGEFILE.getOutputFormat())) {
            createEmptyPageFile(dataSinkFactory, session, target.getFileSystem(conf), target);
            return readAllBytes(tmpFilePath);
        }
        // Some serializers such as Avro set a property in the schema.
        initializeSerializer(conf, properties, serDe);
        // The code below is not a try with resources because RecordWriter is not Closeable.
        RecordWriter recordWriter = HiveWriteUtils.createRecordWriter(target, conf, properties, outputFormatName, session);
        recordWriter.close(false);
        return readAllBytes(tmpFilePath);
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    } finally {
        try {
            deleteIfExists(tmpFilePath);
        } catch (IOException e) {
            log.error(e, "Error deleting temporary file: %s", tmpFilePath);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) JobConf(org.apache.hadoop.mapred.JobConf)

Example 4 with RecordWriter

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter in project presto by prestodb.

the class ParquetRecordWriterUtil method createParquetWriter.

public static RecordWriter createParquetWriter(Path target, JobConf conf, Properties properties, boolean compress, ConnectorSession session) throws IOException, ReflectiveOperationException {
    conf.setLong(ParquetOutputFormat.BLOCK_SIZE, getParquetWriterBlockSize(session).toBytes());
    conf.setLong(ParquetOutputFormat.PAGE_SIZE, getParquetWriterPageSize(session).toBytes());
    RecordWriter recordWriter = new MapredParquetOutputFormat().getHiveRecordWriter(conf, target, Text.class, compress, properties, Reporter.NULL);
    Object realWriter = REAL_WRITER_FIELD.get(recordWriter);
    Object internalWriter = INTERNAL_WRITER_FIELD.get(realWriter);
    ParquetFileWriter fileWriter = (ParquetFileWriter) FILE_WRITER_FIELD.get(internalWriter);
    return new ExtendedRecordWriter() {

        private long length;

        @Override
        public long getWrittenBytes() {
            return length;
        }

        @Override
        public void write(Writable value) throws IOException {
            recordWriter.write(value);
            length = fileWriter.getPos();
        }

        @Override
        public void close(boolean abort) throws IOException {
            recordWriter.close(abort);
            if (!abort) {
                length = target.getFileSystem(conf).getFileStatus(target).getLen();
            }
        }
    };
}
Also used : RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) ParquetRecordWriter(org.apache.parquet.hadoop.ParquetRecordWriter) ExtendedRecordWriter(com.facebook.presto.hive.RecordFileWriter.ExtendedRecordWriter) MapredParquetOutputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat) ParquetFileWriter(org.apache.parquet.hadoop.ParquetFileWriter) ExtendedRecordWriter(com.facebook.presto.hive.RecordFileWriter.ExtendedRecordWriter) Writable(org.apache.hadoop.io.Writable)

Example 5 with RecordWriter

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter in project presto by prestodb.

the class ParquetTester method writeParquetColumn.

private static DataSize writeParquetColumn(JobConf jobConf, File outputFile, CompressionCodecName compressionCodecName, Properties tableProperties, SettableStructObjectInspector objectInspector, Iterator<?>[] valuesByField, Optional<MessageType> parquetSchema, boolean singleLevelArray) throws Exception {
    RecordWriter recordWriter = new TestMapredParquetOutputFormat(parquetSchema, singleLevelArray).getHiveRecordWriter(jobConf, new Path(outputFile.toURI()), Text.class, compressionCodecName != UNCOMPRESSED, tableProperties, () -> {
    });
    Object row = objectInspector.create();
    List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs());
    while (stream(valuesByField).allMatch(Iterator::hasNext)) {
        for (int field = 0; field < fields.size(); field++) {
            Object value = valuesByField[field].next();
            objectInspector.setStructFieldData(row, fields.get(field), value);
        }
        ParquetHiveSerDe serde = new ParquetHiveSerDe();
        serde.initialize(jobConf, tableProperties, null);
        Writable record = serde.serialize(row, objectInspector);
        recordWriter.write(record);
    }
    recordWriter.close(false);
    return succinctBytes(outputFile.length());
}
Also used : Path(org.apache.hadoop.fs.Path) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) TestMapredParquetOutputFormat(com.facebook.presto.hive.parquet.write.TestMapredParquetOutputFormat) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) ParquetHiveSerDe(org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe) AbstractIterator(com.google.common.collect.AbstractIterator) Iterator(java.util.Iterator) Writable(org.apache.hadoop.io.Writable)

Aggregations

RecordWriter (org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter)24 Writable (org.apache.hadoop.io.Writable)16 Path (org.apache.hadoop.fs.Path)12 StructField (org.apache.hadoop.hive.serde2.objectinspector.StructField)10 BytesWritable (org.apache.hadoop.io.BytesWritable)8 JobConf (org.apache.hadoop.mapred.JobConf)8 FileSystem (org.apache.hadoop.fs.FileSystem)7 SettableStructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector)7 Text (org.apache.hadoop.io.Text)6 Properties (java.util.Properties)5 Serializer (org.apache.hadoop.hive.serde2.Serializer)5 SequenceFile (org.apache.hadoop.io.SequenceFile)4 Slice (io.airlift.slice.Slice)3 OutputStream (java.io.OutputStream)3 LongWritable (org.apache.hadoop.io.LongWritable)3 ExtendedRecordWriter (com.facebook.presto.hive.RecordFileWriter.ExtendedRecordWriter)2 DataSize (io.airlift.units.DataSize)2 File (java.io.File)2 IOException (java.io.IOException)2 Field (java.lang.reflect.Field)2