Search in sources :

Example 6 with RecordWriter

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter in project presto by prestodb.

the class AbstractTestOrcReader method createTempFile.

private static TempFile createTempFile(int nRecords) throws IOException, SerDeException {
    TempFile file = new TempFile();
    RecordWriter writer = createOrcRecordWriter(file.getFile(), ORC_12, CompressionKind.NONE, BIGINT);
    @SuppressWarnings("deprecation") Serializer serde = new OrcSerde();
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", BIGINT);
    Object row = objectInspector.create();
    StructField field = objectInspector.getAllStructFieldRefs().get(0);
    objectInspector.setStructFieldData(row, field, 1L);
    Writable record = serde.serialize(row, objectInspector);
    for (int i = 0; i < nRecords; i++) {
        writer.write(record);
    }
    writer.close(false);
    return file;
}
Also used : SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) OrcTester.createSettableStructObjectInspector(com.facebook.presto.orc.OrcTester.createSettableStructObjectInspector) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) OrcTester.createOrcRecordWriter(com.facebook.presto.orc.OrcTester.createOrcRecordWriter) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) Writable(org.apache.hadoop.io.Writable) Serializer(org.apache.hadoop.hive.serde2.Serializer)

Example 7 with RecordWriter

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter in project presto by prestodb.

the class AbstractTestHiveFileFormats method createTestFile.

public static FileSplit createTestFile(String filePath, HiveStorageFormat storageFormat, HiveCompressionCodec compressionCodec, List<TestColumn> testColumns, int numRows) throws Exception {
    HiveOutputFormat<?, ?> outputFormat = newInstance(storageFormat.getOutputFormat(), HiveOutputFormat.class);
    Serializer serializer = newInstance(storageFormat.getSerDe(), Serializer.class);
    // filter out partition keys, which are not written to the file
    testColumns = ImmutableList.copyOf(filter(testColumns, not(TestColumn::isPartitionKey)));
    Properties tableProperties = new Properties();
    tableProperties.setProperty("columns", Joiner.on(',').join(transform(testColumns, TestColumn::getName)));
    tableProperties.setProperty("columns.types", Joiner.on(',').join(transform(testColumns, TestColumn::getType)));
    serializer.initialize(new Configuration(), tableProperties);
    JobConf jobConf = configureCompression(new JobConf(), compressionCodec);
    RecordWriter recordWriter = outputFormat.getHiveRecordWriter(jobConf, new Path(filePath), Text.class, compressionCodec != HiveCompressionCodec.NONE, tableProperties, () -> {
    });
    try {
        serializer.initialize(new Configuration(), tableProperties);
        SettableStructObjectInspector objectInspector = getStandardStructObjectInspector(ImmutableList.copyOf(transform(testColumns, TestColumn::getName)), ImmutableList.copyOf(transform(testColumns, TestColumn::getObjectInspector)));
        Object row = objectInspector.create();
        List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs());
        for (int rowNumber = 0; rowNumber < numRows; rowNumber++) {
            for (int i = 0; i < testColumns.size(); i++) {
                Object writeValue = testColumns.get(i).getWriteValue();
                if (writeValue instanceof Slice) {
                    writeValue = ((Slice) writeValue).getBytes();
                }
                objectInspector.setStructFieldData(row, fields.get(i), writeValue);
            }
            Writable record = serializer.serialize(row, objectInspector);
            recordWriter.write(record);
        }
    } finally {
        recordWriter.close(false);
    }
    // todo to test with compression, the file must be renamed with the compression extension
    Path path = new Path(filePath);
    path.getFileSystem(new Configuration()).setVerifyChecksum(true);
    File file = new File(filePath);
    return new FileSplit(path, 0, file.length(), new String[0]);
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Writable(org.apache.hadoop.io.Writable) Properties(java.util.Properties) FileSplit(org.apache.hadoop.mapred.FileSplit) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) Slice(io.airlift.slice.Slice) SerDeUtils.serializeObject(com.facebook.presto.hive.util.SerDeUtils.serializeObject) JobConf(org.apache.hadoop.mapred.JobConf) File(java.io.File) Serializer(org.apache.hadoop.hive.serde2.Serializer)

Example 8 with RecordWriter

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter in project presto by prestodb.

the class TestOrcBatchPageSourceMemoryTracking method flushStripe.

private static void flushStripe(RecordWriter recordWriter) {
    try {
        Field writerField = OrcOutputFormat.class.getClassLoader().loadClass(ORC_RECORD_WRITER).getDeclaredField("writer");
        writerField.setAccessible(true);
        Writer writer = (Writer) writerField.get(recordWriter);
        Method flushStripe = WriterImpl.class.getDeclaredMethod("flushStripe");
        flushStripe.setAccessible(true);
        flushStripe.invoke(writer);
    } catch (ReflectiveOperationException e) {
        throw new RuntimeException(e);
    }
}
Also used : StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) Field(java.lang.reflect.Field) Method(java.lang.reflect.Method) Writer(org.apache.hadoop.hive.ql.io.orc.Writer) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter)

Example 9 with RecordWriter

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter in project presto by prestodb.

the class TestOrcBatchPageSourceMemoryTracking method createTestFile.

public static FileSplit createTestFile(String filePath, HiveOutputFormat<?, ?> outputFormat, Serializer serializer, String compressionCodec, List<TestColumn> testColumns, int numRows, int stripeRows) throws Exception {
    // filter out partition keys, which are not written to the file
    testColumns = ImmutableList.copyOf(filter(testColumns, not(TestColumn::isPartitionKey)));
    Properties tableProperties = new Properties();
    tableProperties.setProperty("columns", Joiner.on(',').join(transform(testColumns, TestColumn::getName)));
    tableProperties.setProperty("columns.types", Joiner.on(',').join(transform(testColumns, TestColumn::getType)));
    serializer.initialize(CONFIGURATION, tableProperties);
    JobConf jobConf = new JobConf();
    if (compressionCodec != null) {
        CompressionCodec codec = new CompressionCodecFactory(CONFIGURATION).getCodecByName(compressionCodec);
        jobConf.set(COMPRESS_CODEC, codec.getClass().getName());
        jobConf.set(COMPRESS_TYPE, SequenceFile.CompressionType.BLOCK.toString());
    }
    RecordWriter recordWriter = createRecordWriter(new Path(filePath), CONFIGURATION);
    try {
        SettableStructObjectInspector objectInspector = getStandardStructObjectInspector(ImmutableList.copyOf(transform(testColumns, TestColumn::getName)), ImmutableList.copyOf(transform(testColumns, TestColumn::getObjectInspector)));
        Object row = objectInspector.create();
        List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs());
        for (int rowNumber = 0; rowNumber < numRows; rowNumber++) {
            for (int i = 0; i < testColumns.size(); i++) {
                Object writeValue = testColumns.get(i).getWriteValue();
                if (writeValue instanceof Slice) {
                    writeValue = ((Slice) writeValue).getBytes();
                }
                objectInspector.setStructFieldData(row, fields.get(i), writeValue);
            }
            Writable record = serializer.serialize(row, objectInspector);
            recordWriter.write(record);
            if (rowNumber % stripeRows == stripeRows - 1) {
                flushStripe(recordWriter);
            }
        }
    } finally {
        recordWriter.close(false);
    }
    Path path = new Path(filePath);
    path.getFileSystem(CONFIGURATION).setVerifyChecksum(true);
    File file = new File(filePath);
    return new FileSplit(path, 0, file.length(), new String[0]);
}
Also used : Path(org.apache.hadoop.fs.Path) Writable(org.apache.hadoop.io.Writable) Properties(java.util.Properties) FileSplit(org.apache.hadoop.mapred.FileSplit) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) Slice(io.airlift.slice.Slice) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) JobConf(org.apache.hadoop.mapred.JobConf) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File) OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile)

Example 10 with RecordWriter

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter in project hive by apache.

the class HiveHFileOutputFormat method getHiveRecordWriter.

@Override
public RecordWriter getHiveRecordWriter(final JobConf jc, final Path finalOutPath, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, final Progressable progressable) throws IOException {
    String hbaseTableName = jc.get(HBaseSerDe.HBASE_TABLE_NAME);
    if (hbaseTableName == null) {
        hbaseTableName = tableProperties.getProperty(hive_metastoreConstants.META_TABLE_NAME);
        hbaseTableName = hbaseTableName.toLowerCase();
        if (hbaseTableName.startsWith(HBaseStorageHandler.DEFAULT_PREFIX)) {
            hbaseTableName = hbaseTableName.substring(HBaseStorageHandler.DEFAULT_PREFIX.length());
        }
    }
    jc.set(OUTPUT_TABLE_NAME_CONF_KEY, hbaseTableName);
    // Read configuration for the target path, first from jobconf, then from table properties
    String hfilePath = getFamilyPath(jc, tableProperties);
    if (hfilePath == null) {
        throw new RuntimeException("Please set " + HFILE_FAMILY_PATH + " to target location for HFiles");
    }
    // Target path's last component is also the column family name.
    final Path columnFamilyPath = new Path(hfilePath);
    final String columnFamilyName = columnFamilyPath.getName();
    final byte[] columnFamilyNameBytes = Bytes.toBytes(columnFamilyName);
    final Job job = new Job(jc);
    setCompressOutput(job, isCompressed);
    setOutputPath(job, finalOutPath);
    // Create the HFile writer
    final org.apache.hadoop.mapreduce.TaskAttemptContext tac = ShimLoader.getHadoopShims().newTaskAttemptContext(job.getConfiguration(), progressable);
    final Path outputdir = FileOutputFormat.getOutputPath(tac);
    final Path taskAttemptOutputdir = new FileOutputCommitter(outputdir, tac).getWorkPath();
    final org.apache.hadoop.mapreduce.RecordWriter<ImmutableBytesWritable, Cell> fileWriter = getFileWriter(tac);
    // Individual columns are going to be pivoted to HBase cells,
    // and for each row, they need to be written out in order
    // of column name, so sort the column names now, creating a
    // mapping to their column position.  However, the first
    // column is interpreted as the row key.
    String columnList = tableProperties.getProperty("columns");
    String[] columnArray = columnList.split(",");
    final SortedMap<byte[], Integer> columnMap = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
    int i = 0;
    for (String columnName : columnArray) {
        if (i != 0) {
            columnMap.put(Bytes.toBytes(columnName), i);
        }
        ++i;
    }
    return new RecordWriter() {

        @Override
        public void close(boolean abort) throws IOException {
            try {
                fileWriter.close(null);
                if (abort) {
                    return;
                }
                // Move the hfiles file(s) from the task output directory to the
                // location specified by the user.
                FileSystem fs = outputdir.getFileSystem(jc);
                fs.mkdirs(columnFamilyPath);
                Path srcDir = taskAttemptOutputdir;
                FileStatus[] files = null;
                for (; ; ) {
                    try {
                        files = fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER);
                    } catch (FileNotFoundException fnf) {
                        LOG.debug("File doesn't exist {} ", srcDir, fnf);
                        break;
                    }
                    if ((files == null) || (files.length == 0)) {
                        throw new IOException("No family directories found in " + srcDir);
                    }
                    if (files.length != 1) {
                        throw new IOException("Multiple family directories found in " + srcDir);
                    }
                    srcDir = files[0].getPath();
                    if (srcDir.getName().equals(columnFamilyName)) {
                        break;
                    }
                    if (files[0].isFile()) {
                        throw new IOException("No family directories found in " + taskAttemptOutputdir + ". " + "The last component in hfile path should match column family name " + columnFamilyName);
                    }
                }
                if (files != null) {
                    for (FileStatus regionFile : fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER)) {
                        fs.rename(regionFile.getPath(), new Path(columnFamilyPath, regionFile.getPath().getName()));
                    }
                }
            } catch (InterruptedException ex) {
                throw new IOException(ex);
            } catch (FileNotFoundException fnf) {
                // Ignore....
                LOG.debug("File doesn't exist.", fnf);
            }
        }

        private void writeText(Text text) throws IOException {
            // Decompose the incoming text row into fields.
            String s = text.toString();
            String[] fields = s.split("\u0001");
            assert (fields.length <= (columnMap.size() + 1));
            // First field is the row key.
            byte[] rowKeyBytes = Bytes.toBytes(fields[0]);
            // Remaining fields are cells addressed by column name within row.
            for (Map.Entry<byte[], Integer> entry : columnMap.entrySet()) {
                byte[] columnNameBytes = entry.getKey();
                int iColumn = entry.getValue();
                String val;
                if (iColumn >= fields.length) {
                    // trailing blank field
                    val = "";
                } else {
                    val = fields[iColumn];
                    if ("\\N".equals(val)) {
                        // omit nulls
                        continue;
                    }
                }
                byte[] valBytes = Bytes.toBytes(val);
                KeyValue kv = new KeyValue(rowKeyBytes, columnFamilyNameBytes, columnNameBytes, valBytes);
                try {
                    fileWriter.write(null, kv);
                } catch (IOException e) {
                    LOG.error("Failed while writing row: " + s);
                    throw e;
                } catch (InterruptedException ex) {
                    throw new IOException(ex);
                }
            }
        }

        private void writePut(PutWritable put) throws IOException {
            ImmutableBytesWritable row = new ImmutableBytesWritable(put.getPut().getRow());
            SortedMap<byte[], List<Cell>> cells = put.getPut().getFamilyCellMap();
            for (Map.Entry<byte[], List<Cell>> entry : cells.entrySet()) {
                Collections.sort(entry.getValue(), new CellComparatorImpl());
                for (Cell c : entry.getValue()) {
                    try {
                        fileWriter.write(row, KeyValueUtil.copyToNewKeyValue(c));
                    } catch (InterruptedException e) {
                        throw (InterruptedIOException) new InterruptedIOException().initCause(e);
                    }
                }
            }
        }

        @Override
        public void write(Writable w) throws IOException {
            if (w instanceof Text) {
                writeText((Text) w);
            } else if (w instanceof PutWritable) {
                writePut((PutWritable) w);
            } else {
                throw new IOException("Unexpected writable " + w);
            }
        }
    };
}
Also used : InterruptedIOException(java.io.InterruptedIOException) FileStatus(org.apache.hadoop.fs.FileStatus) KeyValue(org.apache.hadoop.hbase.KeyValue) FileNotFoundException(java.io.FileNotFoundException) Writable(org.apache.hadoop.io.Writable) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) FileSystem(org.apache.hadoop.fs.FileSystem) List(java.util.List) Job(org.apache.hadoop.mapreduce.Job) Cell(org.apache.hadoop.hbase.Cell) Path(org.apache.hadoop.fs.Path) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) FileOutputCommitter(org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter) CellComparatorImpl(org.apache.hadoop.hbase.CellComparatorImpl) Text(org.apache.hadoop.io.Text) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) TreeMap(java.util.TreeMap) Map(java.util.Map) TreeMap(java.util.TreeMap) SortedMap(java.util.SortedMap)

Aggregations

RecordWriter (org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter)24 Writable (org.apache.hadoop.io.Writable)16 Path (org.apache.hadoop.fs.Path)12 StructField (org.apache.hadoop.hive.serde2.objectinspector.StructField)10 BytesWritable (org.apache.hadoop.io.BytesWritable)8 JobConf (org.apache.hadoop.mapred.JobConf)8 FileSystem (org.apache.hadoop.fs.FileSystem)7 SettableStructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector)7 Text (org.apache.hadoop.io.Text)6 Properties (java.util.Properties)5 Serializer (org.apache.hadoop.hive.serde2.Serializer)5 SequenceFile (org.apache.hadoop.io.SequenceFile)4 Slice (io.airlift.slice.Slice)3 OutputStream (java.io.OutputStream)3 LongWritable (org.apache.hadoop.io.LongWritable)3 ExtendedRecordWriter (com.facebook.presto.hive.RecordFileWriter.ExtendedRecordWriter)2 DataSize (io.airlift.units.DataSize)2 File (java.io.File)2 IOException (java.io.IOException)2 Field (java.lang.reflect.Field)2