Search in sources :

Example 1 with ArrowWrapperWritable

use of org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable in project hive by apache.

the class LlapArrowRecordWriter method write.

@Override
public void write(K key, V value) throws IOException {
    ArrowWrapperWritable arrowWrapperWritable = (ArrowWrapperWritable) value;
    if (arrowStreamWriter == null) {
        vectorSchemaRoot = arrowWrapperWritable.getVectorSchemaRoot();
        arrowStreamWriter = new ArrowStreamWriter(vectorSchemaRoot, null, out);
        allocator = arrowWrapperWritable.getAllocator();
        this.out.setAllocator(allocator);
        rootVector = arrowWrapperWritable.getRootVector();
    } else {
        // We need to set the row count for the current vector
        // since root is reused by the stream writer.
        vectorSchemaRoot.setRowCount(rootVector.getValueCount());
    }
    arrowStreamWriter.writeBatch();
}
Also used : ArrowWrapperWritable(org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable) ArrowStreamWriter(org.apache.arrow.vector.ipc.ArrowStreamWriter)

Example 2 with ArrowWrapperWritable

use of org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable in project hive by apache.

the class FileSinkOperator method closeOp.

@Override
public void closeOp(boolean abort) throws HiveException {
    row_count.set(numRows);
    LOG.info(toString() + ": records written - " + numRows);
    if ("spark".equalsIgnoreCase(HiveConf.getVar(hconf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE))) {
        SparkMetricUtils.updateSparkRecordsWrittenMetrics(runTimeNumRows);
    }
    if (!bDynParts && !filesCreated) {
        boolean isTez = "tez".equalsIgnoreCase(HiveConf.getVar(hconf, ConfVars.HIVE_EXECUTION_ENGINE));
        Class<?> clazz = conf.getTableInfo().getOutputFileFormatClass();
        boolean isStreaming = StreamingOutputFormat.class.isAssignableFrom(clazz);
        // let empty file generation for mm/acid table as a quick and dirty workaround for HIVE-22941
        if (!isTez || isStreaming || (this.isInsertOverwrite && (conf.isMmTable() || conf.isFullAcidTable()))) {
            createBucketFiles(fsp);
        }
    }
    lastProgressReport = System.currentTimeMillis();
    if (!abort) {
        // (the size of buffer is kept track of in the ThriftJDBCBinarySerDe).
        if (conf.isUsingBatchingSerDe()) {
            try {
                recordValue = serializer.serialize(null, inputObjInspectors[0]);
                if (null != fpaths) {
                    rowOutWriters = fpaths.outWriters;
                    rowOutWriters[0].write(recordValue);
                } else if (recordValue instanceof ArrowWrapperWritable) {
                    // i.e. we need to write a 0 size batch to signal EOS to the consumer
                    for (FSPaths fsPaths : valToPaths.values()) {
                        for (RecordWriter writer : fsPaths.outWriters) {
                            writer.write(recordValue);
                        }
                    }
                }
            } catch (SerDeException | IOException e) {
                throw new HiveException(e);
            }
        }
        List<Path> commitPaths = new ArrayList<>();
        for (FSPaths fsp : valToPaths.values()) {
            List<Path> deleteDeltas = new ArrayList<Path>();
            fsp.closeWriters(abort, deleteDeltas);
            // accumulated statistics which will be aggregated in case of spray writers
            if (conf.isGatherStats() && isCollectRWStats) {
                if (conf.getWriteType() == AcidUtils.Operation.NOT_ACID || conf.isMmTable() || conf.isCompactionTable()) {
                    for (int idx = 0; idx < fsp.outWriters.length; idx++) {
                        RecordWriter outWriter = fsp.outWriters[idx];
                        if (outWriter != null) {
                            SerDeStats stats = ((StatsProvidingRecordWriter) outWriter).getStats();
                            if (stats != null) {
                                fsp.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize());
                                fsp.addToStat(StatsSetupConst.ROW_COUNT, stats.getRowCount());
                            }
                        }
                    }
                } else {
                    for (int i = 0; i < fsp.updaters.length; i++) {
                        if (fsp.updaters[i] != null) {
                            SerDeStats stats = fsp.updaters[i].getStats();
                            if (stats != null) {
                                fsp.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize());
                                fsp.addToStat(StatsSetupConst.ROW_COUNT, stats.getRowCount());
                                fsp.addToStat(StatsSetupConst.INSERT_COUNT, stats.getInsertCount());
                                fsp.addToStat(StatsSetupConst.UPDATE_COUNT, stats.getUpdateCount());
                                fsp.addToStat(StatsSetupConst.DELETE_COUNT, stats.getDeleteCount());
                            }
                        }
                    }
                }
            }
            if (isNativeTable()) {
                fsp.commit(fs, commitPaths, deleteDeltas);
            }
            if ("spark".equals(HiveConf.getVar(hconf, ConfVars.HIVE_EXECUTION_ENGINE))) {
                SparkMetricUtils.updateSparkBytesWrittenMetrics(LOG, fs, fsp.finalPaths);
            }
        }
        if (conf.isMmTable() || conf.isDirectInsert()) {
            boolean isDelete = AcidUtils.Operation.DELETE.equals(conf.getAcidOperation());
            Utilities.writeCommitManifest(commitPaths, specPath, fs, originalTaskId, conf.getTableWriteId(), conf.getStatementId(), unionPath, conf.getInsertOverwrite(), bDynParts, dynamicPartitionSpecs, conf.getStaticSpec(), isDelete);
        }
        // Only publish stats if this operator's flag was set to gather stats
        if (conf.isGatherStats()) {
            publishStats();
        }
    } else {
        // reduce().
        for (FSPaths fsp : valToPaths.values()) {
            fsp.abortWritersAndUpdaters(fs, abort, !autoDelete && isNativeTable() && !conf.isMmTable() && !conf.isDirectInsert());
        }
    }
    fsp = prevFsp = null;
    super.closeOp(abort);
}
Also used : Path(org.apache.hadoop.fs.Path) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ArrayList(java.util.ArrayList) ArrowWrapperWritable(org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable) IOException(java.io.IOException) StatsProvidingRecordWriter(org.apache.hadoop.hive.ql.io.StatsProvidingRecordWriter) StatsProvidingRecordWriter(org.apache.hadoop.hive.ql.io.StatsProvidingRecordWriter)

Example 3 with ArrowWrapperWritable

use of org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable in project hive by apache.

the class LlapArrowRowRecordReader method next.

@Override
public boolean next(NullWritable key, Row value) throws IOException {
    Preconditions.checkArgument(value != null);
    boolean hasNext = false;
    ArrowWrapperWritable batchData = (ArrowWrapperWritable) data;
    if ((batchSize == 0) || (rowIndex == batchSize)) {
        // This is either the first batch or we've used up the current batch buffer
        batchSize = 0;
        rowIndex = 0;
        // we should keep trying until we get a batch with some data or reader.next() returns false
        while (batchSize == 0 && (hasNext = reader.next(key, data))) {
            List<FieldVector> vectors = batchData.getVectorSchemaRoot().getFieldVectors();
            // hasNext implies there is some column in the batch
            Preconditions.checkState(vectors.size() > 0);
            // All the vectors have the same length,
            // we can get the number of rows from the first vector
            batchSize = vectors.get(0).getValueCount();
        }
        if (hasNext) {
            // There is another batch to buffer
            try {
                ArrowWrapperWritable wrapper = new ArrowWrapperWritable(batchData.getVectorSchemaRoot());
                currentBatch = (Object[][]) serde.deserialize(wrapper);
                StructObjectInspector rowOI = (StructObjectInspector) serde.getObjectInspector();
                setRowFromStruct(value, currentBatch[rowIndex], rowOI);
            } catch (Exception e) {
                LOG.error("Failed to fetch Arrow batch", e);
                throw new RuntimeException(e);
            }
        }
    // There were no more batches AND
    // this is either the first batch or we've used up the current batch buffer.
    // goto return false
    } else if (rowIndex < batchSize) {
        // Take a row from the current buffered batch
        hasNext = true;
        StructObjectInspector rowOI = null;
        try {
            rowOI = (StructObjectInspector) serde.getObjectInspector();
        } catch (SerDeException e) {
            throw new RuntimeException(e);
        }
        setRowFromStruct(value, currentBatch[rowIndex], rowOI);
    }
    // Always inc the batch buffer index
    // If we return false, it is just a noop
    rowIndex++;
    return hasNext;
}
Also used : FieldVector(org.apache.arrow.vector.FieldVector) ArrowWrapperWritable(org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 4 with ArrowWrapperWritable

use of org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable in project hive by apache.

the class VectorFileSinkArrowOperator method closeOp.

@Override
protected void closeOp(boolean abort) throws HiveException {
    try {
        if (!wroteData) {
            // Send a schema only batch to signal EOS with no data written
            ArrowWrapperWritable writable = converter.emptyBatch();
            if (recordWriter == null) {
                recordWriter = LlapOutputFormatService.get().getWriter(this.attemptId);
            }
            recordWriter.write(null, writable);
        }
    } catch (Exception e) {
        LOG.error("Failed to write Arrow stream schema");
        throw new RuntimeException(e);
    } finally {
        try {
            // Close the recordWriter with null Reporter
            recordWriter.close(null);
        } catch (Exception e) {
            LOG.error("Failed to close Arrow stream");
            throw new RuntimeException(e);
        }
    }
}
Also used : ArrowWrapperWritable(org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Example 5 with ArrowWrapperWritable

use of org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable in project hive by apache.

the class VectorFileSinkArrowOperator method process.

@Override
public void process(Object data, int tag) throws HiveException {
    // ArrowStreamReader expects at least the schema metadata, if this op writes no data,
    // we need to send the schema to close the stream gracefully
    VectorizedRowBatch batch = (VectorizedRowBatch) data;
    try {
        if (recordWriter == null) {
            recordWriter = LlapOutputFormatService.get().getWriter(this.attemptId);
        }
        // Convert the VectorizedRowBatch to a handle for the Arrow batch
        ArrowWrapperWritable writable = converter.serializeBatch(batch, true);
        // Pass the handle to the LlapOutputFormatService recordWriter
        recordWriter.write(null, writable);
        this.wroteData = true;
    } catch (Exception e) {
        LOG.error("Failed to convert VectorizedRowBatch to Arrow batch");
        throw new RuntimeException(e);
    }
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) ArrowWrapperWritable(org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Aggregations

ArrowWrapperWritable (org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable)6 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)3 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 FieldVector (org.apache.arrow.vector.FieldVector)2 ImmutableMap (com.google.common.collect.ImmutableMap)1 Lists (com.google.common.collect.Lists)1 SQLException (java.sql.SQLException)1 Statement (java.sql.Statement)1 Arrays (java.util.Arrays)1 List (java.util.List)1 Map (java.util.Map)1 UUID (java.util.UUID)1 Collectors (java.util.stream.Collectors)1 BufferAllocator (org.apache.arrow.memory.BufferAllocator)1 ArrowStreamWriter (org.apache.arrow.vector.ipc.ArrowStreamWriter)1 MultiSet (org.apache.commons.collections4.MultiSet)1 HashMultiSet (org.apache.commons.collections4.multiset.HashMultiSet)1 Path (org.apache.hadoop.fs.Path)1 CalendarUtils (org.apache.hadoop.hive.common.type.CalendarUtils)1