use of org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable in project hive by apache.
the class LlapArrowRecordWriter method write.
@Override
public void write(K key, V value) throws IOException {
ArrowWrapperWritable arrowWrapperWritable = (ArrowWrapperWritable) value;
if (arrowStreamWriter == null) {
vectorSchemaRoot = arrowWrapperWritable.getVectorSchemaRoot();
arrowStreamWriter = new ArrowStreamWriter(vectorSchemaRoot, null, out);
allocator = arrowWrapperWritable.getAllocator();
this.out.setAllocator(allocator);
rootVector = arrowWrapperWritable.getRootVector();
} else {
// We need to set the row count for the current vector
// since root is reused by the stream writer.
vectorSchemaRoot.setRowCount(rootVector.getValueCount());
}
arrowStreamWriter.writeBatch();
}
use of org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable in project hive by apache.
the class FileSinkOperator method closeOp.
@Override
public void closeOp(boolean abort) throws HiveException {
row_count.set(numRows);
LOG.info(toString() + ": records written - " + numRows);
if ("spark".equalsIgnoreCase(HiveConf.getVar(hconf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE))) {
SparkMetricUtils.updateSparkRecordsWrittenMetrics(runTimeNumRows);
}
if (!bDynParts && !filesCreated) {
boolean isTez = "tez".equalsIgnoreCase(HiveConf.getVar(hconf, ConfVars.HIVE_EXECUTION_ENGINE));
Class<?> clazz = conf.getTableInfo().getOutputFileFormatClass();
boolean isStreaming = StreamingOutputFormat.class.isAssignableFrom(clazz);
// let empty file generation for mm/acid table as a quick and dirty workaround for HIVE-22941
if (!isTez || isStreaming || (this.isInsertOverwrite && (conf.isMmTable() || conf.isFullAcidTable()))) {
createBucketFiles(fsp);
}
}
lastProgressReport = System.currentTimeMillis();
if (!abort) {
// (the size of buffer is kept track of in the ThriftJDBCBinarySerDe).
if (conf.isUsingBatchingSerDe()) {
try {
recordValue = serializer.serialize(null, inputObjInspectors[0]);
if (null != fpaths) {
rowOutWriters = fpaths.outWriters;
rowOutWriters[0].write(recordValue);
} else if (recordValue instanceof ArrowWrapperWritable) {
// i.e. we need to write a 0 size batch to signal EOS to the consumer
for (FSPaths fsPaths : valToPaths.values()) {
for (RecordWriter writer : fsPaths.outWriters) {
writer.write(recordValue);
}
}
}
} catch (SerDeException | IOException e) {
throw new HiveException(e);
}
}
List<Path> commitPaths = new ArrayList<>();
for (FSPaths fsp : valToPaths.values()) {
List<Path> deleteDeltas = new ArrayList<Path>();
fsp.closeWriters(abort, deleteDeltas);
// accumulated statistics which will be aggregated in case of spray writers
if (conf.isGatherStats() && isCollectRWStats) {
if (conf.getWriteType() == AcidUtils.Operation.NOT_ACID || conf.isMmTable() || conf.isCompactionTable()) {
for (int idx = 0; idx < fsp.outWriters.length; idx++) {
RecordWriter outWriter = fsp.outWriters[idx];
if (outWriter != null) {
SerDeStats stats = ((StatsProvidingRecordWriter) outWriter).getStats();
if (stats != null) {
fsp.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize());
fsp.addToStat(StatsSetupConst.ROW_COUNT, stats.getRowCount());
}
}
}
} else {
for (int i = 0; i < fsp.updaters.length; i++) {
if (fsp.updaters[i] != null) {
SerDeStats stats = fsp.updaters[i].getStats();
if (stats != null) {
fsp.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize());
fsp.addToStat(StatsSetupConst.ROW_COUNT, stats.getRowCount());
fsp.addToStat(StatsSetupConst.INSERT_COUNT, stats.getInsertCount());
fsp.addToStat(StatsSetupConst.UPDATE_COUNT, stats.getUpdateCount());
fsp.addToStat(StatsSetupConst.DELETE_COUNT, stats.getDeleteCount());
}
}
}
}
}
if (isNativeTable()) {
fsp.commit(fs, commitPaths, deleteDeltas);
}
if ("spark".equals(HiveConf.getVar(hconf, ConfVars.HIVE_EXECUTION_ENGINE))) {
SparkMetricUtils.updateSparkBytesWrittenMetrics(LOG, fs, fsp.finalPaths);
}
}
if (conf.isMmTable() || conf.isDirectInsert()) {
boolean isDelete = AcidUtils.Operation.DELETE.equals(conf.getAcidOperation());
Utilities.writeCommitManifest(commitPaths, specPath, fs, originalTaskId, conf.getTableWriteId(), conf.getStatementId(), unionPath, conf.getInsertOverwrite(), bDynParts, dynamicPartitionSpecs, conf.getStaticSpec(), isDelete);
}
// Only publish stats if this operator's flag was set to gather stats
if (conf.isGatherStats()) {
publishStats();
}
} else {
// reduce().
for (FSPaths fsp : valToPaths.values()) {
fsp.abortWritersAndUpdaters(fs, abort, !autoDelete && isNativeTable() && !conf.isMmTable() && !conf.isDirectInsert());
}
}
fsp = prevFsp = null;
super.closeOp(abort);
}
use of org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable in project hive by apache.
the class LlapArrowRowRecordReader method next.
@Override
public boolean next(NullWritable key, Row value) throws IOException {
Preconditions.checkArgument(value != null);
boolean hasNext = false;
ArrowWrapperWritable batchData = (ArrowWrapperWritable) data;
if ((batchSize == 0) || (rowIndex == batchSize)) {
// This is either the first batch or we've used up the current batch buffer
batchSize = 0;
rowIndex = 0;
// we should keep trying until we get a batch with some data or reader.next() returns false
while (batchSize == 0 && (hasNext = reader.next(key, data))) {
List<FieldVector> vectors = batchData.getVectorSchemaRoot().getFieldVectors();
// hasNext implies there is some column in the batch
Preconditions.checkState(vectors.size() > 0);
// All the vectors have the same length,
// we can get the number of rows from the first vector
batchSize = vectors.get(0).getValueCount();
}
if (hasNext) {
// There is another batch to buffer
try {
ArrowWrapperWritable wrapper = new ArrowWrapperWritable(batchData.getVectorSchemaRoot());
currentBatch = (Object[][]) serde.deserialize(wrapper);
StructObjectInspector rowOI = (StructObjectInspector) serde.getObjectInspector();
setRowFromStruct(value, currentBatch[rowIndex], rowOI);
} catch (Exception e) {
LOG.error("Failed to fetch Arrow batch", e);
throw new RuntimeException(e);
}
}
// There were no more batches AND
// this is either the first batch or we've used up the current batch buffer.
// goto return false
} else if (rowIndex < batchSize) {
// Take a row from the current buffered batch
hasNext = true;
StructObjectInspector rowOI = null;
try {
rowOI = (StructObjectInspector) serde.getObjectInspector();
} catch (SerDeException e) {
throw new RuntimeException(e);
}
setRowFromStruct(value, currentBatch[rowIndex], rowOI);
}
// Always inc the batch buffer index
// If we return false, it is just a noop
rowIndex++;
return hasNext;
}
use of org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable in project hive by apache.
the class VectorFileSinkArrowOperator method closeOp.
@Override
protected void closeOp(boolean abort) throws HiveException {
try {
if (!wroteData) {
// Send a schema only batch to signal EOS with no data written
ArrowWrapperWritable writable = converter.emptyBatch();
if (recordWriter == null) {
recordWriter = LlapOutputFormatService.get().getWriter(this.attemptId);
}
recordWriter.write(null, writable);
}
} catch (Exception e) {
LOG.error("Failed to write Arrow stream schema");
throw new RuntimeException(e);
} finally {
try {
// Close the recordWriter with null Reporter
recordWriter.close(null);
} catch (Exception e) {
LOG.error("Failed to close Arrow stream");
throw new RuntimeException(e);
}
}
}
use of org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable in project hive by apache.
the class VectorFileSinkArrowOperator method process.
@Override
public void process(Object data, int tag) throws HiveException {
// ArrowStreamReader expects at least the schema metadata, if this op writes no data,
// we need to send the schema to close the stream gracefully
VectorizedRowBatch batch = (VectorizedRowBatch) data;
try {
if (recordWriter == null) {
recordWriter = LlapOutputFormatService.get().getWriter(this.attemptId);
}
// Convert the VectorizedRowBatch to a handle for the Arrow batch
ArrowWrapperWritable writable = converter.serializeBatch(batch, true);
// Pass the handle to the LlapOutputFormatService recordWriter
recordWriter.write(null, writable);
this.wroteData = true;
} catch (Exception e) {
LOG.error("Failed to convert VectorizedRowBatch to Arrow batch");
throw new RuntimeException(e);
}
}
Aggregations