use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.
the class HiveInputFormat method wrapForLlap.
public static InputFormat<WritableComparable, Writable> wrapForLlap(InputFormat<WritableComparable, Writable> inputFormat, Configuration conf, PartitionDesc part) throws HiveException {
if (!HiveConf.getBoolVar(conf, ConfVars.LLAP_IO_ENABLED, LlapProxy.isDaemon())) {
// LLAP not enabled, no-op.
return inputFormat;
}
String ifName = inputFormat.getClass().getCanonicalName();
boolean isSupported = inputFormat instanceof LlapWrappableInputFormatInterface;
boolean isCacheOnly = inputFormat instanceof LlapCacheOnlyInputFormatInterface;
boolean isVectorized = Utilities.getIsVectorized(conf);
if (!isVectorized) {
// Pretend it's vectorized if the non-vector wrapped is enabled.
isVectorized = HiveConf.getBoolVar(conf, ConfVars.LLAP_IO_NONVECTOR_WRAPPER_ENABLED) && (Utilities.getPlanPath(conf) != null);
}
boolean isSerdeBased = false;
if (isVectorized && !isSupported && HiveConf.getBoolVar(conf, ConfVars.LLAP_IO_ENCODE_ENABLED)) {
// See if we can use re-encoding to read the format thru IO elevator.
isSupported = isSerdeBased = checkInputFormatForLlapEncode(conf, ifName);
}
if ((!isSupported || !isVectorized) && !isCacheOnly) {
if (LOG.isInfoEnabled()) {
LOG.info("Not using llap for " + ifName + ": supported = " + isSupported + ", vectorized = " + isVectorized + ", cache only = " + isCacheOnly);
}
return inputFormat;
}
if (LOG.isDebugEnabled()) {
LOG.debug("Processing " + ifName);
}
@SuppressWarnings("unchecked") LlapIo<VectorizedRowBatch> llapIo = LlapProxy.getIo();
if (llapIo == null) {
if (LOG.isInfoEnabled()) {
LOG.info("Not using LLAP IO because it is not initialized");
}
return inputFormat;
}
Deserializer serde = null;
if (isSerdeBased) {
if (part == null) {
if (isCacheOnly) {
LOG.info("Using cache only because there's no partition spec for SerDe-based IF");
injectLlapCaches(inputFormat, llapIo);
} else {
LOG.info("Not using LLAP IO because there's no partition spec for SerDe-based IF");
}
return inputFormat;
}
serde = findSerDeForLlapSerDeIf(conf, part);
}
if (isSupported && isVectorized) {
InputFormat<?, ?> wrappedIf = llapIo.getInputFormat(inputFormat, serde);
// null means we cannot wrap; the cause is logged inside.
if (wrappedIf != null) {
return castInputFormat(wrappedIf);
}
}
if (isCacheOnly) {
injectLlapCaches(inputFormat, llapIo);
}
return inputFormat;
}
use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.
the class DynamicValueRegistryTez method init.
@Override
public void init(RegistryConf conf) throws Exception {
RegistryConfTez rct = (RegistryConfTez) conf;
for (String inputSourceName : rct.baseWork.getInputSourceToRuntimeValuesInfo().keySet()) {
LOG.info("Runtime value source: " + inputSourceName);
LogicalInput runtimeValueInput = rct.inputs.get(inputSourceName);
RuntimeValuesInfo runtimeValuesInfo = rct.baseWork.getInputSourceToRuntimeValuesInfo().get(inputSourceName);
// Setup deserializer/obj inspectors for the incoming data source
Deserializer deserializer = ReflectionUtils.newInstance(runtimeValuesInfo.getTableDesc().getDeserializerClass(), null);
deserializer.initialize(rct.conf, runtimeValuesInfo.getTableDesc().getProperties());
ObjectInspector inspector = deserializer.getObjectInspector();
// Set up col expressions for the dynamic values using this input
List<ExprNodeEvaluator> colExprEvaluators = new ArrayList<ExprNodeEvaluator>();
for (ExprNodeDesc expr : runtimeValuesInfo.getColExprs()) {
ExprNodeEvaluator exprEval = ExprNodeEvaluatorFactory.get(expr, null);
exprEval.initialize(inspector);
colExprEvaluators.add(exprEval);
}
runtimeValueInput.start();
List<Input> inputList = new ArrayList<Input>();
inputList.add(runtimeValueInput);
rct.processorContext.waitForAllInputsReady(inputList);
KeyValueReader kvReader = (KeyValueReader) runtimeValueInput.getReader();
long rowCount = 0;
while (kvReader.next()) {
Object row = deserializer.deserialize((Writable) kvReader.getCurrentValue());
rowCount++;
for (int colIdx = 0; colIdx < colExprEvaluators.size(); ++colIdx) {
// Read each expression and save it to the value registry
ExprNodeEvaluator eval = colExprEvaluators.get(colIdx);
Object val = eval.evaluate(row);
setValue(runtimeValuesInfo.getDynamicValueIDs().get(colIdx), val);
}
}
// For now, expecting a single row (min/max, aggregated bloom filter), or no rows
if (rowCount == 0) {
LOG.debug("No input rows from " + inputSourceName + ", filling dynamic values with nulls");
for (int colIdx = 0; colIdx < colExprEvaluators.size(); ++colIdx) {
ExprNodeEvaluator eval = colExprEvaluators.get(colIdx);
setValue(runtimeValuesInfo.getDynamicValueIDs().get(colIdx), null);
}
} else if (rowCount > 1) {
throw new IllegalStateException("Expected 0 or 1 rows from " + inputSourceName + ", got " + rowCount);
}
}
}
use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.
the class VectorMapOperator method process.
@Override
public void process(Writable value) throws HiveException {
// A mapper can span multiple files/partitions.
// The VectorPartitionContext need to be changed if the input file changed
ExecMapperContext context = getExecContext();
if (context != null && context.inputFileChanged()) {
// The child operators cleanup if input file has changed
cleanUpInputFileChanged();
}
if (!oneRootOperator.getDone()) {
/*
* 3 different kinds of vectorized reading supported:
*
* 1) Read the Vectorized Input File Format which returns VectorizedRowBatch as the row.
*
* 2) Read using VectorDeserializeRow to deserialize each row into the VectorizedRowBatch.
*
* 3) And read using the regular partition deserializer to get the row object and assigning
* the row object into the VectorizedRowBatch with VectorAssignRow.
*/
try {
if (currentReadType == VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT) {
if (!deliverVectorizedRowBatch(value)) {
// Operator tree is now done.
return;
}
} else if (value instanceof VectorizedRowBatch) {
/*
* Clear out any rows we may have processed in row-mode for the current partition..
*/
if (!flushDeserializerBatch()) {
// Operator tree is now done.
return;
}
if (!deliverVectorizedRowBatch(value)) {
// Operator tree is now done.
return;
}
} else {
/*
* We have a "regular" single rows from the Input File Format reader that we will need
* to deserialize.
*/
Preconditions.checkState(currentReadType == VectorMapOperatorReadType.VECTOR_DESERIALIZE || currentReadType == VectorMapOperatorReadType.ROW_DESERIALIZE);
if (deserializerBatch.size == deserializerBatch.DEFAULT_SIZE) {
numRows += deserializerBatch.size;
/*
* Feed current full batch to operator tree.
*/
batchCounter++;
oneRootOperator.process(deserializerBatch, 0);
/**
* Only reset the current data columns. Not any data columns defaulted to NULL
* because they are not present in the partition, and not partition columns.
*/
for (int c = 0; c < currentDataColumnCount; c++) {
ColumnVector colVector = deserializerBatch.cols[c];
if (colVector != null) {
colVector.reset();
colVector.init();
}
}
deserializerBatch.selectedInUse = false;
deserializerBatch.size = 0;
deserializerBatch.endOfFile = false;
if (oneRootOperator.getDone()) {
setDone(true);
return;
}
}
/*
* Do the {vector|row} deserialization of the one row into the VectorizedRowBatch.
*/
switch(currentReadType) {
case VECTOR_DESERIALIZE:
{
BinaryComparable binComp = (BinaryComparable) value;
currentDeserializeRead.set(binComp.getBytes(), 0, binComp.getLength());
// Deserialize and append new row using the current batch size as the index.
try {
currentVectorDeserializeRow.deserialize(deserializerBatch, deserializerBatch.size++);
} catch (Exception e) {
throw new HiveException("\nDeserializeRead detail: " + currentVectorDeserializeRow.getDetailedReadPositionString(), e);
}
}
break;
case ROW_DESERIALIZE:
{
Object deserialized = currentPartDeserializer.deserialize(value);
// Note: Regardless of what the Input File Format returns, we have determined
// with VectorAppendRow.initConversion that only currentDataColumnCount columns
// have values we want.
//
// Any extra columns needed by the table schema were set to repeating null
// in the batch by setupPartitionContextVars.
// Convert input row to standard objects.
List<Object> standardObjects = new ArrayList<Object>();
ObjectInspectorUtils.copyToStandardObject(standardObjects, deserialized, currentPartRawRowObjectInspector, ObjectInspectorCopyOption.WRITABLE);
if (standardObjects.size() < currentDataColumnCount) {
throw new HiveException("Input File Format returned row with too few columns");
}
// Append the deserialized standard object row using the current batch size
// as the index.
currentVectorAssign.assignRow(deserializerBatch, deserializerBatch.size++, standardObjects, currentDataColumnCount);
}
break;
default:
throw new RuntimeException("Unexpected vector MapOperator read type " + currentReadType.name());
}
}
} catch (Exception e) {
throw new HiveException("Hive Runtime Error while processing row ", e);
}
}
}
use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.
the class MapOperator method getConvertedOI.
// Return the mapping for table descriptor to the expected table OI
/**
* Traverse all the partitions for a table, and get the OI for the table.
* Note that a conversion is required if any of the partition OI is different
* from the table OI. For eg. if the query references table T (partitions P1, P2),
* and P1's schema is same as T, whereas P2's scheme is different from T, conversion
* might be needed for both P1 and P2, since SettableOI might be needed for T
*/
private Map<TableDesc, StructObjectInspector> getConvertedOI(Map<String, Configuration> tableToConf) throws HiveException {
Map<TableDesc, StructObjectInspector> tableDescOI = new HashMap<TableDesc, StructObjectInspector>();
Set<TableDesc> identityConverterTableDesc = new HashSet<TableDesc>();
try {
Map<ObjectInspector, Boolean> oiSettableProperties = new HashMap<ObjectInspector, Boolean>();
for (Path onefile : conf.getPathToAliases().keySet()) {
PartitionDesc pd = conf.getPathToPartitionInfo().get(onefile);
TableDesc tableDesc = pd.getTableDesc();
Configuration hconf = tableToConf.get(tableDesc.getTableName());
Deserializer partDeserializer = pd.getDeserializer(hconf);
StructObjectInspector partRawRowObjectInspector;
boolean isAcid = AcidUtils.isTablePropertyTransactional(tableDesc.getProperties());
if (Utilities.isSchemaEvolutionEnabled(hconf, isAcid) && Utilities.isInputFileFormatSelfDescribing(pd)) {
Deserializer tblDeserializer = tableDesc.getDeserializer(hconf);
partRawRowObjectInspector = (StructObjectInspector) tblDeserializer.getObjectInspector();
} else {
partRawRowObjectInspector = (StructObjectInspector) partDeserializer.getObjectInspector();
}
StructObjectInspector tblRawRowObjectInspector = tableDescOI.get(tableDesc);
if ((tblRawRowObjectInspector == null) || (identityConverterTableDesc.contains(tableDesc))) {
Deserializer tblDeserializer = tableDesc.getDeserializer(hconf);
tblRawRowObjectInspector = (StructObjectInspector) ObjectInspectorConverters.getConvertedOI(partRawRowObjectInspector, tblDeserializer.getObjectInspector(), oiSettableProperties);
if (identityConverterTableDesc.contains(tableDesc)) {
if (!partRawRowObjectInspector.equals(tblRawRowObjectInspector)) {
identityConverterTableDesc.remove(tableDesc);
}
} else if (partRawRowObjectInspector.equals(tblRawRowObjectInspector)) {
identityConverterTableDesc.add(tableDesc);
}
tableDescOI.put(tableDesc, tblRawRowObjectInspector);
}
}
} catch (Exception e) {
throw new HiveException(e);
}
return tableDescOI;
}
use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.
the class MapOperator method populateVirtualColumnValues.
public static Object[] populateVirtualColumnValues(ExecMapperContext ctx, List<VirtualColumn> vcs, Object[] vcValues, Deserializer deserializer) {
if (vcs == null) {
return vcValues;
}
if (vcValues == null) {
vcValues = new Object[vcs.size()];
}
for (int i = 0; i < vcs.size(); i++) {
switch(vcs.get(i)) {
case FILENAME:
if (ctx.inputFileChanged()) {
vcValues[i] = new Text(ctx.getCurrentInputPath().toString());
}
break;
case BLOCKOFFSET:
{
long current = ctx.getIoCxt().getCurrentBlockStart();
LongWritable old = (LongWritable) vcValues[i];
if (old == null) {
old = new LongWritable(current);
vcValues[i] = old;
continue;
}
if (current != old.get()) {
old.set(current);
}
}
break;
case ROWOFFSET:
{
long current = ctx.getIoCxt().getCurrentRow();
LongWritable old = (LongWritable) vcValues[i];
if (old == null) {
old = new LongWritable(current);
vcValues[i] = old;
continue;
}
if (current != old.get()) {
old.set(current);
}
}
break;
case RAWDATASIZE:
long current = 0L;
SerDeStats stats = deserializer.getSerDeStats();
if (stats != null) {
current = stats.getRawDataSize();
}
LongWritable old = (LongWritable) vcValues[i];
if (old == null) {
old = new LongWritable(current);
vcValues[i] = old;
continue;
}
if (current != old.get()) {
old.set(current);
}
break;
case ROWID:
if (ctx.getIoCxt().getRecordIdentifier() == null) {
vcValues[i] = null;
} else {
if (vcValues[i] == null) {
vcValues[i] = new Object[RecordIdentifier.Field.values().length];
}
RecordIdentifier.StructInfo.toArray(ctx.getIoCxt().getRecordIdentifier(), (Object[]) vcValues[i]);
// so we don't accidentally cache the value; shouldn't
ctx.getIoCxt().setRecordIdentifier(null);
// happen since IO layer either knows how to produce ROW__ID or not - but to be safe
}
break;
}
}
return vcValues;
}
Aggregations