use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.
the class VectorMapOperator method process.
@Override
public void process(Writable value) throws HiveException {
// A mapper can span multiple files/partitions.
// The VectorPartitionContext need to be changed if the input file changed
ExecMapperContext context = getExecContext();
if (context != null && context.inputFileChanged()) {
// The child operators cleanup if input file has changed
cleanUpInputFileChanged();
}
if (!oneRootOperator.getDone()) {
/*
* 3 different kinds of vectorized reading supported:
*
* 1) Read the Vectorized Input File Format which returns VectorizedRowBatch as the row.
*
* 2) Read using VectorDeserializeRow to deserialize each row into the VectorizedRowBatch.
*
* 3) And read using the regular partition deserializer to get the row object and assigning
* the row object into the VectorizedRowBatch with VectorAssignRow.
*/
try {
if (currentReadType == VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT) {
/*
* The Vectorized Input File Format reader has already set the partition column
* values, reset and filled in the batch, etc.
*
* We pass the VectorizedRowBatch through here.
*/
batchCounter++;
if (value != null) {
numRows += ((VectorizedRowBatch) value).size;
}
oneRootOperator.process(value, 0);
if (oneRootOperator.getDone()) {
setDone(true);
return;
}
} else {
/*
* We have a "regular" single rows from the Input File Format reader that we will need
* to deserialize.
*/
Preconditions.checkState(currentReadType == VectorMapOperatorReadType.VECTOR_DESERIALIZE || currentReadType == VectorMapOperatorReadType.ROW_DESERIALIZE);
if (deserializerBatch.size == deserializerBatch.DEFAULT_SIZE) {
numRows += deserializerBatch.size;
/*
* Feed current full batch to operator tree.
*/
batchCounter++;
oneRootOperator.process(deserializerBatch, 0);
/**
* Only reset the current data columns. Not any data columns defaulted to NULL
* because they are not present in the partition, and not partition columns.
*/
for (int c = 0; c < currentDataColumnCount; c++) {
ColumnVector colVector = deserializerBatch.cols[c];
if (colVector != null) {
colVector.reset();
colVector.init();
}
}
deserializerBatch.selectedInUse = false;
deserializerBatch.size = 0;
deserializerBatch.endOfFile = false;
if (oneRootOperator.getDone()) {
setDone(true);
return;
}
}
/*
* Do the {vector|row} deserialization of the one row into the VectorizedRowBatch.
*/
switch(currentReadType) {
case VECTOR_DESERIALIZE:
{
BinaryComparable binComp = (BinaryComparable) value;
currentDeserializeRead.set(binComp.getBytes(), 0, binComp.getLength());
// Deserialize and append new row using the current batch size as the index.
try {
currentVectorDeserializeRow.deserialize(deserializerBatch, deserializerBatch.size++);
} catch (Exception e) {
throw new HiveException("\nDeserializeRead detail: " + currentVectorDeserializeRow.getDetailedReadPositionString(), e);
}
}
break;
case ROW_DESERIALIZE:
{
Object deserialized = currentPartDeserializer.deserialize(value);
// Note: Regardless of what the Input File Format returns, we have determined
// with VectorAppendRow.initConversion that only currentDataColumnCount columns
// have values we want.
//
// Any extra columns needed by the table schema were set to repeating null
// in the batch by setupPartitionContextVars.
// Convert input row to standard objects.
List<Object> standardObjects = new ArrayList<Object>();
ObjectInspectorUtils.copyToStandardObject(standardObjects, deserialized, currentPartRawRowObjectInspector, ObjectInspectorCopyOption.WRITABLE);
if (standardObjects.size() < currentDataColumnCount) {
throw new HiveException("Input File Format returned row with too few columns");
}
// Append the deserialized standard object row using the current batch size
// as the index.
currentVectorAssign.assignRow(deserializerBatch, deserializerBatch.size++, standardObjects, currentDataColumnCount);
}
break;
default:
throw new RuntimeException("Unexpected vector MapOperator read type " + currentReadType.name());
}
}
} catch (Exception e) {
throw new HiveException("Hive Runtime Error while processing row ", e);
}
}
}
use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.
the class OpProcFactory method pushFilterToStorageHandler.
/**
* Attempts to push a predicate down into a storage handler. For
* native tables, this is a no-op.
*
* @param tableScanOp table scan against which predicate applies
*
* @param originalPredicate predicate to be pushed down
*
* @param owi object walk info
*
* @param hiveConf Hive configuration
*
* @return portion of predicate which needs to be evaluated
* by Hive as a post-filter, or null if it was possible
* to push down the entire predicate
*/
private static ExprNodeGenericFuncDesc pushFilterToStorageHandler(TableScanOperator tableScanOp, ExprNodeGenericFuncDesc originalPredicate, OpWalkerInfo owi, HiveConf hiveConf) {
TableScanDesc tableScanDesc = tableScanOp.getConf();
Table tbl = tableScanDesc.getTableMetadata();
if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTINDEXFILTER)) {
// attach the original predicate to the table scan operator for index
// optimizations that require the pushed predicate before pcr & later
// optimizations are applied
tableScanDesc.setFilterExpr(originalPredicate);
}
if (!tbl.isNonNative()) {
return originalPredicate;
}
HiveStorageHandler storageHandler = tbl.getStorageHandler();
if (!(storageHandler instanceof HiveStoragePredicateHandler)) {
// The storage handler does not provide predicate decomposition
// support, so we'll implement the entire filter in Hive. However,
// we still provide the full predicate to the storage handler in
// case it wants to do any of its own prefiltering.
tableScanDesc.setFilterExpr(originalPredicate);
return originalPredicate;
}
HiveStoragePredicateHandler predicateHandler = (HiveStoragePredicateHandler) storageHandler;
JobConf jobConf = new JobConf(owi.getParseContext().getConf());
Utilities.setColumnNameList(jobConf, tableScanOp);
Utilities.setColumnTypeList(jobConf, tableScanOp);
Utilities.copyTableJobPropertiesToConf(Utilities.getTableDesc(tbl), jobConf);
Deserializer deserializer = tbl.getDeserializer();
HiveStoragePredicateHandler.DecomposedPredicate decomposed = predicateHandler.decomposePredicate(jobConf, deserializer, originalPredicate);
if (decomposed == null) {
// not able to push anything down
if (LOG.isDebugEnabled()) {
LOG.debug("No pushdown possible for predicate: " + originalPredicate.getExprString());
}
return originalPredicate;
}
if (LOG.isDebugEnabled()) {
LOG.debug("Original predicate: " + originalPredicate.getExprString());
if (decomposed.pushedPredicate != null) {
LOG.debug("Pushed predicate: " + decomposed.pushedPredicate.getExprString());
}
if (decomposed.residualPredicate != null) {
LOG.debug("Residual predicate: " + decomposed.residualPredicate.getExprString());
}
}
tableScanDesc.setFilterExpr(decomposed.pushedPredicate);
tableScanDesc.setFilterObject(decomposed.pushedPredicateObject);
return decomposed.residualPredicate;
}
use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.
the class PlanUtils method getTableDesc.
/**
* Generate a table descriptor from a createTableDesc.
*/
public static TableDesc getTableDesc(CreateTableDesc crtTblDesc, String cols, String colTypes) {
TableDesc ret;
// Resolve storage handler (if any)
try {
HiveStorageHandler storageHandler = null;
if (crtTblDesc.getStorageHandler() != null) {
storageHandler = HiveUtils.getStorageHandler(SessionState.getSessionConf(), crtTblDesc.getStorageHandler());
}
Class<? extends Deserializer> serdeClass = LazySimpleSerDe.class;
String separatorCode = Integer.toString(Utilities.ctrlaCode);
String columns = cols;
String columnTypes = colTypes;
boolean lastColumnTakesRestOfTheLine = false;
if (storageHandler != null) {
serdeClass = storageHandler.getSerDeClass();
} else if (crtTblDesc.getSerName() != null) {
serdeClass = JavaUtils.loadClass(crtTblDesc.getSerName());
}
if (crtTblDesc.getFieldDelim() != null) {
separatorCode = crtTblDesc.getFieldDelim();
}
ret = getTableDesc(serdeClass, separatorCode, columns, columnTypes, lastColumnTakesRestOfTheLine, false);
// set other table properties
Properties properties = ret.getProperties();
if (crtTblDesc.getStorageHandler() != null) {
properties.setProperty(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_STORAGE, crtTblDesc.getStorageHandler());
}
if (crtTblDesc.getCollItemDelim() != null) {
properties.setProperty(serdeConstants.COLLECTION_DELIM, crtTblDesc.getCollItemDelim());
}
if (crtTblDesc.getMapKeyDelim() != null) {
properties.setProperty(serdeConstants.MAPKEY_DELIM, crtTblDesc.getMapKeyDelim());
}
if (crtTblDesc.getFieldEscape() != null) {
properties.setProperty(serdeConstants.ESCAPE_CHAR, crtTblDesc.getFieldEscape());
}
if (crtTblDesc.getLineDelim() != null) {
properties.setProperty(serdeConstants.LINE_DELIM, crtTblDesc.getLineDelim());
}
if (crtTblDesc.getNullFormat() != null) {
properties.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, crtTblDesc.getNullFormat());
}
if (crtTblDesc.getTableName() != null && crtTblDesc.getDatabaseName() != null) {
properties.setProperty(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_NAME, crtTblDesc.getTableName());
}
if (crtTblDesc.getTblProps() != null) {
properties.putAll(crtTblDesc.getTblProps());
}
if (crtTblDesc.getSerdeProps() != null) {
properties.putAll(crtTblDesc.getSerdeProps());
}
// replace the default input & output file format with those found in
// crtTblDesc
Class<? extends InputFormat> in_class;
if (storageHandler != null) {
in_class = storageHandler.getInputFormatClass();
} else {
in_class = JavaUtils.loadClass(crtTblDesc.getInputFormat());
}
Class<? extends OutputFormat> out_class;
if (storageHandler != null) {
out_class = storageHandler.getOutputFormatClass();
} else {
out_class = JavaUtils.loadClass(crtTblDesc.getOutputFormat());
}
ret.setInputFileFormatClass(in_class);
ret.setOutputFileFormatClass(out_class);
} catch (ClassNotFoundException e) {
throw new RuntimeException("Unable to find class in getTableDesc: " + e.getMessage(), e);
} catch (HiveException e) {
throw new RuntimeException("Error loading storage handler in getTableDesc: " + e.getMessage(), e);
}
return ret;
}
use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.
the class MetaStoreUtils method getDeserializer.
/**
* getDeserializer
*
* Get the Deserializer for a partition.
*
* @param conf
* - hadoop config
* @param part
* the partition
* @param table the table
* @return
* Returns instantiated deserializer by looking up class name of deserializer stored in
* storage descriptor of passed in partition. Also, initializes the deserializer with
* schema of partition.
* @exception MetaException
* if any problems instantiating the Deserializer
*
*/
public static Deserializer getDeserializer(Configuration conf, org.apache.hadoop.hive.metastore.api.Partition part, org.apache.hadoop.hive.metastore.api.Table table) throws MetaException {
String lib = part.getSd().getSerdeInfo().getSerializationLib();
try {
Deserializer deserializer = ReflectionUtil.newInstance(conf.getClassByName(lib).asSubclass(Deserializer.class), conf);
SerDeUtils.initializeSerDe(deserializer, conf, MetaStoreUtils.getTableMetadata(table), MetaStoreUtils.getPartitionMetadata(part, table));
return deserializer;
} catch (RuntimeException e) {
throw e;
} catch (Exception e) {
LOG.error("error in initSerDe: " + e.getClass().getName() + " " + e.getMessage(), e);
throw new MetaException(e.getClass().getName() + " " + e.getMessage());
}
}
use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.
the class MapOperator method populateVirtualColumnValues.
public static Object[] populateVirtualColumnValues(ExecMapperContext ctx, List<VirtualColumn> vcs, Object[] vcValues, Deserializer deserializer) {
if (vcs == null) {
return vcValues;
}
if (vcValues == null) {
vcValues = new Object[vcs.size()];
}
for (int i = 0; i < vcs.size(); i++) {
switch(vcs.get(i)) {
case FILENAME:
if (ctx.inputFileChanged()) {
vcValues[i] = new Text(ctx.getCurrentInputPath().toString());
}
break;
case BLOCKOFFSET:
{
long current = ctx.getIoCxt().getCurrentBlockStart();
LongWritable old = (LongWritable) vcValues[i];
if (old == null) {
old = new LongWritable(current);
vcValues[i] = old;
continue;
}
if (current != old.get()) {
old.set(current);
}
}
break;
case ROWOFFSET:
{
long current = ctx.getIoCxt().getCurrentRow();
LongWritable old = (LongWritable) vcValues[i];
if (old == null) {
old = new LongWritable(current);
vcValues[i] = old;
continue;
}
if (current != old.get()) {
old.set(current);
}
}
break;
case RAWDATASIZE:
long current = 0L;
SerDeStats stats = deserializer.getSerDeStats();
if (stats != null) {
current = stats.getRawDataSize();
}
LongWritable old = (LongWritable) vcValues[i];
if (old == null) {
old = new LongWritable(current);
vcValues[i] = old;
continue;
}
if (current != old.get()) {
old.set(current);
}
break;
case ROWID:
if (ctx.getIoCxt().getRecordIdentifier() == null) {
vcValues[i] = null;
} else {
if (vcValues[i] == null) {
vcValues[i] = new Object[RecordIdentifier.Field.values().length];
}
RecordIdentifier.StructInfo.toArray(ctx.getIoCxt().getRecordIdentifier(), (Object[]) vcValues[i]);
//so we don't accidentally cache the value; shouldn't
ctx.getIoCxt().setRecordIdentifier(null);
//happen since IO layer either knows how to produce ROW__ID or not - but to be safe
}
break;
}
}
return vcValues;
}
Aggregations