use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class MapOperator method initObjectInspector.
private MapOpCtx initObjectInspector(Configuration hconf, MapOpCtx opCtx, StructObjectInspector tableRowOI) throws Exception {
PartitionDesc pd = opCtx.partDesc;
TableDesc td = pd.getTableDesc();
// Use table properties in case of unpartitioned tables,
// and the union of table properties and partition properties, with partition
// taking precedence, in the case of partitioned tables
Properties overlayedProps = SerDeUtils.createOverlayedProperties(td.getProperties(), pd.getProperties());
Map<String, String> partSpec = pd.getPartSpec();
opCtx.tableName = String.valueOf(overlayedProps.getProperty("name"));
opCtx.partName = String.valueOf(partSpec);
opCtx.deserializer = pd.getDeserializer(hconf);
StructObjectInspector partRawRowObjectInspector;
boolean isAcid = AcidUtils.isTablePropertyTransactional(td.getProperties());
if (Utilities.isSchemaEvolutionEnabled(hconf, isAcid) && Utilities.isInputFileFormatSelfDescribing(pd)) {
partRawRowObjectInspector = tableRowOI;
} else {
partRawRowObjectInspector = (StructObjectInspector) opCtx.deserializer.getObjectInspector();
}
opCtx.partTblObjectInspectorConverter = ObjectInspectorConverters.getConverter(partRawRowObjectInspector, tableRowOI);
// Next check if this table has partitions and if so
// get the list of partition names as well as allocate
// the serdes for the partition columns
String pcols = overlayedProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS);
if (pcols != null && pcols.length() > 0) {
String[] partKeys = pcols.trim().split("/");
String pcolTypes = overlayedProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMN_TYPES);
String[] partKeyTypes = pcolTypes.trim().split(":");
if (partKeys.length > partKeyTypes.length) {
throw new HiveException("Internal error : partKeys length, " + partKeys.length + " greater than partKeyTypes length, " + partKeyTypes.length);
}
List<String> partNames = new ArrayList<String>(partKeys.length);
Object[] partValues = new Object[partKeys.length];
List<ObjectInspector> partObjectInspectors = new ArrayList<ObjectInspector>(partKeys.length);
for (int i = 0; i < partKeys.length; i++) {
String key = partKeys[i];
partNames.add(key);
ObjectInspector oi = PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(partKeyTypes[i]));
// Partitions do not exist for this table
if (partSpec == null) {
// for partitionless table, initialize partValue to null
partValues[i] = null;
} else {
partValues[i] = ObjectInspectorConverters.getConverter(PrimitiveObjectInspectorFactory.javaStringObjectInspector, oi).convert(partSpec.get(key));
}
partObjectInspectors.add(oi);
}
opCtx.rowWithPart = new Object[] { null, partValues };
opCtx.partObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(partNames, partObjectInspectors);
}
// In that case, it will be a Select, but the rowOI need not be amended
if (opCtx.op instanceof TableScanOperator) {
TableScanOperator tsOp = (TableScanOperator) opCtx.op;
TableScanDesc tsDesc = tsOp.getConf();
if (tsDesc != null && tsDesc.hasVirtualCols()) {
opCtx.vcs = tsDesc.getVirtualCols();
opCtx.vcValues = new Object[opCtx.vcs.size()];
opCtx.vcsObjectInspector = VirtualColumn.getVCSObjectInspector(opCtx.vcs);
if (opCtx.isPartitioned()) {
opCtx.rowWithPartAndVC = Arrays.copyOfRange(opCtx.rowWithPart, 0, 3);
} else {
opCtx.rowWithPartAndVC = new Object[2];
}
}
}
if (!opCtx.hasVC() && !opCtx.isPartitioned()) {
opCtx.rowObjectInspector = tableRowOI;
return opCtx;
}
List<StructObjectInspector> inspectors = new ArrayList<StructObjectInspector>();
inspectors.add(tableRowOI);
if (opCtx.isPartitioned()) {
inspectors.add(opCtx.partObjectInspector);
}
if (opCtx.hasVC()) {
inspectors.add(opCtx.vcsObjectInspector);
}
opCtx.rowObjectInspector = ObjectInspectorFactory.getUnionStructObjectInspector(inspectors);
return opCtx;
}
use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class JoinUtil method initSpillTables.
public static TableDesc[] initSpillTables(JoinDesc conf, boolean noFilter) {
int tagLen = conf.getTagLength();
Map<Byte, List<ExprNodeDesc>> exprs = conf.getExprs();
TableDesc[] spillTableDesc = new TableDesc[tagLen];
for (int tag = 0; tag < exprs.size(); tag++) {
List<ExprNodeDesc> valueCols = exprs.get((byte) tag);
int columnSize = valueCols.size();
StringBuilder colNames = new StringBuilder();
StringBuilder colTypes = new StringBuilder();
if (columnSize <= 0) {
continue;
}
for (int k = 0; k < columnSize; k++) {
// any name, it does not
String newColName = tag + "_VALUE_" + k;
// matter.
colNames.append(newColName);
colNames.append(',');
colTypes.append(valueCols.get(k).getTypeString());
colTypes.append(',');
}
if (!noFilter) {
colNames.append("filtered");
colNames.append(',');
colTypes.append(TypeInfoFactory.shortTypeInfo.getTypeName());
colTypes.append(',');
}
// remove the last ','
colNames.setLength(colNames.length() - 1);
colTypes.setLength(colTypes.length() - 1);
TableDesc tblDesc = new TableDesc(SequenceFileInputFormat.class, HiveSequenceFileOutputFormat.class, Utilities.makeProperties(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_FORMAT, "" + Utilities.ctrlaCode, org.apache.hadoop.hive.serde.serdeConstants.LIST_COLUMNS, colNames.toString(), org.apache.hadoop.hive.serde.serdeConstants.LIST_COLUMN_TYPES, colTypes.toString(), serdeConstants.SERIALIZATION_LIB, LazyBinarySerDe.class.getName()));
spillTableDesc[tag] = tblDesc;
}
return spillTableDesc;
}
use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class JoinUtil method getSpillSerDe.
public static AbstractSerDe getSpillSerDe(byte alias, TableDesc[] spillTableDesc, JoinDesc conf, boolean noFilter) {
TableDesc desc = getSpillTableDesc(alias, spillTableDesc, conf, noFilter);
if (desc == null) {
return null;
}
AbstractSerDe sd = (AbstractSerDe) ReflectionUtil.newInstance(desc.getDeserializerClass(), null);
try {
SerDeUtils.initializeSerDe(sd, null, desc.getProperties(), null);
} catch (SerDeException e) {
e.printStackTrace();
return null;
}
return sd;
}
use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class MapJoinOperator method generateMapMetaData.
public void generateMapMetaData() throws HiveException {
try {
TableDesc keyTableDesc = conf.getKeyTblDesc();
AbstractSerDe keySerializer = (AbstractSerDe) ReflectionUtil.newInstance(keyTableDesc.getDeserializerClass(), null);
SerDeUtils.initializeSerDe(keySerializer, null, keyTableDesc.getProperties(), null);
MapJoinObjectSerDeContext keyContext = new MapJoinObjectSerDeContext(keySerializer, false);
for (int pos = 0; pos < order.length; pos++) {
if (pos == posBigTable) {
continue;
}
TableDesc valueTableDesc;
if (conf.getNoOuterJoin()) {
valueTableDesc = conf.getValueTblDescs().get(pos);
} else {
valueTableDesc = conf.getValueFilteredTblDescs().get(pos);
}
AbstractSerDe valueSerDe = (AbstractSerDe) ReflectionUtil.newInstance(valueTableDesc.getDeserializerClass(), null);
SerDeUtils.initializeSerDe(valueSerDe, null, valueTableDesc.getProperties(), null);
MapJoinObjectSerDeContext valueContext = new MapJoinObjectSerDeContext(valueSerDe, hasFilter(pos));
mapJoinTableSerdes[pos] = new MapJoinTableContainerSerDe(keyContext, valueContext);
}
} catch (SerDeException e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class IndexWhereProcessor method process.
@Override
public /**
* Process a node of the operator tree. This matches on the rule in IndexWhereTaskDispatcher
*/
Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
TableScanOperator operator = (TableScanOperator) nd;
List<Node> opChildren = operator.getChildren();
TableScanDesc operatorDesc = operator.getConf();
if (operatorDesc == null || !tsToIndices.containsKey(operator)) {
return null;
}
List<Index> indexes = tsToIndices.get(operator);
ExprNodeDesc predicate = operatorDesc.getFilterExpr();
IndexWhereProcCtx context = (IndexWhereProcCtx) procCtx;
ParseContext pctx = context.getParseContext();
LOG.info("Processing predicate for index optimization");
if (predicate == null) {
LOG.info("null predicate pushed down");
return null;
}
LOG.info(predicate.getExprString());
// check if we have tsToIndices on all partitions in this table scan
Set<Partition> queryPartitions;
try {
queryPartitions = IndexUtils.checkPartitionsCoveredByIndex(operator, pctx, indexes);
if (queryPartitions == null) {
// partitions not covered
return null;
}
} catch (HiveException e) {
LOG.error("Fatal Error: problem accessing metastore", e);
throw new SemanticException(e);
}
// we can only process MapReduce tasks to check input size
if (!context.getCurrentTask().isMapRedTask()) {
return null;
}
MapRedTask currentTask = (MapRedTask) context.getCurrentTask();
// get potential reentrant index queries from each index
Map<Index, HiveIndexQueryContext> queryContexts = new HashMap<Index, HiveIndexQueryContext>();
// make sure we have an index on the table being scanned
TableDesc tblDesc = operator.getTableDesc();
Map<String, List<Index>> indexesByType = new HashMap<String, List<Index>>();
for (Index indexOnTable : indexes) {
if (indexesByType.get(indexOnTable.getIndexHandlerClass()) == null) {
List<Index> newType = new ArrayList<Index>();
newType.add(indexOnTable);
indexesByType.put(indexOnTable.getIndexHandlerClass(), newType);
} else {
indexesByType.get(indexOnTable.getIndexHandlerClass()).add(indexOnTable);
}
}
// choose index type with most tsToIndices of the same type on the table
// TODO HIVE-2130 This would be a good place for some sort of cost based choice?
List<Index> bestIndexes = indexesByType.values().iterator().next();
for (List<Index> indexTypes : indexesByType.values()) {
if (bestIndexes.size() < indexTypes.size()) {
bestIndexes = indexTypes;
}
}
// rewrite index queries for the chosen index type
HiveIndexQueryContext tmpQueryContext = new HiveIndexQueryContext();
tmpQueryContext.setQueryPartitions(queryPartitions);
rewriteForIndexes(predicate, bestIndexes, pctx, currentTask, tmpQueryContext);
List<Task<?>> indexTasks = tmpQueryContext.getQueryTasks();
if (indexTasks != null && indexTasks.size() > 0) {
queryContexts.put(bestIndexes.get(0), tmpQueryContext);
}
// choose an index rewrite to use
if (queryContexts.size() > 0) {
// TODO HIVE-2130 This would be a good place for some sort of cost based choice?
Index chosenIndex = queryContexts.keySet().iterator().next();
// modify the parse context to use indexing
// we need to delay this until we choose one index so that we don't attempt to modify pctx multiple times
HiveIndexQueryContext queryContext = queryContexts.get(chosenIndex);
// prepare the map reduce job to use indexing
MapWork work = currentTask.getWork().getMapWork();
work.setInputformat(queryContext.getIndexInputFormat());
work.addIndexIntermediateFile(queryContext.getIndexIntermediateFile());
// modify inputs based on index query
Set<ReadEntity> inputs = pctx.getSemanticInputs();
inputs.addAll(queryContext.getAdditionalSemanticInputs());
List<Task<?>> chosenRewrite = queryContext.getQueryTasks();
// add dependencies so index query runs first
insertIndexQuery(pctx, context, chosenRewrite);
}
return null;
}
Aggregations