use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class DDLTask method describeTable.
/**
* Write the description of a table to a file.
*
* @param db
* The database in question.
* @param descTbl
* This is the table we're interested in.
* @return Returns 0 when execution succeeds and above 0 if it fails.
* @throws HiveException
* Throws this exception if an unexpected error occurs.
* @throws MetaException
*/
private int describeTable(Hive db, DescTableDesc descTbl) throws HiveException, MetaException {
String colPath = descTbl.getColumnPath();
String tableName = descTbl.getTableName();
// describe the table - populate the output stream
Table tbl = db.getTable(tableName, false);
if (tbl == null) {
throw new HiveException(ErrorMsg.INVALID_TABLE, tableName);
}
Partition part = null;
if (descTbl.getPartSpec() != null) {
part = db.getPartition(tbl, descTbl.getPartSpec(), false);
if (part == null) {
throw new HiveException(ErrorMsg.INVALID_PARTITION, StringUtils.join(descTbl.getPartSpec().keySet(), ','), tableName);
}
tbl = part.getTable();
}
DataOutputStream outStream = getOutputStream(descTbl.getResFile());
try {
LOG.debug("DDLTask: got data for {}", tableName);
List<FieldSchema> cols = null;
List<ColumnStatisticsObj> colStats = null;
Deserializer deserializer = tbl.getDeserializer(true);
if (deserializer instanceof AbstractSerDe) {
String errorMsgs = ((AbstractSerDe) deserializer).getConfigurationErrors();
if (errorMsgs != null && !errorMsgs.isEmpty()) {
throw new SQLException(errorMsgs);
}
}
if (colPath.equals(tableName)) {
cols = (part == null || tbl.getTableType() == TableType.VIRTUAL_VIEW) ? tbl.getCols() : part.getCols();
if (!descTbl.isFormatted()) {
cols.addAll(tbl.getPartCols());
}
if (tbl.isPartitioned() && part == null) {
// No partitioned specified for partitioned table, lets fetch all.
Map<String, String> tblProps = tbl.getParameters() == null ? new HashMap<String, String>() : tbl.getParameters();
Map<String, Long> valueMap = new HashMap<>();
Map<String, Boolean> stateMap = new HashMap<>();
for (String stat : StatsSetupConst.supportedStats) {
valueMap.put(stat, 0L);
stateMap.put(stat, true);
}
PartitionIterable parts = new PartitionIterable(db, tbl, null, conf.getIntVar(HiveConf.ConfVars.METASTORE_BATCH_RETRIEVE_MAX));
int numParts = 0;
for (Partition partition : parts) {
Map<String, String> props = partition.getParameters();
Boolean state = StatsSetupConst.areBasicStatsUptoDate(props);
for (String stat : StatsSetupConst.supportedStats) {
stateMap.put(stat, stateMap.get(stat) && state);
if (props != null && props.get(stat) != null) {
valueMap.put(stat, valueMap.get(stat) + Long.parseLong(props.get(stat)));
}
}
numParts++;
}
for (String stat : StatsSetupConst.supportedStats) {
StatsSetupConst.setBasicStatsState(tblProps, Boolean.toString(stateMap.get(stat)));
tblProps.put(stat, valueMap.get(stat).toString());
}
tblProps.put(StatsSetupConst.NUM_PARTITIONS, Integer.toString(numParts));
tbl.setParameters(tblProps);
}
} else {
if (descTbl.isFormatted()) {
// when column name is specified in describe table DDL, colPath will
// will be table_name.column_name
String colName = colPath.split("\\.")[1];
String[] dbTab = Utilities.getDbTableName(tableName);
List<String> colNames = new ArrayList<String>();
colNames.add(colName.toLowerCase());
if (null == part) {
if (tbl.isPartitioned()) {
Map<String, String> tblProps = tbl.getParameters() == null ? new HashMap<String, String>() : tbl.getParameters();
if (tbl.isPartitionKey(colNames.get(0))) {
FieldSchema partCol = tbl.getPartColByName(colNames.get(0));
cols = Collections.singletonList(partCol);
PartitionIterable parts = new PartitionIterable(db, tbl, null, conf.getIntVar(HiveConf.ConfVars.METASTORE_BATCH_RETRIEVE_MAX));
ColumnInfo ci = new ColumnInfo(partCol.getName(), TypeInfoUtils.getTypeInfoFromTypeString(partCol.getType()), null, false);
ColStatistics cs = StatsUtils.getColStatsForPartCol(ci, parts, conf);
ColumnStatisticsData data = new ColumnStatisticsData();
ColStatistics.Range r = cs.getRange();
StatObjectConverter.fillColumnStatisticsData(partCol.getType(), data, r == null ? null : r.minValue, r == null ? null : r.maxValue, r == null ? null : r.minValue, r == null ? null : r.maxValue, r == null ? null : r.minValue.toString(), r == null ? null : r.maxValue.toString(), cs.getNumNulls(), cs.getCountDistint(), null, cs.getAvgColLen(), cs.getAvgColLen(), cs.getNumTrues(), cs.getNumFalses());
ColumnStatisticsObj cso = new ColumnStatisticsObj(partCol.getName(), partCol.getType(), data);
colStats = Collections.singletonList(cso);
StatsSetupConst.setColumnStatsState(tblProps, colNames);
} else {
cols = Hive.getFieldsFromDeserializer(colPath, deserializer);
List<String> parts = db.getPartitionNames(dbTab[0].toLowerCase(), dbTab[1].toLowerCase(), (short) -1);
AggrStats aggrStats = db.getAggrColStatsFor(dbTab[0].toLowerCase(), dbTab[1].toLowerCase(), colNames, parts);
colStats = aggrStats.getColStats();
if (parts.size() == aggrStats.getPartsFound()) {
StatsSetupConst.setColumnStatsState(tblProps, colNames);
} else {
StatsSetupConst.removeColumnStatsState(tblProps, colNames);
}
}
tbl.setParameters(tblProps);
} else {
cols = Hive.getFieldsFromDeserializer(colPath, deserializer);
colStats = db.getTableColumnStatistics(dbTab[0].toLowerCase(), dbTab[1].toLowerCase(), colNames);
}
} else {
List<String> partitions = new ArrayList<String>();
partitions.add(part.getName());
cols = Hive.getFieldsFromDeserializer(colPath, deserializer);
colStats = db.getPartitionColumnStatistics(dbTab[0].toLowerCase(), dbTab[1].toLowerCase(), partitions, colNames).get(part.getName());
}
} else {
cols = Hive.getFieldsFromDeserializer(colPath, deserializer);
}
}
PrimaryKeyInfo pkInfo = null;
ForeignKeyInfo fkInfo = null;
UniqueConstraint ukInfo = null;
NotNullConstraint nnInfo = null;
DefaultConstraint dInfo = null;
CheckConstraint cInfo = null;
if (descTbl.isExt() || descTbl.isFormatted()) {
pkInfo = db.getPrimaryKeys(tbl.getDbName(), tbl.getTableName());
fkInfo = db.getForeignKeys(tbl.getDbName(), tbl.getTableName());
ukInfo = db.getUniqueConstraints(tbl.getDbName(), tbl.getTableName());
nnInfo = db.getNotNullConstraints(tbl.getDbName(), tbl.getTableName());
dInfo = db.getDefaultConstraints(tbl.getDbName(), tbl.getTableName());
cInfo = db.getCheckConstraints(tbl.getDbName(), tbl.getTableName());
}
fixDecimalColumnTypeName(cols);
// In case the query is served by HiveServer2, don't pad it with spaces,
// as HiveServer2 output is consumed by JDBC/ODBC clients.
boolean isOutputPadded = !SessionState.get().isHiveServerQuery();
formatter.describeTable(outStream, colPath, tableName, tbl, part, cols, descTbl.isFormatted(), descTbl.isExt(), isOutputPadded, colStats, pkInfo, fkInfo, ukInfo, nnInfo, dInfo, cInfo);
LOG.debug("DDLTask: written data for {}", tableName);
} catch (SQLException e) {
throw new HiveException(e, ErrorMsg.GENERIC_ERROR, tableName);
} finally {
IOUtils.closeStream(outStream);
}
return 0;
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class RelOptHiveTable method getColStat.
/**
* Note: DOES NOT CHECK txn stats.
*/
public List<ColStatistics> getColStat(List<Integer> projIndxLst, boolean allowMissingStats) {
List<ColStatistics> colStatsBldr = Lists.newArrayList();
Set<Integer> projIndxSet = new HashSet<>(projIndxLst);
for (Integer i : projIndxLst) {
if (i >= noOfNonVirtualCols) {
projIndxSet.remove(i);
} else if (hiveColStatsMap.get(i) != null) {
colStatsBldr.add(hiveColStatsMap.get(i));
projIndxSet.remove(i);
}
}
if (!projIndxSet.isEmpty()) {
LOG.info("Calculating column statistics for {}, projIndxSet: {}, allowMissingStats: {}", name, projIndxLst, allowMissingStats);
updateColStats(projIndxSet, allowMissingStats);
for (Integer i : projIndxSet) {
colStatsBldr.add(hiveColStatsMap.get(i));
}
}
return colStatsBldr;
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class HiveRelJsonImpl method explain_.
// ~ Methods ------------------------------------------------------------------
@Override
protected void explain_(RelNode rel, List<Pair<String, Object>> values) {
super.explain_(rel, values);
RelMetadataQuery mq = rel.getCluster().getMetadataQuery();
Map<String, Object> map = (Map<String, Object>) relList.get(relList.size() - 1);
map.put("rowCount", mq.getRowCount(rel));
if (rel.getInputs().size() == 0) {
// This is a leaf, we will print the average row size and schema
map.put("avgRowSize", mq.getAverageRowSize(rel));
map.put("rowType", relJson.toJson(rel.getRowType()));
// We also include partition columns information
RelOptHiveTable table = (RelOptHiveTable) rel.getTable();
List<Object> list = jsonBuilder.list();
list.addAll(table.getHiveTableMD().getPartColNames());
if (!list.isEmpty()) {
map.put("partitionColumns", list);
}
// We also include column stats
List<ColStatistics> colStats = table.getColStat(ImmutableBitSet.range(0, table.getNoOfNonVirtualCols()).asList(), true);
list = jsonBuilder.list();
for (ColStatistics cs : colStats) {
final Map<String, Object> csMap = jsonBuilder.map();
csMap.put("name", cs.getColumnName());
csMap.put("ndv", cs.getCountDistint());
if (cs.getRange() != null) {
csMap.put("minValue", cs.getRange().minValue);
csMap.put("maxValue", cs.getRange().maxValue);
}
list.add(csMap);
}
if (!list.isEmpty()) {
map.put("colStats", list);
}
}
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class TezCompiler method markSemiJoinForDPP.
private void markSemiJoinForDPP(OptimizeTezProcContext procCtx) throws SemanticException {
// Stores the Tablescan operators processed to avoid redoing them.
Map<ReduceSinkOperator, SemiJoinBranchInfo> map = procCtx.parseContext.getRsToSemiJoinBranchInfo();
for (ReduceSinkOperator rs : map.keySet()) {
SemiJoinBranchInfo sjInfo = map.get(rs);
TableScanOperator ts = sjInfo.getTsOp();
if (sjInfo.getIsHint() || !sjInfo.getShouldRemove()) {
continue;
}
// A TS can have multiple branches due to DPP Or Semijoin Opt.
// Use DFS to traverse all the branches until RS or DPP is hit.
Deque<Operator<?>> deque = new LinkedList<>();
deque.add(ts);
while (!deque.isEmpty()) {
Operator<?> op = deque.pollLast();
if (op instanceof AppMasterEventOperator && ((AppMasterEventOperator) op).getConf() instanceof DynamicPruningEventDesc) {
// DPP. Now look up nDVs on both sides to see the selectivity.
// <Parent Ops>-SEL-GB1-RS1-GB2-RS2
SelectOperator selOp = OperatorUtils.ancestor(rs, SelectOperator.class, 0, 0, 0, 0);
try {
// Get nDVs on Semijoin edge side
Statistics stats = selOp.getStatistics();
if (stats == null) {
// No stats found on semijoin edge, do nothing
break;
}
String selCol = ExprNodeDescUtils.extractColName(selOp.getConf().getColList().get(0));
ColStatistics colStatisticsSJ = stats.getColumnStatisticsFromColName(selCol);
if (colStatisticsSJ == null) {
// No column stats found for semijoin edge
break;
}
long nDVs = colStatisticsSJ.getCountDistint();
if (nDVs > 0) {
// Lookup nDVs on TS side.
RuntimeValuesInfo rti = procCtx.parseContext.getRsToRuntimeValuesInfoMap().get(rs);
// TODO Handle multi column semi-joins as part of HIVE-23934
ExprNodeDesc tsExpr = rti.getTargetColumns().get(0);
FilterOperator fil = (FilterOperator) (ts.getChildOperators().get(0));
Statistics filStats = fil.getStatistics();
if (filStats == null) {
// No stats found on target, do nothing
break;
}
String colName = ExprNodeDescUtils.extractColName(tsExpr);
ColStatistics colStatisticsTarget = filStats.getColumnStatisticsFromColName(colName);
if (colStatisticsTarget == null) {
// No column stats found on target
break;
}
long nDVsOfTS = colStatisticsTarget.getCountDistint();
double nDVsOfTSFactored = nDVsOfTS * procCtx.conf.getFloatVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_FOR_DPP_FACTOR);
if ((long) nDVsOfTSFactored > nDVs) {
if (LOG.isDebugEnabled()) {
LOG.debug("nDVs = " + nDVs + ", nDVsOfTS = " + nDVsOfTS + " and nDVsOfTSFactored = " + nDVsOfTSFactored + "Adding semijoin branch from ReduceSink " + rs + " to TS " + sjInfo.getTsOp());
}
sjInfo.setShouldRemove(false);
}
}
} catch (NullPointerException e) {
// Do nothing
if (LOG.isDebugEnabled()) {
LOG.debug("Caught NPE in markSemiJoinForDPP from ReduceSink " + rs + " to TS " + sjInfo.getTsOp());
}
}
break;
}
if (op instanceof TerminalOperator) {
// Done with this branch
continue;
}
deque.addAll(op.getChildOperators());
}
}
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class StatsUtils method extractColumnStates.
private static List<String> extractColumnStates(Table table, List<String> columns, ColumnStatsList colStatsCache, List<ColStatistics> columnStats) {
if (colStatsCache == null) {
return columns;
}
List<String> neededColsToRetrieve = new ArrayList<>(columns.size());
for (String colName : columns) {
ColStatistics colStats = colStatsCache.getColStats().get(colName);
if (colStats == null) {
neededColsToRetrieve.add(colName);
LOG.debug("Stats for column {} in table {} could not be retrieved from cache", colName, table.getCompleteName());
} else {
columnStats.add(colStats);
LOG.debug("Stats for column {} in table {} retrieved from cache", colName, table.getCompleteName());
}
}
return neededColsToRetrieve;
}
Aggregations