use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.
the class ColumnStatsUpdateTask method constructColumnStatsFromInput.
private ColumnStatistics constructColumnStatsFromInput() throws SemanticException, MetaException {
// If we are replicating the stats, we don't need to construct those again.
if (work.getColStats() != null) {
ColumnStatistics colStats = work.getColStats();
LOG.debug("Got stats through replication for " + colStats.getStatsDesc().getDbName() + "." + colStats.getStatsDesc().getTableName());
return colStats;
}
String dbName = work.dbName();
String tableName = work.getTableName();
String partName = work.getPartName();
String colName = work.getColName();
String columnType = work.getColType();
ColumnStatisticsObj statsObj = new ColumnStatisticsObj();
// grammar prohibits more than 1 column so we are guaranteed to have only 1
// element in this lists.
statsObj.setColName(colName);
statsObj.setColType(columnType);
ColumnStatisticsData statsData = new ColumnStatisticsData();
if (columnType.equalsIgnoreCase("long") || columnType.equalsIgnoreCase("tinyint") || columnType.equalsIgnoreCase("smallint") || columnType.equalsIgnoreCase("int") || columnType.equalsIgnoreCase("bigint")) {
LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
longStats.setNumNullsIsSet(false);
longStats.setNumDVsIsSet(false);
longStats.setLowValueIsSet(false);
longStats.setHighValueIsSet(false);
Map<String, String> mapProp = work.getMapProp();
for (Entry<String, String> entry : mapProp.entrySet()) {
String fName = entry.getKey();
String value = entry.getValue();
if (fName.equals("numNulls")) {
longStats.setNumNulls(Long.parseLong(value));
} else if (fName.equals("numDVs")) {
longStats.setNumDVs(Long.parseLong(value));
} else if (fName.equals("lowValue")) {
longStats.setLowValue(Long.parseLong(value));
} else if (fName.equals("highValue")) {
longStats.setHighValue(Long.parseLong(value));
} else {
throw new SemanticException("Unknown stat");
}
}
statsData.setLongStats(longStats);
statsObj.setStatsData(statsData);
} else if (columnType.equalsIgnoreCase("double") || columnType.equalsIgnoreCase("float")) {
DoubleColumnStatsDataInspector doubleStats = new DoubleColumnStatsDataInspector();
doubleStats.setNumNullsIsSet(false);
doubleStats.setNumDVsIsSet(false);
doubleStats.setLowValueIsSet(false);
doubleStats.setHighValueIsSet(false);
Map<String, String> mapProp = work.getMapProp();
for (Entry<String, String> entry : mapProp.entrySet()) {
String fName = entry.getKey();
String value = entry.getValue();
if (fName.equals("numNulls")) {
doubleStats.setNumNulls(Long.parseLong(value));
} else if (fName.equals("numDVs")) {
doubleStats.setNumDVs(Long.parseLong(value));
} else if (fName.equals("lowValue")) {
doubleStats.setLowValue(Double.parseDouble(value));
} else if (fName.equals("highValue")) {
doubleStats.setHighValue(Double.parseDouble(value));
} else {
throw new SemanticException("Unknown stat");
}
}
statsData.setDoubleStats(doubleStats);
statsObj.setStatsData(statsData);
} else if (columnType.equalsIgnoreCase("string") || columnType.toLowerCase().startsWith("char") || columnType.toLowerCase().startsWith("varchar")) {
// char(x),varchar(x) types
StringColumnStatsDataInspector stringStats = new StringColumnStatsDataInspector();
stringStats.setMaxColLenIsSet(false);
stringStats.setAvgColLenIsSet(false);
stringStats.setNumNullsIsSet(false);
stringStats.setNumDVsIsSet(false);
Map<String, String> mapProp = work.getMapProp();
for (Entry<String, String> entry : mapProp.entrySet()) {
String fName = entry.getKey();
String value = entry.getValue();
if (fName.equals("numNulls")) {
stringStats.setNumNulls(Long.parseLong(value));
} else if (fName.equals("numDVs")) {
stringStats.setNumDVs(Long.parseLong(value));
} else if (fName.equals("avgColLen")) {
stringStats.setAvgColLen(Double.parseDouble(value));
} else if (fName.equals("maxColLen")) {
stringStats.setMaxColLen(Long.parseLong(value));
} else {
throw new SemanticException("Unknown stat");
}
}
statsData.setStringStats(stringStats);
statsObj.setStatsData(statsData);
} else if (columnType.equalsIgnoreCase("boolean")) {
BooleanColumnStatsData booleanStats = new BooleanColumnStatsData();
booleanStats.setNumNullsIsSet(false);
booleanStats.setNumTruesIsSet(false);
booleanStats.setNumFalsesIsSet(false);
Map<String, String> mapProp = work.getMapProp();
for (Entry<String, String> entry : mapProp.entrySet()) {
String fName = entry.getKey();
String value = entry.getValue();
if (fName.equals("numNulls")) {
booleanStats.setNumNulls(Long.parseLong(value));
} else if (fName.equals("numTrues")) {
booleanStats.setNumTrues(Long.parseLong(value));
} else if (fName.equals("numFalses")) {
booleanStats.setNumFalses(Long.parseLong(value));
} else {
throw new SemanticException("Unknown stat");
}
}
statsData.setBooleanStats(booleanStats);
statsObj.setStatsData(statsData);
} else if (columnType.equalsIgnoreCase("binary")) {
BinaryColumnStatsData binaryStats = new BinaryColumnStatsData();
binaryStats.setNumNullsIsSet(false);
binaryStats.setAvgColLenIsSet(false);
binaryStats.setMaxColLenIsSet(false);
Map<String, String> mapProp = work.getMapProp();
for (Entry<String, String> entry : mapProp.entrySet()) {
String fName = entry.getKey();
String value = entry.getValue();
if (fName.equals("numNulls")) {
binaryStats.setNumNulls(Long.parseLong(value));
} else if (fName.equals("avgColLen")) {
binaryStats.setAvgColLen(Double.parseDouble(value));
} else if (fName.equals("maxColLen")) {
binaryStats.setMaxColLen(Long.parseLong(value));
} else {
throw new SemanticException("Unknown stat");
}
}
statsData.setBinaryStats(binaryStats);
statsObj.setStatsData(statsData);
} else if (columnType.toLowerCase().startsWith("decimal")) {
// decimal(a,b) type
DecimalColumnStatsDataInspector decimalStats = new DecimalColumnStatsDataInspector();
decimalStats.setNumNullsIsSet(false);
decimalStats.setNumDVsIsSet(false);
decimalStats.setLowValueIsSet(false);
decimalStats.setHighValueIsSet(false);
Map<String, String> mapProp = work.getMapProp();
for (Entry<String, String> entry : mapProp.entrySet()) {
String fName = entry.getKey();
String value = entry.getValue();
if (fName.equals("numNulls")) {
decimalStats.setNumNulls(Long.parseLong(value));
} else if (fName.equals("numDVs")) {
decimalStats.setNumDVs(Long.parseLong(value));
} else if (fName.equals("lowValue")) {
BigDecimal d = new BigDecimal(value);
decimalStats.setLowValue(DecimalUtils.getDecimal(ByteBuffer.wrap(d.unscaledValue().toByteArray()), (short) d.scale()));
} else if (fName.equals("highValue")) {
BigDecimal d = new BigDecimal(value);
decimalStats.setHighValue(DecimalUtils.getDecimal(ByteBuffer.wrap(d.unscaledValue().toByteArray()), (short) d.scale()));
} else {
throw new SemanticException("Unknown stat");
}
}
statsData.setDecimalStats(decimalStats);
statsObj.setStatsData(statsData);
} else if (columnType.equalsIgnoreCase("date")) {
DateColumnStatsDataInspector dateStats = new DateColumnStatsDataInspector();
Map<String, String> mapProp = work.getMapProp();
for (Entry<String, String> entry : mapProp.entrySet()) {
String fName = entry.getKey();
String value = entry.getValue();
if (fName.equals("numNulls")) {
dateStats.setNumNulls(Long.parseLong(value));
} else if (fName.equals("numDVs")) {
dateStats.setNumDVs(Long.parseLong(value));
} else if (fName.equals("lowValue")) {
// Date high/low value is stored as long in stats DB, but allow users to set high/low
// value using either date format (yyyy-mm-dd) or numeric format (days since epoch)
dateStats.setLowValue(readDateValue(value));
} else if (fName.equals("highValue")) {
dateStats.setHighValue(readDateValue(value));
} else {
throw new SemanticException("Unknown stat");
}
}
statsData.setDateStats(dateStats);
statsObj.setStatsData(statsData);
} else if (columnType.equalsIgnoreCase("timestamp")) {
TimestampColumnStatsDataInspector timestampStats = new TimestampColumnStatsDataInspector();
Map<String, String> mapProp = work.getMapProp();
for (Entry<String, String> entry : mapProp.entrySet()) {
String fName = entry.getKey();
String value = entry.getValue();
if (fName.equals("numNulls")) {
timestampStats.setNumNulls(Long.parseLong(value));
} else if (fName.equals("numDVs")) {
timestampStats.setNumDVs(Long.parseLong(value));
} else if (fName.equals("lowValue")) {
timestampStats.setLowValue(readTimestampValue(value));
} else if (fName.equals("highValue")) {
timestampStats.setHighValue(readTimestampValue(value));
} else {
throw new SemanticException("Unknown stat");
}
}
statsData.setTimestampStats(timestampStats);
statsObj.setStatsData(statsData);
} else {
throw new SemanticException("Unsupported type");
}
ColumnStatisticsDesc statsDesc = getColumnStatsDesc(dbName, tableName, partName, partName == null);
ColumnStatistics colStat = new ColumnStatistics();
colStat.setStatsDesc(statsDesc);
colStat.addToStatsObj(statsObj);
colStat.setEngine(Constants.HIVE_ENGINE);
return colStat;
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.
the class StatsUpdaterThread method getExistingStatsToUpdate.
private List<String> getExistingStatsToUpdate(ColumnStatistics existingStats, Map<String, String> params, boolean isTxnValid) {
boolean hasAnyAccurate = isTxnValid && StatsSetupConst.areBasicStatsUptoDate(params);
List<String> colsToUpdate = new ArrayList<>();
for (ColumnStatisticsObj obj : existingStats.getStatsObj()) {
String col = obj.getColName();
if (!hasAnyAccurate || !StatsSetupConst.areColumnStatsUptoDate(params, col)) {
colsToUpdate.add(col);
}
}
return colsToUpdate;
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.
the class ColStatsProcessor method constructColumnStatsFromPackedRows.
private boolean constructColumnStatsFromPackedRows(Table tbl, List<ColumnStatistics> stats, long maxNumStats) throws HiveException, MetaException, IOException {
String partName = null;
List<String> colName = colStatDesc.getColName();
List<String> colType = colStatDesc.getColType();
boolean isTblLevel = colStatDesc.isTblLevel();
InspectableObject packedRow;
long numStats = 0;
while ((packedRow = ftOp.getNextRow()) != null) {
if (packedRow.oi.getCategory() != ObjectInspector.Category.STRUCT) {
throw new HiveException("Unexpected object type encountered while unpacking row");
}
final List<ColumnStatisticsObj> statsObjs = new ArrayList<>();
final StructObjectInspector soi = (StructObjectInspector) packedRow.oi;
final List<? extends StructField> fields = soi.getAllStructFieldRefs();
final List<Object> values = soi.getStructFieldsDataAsList(packedRow.o);
// Partition columns are appended at end, we only care about stats column
int pos = 0;
for (int i = 0; i < colName.size(); i++) {
String columnName = colName.get(i);
String columnType = colType.get(i);
PrimitiveTypeInfo typeInfo = (PrimitiveTypeInfo) TypeInfoUtils.getTypeInfoFromTypeString(columnType);
List<ColumnStatsField> columnStatsFields = ColumnStatsType.getColumnStats(typeInfo);
try {
ColumnStatisticsObj statObj = ColumnStatisticsObjTranslator.readHiveColumnStatistics(columnName, columnType, columnStatsFields, pos, fields, values);
statsObjs.add(statObj);
numStats++;
} catch (Exception e) {
if (isStatsReliable) {
throw new HiveException("Statistics collection failed while (hive.stats.reliable)", e);
} else {
LOG.debug("Because {} is infinite or NaN, we skip stats.", columnName, e);
}
}
pos += columnStatsFields.size();
}
if (!statsObjs.isEmpty()) {
if (!isTblLevel) {
List<FieldSchema> partColSchema = tbl.getPartCols();
List<String> partVals = new ArrayList<>();
// Iterate over partition columns to figure out partition name
for (int i = pos; i < pos + partColSchema.size(); i++) {
Object partVal = ((PrimitiveObjectInspector) fields.get(i).getFieldObjectInspector()).getPrimitiveJavaObject(values.get(i));
partVals.add(// could be null for default partition
partVal == null ? this.conf.getVar(ConfVars.DEFAULTPARTITIONNAME) : partVal.toString());
}
partName = Warehouse.makePartName(partColSchema, partVals);
}
ColumnStatisticsDesc statsDesc = buildColumnStatsDesc(tbl, partName, isTblLevel);
ColumnStatistics colStats = new ColumnStatistics();
colStats.setStatsDesc(statsDesc);
colStats.setStatsObj(statsObjs);
colStats.setEngine(Constants.HIVE_ENGINE);
stats.add(colStats);
if (numStats >= maxNumStats) {
return false;
}
}
}
ftOp.clearFetchContext();
return true;
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.
the class TestTxnCommands method testParallelInsertStats.
@Test
public void testParallelInsertStats() throws Exception {
final int TASK_COUNT = 4;
String tableName = "mm_table";
List<ColumnStatisticsObj> stats;
IMetaStoreClient msClient = prepareParallelTest(tableName, 0);
String[] queries = new String[TASK_COUNT];
for (int i = 0; i < queries.length; ++i) {
queries[i] = String.format("insert into %s (a) values (" + i + ")", tableName);
}
runParallelQueries(queries);
// Verify stats are either invalid, or valid and correct.
stats = getTxnTableStats(msClient, tableName);
boolean hasStats = 0 != stats.size();
if (hasStats) {
verifyLongStats(TASK_COUNT, 0, TASK_COUNT - 1, stats);
}
runStatementOnDriver(String.format("insert into %s (a) values (" + TASK_COUNT + ")", tableName));
if (!hasStats) {
// Stats should still be invalid if they were invalid.
stats = getTxnTableStats(msClient, tableName);
Assert.assertEquals(0, stats.size());
}
// Stats should be valid after analyze.
runStatementOnDriver(String.format("analyze table %s compute statistics for columns", tableName));
verifyLongStats(TASK_COUNT + 1, 0, TASK_COUNT, getTxnTableStats(msClient, tableName));
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.
the class TestTxnCommands method testTxnStatsOnOff.
@Test
public void testTxnStatsOnOff() throws Exception {
String tableName = "mm_table";
hiveConf.setBoolean("hive.stats.autogather", true);
hiveConf.setBoolean("hive.stats.column.autogather", true);
// Need to close the thread local Hive object so that configuration change is reflected to HMS.
Hive.closeCurrent();
runStatementOnDriver("drop table if exists " + tableName);
runStatementOnDriver(String.format("create table %s (a int) stored as orc " + "TBLPROPERTIES ('transactional'='true', 'transactional_properties'='insert_only')", tableName));
runStatementOnDriver(String.format("insert into %s (a) values (1)", tableName));
IMetaStoreClient msClient = new HiveMetaStoreClient(hiveConf);
List<ColumnStatisticsObj> stats = getTxnTableStats(msClient, tableName);
Assert.assertEquals(1, stats.size());
runStatementOnDriver(String.format("insert into %s (a) values (1)", tableName));
stats = getTxnTableStats(msClient, tableName);
Assert.assertEquals(1, stats.size());
msClient.close();
hiveConf.setBoolean(MetastoreConf.ConfVars.HIVE_TXN_STATS_ENABLED.getVarname(), false);
msClient = new HiveMetaStoreClient(hiveConf);
// Even though the stats are valid in metastore, txn stats are disabled.
stats = getTxnTableStats(msClient, tableName);
Assert.assertEquals(0, stats.size());
msClient.close();
hiveConf.setBoolean(MetastoreConf.ConfVars.HIVE_TXN_STATS_ENABLED.getVarname(), true);
msClient = new HiveMetaStoreClient(hiveConf);
stats = getTxnTableStats(msClient, tableName);
// Now the stats are visible again.
Assert.assertEquals(1, stats.size());
msClient.close();
hiveConf.setBoolean(MetastoreConf.ConfVars.HIVE_TXN_STATS_ENABLED.getVarname(), false);
// Need to close the thread local Hive object so that configuration change is reflected to HMS.
Hive.closeCurrent();
// Running the query with stats disabled will cause stats in metastore itself to become invalid.
runStatementOnDriver(String.format("insert into %s (a) values (1)", tableName));
hiveConf.setBoolean(MetastoreConf.ConfVars.HIVE_TXN_STATS_ENABLED.getVarname(), true);
msClient = new HiveMetaStoreClient(hiveConf);
stats = getTxnTableStats(msClient, tableName);
Assert.assertEquals(0, stats.size());
msClient.close();
}
Aggregations