use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.
the class Driver method doAuthorization.
/**
* Do authorization using post semantic analysis information in the semantic analyzer
* The original command is also passed so that authorization interface can provide
* more useful information in logs.
* @param sem SemanticAnalyzer used to parse input query
* @param command input query
* @throws HiveException
* @throws AuthorizationException
*/
public static void doAuthorization(HiveOperation op, BaseSemanticAnalyzer sem, String command) throws HiveException, AuthorizationException {
SessionState ss = SessionState.get();
Hive db = sem.getDb();
Set<ReadEntity> additionalInputs = new HashSet<ReadEntity>();
for (Entity e : sem.getInputs()) {
if (e.getType() == Entity.Type.PARTITION) {
additionalInputs.add(new ReadEntity(e.getTable()));
}
}
Set<WriteEntity> additionalOutputs = new HashSet<WriteEntity>();
for (WriteEntity e : sem.getOutputs()) {
if (e.getType() == Entity.Type.PARTITION) {
additionalOutputs.add(new WriteEntity(e.getTable(), e.getWriteType()));
}
}
// The following union operation returns a union, which traverses over the
// first set once and then then over each element of second set, in order,
// that is not contained in first. This means it doesn't replace anything
// in first set, and would preserve the WriteType in WriteEntity in first
// set in case of outputs list.
Set<ReadEntity> inputs = Sets.union(sem.getInputs(), additionalInputs);
Set<WriteEntity> outputs = Sets.union(sem.getOutputs(), additionalOutputs);
if (ss.isAuthorizationModeV2()) {
// get mapping of tables to columns used
ColumnAccessInfo colAccessInfo = sem.getColumnAccessInfo();
// colAccessInfo is set only in case of SemanticAnalyzer
Map<String, List<String>> selectTab2Cols = colAccessInfo != null ? colAccessInfo.getTableToColumnAccessMap() : null;
Map<String, List<String>> updateTab2Cols = sem.getUpdateColumnAccessInfo() != null ? sem.getUpdateColumnAccessInfo().getTableToColumnAccessMap() : null;
doAuthorizationV2(ss, op, inputs, outputs, command, selectTab2Cols, updateTab2Cols);
return;
}
if (op == null) {
throw new HiveException("Operation should not be null");
}
HiveAuthorizationProvider authorizer = ss.getAuthorizer();
if (op.equals(HiveOperation.CREATEDATABASE)) {
authorizer.authorize(op.getInputRequiredPrivileges(), op.getOutputRequiredPrivileges());
} else if (op.equals(HiveOperation.CREATETABLE_AS_SELECT) || op.equals(HiveOperation.CREATETABLE)) {
authorizer.authorize(db.getDatabase(SessionState.get().getCurrentDatabase()), null, HiveOperation.CREATETABLE_AS_SELECT.getOutputRequiredPrivileges());
} else {
if (op.equals(HiveOperation.IMPORT)) {
ImportSemanticAnalyzer isa = (ImportSemanticAnalyzer) sem;
if (!isa.existsTable()) {
authorizer.authorize(db.getDatabase(SessionState.get().getCurrentDatabase()), null, HiveOperation.CREATETABLE_AS_SELECT.getOutputRequiredPrivileges());
}
}
}
if (outputs != null && outputs.size() > 0) {
for (WriteEntity write : outputs) {
if (write.isDummy() || write.isPathType()) {
continue;
}
if (write.getType() == Entity.Type.DATABASE) {
if (!op.equals(HiveOperation.IMPORT)) {
// We skip DB check for import here because we already handle it above
// as a CTAS check.
authorizer.authorize(write.getDatabase(), null, op.getOutputRequiredPrivileges());
}
continue;
}
if (write.getType() == WriteEntity.Type.PARTITION) {
Partition part = db.getPartition(write.getTable(), write.getPartition().getSpec(), false);
if (part != null) {
authorizer.authorize(write.getPartition(), null, op.getOutputRequiredPrivileges());
continue;
}
}
if (write.getTable() != null) {
authorizer.authorize(write.getTable(), null, op.getOutputRequiredPrivileges());
}
}
}
if (inputs != null && inputs.size() > 0) {
Map<Table, List<String>> tab2Cols = new HashMap<Table, List<String>>();
Map<Partition, List<String>> part2Cols = new HashMap<Partition, List<String>>();
// determine if partition level privileges should be checked for input tables
Map<String, Boolean> tableUsePartLevelAuth = new HashMap<String, Boolean>();
for (ReadEntity read : inputs) {
if (read.isDummy() || read.isPathType() || read.getType() == Entity.Type.DATABASE) {
continue;
}
Table tbl = read.getTable();
if ((read.getPartition() != null) || (tbl != null && tbl.isPartitioned())) {
String tblName = tbl.getTableName();
if (tableUsePartLevelAuth.get(tblName) == null) {
boolean usePartLevelPriv = (tbl.getParameters().get("PARTITION_LEVEL_PRIVILEGE") != null && ("TRUE".equalsIgnoreCase(tbl.getParameters().get("PARTITION_LEVEL_PRIVILEGE"))));
if (usePartLevelPriv) {
tableUsePartLevelAuth.put(tblName, Boolean.TRUE);
} else {
tableUsePartLevelAuth.put(tblName, Boolean.FALSE);
}
}
}
}
// column authorization is checked through table scan operators.
getTablePartitionUsedColumns(op, sem, tab2Cols, part2Cols, tableUsePartLevelAuth);
// cache the results for table authorization
Set<String> tableAuthChecked = new HashSet<String>();
for (ReadEntity read : inputs) {
// if read is not direct, we do not need to check its autho.
if (read.isDummy() || read.isPathType() || !read.isDirect()) {
continue;
}
if (read.getType() == Entity.Type.DATABASE) {
authorizer.authorize(read.getDatabase(), op.getInputRequiredPrivileges(), null);
continue;
}
Table tbl = read.getTable();
if (tbl.isView() && sem instanceof SemanticAnalyzer) {
tab2Cols.put(tbl, sem.getColumnAccessInfo().getTableToColumnAccessMap().get(tbl.getCompleteName()));
}
if (read.getPartition() != null) {
Partition partition = read.getPartition();
tbl = partition.getTable();
// use partition level authorization
if (Boolean.TRUE.equals(tableUsePartLevelAuth.get(tbl.getTableName()))) {
List<String> cols = part2Cols.get(partition);
if (cols != null && cols.size() > 0) {
authorizer.authorize(partition.getTable(), partition, cols, op.getInputRequiredPrivileges(), null);
} else {
authorizer.authorize(partition, op.getInputRequiredPrivileges(), null);
}
continue;
}
}
// partitions
if (tbl != null && !tableAuthChecked.contains(tbl.getTableName()) && !(Boolean.TRUE.equals(tableUsePartLevelAuth.get(tbl.getTableName())))) {
List<String> cols = tab2Cols.get(tbl);
if (cols != null && cols.size() > 0) {
authorizer.authorize(tbl, null, cols, op.getInputRequiredPrivileges(), null);
} else {
authorizer.authorize(tbl, op.getInputRequiredPrivileges(), null);
}
tableAuthChecked.add(tbl.getTableName());
}
}
}
}
use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.
the class ArchiveUtils method conflictingArchiveNameOrNull.
/**
* Determines if one can insert into partition(s), or there's a conflict with
* archive. It can be because partition is itself archived or it is to be
* created inside existing archive. The second case is when partition doesn't
* exist yet, but it would be inside of an archive if it existed. This one is
* quite tricky to check, we need to find at least one partition inside of
* the parent directory. If it is archived and archiving level tells that
* the archival was done of directory partition is in it means we cannot
* insert; otherwise we can.
* This method works both for full specifications and partial ones - in second
* case it checks if any partition that could possibly match such
* specification is inside archive.
*
* @param db - Hive object
* @param tbl - table where partition is
* @param partSpec - partition specification with possible nulls in case of
* dynamic partiton inserts
* @return null if partition can be inserted, string with colliding archive
* name when it can't
* @throws HiveException
*/
public static String conflictingArchiveNameOrNull(Hive db, Table tbl, LinkedHashMap<String, String> partSpec) throws HiveException {
List<FieldSchema> partKeys = tbl.getPartitionKeys();
int partSpecLevel = 0;
for (FieldSchema partKey : partKeys) {
if (!partSpec.containsKey(partKey.getName())) {
break;
}
partSpecLevel++;
}
if (partSpecLevel != partSpec.size()) {
throw new HiveException("partspec " + partSpec + " is wrong for table " + tbl.getTableName());
}
Map<String, String> spec = new HashMap<String, String>(partSpec);
List<String> reversedKeys = new LinkedList<String>();
for (FieldSchema fs : tbl.getPartCols()) {
if (spec.containsKey(fs.getName())) {
reversedKeys.add(0, fs.getName());
}
}
for (String rk : reversedKeys) {
List<Partition> parts = db.getPartitions(tbl, spec, (short) 1);
if (parts.size() != 0) {
Partition p = parts.get(0);
if (!isArchived(p)) {
// no archiving was done neither at this nor at upper level
return null;
} else if (getArchivingLevel(p) > spec.size()) {
// it is not, which means no archiving at this or upper level
return null;
} else {
return getPartialName(p, getArchivingLevel(p));
}
}
spec.remove(rk);
}
return null;
}
use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.
the class StatsUtils method collectStatistics.
public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List<ColumnInfo> schema, List<String> neededColumns, List<String> referencedColumns, boolean fetchColStats, boolean fetchPartStats) throws HiveException {
Statistics stats = new Statistics();
float deserFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR);
if (!table.isPartitioned()) {
long ds = getDataSize(conf, table);
long nr = getNumRows(conf, schema, neededColumns, table, ds);
stats.setNumRows(nr);
List<ColStatistics> colStats = Lists.newArrayList();
if (fetchColStats) {
colStats = getTableColumnStats(table, schema, neededColumns);
long betterDS = getDataSizeFromColumnStats(nr, colStats);
ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS;
}
stats.setDataSize(ds);
// infer if any column can be primary key based on column statistics
inferAndSetPrimaryKey(stats.getNumRows(), colStats);
stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
stats.addToColumnStats(colStats);
} else if (partList != null) {
// For partitioned tables, get the size of all the partitions after pruning
// the partitions that are not required
long nr = 0;
long ds = 0;
List<Long> rowCounts = Lists.newArrayList();
List<Long> dataSizes = Lists.newArrayList();
if (fetchPartStats) {
rowCounts = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT);
dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE);
nr = getSumIgnoreNegatives(rowCounts);
ds = getSumIgnoreNegatives(dataSizes);
if (ds <= 0) {
dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE);
ds = getSumIgnoreNegatives(dataSizes);
}
}
// sizes
if (ds <= 0) {
dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns());
}
ds = getSumIgnoreNegatives(dataSizes);
ds = (long) (ds * deserFactor);
int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns);
if (avgRowSize > 0) {
setUnknownRcDsToAverage(rowCounts, dataSizes, avgRowSize);
nr = getSumIgnoreNegatives(rowCounts);
ds = getSumIgnoreNegatives(dataSizes);
// number of rows -1 means that statistics from metastore is not reliable
if (nr <= 0) {
nr = ds / avgRowSize;
}
}
if (nr == 0) {
nr = 1;
}
stats.addToNumRows(nr);
stats.addToDataSize(ds);
// if at least a partition does not contain row count then mark basic stats state as PARTIAL
if (containsNonPositives(rowCounts) && stats.getBasicStatsState().equals(State.COMPLETE)) {
stats.setBasicStatsState(State.PARTIAL);
}
if (fetchColStats) {
List<String> partNames = new ArrayList<String>(partList.getNotDeniedPartns().size());
for (Partition part : partList.getNotDeniedPartns()) {
partNames.add(part.getName());
}
neededColumns = processNeededColumns(schema, neededColumns);
AggrStats aggrStats = null;
// skip the step to connect to the metastore.
if (neededColumns.size() > 0 && partNames.size() > 0) {
aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColumns, partNames);
}
if (null == aggrStats || null == aggrStats.getColStats() || aggrStats.getColStatsSize() == 0) {
// There are some partitions with no state (or we didn't fetch any state).
// Update the stats with empty list to reflect that in the
// state/initialize structures.
List<ColStatistics> emptyStats = Lists.newArrayList();
// add partition column stats
addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList, emptyStats);
stats.addToColumnStats(emptyStats);
stats.addToDataSize(getDataSizeFromColumnStats(nr, emptyStats));
stats.updateColumnStatsState(deriveStatType(emptyStats, referencedColumns));
} else {
List<ColumnStatisticsObj> colStats = aggrStats.getColStats();
if (colStats.size() != neededColumns.size()) {
LOG.debug("Column stats requested for : " + neededColumns.size() + " columns. Able to" + " retrieve for " + colStats.size() + " columns");
}
List<ColStatistics> columnStats = convertColStats(colStats, table.getTableName());
addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList, columnStats);
long betterDS = getDataSizeFromColumnStats(nr, columnStats);
stats.setDataSize((betterDS < 1 || columnStats.isEmpty()) ? ds : betterDS);
// infer if any column can be primary key based on column statistics
inferAndSetPrimaryKey(stats.getNumRows(), columnStats);
stats.addToColumnStats(columnStats);
State colState = deriveStatType(columnStats, referencedColumns);
if (aggrStats.getPartsFound() != partNames.size() && colState != State.NONE) {
LOG.debug("Column stats requested for : " + partNames.size() + " partitions. " + "Able to retrieve for " + aggrStats.getPartsFound() + " partitions");
colState = State.PARTIAL;
}
stats.setColumnStatsState(colState);
}
}
}
return stats;
}
use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.
the class StatsNoJobTask method aggregateStats.
private int aggregateStats(ExecutorService threadPool, Hive db) {
int ret = 0;
try {
Collection<Partition> partitions = null;
if (work.getPrunedPartitionList() == null) {
partitions = getPartitionsList();
} else {
partitions = work.getPrunedPartitionList().getPartitions();
}
// non-partitioned table
if (partitions == null) {
org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
Map<String, String> parameters = tTable.getParameters();
try {
Path dir = new Path(tTable.getSd().getLocation());
long numRows = 0;
long rawDataSize = 0;
long fileSize = 0;
long numFiles = 0;
FileSystem fs = dir.getFileSystem(conf);
FileStatus[] fileList = HiveStatsUtils.getFileStatusRecurse(dir, -1, fs);
boolean statsAvailable = false;
for (FileStatus file : fileList) {
if (!file.isDir()) {
InputFormat<?, ?> inputFormat = ReflectionUtil.newInstance(table.getInputFormatClass(), jc);
InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0, new String[] { table.getDataLocation().toString() });
if (file.getLen() == 0) {
numFiles += 1;
statsAvailable = true;
} else {
org.apache.hadoop.mapred.RecordReader<?, ?> recordReader = inputFormat.getRecordReader(dummySplit, jc, Reporter.NULL);
StatsProvidingRecordReader statsRR;
if (recordReader instanceof StatsProvidingRecordReader) {
statsRR = (StatsProvidingRecordReader) recordReader;
numRows += statsRR.getStats().getRowCount();
rawDataSize += statsRR.getStats().getRawDataSize();
fileSize += file.getLen();
numFiles += 1;
statsAvailable = true;
}
recordReader.close();
}
}
}
if (statsAvailable) {
parameters.put(StatsSetupConst.ROW_COUNT, String.valueOf(numRows));
parameters.put(StatsSetupConst.RAW_DATA_SIZE, String.valueOf(rawDataSize));
parameters.put(StatsSetupConst.TOTAL_SIZE, String.valueOf(fileSize));
parameters.put(StatsSetupConst.NUM_FILES, String.valueOf(numFiles));
EnvironmentContext environmentContext = new EnvironmentContext();
environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, StatsSetupConst.TASK);
db.alterTable(tableFullName, new Table(tTable), environmentContext);
String msg = "Table " + tableFullName + " stats: [" + toString(parameters) + ']';
LOG.debug(msg);
console.printInfo(msg);
} else {
String msg = "Table " + tableFullName + " does not provide stats.";
LOG.debug(msg);
}
} catch (Exception e) {
console.printInfo("[Warning] could not update stats for " + tableFullName + ".", "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
}
} else {
// Partitioned table
for (Partition partn : partitions) {
threadPool.execute(new StatsCollection(partn));
}
LOG.debug("Stats collection waiting for threadpool to shutdown..");
shutdownAndAwaitTermination(threadPool);
LOG.debug("Stats collection threadpool shutdown successful.");
ret = updatePartitions(db);
}
} catch (Exception e) {
// Fail the query if the stats are supposed to be reliable
if (work.isStatsReliable()) {
ret = -1;
}
}
// anything else indicates failure
return ret;
}
use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.
the class StatsNoJobTask method updatePartitions.
private int updatePartitions(Hive db) throws InvalidOperationException, HiveException {
if (!partUpdates.isEmpty()) {
List<Partition> updatedParts = Lists.newArrayList(partUpdates.values());
if (updatedParts.contains(null) && work.isStatsReliable()) {
LOG.debug("Stats requested to be reliable. Empty stats found and hence failing the task.");
return -1;
} else {
LOG.debug("Bulk updating partitions..");
EnvironmentContext environmentContext = new EnvironmentContext();
environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, StatsSetupConst.TASK);
db.alterPartitions(tableFullName, Lists.newArrayList(partUpdates.values()), environmentContext);
LOG.debug("Bulk updated " + partUpdates.values().size() + " partitions.");
}
}
return 0;
}
Aggregations