use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.
the class StatsUtils method getFileSizeForPartitions.
/**
* Find the bytes on disks occupied by list of partitions
* @param conf
* - hive conf
* @param parts
* - partition list
* @return sizes of partitions
*/
public static List<Long> getFileSizeForPartitions(final HiveConf conf, List<Partition> parts) {
LOG.info("Number of partitions : " + parts.size());
ArrayList<Future<Long>> futures = new ArrayList<>();
int threads = Math.max(1, conf.getIntVar(ConfVars.METASTORE_FS_HANDLER_THREADS_COUNT));
final ExecutorService pool = Executors.newFixedThreadPool(threads, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Get-Partitions-Size-%d").build());
final ArrayList<Long> sizes = new ArrayList<>(parts.size());
for (final Partition part : parts) {
final Path path = part.getDataLocation();
futures.add(pool.submit(new Callable<Long>() {
@Override
public Long call() throws Exception {
try {
LOG.debug("Partition path : " + path);
FileSystem fs = path.getFileSystem(conf);
return fs.getContentSummary(path).getLength();
} catch (IOException e) {
return 0L;
}
}
}));
}
try {
for (int i = 0; i < futures.size(); i++) {
sizes.add(i, futures.get(i).get());
}
} catch (InterruptedException | ExecutionException e) {
LOG.warn("Exception in processing files ", e);
} finally {
pool.shutdownNow();
}
return sizes;
}
use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.
the class StatsUtils method getRangePartitionColumn.
private static Range getRangePartitionColumn(PartitionIterable partitions, String partColName, String colType, String defaultPartName) {
Range range = null;
String partVal;
String colTypeLowerCase = colType.toLowerCase();
if (colTypeLowerCase.equals(serdeConstants.TINYINT_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.SMALLINT_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.INT_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.BIGINT_TYPE_NAME)) {
long min = Long.MAX_VALUE;
long max = Long.MIN_VALUE;
for (Partition partition : partitions) {
partVal = partition.getSpec().get(partColName);
if (partVal.equals(defaultPartName)) {
// partition column value is null.
continue;
} else {
long value = Long.parseLong(partVal);
min = Math.min(min, value);
max = Math.max(max, value);
}
}
range = new Range(min, max);
} else if (colTypeLowerCase.equals(serdeConstants.FLOAT_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.DOUBLE_TYPE_NAME)) {
double min = Double.MAX_VALUE;
double max = Double.MIN_VALUE;
for (Partition partition : partitions) {
partVal = partition.getSpec().get(partColName);
if (partVal.equals(defaultPartName)) {
// partition column value is null.
continue;
} else {
double value = Double.parseDouble(partVal);
min = Math.min(min, value);
max = Math.max(max, value);
}
}
range = new Range(min, max);
} else if (colTypeLowerCase.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
double min = Double.MAX_VALUE;
double max = Double.MIN_VALUE;
for (Partition partition : partitions) {
partVal = partition.getSpec().get(partColName);
if (partVal.equals(defaultPartName)) {
// partition column value is null.
continue;
} else {
double value = new BigDecimal(partVal).doubleValue();
min = Math.min(min, value);
max = Math.max(max, value);
}
}
range = new Range(min, max);
} else {
// Columns statistics for complex datatypes are not supported yet
return null;
}
return range;
}
use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.
the class DDLTask method alterTableAlterPart.
/**
* Alter partition column type in a table
*
* @param db
* Database to rename the partition.
* @param alterPartitionDesc
* change partition column type.
* @return Returns 0 when execution succeeds and above 0 if it fails.
* @throws HiveException
*/
private int alterTableAlterPart(Hive db, AlterTableAlterPartDesc alterPartitionDesc) throws HiveException {
Table tbl = db.getTable(alterPartitionDesc.getTableName(), true);
String tabName = alterPartitionDesc.getTableName();
// This is checked by DDLSemanticAnalyzer
assert (tbl.isPartitioned());
List<FieldSchema> newPartitionKeys = new ArrayList<FieldSchema>();
// with a non null value before trying to alter the partition column type.
try {
Set<Partition> partitions = db.getAllPartitionsOf(tbl);
int colIndex = -1;
for (FieldSchema col : tbl.getTTable().getPartitionKeys()) {
colIndex++;
if (col.getName().compareTo(alterPartitionDesc.getPartKeySpec().getName()) == 0) {
break;
}
}
if (colIndex == -1 || colIndex == tbl.getTTable().getPartitionKeys().size()) {
throw new HiveException("Cannot find partition column " + alterPartitionDesc.getPartKeySpec().getName());
}
TypeInfo expectedType = TypeInfoUtils.getTypeInfoFromTypeString(alterPartitionDesc.getPartKeySpec().getType());
ObjectInspector outputOI = TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(expectedType);
Converter converter = ObjectInspectorConverters.getConverter(PrimitiveObjectInspectorFactory.javaStringObjectInspector, outputOI);
// For all the existing partitions, check if the value can be type casted to a non-null object
for (Partition part : partitions) {
if (part.getName().equals(conf.getVar(HiveConf.ConfVars.DEFAULTPARTITIONNAME))) {
continue;
}
try {
String value = part.getValues().get(colIndex);
Object convertedValue = converter.convert(value);
if (convertedValue == null) {
throw new HiveException(" Converting from " + TypeInfoFactory.stringTypeInfo + " to " + expectedType + " for value : " + value + " resulted in NULL object");
}
} catch (Exception e) {
throw new HiveException("Exception while converting " + TypeInfoFactory.stringTypeInfo + " to " + expectedType + " for value : " + part.getValues().get(colIndex));
}
}
} catch (Exception e) {
throw new HiveException("Exception while checking type conversion of existing partition values to " + alterPartitionDesc.getPartKeySpec() + " : " + e.getMessage());
}
for (FieldSchema col : tbl.getTTable().getPartitionKeys()) {
if (col.getName().compareTo(alterPartitionDesc.getPartKeySpec().getName()) == 0) {
newPartitionKeys.add(alterPartitionDesc.getPartKeySpec());
} else {
newPartitionKeys.add(col);
}
}
tbl.getTTable().setPartitionKeys(newPartitionKeys);
try {
db.alterTable(tabName, tbl, null);
} catch (InvalidOperationException e) {
throw new HiveException(e, ErrorMsg.GENERIC_ERROR, "Unable to alter " + tabName);
}
work.getInputs().add(new ReadEntity(tbl));
// We've already locked the table as the input, don't relock it as the output.
addIfAbsentByName(new WriteEntity(tbl, WriteEntity.WriteType.DDL_NO_LOCK));
return 0;
}
use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.
the class DDLTask method archive.
private int archive(Hive db, AlterTableSimpleDesc simpleDesc, DriverContext driverContext) throws HiveException {
Table tbl = db.getTable(simpleDesc.getTableName());
if (tbl.getTableType() != TableType.MANAGED_TABLE) {
throw new HiveException("ARCHIVE can only be performed on managed tables");
}
Map<String, String> partSpec = simpleDesc.getPartSpec();
PartSpecInfo partSpecInfo = PartSpecInfo.create(tbl, partSpec);
List<Partition> partitions = db.getPartitions(tbl, partSpec);
Path originalDir = null;
// to keep backward compatibility
if (partitions.isEmpty()) {
throw new HiveException("No partition matches the specification");
} else if (partSpecInfo.values.size() != tbl.getPartCols().size()) {
// for partial specifications we need partitions to follow the scheme
for (Partition p : partitions) {
if (partitionInCustomLocation(tbl, p)) {
String message = String.format("ARCHIVE cannot run for partition " + "groups with custom locations like %s", p.getLocation());
throw new HiveException(message);
}
}
originalDir = partSpecInfo.createPath(tbl);
} else {
Partition p = partitions.get(0);
// partition can be archived if during recovery
if (ArchiveUtils.isArchived(p)) {
originalDir = new Path(getOriginalLocation(p));
} else {
originalDir = p.getDataLocation();
}
}
Path intermediateArchivedDir = new Path(originalDir.getParent(), originalDir.getName() + INTERMEDIATE_ARCHIVED_DIR_SUFFIX);
Path intermediateOriginalDir = new Path(originalDir.getParent(), originalDir.getName() + INTERMEDIATE_ORIGINAL_DIR_SUFFIX);
console.printInfo("intermediate.archived is " + intermediateArchivedDir.toString());
console.printInfo("intermediate.original is " + intermediateOriginalDir.toString());
String archiveName = "data.har";
FileSystem fs = null;
try {
fs = originalDir.getFileSystem(conf);
} catch (IOException e) {
throw new HiveException(e);
}
URI archiveUri = (new Path(originalDir, archiveName)).toUri();
URI originalUri = ArchiveUtils.addSlash(originalDir.toUri());
ArchiveUtils.HarPathHelper harHelper = new ArchiveUtils.HarPathHelper(conf, archiveUri, originalUri);
// if they are different, we throw an error
for (Partition p : partitions) {
if (ArchiveUtils.isArchived(p)) {
if (ArchiveUtils.getArchivingLevel(p) != partSpecInfo.values.size()) {
String name = ArchiveUtils.getPartialName(p, ArchiveUtils.getArchivingLevel(p));
String m = String.format("Conflict with existing archive %s", name);
throw new HiveException(m);
} else {
throw new HiveException("Partition(s) already archived");
}
}
}
boolean recovery = false;
if (pathExists(intermediateArchivedDir) || pathExists(intermediateOriginalDir)) {
recovery = true;
console.printInfo("Starting recovery after failed ARCHIVE");
}
// to use as the move operation that created it is atomic.
if (!pathExists(intermediateArchivedDir) && !pathExists(intermediateOriginalDir)) {
// First create the archive in a tmp dir so that if the job fails, the
// bad files don't pollute the filesystem
Path tmpPath = new Path(driverContext.getCtx().getExternalTmpPath(originalDir), "partlevel");
console.printInfo("Creating " + archiveName + " for " + originalDir.toString());
console.printInfo("in " + tmpPath);
console.printInfo("Please wait... (this may take a while)");
// Create the Hadoop archive
int ret = 0;
try {
int maxJobNameLen = conf.getIntVar(HiveConf.ConfVars.HIVEJOBNAMELENGTH);
String jobname = String.format("Archiving %s@%s", tbl.getTableName(), partSpecInfo.getName());
jobname = Utilities.abbreviate(jobname, maxJobNameLen - 6);
conf.set(MRJobConfig.JOB_NAME, jobname);
HadoopArchives har = new HadoopArchives(conf);
List<String> args = new ArrayList<String>();
args.add("-archiveName");
args.add(archiveName);
args.add("-p");
args.add(originalDir.toString());
args.add(tmpPath.toString());
ret = ToolRunner.run(har, args.toArray(new String[0]));
} catch (Exception e) {
throw new HiveException(e);
}
if (ret != 0) {
throw new HiveException("Error while creating HAR");
}
// the partition directory. e.g. .../hr=12-intermediate-archived
try {
console.printInfo("Moving " + tmpPath + " to " + intermediateArchivedDir);
if (pathExists(intermediateArchivedDir)) {
throw new HiveException("The intermediate archive directory already exists.");
}
fs.rename(tmpPath, intermediateArchivedDir);
} catch (IOException e) {
throw new HiveException("Error while moving tmp directory");
}
} else {
if (pathExists(intermediateArchivedDir)) {
console.printInfo("Intermediate archive directory " + intermediateArchivedDir + " already exists. Assuming it contains an archived version of the partition");
}
}
// if the move hasn't been made already
if (!pathExists(intermediateOriginalDir)) {
console.printInfo("Moving " + originalDir + " to " + intermediateOriginalDir);
moveDir(fs, originalDir, intermediateOriginalDir);
} else {
console.printInfo(intermediateOriginalDir + " already exists. " + "Assuming it contains the original files in the partition");
}
// Move the intermediate archived directory to the original parent directory
if (!pathExists(originalDir)) {
console.printInfo("Moving " + intermediateArchivedDir + " to " + originalDir);
moveDir(fs, intermediateArchivedDir, originalDir);
} else {
console.printInfo(originalDir + " already exists. " + "Assuming it contains the archived version of the partition");
}
// Record this change in the metastore
try {
for (Partition p : partitions) {
URI originalPartitionUri = ArchiveUtils.addSlash(p.getDataLocation().toUri());
URI harPartitionDir = harHelper.getHarUri(originalPartitionUri);
StringBuilder authority = new StringBuilder();
if (harPartitionDir.getUserInfo() != null) {
authority.append(harPartitionDir.getUserInfo()).append("@");
}
authority.append(harPartitionDir.getHost());
if (harPartitionDir.getPort() != -1) {
authority.append(":").append(harPartitionDir.getPort());
}
Path harPath = new Path(harPartitionDir.getScheme(), authority.toString(), // make in Path to ensure no slash at the end
harPartitionDir.getPath());
setArchived(p, harPath, partSpecInfo.values.size());
db.alterPartition(simpleDesc.getTableName(), p, null);
}
} catch (Exception e) {
throw new HiveException("Unable to change the partition info for HAR", e);
}
// will not be deleted. The user will run ARCHIVE again to clear this up
if (pathExists(intermediateOriginalDir)) {
deleteDir(intermediateOriginalDir);
}
if (recovery) {
console.printInfo("Recovery after ARCHIVE succeeded");
}
return 0;
}
use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.
the class DDLTask method dropTable.
private void dropTable(Hive db, Table tbl, DropTableDesc dropTbl) throws HiveException {
// This is a true DROP TABLE
if (tbl != null) {
if (tbl.isView()) {
if (!dropTbl.getExpectView()) {
if (dropTbl.getIfExists()) {
return;
}
if (dropTbl.getExpectMaterializedView()) {
throw new HiveException("Cannot drop a view with DROP MATERIALIZED VIEW");
} else {
throw new HiveException("Cannot drop a view with DROP TABLE");
}
}
} else if (tbl.isMaterializedView()) {
if (!dropTbl.getExpectMaterializedView()) {
if (dropTbl.getIfExists()) {
return;
}
if (dropTbl.getExpectView()) {
throw new HiveException("Cannot drop a materialized view with DROP VIEW");
} else {
throw new HiveException("Cannot drop a materialized view with DROP TABLE");
}
}
} else {
if (dropTbl.getExpectView()) {
if (dropTbl.getIfExists()) {
return;
}
throw new HiveException("Cannot drop a base table with DROP VIEW");
} else if (dropTbl.getExpectMaterializedView()) {
if (dropTbl.getIfExists()) {
return;
}
throw new HiveException("Cannot drop a base table with DROP MATERIALIZED VIEW");
}
}
}
ReplicationSpec replicationSpec = dropTbl.getReplicationSpec();
if ((tbl != null) && replicationSpec.isInReplicationScope()) {
/**
* DROP TABLE FOR REPLICATION behaves differently from DROP TABLE IF EXISTS - it more closely
* matches a DROP TABLE IF OLDER THAN(x) semantic.
*
* Ideally, commands executed under the scope of replication need to be idempotent and resilient
* to repeats. What can happen, sometimes, is that a drone processing a replication task can
* have been abandoned for not returning in time, but still execute its task after a while,
* which should not result in it mucking up data that has been impressed later on. So, for eg.,
* if we create partition P1, followed by droppping it, followed by creating it yet again,
* the replication of that drop should not drop the newer partition if it runs after the destination
* object is already in the newer state.
*
* Thus, we check the replicationSpec.allowEventReplacementInto to determine whether or not we can
* drop the object in question(will return false if object is newer than the event, true if not)
*
* In addition, since DROP TABLE FOR REPLICATION can result in a table not being dropped, while DROP
* TABLE will always drop the table, and the included partitions, DROP TABLE FOR REPLICATION must
* do one more thing - if it does not drop the table because the table is in a newer state, it must
* drop the partitions inside it that are older than this event. To wit, DROP TABLE FOR REPL
* acts like a recursive DROP TABLE IF OLDER.
*/
if (!replicationSpec.allowEventReplacementInto(tbl)) {
// any partitions inside that are older.
if (tbl.isPartitioned()) {
PartitionIterable partitions = new PartitionIterable(db, tbl, null, conf.getIntVar(HiveConf.ConfVars.METASTORE_BATCH_RETRIEVE_MAX));
for (Partition p : Iterables.filter(partitions, replicationSpec.allowEventReplacementInto())) {
db.dropPartition(tbl.getDbName(), tbl.getTableName(), p.getValues(), true);
}
}
// table is newer, leave it be.
return;
}
}
// drop the table
db.dropTable(dropTbl.getTableName(), dropTbl.getIfPurge());
if (tbl != null) {
// Remove from cache if it is a materialized view
if (tbl.isMaterializedView()) {
HiveMaterializedViewsRegistry.get().dropMaterializedView(tbl);
}
// We have already locked the table in DDLSemanticAnalyzer, don't do it again here
addIfAbsentByName(new WriteEntity(tbl, WriteEntity.WriteType.DDL_NO_LOCK));
}
}
Aggregations