use of org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.BucketCol in project hive by apache.
the class MoveTask method updatePartitionBucketSortColumns.
/**
* Alters the bucketing and/or sorting columns of the partition provided they meet some
* validation criteria, e.g. the number of buckets match the number of files, and the
* columns are not partition columns
* @param table
* @param partn
* @param bucketCols
* @param numBuckets
* @param sortCols
* @throws IOException
* @throws InvalidOperationException
* @throws HiveException
*/
private void updatePartitionBucketSortColumns(Hive db, Table table, Partition partn, List<BucketCol> bucketCols, int numBuckets, List<SortCol> sortCols) throws IOException, InvalidOperationException, HiveException {
boolean updateBucketCols = false;
if (bucketCols != null) {
FileSystem fileSys = partn.getDataLocation().getFileSystem(conf);
FileStatus[] fileStatus = HiveStatsUtils.getFileStatusRecurse(partn.getDataLocation(), 1, fileSys);
// each bucket.
if (fileStatus.length == numBuckets) {
List<String> newBucketCols = new ArrayList<String>();
updateBucketCols = true;
for (BucketCol bucketCol : bucketCols) {
if (bucketCol.getIndexes().get(0) < partn.getCols().size()) {
newBucketCols.add(partn.getCols().get(bucketCol.getIndexes().get(0)).getName());
} else {
// If the table is bucketed on a partition column, not valid for bucketing
updateBucketCols = false;
break;
}
}
if (updateBucketCols) {
partn.getBucketCols().clear();
partn.getBucketCols().addAll(newBucketCols);
partn.getTPartition().getSd().setNumBuckets(numBuckets);
}
}
}
boolean updateSortCols = false;
if (sortCols != null) {
List<Order> newSortCols = new ArrayList<Order>();
updateSortCols = true;
for (SortCol sortCol : sortCols) {
if (sortCol.getIndexes().get(0) < partn.getCols().size()) {
newSortCols.add(new Order(partn.getCols().get(sortCol.getIndexes().get(0)).getName(), sortCol.getSortOrder() == '+' ? BaseSemanticAnalyzer.HIVE_COLUMN_ORDER_ASC : BaseSemanticAnalyzer.HIVE_COLUMN_ORDER_DESC));
} else {
// If the table is sorted on a partition column, not valid for sorting
updateSortCols = false;
break;
}
}
if (updateSortCols) {
partn.getSortCols().clear();
partn.getSortCols().addAll(newSortCols);
}
}
if (updateBucketCols || updateSortCols) {
db.alterPartition(table.getDbName(), table.getTableName(), partn, null);
}
}
use of org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.BucketCol in project hive by apache.
the class BucketingSortingOpProcFactory method findBucketingSortingColumns.
/**
* For each expression, check if it represents a column known to be bucketed/sorted.
*
* The methods setBucketingColsIfComplete and setSortingColsIfComplete should be used to assign
* the values of newBucketCols and newSortCols as the bucketing/sorting columns of this operator
* because these arrays may contain nulls indicating that the output of this operator is not
* bucketed/sorted.
*
* @param exprs - list of expression
* @param colInfos - list of column infos
* @param bucketCols - list of bucketed columns from the input
* @param sortCols - list of sorted columns from the input
* @param newBucketCols - an array of bucket columns which should be the same length as
* bucketCols, updated such that the bucketed column(s) at index i in bucketCols became
* the bucketed column(s) at index i of newBucketCols in the output
* @param newSortCols - an array of sort columns which should be the same length as
* sortCols, updated such that the sorted column(s) at index i in sortCols became
* the sorted column(s) at index i of sortCols in the output
* @param colInfosOffset - the expressions are known to be represented by column infos
* beginning at this index
*/
private static void findBucketingSortingColumns(List<ExprNodeDesc> exprs, List<ColumnInfo> colInfos, List<BucketCol> bucketCols, List<SortCol> sortCols, BucketCol[] newBucketCols, SortCol[] newSortCols, int colInfosOffset) {
for (int cnt = 0; cnt < exprs.size(); cnt++) {
ExprNodeDesc expr = exprs.get(cnt);
// voids any assumptions
if (!(expr instanceof ExprNodeColumnDesc)) {
continue;
}
ExprNodeColumnDesc columnExpr = (ExprNodeColumnDesc) expr;
int colInfosIndex = cnt + colInfosOffset;
if (newBucketCols != null) {
int bucketIndex = indexOfColName(bucketCols, columnExpr.getColumn());
if (bucketIndex != -1) {
if (newBucketCols[bucketIndex] == null) {
newBucketCols[bucketIndex] = new BucketCol();
}
newBucketCols[bucketIndex].addAlias(colInfos.get(colInfosIndex).getInternalName(), colInfosIndex);
}
}
if (newSortCols != null) {
int sortIndex = indexOfColName(sortCols, columnExpr.getColumn());
if (sortIndex != -1) {
if (newSortCols[sortIndex] == null) {
newSortCols[sortIndex] = new SortCol(sortCols.get(sortIndex).getSortOrder());
}
newSortCols[sortIndex].addAlias(colInfos.get(colInfosIndex).getInternalName(), colInfosIndex);
}
}
}
}
use of org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.BucketCol in project hive by apache.
the class BucketingSortingOpProcFactory method extractTraits.
static void extractTraits(BucketingSortingCtx bctx, ReduceSinkOperator rop, Operator<?> childop) throws SemanticException {
List<ExprNodeDesc> outputValues = Collections.emptyList();
if (childop instanceof SelectOperator) {
SelectDesc select = ((SelectOperator) childop).getConf();
outputValues = ExprNodeDescUtils.backtrack(select.getColList(), childop, rop);
}
if (outputValues.isEmpty()) {
return;
}
// Go through the set of partition columns, and find their representatives in the values
// These represent the bucketed columns
List<BucketCol> bucketCols = extractBucketCols(rop, outputValues);
// Go through the set of key columns, and find their representatives in the values
// These represent the sorted columns
List<SortCol> sortCols = extractSortCols(rop, outputValues);
List<ColumnInfo> colInfos = childop.getSchema().getSignature();
if (!bucketCols.isEmpty()) {
List<BucketCol> newBucketCols = getNewBucketCols(bucketCols, colInfos);
bctx.setBucketedCols(childop, newBucketCols);
}
if (!sortCols.isEmpty()) {
List<SortCol> newSortCols = getNewSortCols(sortCols, colInfos);
bctx.setSortedCols(childop, newSortCols);
}
}
use of org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.BucketCol in project hive by apache.
the class BucketingSortingOpProcFactory method extractBucketCols.
static List<BucketCol> extractBucketCols(ReduceSinkOperator rop, List<ExprNodeDesc> outputValues) {
List<BucketCol> bucketCols = new ArrayList<BucketCol>();
for (ExprNodeDesc partitionCol : rop.getConf().getPartitionCols()) {
if (!(partitionCol instanceof ExprNodeColumnDesc)) {
return Collections.emptyList();
}
int index = ExprNodeDescUtils.indexOf(partitionCol, outputValues);
if (index < 0) {
return Collections.emptyList();
}
bucketCols.add(new BucketCol(((ExprNodeColumnDesc) partitionCol).getColumn(), index));
}
// If the partition columns can't all be found in the values then the data is not bucketed
return bucketCols;
}
use of org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.BucketCol in project hive by apache.
the class MoveTask method execute.
@Override
public int execute(DriverContext driverContext) {
try {
if (driverContext.getCtx().getExplainAnalyze() == AnalyzeState.RUNNING) {
return 0;
}
Hive db = getHive();
// Do any hive related operations like moving tables and files
// to appropriate locations
LoadFileDesc lfd = work.getLoadFileWork();
if (lfd != null) {
Path targetPath = lfd.getTargetDir();
Path sourcePath = lfd.getSourcePath();
moveFile(sourcePath, targetPath, lfd.getIsDfsDir());
}
// Multi-file load is for dynamic partitions when some partitions do not
// need to merge and they can simply be moved to the target directory.
LoadMultiFilesDesc lmfd = work.getLoadMultiFilesWork();
if (lmfd != null) {
boolean isDfsDir = lmfd.getIsDfsDir();
int i = 0;
while (i < lmfd.getSourceDirs().size()) {
Path srcPath = lmfd.getSourceDirs().get(i);
Path destPath = lmfd.getTargetDirs().get(i);
FileSystem fs = destPath.getFileSystem(conf);
if (!fs.exists(destPath.getParent())) {
fs.mkdirs(destPath.getParent());
}
moveFile(srcPath, destPath, isDfsDir);
i++;
}
}
// Next we do this for tables and partitions
LoadTableDesc tbd = work.getLoadTableWork();
if (tbd != null) {
StringBuilder mesg = new StringBuilder("Loading data to table ").append(tbd.getTable().getTableName());
if (tbd.getPartitionSpec().size() > 0) {
mesg.append(" partition (");
Map<String, String> partSpec = tbd.getPartitionSpec();
for (String key : partSpec.keySet()) {
mesg.append(key).append('=').append(partSpec.get(key)).append(", ");
}
mesg.setLength(mesg.length() - 2);
mesg.append(')');
}
String mesg_detail = " from " + tbd.getSourcePath();
console.printInfo(mesg.toString(), mesg_detail);
Table table = db.getTable(tbd.getTable().getTableName());
if (work.getCheckFileFormat()) {
// Get all files from the src directory
FileStatus[] dirs;
ArrayList<FileStatus> files;
// source filesystem
FileSystem srcFs;
try {
srcFs = tbd.getSourcePath().getFileSystem(conf);
dirs = srcFs.globStatus(tbd.getSourcePath());
files = new ArrayList<FileStatus>();
for (int i = 0; (dirs != null && i < dirs.length); i++) {
files.addAll(Arrays.asList(srcFs.listStatus(dirs[i].getPath(), FileUtils.HIDDEN_FILES_PATH_FILTER)));
// one.
if (files.size() > 0) {
break;
}
}
} catch (IOException e) {
throw new HiveException("addFiles: filesystem error in check phase", e);
}
// handle file format check for table level
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVECHECKFILEFORMAT)) {
boolean flag = true;
// dynamic partition context is null
if (tbd.getDPCtx() == null) {
if (tbd.getPartitionSpec() == null || tbd.getPartitionSpec().isEmpty()) {
// Check if the file format of the file matches that of the table.
flag = HiveFileFormatUtils.checkInputFormat(srcFs, conf, tbd.getTable().getInputFileFormatClass(), files);
} else {
// Check if the file format of the file matches that of the partition
Partition oldPart = db.getPartition(table, tbd.getPartitionSpec(), false);
if (oldPart == null) {
// this means we have just created a table and are specifying partition in the
// load statement (without pre-creating the partition), in which case lets use
// table input format class. inheritTableSpecs defaults to true so when a new
// partition is created later it will automatically inherit input format
// from table object
flag = HiveFileFormatUtils.checkInputFormat(srcFs, conf, tbd.getTable().getInputFileFormatClass(), files);
} else {
flag = HiveFileFormatUtils.checkInputFormat(srcFs, conf, oldPart.getInputFormatClass(), files);
}
}
if (!flag) {
throw new HiveException("Wrong file format. Please check the file's format.");
}
} else {
LOG.warn("Skipping file format check as dpCtx is not null");
}
}
}
// Create a data container
DataContainer dc = null;
if (tbd.getPartitionSpec().size() == 0) {
dc = new DataContainer(table.getTTable());
db.loadTable(tbd.getSourcePath(), tbd.getTable().getTableName(), tbd.getReplace(), work.isSrcLocal(), isSkewedStoredAsDirs(tbd), work.getLoadTableWork().getWriteType() != AcidUtils.Operation.NOT_ACID, hasFollowingStatsTask());
if (work.getOutputs() != null) {
DDLTask.addIfAbsentByName(new WriteEntity(table, getWriteType(tbd, work.getLoadTableWork().getWriteType())), work.getOutputs());
}
} else {
LOG.info("Partition is: " + tbd.getPartitionSpec().toString());
// Check if the bucketing and/or sorting columns were inferred
List<BucketCol> bucketCols = null;
List<SortCol> sortCols = null;
int numBuckets = -1;
Task task = this;
String path = tbd.getSourcePath().toUri().toString();
// (Either standard, local, or a merge)
while (task.getParentTasks() != null && task.getParentTasks().size() == 1) {
task = (Task) task.getParentTasks().get(0);
// If it was a merge task or a local map reduce task, nothing can be inferred
if (task instanceof MergeFileTask || task instanceof MapredLocalTask) {
break;
}
// the directory this move task is moving
if (task instanceof MapRedTask) {
MapredWork work = (MapredWork) task.getWork();
MapWork mapWork = work.getMapWork();
bucketCols = mapWork.getBucketedColsByDirectory().get(path);
sortCols = mapWork.getSortedColsByDirectory().get(path);
if (work.getReduceWork() != null) {
numBuckets = work.getReduceWork().getNumReduceTasks();
}
if (bucketCols != null || sortCols != null) {
// operator that writes the final output)
assert work.isFinalMapRed();
}
break;
}
// condition for merging is not met, see GenMRFileSink1.
if (task instanceof MoveTask) {
if (((MoveTask) task).getWork().getLoadFileWork() != null) {
path = ((MoveTask) task).getWork().getLoadFileWork().getSourcePath().toUri().toString();
}
}
}
// deal with dynamic partitions
DynamicPartitionCtx dpCtx = tbd.getDPCtx();
if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
// dynamic partitions
List<LinkedHashMap<String, String>> dps = Utilities.getFullDPSpecs(conf, dpCtx);
console.printInfo(System.getProperty("line.separator"));
long startTime = System.currentTimeMillis();
// load the list of DP partitions and return the list of partition specs
// TODO: In a follow-up to HIVE-1361, we should refactor loadDynamicPartitions
// to use Utilities.getFullDPSpecs() to get the list of full partSpecs.
// After that check the number of DPs created to not exceed the limit and
// iterate over it and call loadPartition() here.
// The reason we don't do inside HIVE-1361 is the latter is large and we
// want to isolate any potential issue it may introduce.
Map<Map<String, String>, Partition> dp = db.loadDynamicPartitions(tbd.getSourcePath(), tbd.getTable().getTableName(), tbd.getPartitionSpec(), tbd.getReplace(), dpCtx.getNumDPCols(), isSkewedStoredAsDirs(tbd), work.getLoadTableWork().getWriteType() != AcidUtils.Operation.NOT_ACID, SessionState.get().getTxnMgr().getCurrentTxnId(), hasFollowingStatsTask(), work.getLoadTableWork().getWriteType());
// publish DP columns to its subscribers
if (dps != null && dps.size() > 0) {
pushFeed(FeedType.DYNAMIC_PARTITIONS, dp.values());
}
String loadTime = "\t Time taken to load dynamic partitions: " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds";
console.printInfo(loadTime);
LOG.info(loadTime);
if (dp.size() == 0 && conf.getBoolVar(HiveConf.ConfVars.HIVE_ERROR_ON_EMPTY_PARTITION)) {
throw new HiveException("This query creates no partitions." + " To turn off this error, set hive.error.on.empty.partition=false.");
}
startTime = System.currentTimeMillis();
// and put it to WriteEntity for post-exec hook
for (Map.Entry<Map<String, String>, Partition> entry : dp.entrySet()) {
Partition partn = entry.getValue();
if (bucketCols != null || sortCols != null) {
updatePartitionBucketSortColumns(db, table, partn, bucketCols, numBuckets, sortCols);
}
WriteEntity enty = new WriteEntity(partn, getWriteType(tbd, work.getLoadTableWork().getWriteType()));
if (work.getOutputs() != null) {
DDLTask.addIfAbsentByName(enty, work.getOutputs());
}
// queryPlan here.
if (queryPlan.getOutputs() == null) {
queryPlan.setOutputs(new LinkedHashSet<WriteEntity>());
}
queryPlan.getOutputs().add(enty);
// update columnar lineage for each partition
dc = new DataContainer(table.getTTable(), partn.getTPartition());
// Don't set lineage on delete as we don't have all the columns
if (SessionState.get() != null && work.getLoadTableWork().getWriteType() != AcidUtils.Operation.DELETE && work.getLoadTableWork().getWriteType() != AcidUtils.Operation.UPDATE) {
SessionState.get().getLineageState().setLineage(tbd.getSourcePath(), dc, table.getCols());
}
LOG.info("\tLoading partition " + entry.getKey());
}
console.printInfo("\t Time taken for adding to write entity : " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
// reset data container to prevent it being added again.
dc = null;
} else {
// static partitions
List<String> partVals = MetaStoreUtils.getPvals(table.getPartCols(), tbd.getPartitionSpec());
db.validatePartitionNameCharacters(partVals);
db.loadPartition(tbd.getSourcePath(), tbd.getTable().getTableName(), tbd.getPartitionSpec(), tbd.getReplace(), tbd.getInheritTableSpecs(), isSkewedStoredAsDirs(tbd), work.isSrcLocal(), work.getLoadTableWork().getWriteType() != AcidUtils.Operation.NOT_ACID, hasFollowingStatsTask());
Partition partn = db.getPartition(table, tbd.getPartitionSpec(), false);
if (bucketCols != null || sortCols != null) {
updatePartitionBucketSortColumns(db, table, partn, bucketCols, numBuckets, sortCols);
}
dc = new DataContainer(table.getTTable(), partn.getTPartition());
// add this partition to post-execution hook
if (work.getOutputs() != null) {
DDLTask.addIfAbsentByName(new WriteEntity(partn, getWriteType(tbd, work.getLoadTableWork().getWriteType())), work.getOutputs());
}
}
}
if (SessionState.get() != null && dc != null) {
// If we are doing an update or a delete the number of columns in the table will not
// match the number of columns in the file sink. For update there will be one too many
// (because of the ROW__ID), and in the case of the delete there will be just the
// ROW__ID, which we don't need to worry about from a lineage perspective.
List<FieldSchema> tableCols = null;
switch(work.getLoadTableWork().getWriteType()) {
case DELETE:
case UPDATE:
// Pass an empty list as no columns will be written to the file.
// TODO I should be able to make this work for update
tableCols = new ArrayList<FieldSchema>();
break;
default:
tableCols = table.getCols();
break;
}
SessionState.get().getLineageState().setLineage(tbd.getSourcePath(), dc, tableCols);
}
releaseLocks(tbd);
}
return 0;
} catch (Exception e) {
console.printError("Failed with exception " + e.getMessage(), "\n" + StringUtils.stringifyException(e));
setException(e);
return (1);
}
}
Aggregations