use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class GenMapRedUtils method createMergeTask.
/**
* Create a block level merge task for RCFiles or stripe level merge task for
* ORCFiles
*
* @param fsInputDesc
* @param finalName
* @param ctx
* @param inputFormatClass
* @return MergeWork if table is stored as RCFile or ORCFile,
* null otherwise
*/
public static MapWork createMergeTask(FileSinkDesc fsInputDesc, Path finalName, boolean hasDynamicPartitions, CompilationOpContext ctx) throws SemanticException {
Path inputDir = fsInputDesc.getFinalDirName();
TableDesc tblDesc = fsInputDesc.getTableInfo();
List<Path> inputDirs = new ArrayList<Path>(1);
ArrayList<String> inputDirstr = new ArrayList<String>(1);
// in case of dynamic partitioning and list bucketing
if (!hasDynamicPartitions && !GenMapRedUtils.isSkewedStoredAsDirs(fsInputDesc)) {
inputDirs.add(inputDir);
}
inputDirstr.add(inputDir.toString());
// internal input format class for CombineHiveInputFormat
final Class<? extends InputFormat> internalIFClass;
if (tblDesc.getInputFileFormatClass().equals(RCFileInputFormat.class)) {
internalIFClass = RCFileBlockMergeInputFormat.class;
} else if (tblDesc.getInputFileFormatClass().equals(OrcInputFormat.class)) {
internalIFClass = OrcFileStripeMergeInputFormat.class;
} else {
throw new SemanticException("createMergeTask called on a table with file" + " format other than RCFile or ORCFile");
}
// create the merge file work
MergeFileWork work = new MergeFileWork(inputDirs, finalName, hasDynamicPartitions, tblDesc.getInputFileFormatClass().getName());
LinkedHashMap<Path, ArrayList<String>> pathToAliases = new LinkedHashMap<>();
pathToAliases.put(inputDir, inputDirstr);
work.setMapperCannotSpanPartns(true);
work.setPathToAliases(pathToAliases);
PartitionDesc pDesc = new PartitionDesc(tblDesc, null);
pDesc.setInputFileFormatClass(internalIFClass);
work.addPathToPartitionInfo(inputDir, pDesc);
work.setListBucketingCtx(fsInputDesc.getLbCtx());
// create alias to work which contains the merge operator
LinkedHashMap<String, Operator<? extends OperatorDesc>> aliasToWork = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
Operator<? extends OperatorDesc> mergeOp = null;
final FileMergeDesc fmd;
if (tblDesc.getInputFileFormatClass().equals(RCFileInputFormat.class)) {
fmd = new RCFileMergeDesc();
} else {
fmd = new OrcFileMergeDesc();
}
fmd.setDpCtx(fsInputDesc.getDynPartCtx());
fmd.setOutputPath(finalName);
fmd.setHasDynamicPartitions(work.hasDynamicPartitions());
fmd.setListBucketingAlterTableConcatenate(work.isListBucketingAlterTableConcatenate());
int lbLevel = work.getListBucketingCtx() == null ? 0 : work.getListBucketingCtx().calculateListBucketingLevel();
fmd.setListBucketingDepth(lbLevel);
mergeOp = OperatorFactory.get(ctx, fmd);
aliasToWork.put(inputDir.toString(), mergeOp);
work.setAliasToWork(aliasToWork);
return work;
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class GenMapRedUtils method replaceMapWork.
/**
* Replace the Map-side operator tree associated with targetAlias in
* target with the Map-side operator tree associated with sourceAlias in source.
* @param sourceAlias
* @param targetAlias
* @param source
* @param target
*/
public static void replaceMapWork(String sourceAlias, String targetAlias, MapWork source, MapWork target) {
Map<Path, ArrayList<String>> sourcePathToAliases = source.getPathToAliases();
Map<Path, PartitionDesc> sourcePathToPartitionInfo = source.getPathToPartitionInfo();
Map<String, Operator<? extends OperatorDesc>> sourceAliasToWork = source.getAliasToWork();
Map<String, PartitionDesc> sourceAliasToPartnInfo = source.getAliasToPartnInfo();
LinkedHashMap<Path, ArrayList<String>> targetPathToAliases = target.getPathToAliases();
LinkedHashMap<Path, PartitionDesc> targetPathToPartitionInfo = target.getPathToPartitionInfo();
Map<String, Operator<? extends OperatorDesc>> targetAliasToWork = target.getAliasToWork();
Map<String, PartitionDesc> targetAliasToPartnInfo = target.getAliasToPartnInfo();
if (!sourceAliasToWork.containsKey(sourceAlias) || !targetAliasToWork.containsKey(targetAlias)) {
// with targetAlias in target.
return;
}
if (sourceAliasToWork.size() > 1) {
// how to merge.
return;
}
// Remove unnecessary information from target
targetAliasToWork.remove(targetAlias);
targetAliasToPartnInfo.remove(targetAlias);
List<Path> pathsToRemove = new ArrayList<>();
for (Entry<Path, ArrayList<String>> entry : targetPathToAliases.entrySet()) {
ArrayList<String> aliases = entry.getValue();
aliases.remove(targetAlias);
if (aliases.isEmpty()) {
pathsToRemove.add(entry.getKey());
}
}
for (Path pathToRemove : pathsToRemove) {
targetPathToAliases.remove(pathToRemove);
targetPathToPartitionInfo.remove(pathToRemove);
}
// Add new information from source to target
targetAliasToWork.put(sourceAlias, sourceAliasToWork.get(sourceAlias));
targetAliasToPartnInfo.putAll(sourceAliasToPartnInfo);
targetPathToPartitionInfo.putAll(sourcePathToPartitionInfo);
List<Path> pathsToAdd = new ArrayList<>();
for (Entry<Path, ArrayList<String>> entry : sourcePathToAliases.entrySet()) {
ArrayList<String> aliases = entry.getValue();
if (aliases.contains(sourceAlias)) {
pathsToAdd.add(entry.getKey());
}
}
for (Path pathToAdd : pathsToAdd) {
if (!targetPathToAliases.containsKey(pathToAdd)) {
targetPathToAliases.put(pathToAdd, new ArrayList<String>());
}
targetPathToAliases.get(pathToAdd).add(sourceAlias);
}
target.setPathToAliases(targetPathToAliases);
target.setPathToPartitionInfo(targetPathToPartitionInfo);
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class MapOperator method setChildren.
public void setChildren(Configuration hconf) throws Exception {
List<Operator<? extends OperatorDesc>> children = new ArrayList<Operator<? extends OperatorDesc>>();
Map<String, Configuration> tableNameToConf = cloneConfsForNestedColPruning(hconf);
Map<TableDesc, StructObjectInspector> convertedOI = getConvertedOI(tableNameToConf);
for (Map.Entry<Path, ArrayList<String>> entry : conf.getPathToAliases().entrySet()) {
Path onefile = entry.getKey();
List<String> aliases = entry.getValue();
PartitionDesc partDesc = conf.getPathToPartitionInfo().get(onefile);
TableDesc tableDesc = partDesc.getTableDesc();
Configuration newConf = tableNameToConf.get(tableDesc.getTableName());
for (String alias : aliases) {
Operator<? extends OperatorDesc> op = conf.getAliasToWork().get(alias);
if (isLogDebugEnabled) {
LOG.debug("Adding alias " + alias + " to work list for file " + onefile);
}
Map<Operator<?>, MapOpCtx> contexts = opCtxMap.get(onefile.toString());
if (contexts == null) {
opCtxMap.put(onefile.toString(), contexts = new LinkedHashMap<Operator<?>, MapOpCtx>());
}
if (contexts.containsKey(op)) {
continue;
}
MapOpCtx context = new MapOpCtx(alias, op, partDesc);
StructObjectInspector tableRowOI = convertedOI.get(partDesc.getTableDesc());
contexts.put(op, initObjectInspector(newConf, context, tableRowOI));
if (children.contains(op) == false) {
op.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>(1));
op.getParentOperators().add(this);
children.add(op);
}
}
}
initOperatorContext(children);
// we found all the operators that we are supposed to process.
setChildOperators(children);
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class MapOperator method cloneConfsForNestedColPruning.
/**
* For each source table, combine the nested column pruning information from all its
* table scan descriptors and set it in a configuration copy. This is necessary since
* the configuration property "READ_NESTED_COLUMN_PATH_CONF_STR" is set on a per-table
* basis, so we can't just use a single configuration for all the tables.
*/
private Map<String, Configuration> cloneConfsForNestedColPruning(Configuration hconf) {
Map<String, Configuration> tableNameToConf = new HashMap<>();
for (Map.Entry<Path, ArrayList<String>> e : conf.getPathToAliases().entrySet()) {
List<String> aliases = e.getValue();
if (aliases == null || aliases.isEmpty()) {
continue;
}
String tableName = conf.getPathToPartitionInfo().get(e.getKey()).getTableName();
for (String alias : aliases) {
Operator<?> rootOp = conf.getAliasToWork().get(alias);
if (!(rootOp instanceof TableScanOperator)) {
continue;
}
TableScanDesc tableScanDesc = ((TableScanOperator) rootOp).getConf();
List<String> nestedColumnPaths = tableScanDesc.getNeededNestedColumnPaths();
if (nestedColumnPaths == null || nestedColumnPaths.isEmpty()) {
continue;
}
if (!tableNameToConf.containsKey(tableName)) {
Configuration clonedConf = new Configuration(hconf);
clonedConf.unset(ColumnProjectionUtils.READ_NESTED_COLUMN_PATH_CONF_STR);
tableNameToConf.put(tableName, clonedConf);
}
Configuration newConf = tableNameToConf.get(tableName);
ColumnProjectionUtils.appendNestedColumnPaths(newConf, nestedColumnPaths);
}
}
// Assign tables without nested column pruning info to the default conf
for (PartitionDesc pd : conf.getPathToPartitionInfo().values()) {
if (!tableNameToConf.containsKey(pd.getTableName())) {
tableNameToConf.put(pd.getTableName(), hconf);
}
}
for (PartitionDesc pd : conf.getAliasToPartnInfo().values()) {
if (!tableNameToConf.containsKey(pd.getTableName())) {
tableNameToConf.put(pd.getTableName(), hconf);
}
}
return tableNameToConf;
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class SplitGrouper method schemaEvolved.
private boolean schemaEvolved(InputSplit s, InputSplit prevSplit, boolean groupAcrossFiles, MapWork work) throws IOException {
boolean retval = false;
Path path = ((FileSplit) s).getPath();
PartitionDesc pd = HiveFileFormatUtils.getPartitionDescFromPathRecursively(work.getPathToPartitionInfo(), path, cache);
String currentDeserializerClass = pd.getDeserializerClassName();
Class<?> currentInputFormatClass = pd.getInputFileFormatClass();
Class<?> previousInputFormatClass = null;
String previousDeserializerClass = null;
if (prevSplit != null) {
Path prevPath = ((FileSplit) prevSplit).getPath();
if (!groupAcrossFiles) {
return !path.equals(prevPath);
}
PartitionDesc prevPD = HiveFileFormatUtils.getPartitionDescFromPathRecursively(work.getPathToPartitionInfo(), prevPath, cache);
previousDeserializerClass = prevPD.getDeserializerClassName();
previousInputFormatClass = prevPD.getInputFileFormatClass();
}
if ((currentInputFormatClass != previousInputFormatClass) || (!currentDeserializerClass.equals(previousDeserializerClass))) {
retval = true;
}
if (LOG.isDebugEnabled()) {
LOG.debug("Adding split " + path + " to src new group? " + retval);
}
return retval;
}
Aggregations