use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class Utilities method reworkMapRedWork.
/**
* The check here is kind of not clean. It first use a for loop to go through
* all input formats, and choose the ones that extend ReworkMapredInputFormat
* to a set. And finally go through the ReworkMapredInputFormat set, and call
* rework for each one.
*
* Technically all these can be avoided if all Hive's input formats can share
* a same interface. As in today's hive and Hadoop, it is not possible because
* a lot of Hive's input formats are in Hadoop's code. And most of Hadoop's
* input formats just extend InputFormat interface.
*
* @param task
* @param reworkMapredWork
* @param conf
* @throws SemanticException
*/
public static void reworkMapRedWork(Task<? extends Serializable> task, boolean reworkMapredWork, HiveConf conf) throws SemanticException {
if (reworkMapredWork && (task instanceof MapRedTask)) {
try {
MapredWork mapredWork = ((MapRedTask) task).getWork();
Set<Class<? extends InputFormat>> reworkInputFormats = new HashSet<Class<? extends InputFormat>>();
for (PartitionDesc part : mapredWork.getMapWork().getPathToPartitionInfo().values()) {
Class<? extends InputFormat> inputFormatCls = part.getInputFileFormatClass();
if (ReworkMapredInputFormat.class.isAssignableFrom(inputFormatCls)) {
reworkInputFormats.add(inputFormatCls);
}
}
if (reworkInputFormats.size() > 0) {
for (Class<? extends InputFormat> inputFormatCls : reworkInputFormats) {
ReworkMapredInputFormat inst = (ReworkMapredInputFormat) ReflectionUtil.newInstance(inputFormatCls, null);
inst.rework(conf, mapredWork);
}
}
} catch (IOException e) {
throw new SemanticException(e);
}
}
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class Utilities method createDummyFileForEmptyTable.
@SuppressWarnings("rawtypes")
private static Path createDummyFileForEmptyTable(JobConf job, MapWork work, Path hiveScratchDir, String alias) throws Exception {
TableDesc tableDesc = work.getAliasToPartnInfo().get(alias).getTableDesc();
if (tableDesc.isNonNative()) {
// if it does not need native storage, we can't create an empty file for it.
return null;
}
Properties props = tableDesc.getProperties();
HiveOutputFormat outFileFormat = HiveFileFormatUtils.getHiveOutputFormat(job, tableDesc);
Path newPath = createEmptyFile(hiveScratchDir, outFileFormat, job, props, false);
if (LOG.isInfoEnabled()) {
LOG.info("Changed input file for alias " + alias + " to " + newPath);
}
// update the work
LinkedHashMap<Path, ArrayList<String>> pathToAliases = work.getPathToAliases();
ArrayList<String> newList = new ArrayList<String>();
newList.add(alias);
pathToAliases.put(newPath, newList);
work.setPathToAliases(pathToAliases);
PartitionDesc pDesc = work.getAliasToPartnInfo().get(alias).clone();
work.addPathToPartitionInfo(newPath, pDesc);
return newPath;
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class HiveIndexedInputFormat method doGetSplits.
public InputSplit[] doGetSplits(JobConf job, int numSplits) throws IOException {
super.init(job);
Path[] dirs = FileInputFormat.getInputPaths(job);
if (dirs.length == 0) {
throw new IOException("No input paths specified in job");
}
JobConf newjob = new JobConf(job);
ArrayList<InputSplit> result = new ArrayList<InputSplit>();
// for each dir, get the InputFormat, and do getSplits.
PartitionDesc part;
for (Path dir : dirs) {
part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, dir, IOPrepareCache.get().allocatePartitionDescMap(), true);
// create a new InputFormat instance if this is the first time to see this
// class
Class inputFormatClass = part.getInputFileFormatClass();
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), newjob);
FileInputFormat.setInputPaths(newjob, dir);
newjob.setInputFormat(inputFormat.getClass());
InputSplit[] iss = inputFormat.getSplits(newjob, numSplits / dirs.length);
for (InputSplit is : iss) {
result.add(new HiveInputSplit(is, inputFormatClass.getName()));
}
}
return result.toArray(new HiveInputSplit[result.size()]);
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class TableBasedIndexHandler method generateIndexBuildTaskList.
@Override
public List<Task<?>> generateIndexBuildTaskList(org.apache.hadoop.hive.ql.metadata.Table baseTbl, org.apache.hadoop.hive.metastore.api.Index index, List<Partition> indexTblPartitions, List<Partition> baseTblPartitions, org.apache.hadoop.hive.ql.metadata.Table indexTbl, Set<ReadEntity> inputs, Set<WriteEntity> outputs) throws HiveException {
try {
TableDesc desc = Utilities.getTableDesc(indexTbl);
List<Partition> newBaseTblPartitions = new ArrayList<Partition>();
List<Task<?>> indexBuilderTasks = new ArrayList<Task<?>>();
if (!baseTbl.isPartitioned()) {
// the table does not have any partition, then create index for the
// whole table
Task<?> indexBuilder = getIndexBuilderMapRedTask(inputs, outputs, index, false, new PartitionDesc(desc, null), indexTbl.getTableName(), new PartitionDesc(Utilities.getTableDesc(baseTbl), null), baseTbl.getTableName(), indexTbl.getDbName());
indexBuilderTasks.add(indexBuilder);
} else {
// table
for (int i = 0; i < indexTblPartitions.size(); i++) {
Partition indexPart = indexTblPartitions.get(i);
Partition basePart = null;
for (int j = 0; j < baseTblPartitions.size(); j++) {
if (baseTblPartitions.get(j).getName().equals(indexPart.getName())) {
basePart = baseTblPartitions.get(j);
newBaseTblPartitions.add(baseTblPartitions.get(j));
break;
}
}
if (basePart == null) {
throw new RuntimeException("Partitions of base table and index table are inconsistent.");
}
// for each partition, spawn a map reduce task.
Task<?> indexBuilder = getIndexBuilderMapRedTask(inputs, outputs, index, true, new PartitionDesc(indexPart), indexTbl.getTableName(), new PartitionDesc(basePart), baseTbl.getTableName(), indexTbl.getDbName());
indexBuilderTasks.add(indexBuilder);
}
}
return indexBuilderTasks;
} catch (Exception e) {
throw new SemanticException(e);
}
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class CombineHiveInputFormat method getSplits.
/**
* Create Hive splits based on CombineFileSplit.
*/
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
init(job);
ArrayList<InputSplit> result = new ArrayList<InputSplit>();
Path[] paths = getInputPaths(job);
List<Path> nonCombinablePaths = new ArrayList<Path>(paths.length / 2);
List<Path> combinablePaths = new ArrayList<Path>(paths.length / 2);
int numThreads = Math.min(MAX_CHECK_NONCOMBINABLE_THREAD_NUM, (int) Math.ceil((double) paths.length / DEFAULT_NUM_PATH_PER_THREAD));
// In that case, Executors.newFixedThreadPool will fail.
if (numThreads > 0) {
try {
Set<Integer> nonCombinablePathIndices = getNonCombinablePathIndices(job, paths, numThreads);
for (int i = 0; i < paths.length; i++) {
if (nonCombinablePathIndices.contains(i)) {
nonCombinablePaths.add(paths[i]);
} else {
combinablePaths.add(paths[i]);
}
}
} catch (Exception e) {
LOG.error("Error checking non-combinable path", e);
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
throw new IOException(e);
}
}
// Store the previous value for the path specification
String oldPaths = job.get(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
if (LOG.isDebugEnabled()) {
LOG.debug("The received input paths are: [" + oldPaths + "] against the property " + org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
}
// Process the normal splits
if (nonCombinablePaths.size() > 0) {
FileInputFormat.setInputPaths(job, nonCombinablePaths.toArray(new Path[nonCombinablePaths.size()]));
InputSplit[] splits = super.getSplits(job, numSplits);
for (InputSplit split : splits) {
result.add(split);
}
}
// Process the combine splits
if (combinablePaths.size() > 0) {
FileInputFormat.setInputPaths(job, combinablePaths.toArray(new Path[combinablePaths.size()]));
Map<Path, PartitionDesc> pathToPartitionInfo = this.pathToPartitionInfo != null ? this.pathToPartitionInfo : Utilities.getMapWork(job).getPathToPartitionInfo();
InputSplit[] splits = getCombineSplits(job, numSplits, pathToPartitionInfo);
for (InputSplit split : splits) {
result.add(split);
}
}
// if some application depends on the original value being set.
if (oldPaths != null) {
job.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, oldPaths);
}
// clear work from ThreadLocal after splits generated in case of thread is reused in pool.
Utilities.clearWorkMapForConf(job);
LOG.info("Number of all splits " + result.size());
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
return result.toArray(new InputSplit[result.size()]);
}
Aggregations