use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project haivvreo by jghoman.
the class AvroGenericRecordReader method getSchema.
/**
* Attempt to retrieve the reader schema. Haivvreo has a couple opportunities
* to provide this, depending on whether or not we're just selecting data
* or running with a MR job.
* @return Reader schema for the Avro object, or null if it has not been provided.
* @throws HaivvreoException
*/
private Schema getSchema(JobConf job, FileSplit split) throws HaivvreoException, IOException {
FileSystem fs = split.getPath().getFileSystem(job);
// Inside of a MR job, we can pull out the actual properties
if (HaivvreoUtils.insideMRJob(job)) {
MapWork mapWork = Utilities.getMapWork(job);
// that matches our input split.
for (Map.Entry<String, PartitionDesc> pathsAndParts : mapWork.getPathToPartitionInfo().entrySet()) {
String partitionPath = pathsAndParts.getKey();
if (pathIsInPartition(split.getPath().makeQualified(fs), partitionPath)) {
if (LOG.isInfoEnabled())
LOG.info("Matching partition " + partitionPath + " with input split " + split);
Properties props = pathsAndParts.getValue().getProperties();
if (props.containsKey(HaivvreoUtils.SCHEMA_LITERAL) || props.containsKey(HaivvreoUtils.SCHEMA_URL)) {
return HaivvreoUtils.determineSchemaOrThrowException(job, props);
} else
// If it's not in this property, it won't be in any others
return null;
}
}
if (LOG.isInfoEnabled())
LOG.info("Unable to match filesplit " + split + " with a partition.");
}
// In "select * from table" situations (non-MR), Haivvreo can add things to the job
// It's safe to add this to the job since it's not *actually* a mapred job.
// Here the global state is confined to just this process.
String s = job.get(AvroSerDe.HAIVVREO_SCHEMA);
if (s != null) {
LOG.info("Found the avro schema in the job: " + s);
return Schema.parse(s);
}
// No more places to get the schema from. Give up. May have to re-encode later.
return null;
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class TestHCatMultiOutputFormat method getTableData.
/**
* Method to fetch table data
*
* @param table table name
* @param database database
* @return list of columns in comma seperated way
* @throws Exception if any error occurs
*/
private List<String> getTableData(String table, String database) throws Exception {
QueryState queryState = new QueryState(null);
HiveConf conf = queryState.getConf();
conf.addResource("hive-site.xml");
ArrayList<String> results = new ArrayList<String>();
ArrayList<String> temp = new ArrayList<String>();
Hive hive = Hive.get(conf);
org.apache.hadoop.hive.ql.metadata.Table tbl = hive.getTable(database, table);
FetchWork work;
if (!tbl.getPartCols().isEmpty()) {
List<Partition> partitions = hive.getPartitions(tbl);
List<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
List<Path> partLocs = new ArrayList<Path>();
TableDesc tableDesc = Utilities.getTableDesc(tbl);
for (Partition part : partitions) {
partLocs.add(part.getDataLocation());
partDesc.add(Utilities.getPartitionDescFromTableDesc(tableDesc, part, true));
}
work = new FetchWork(partLocs, partDesc, tableDesc);
work.setLimit(100);
} else {
work = new FetchWork(tbl.getDataLocation(), Utilities.getTableDesc(tbl));
}
FetchTask task = new FetchTask();
task.setWork(work);
task.initialize(queryState, null, null, new CompilationOpContext());
task.fetch(temp);
for (String str : temp) {
results.add(str.replace("\t", ","));
}
return results;
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class VectorizedRowBatchCtx method getPartitionValues.
public static void getPartitionValues(VectorizedRowBatchCtx vrbCtx, Configuration hiveConf, FileSplit split, Object[] partitionValues) throws IOException {
Map<Path, PartitionDesc> pathToPartitionInfo = Utilities.getMapWork(hiveConf).getPathToPartitionInfo();
PartitionDesc partDesc = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, split.getPath(), IOPrepareCache.get().getPartitionDescMap());
getPartitionValues(vrbCtx, partDesc, partitionValues);
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class MapJoinProcessor method genMapJoinLocalWork.
/**
* Generate the MapRed Local Work for the given map-join operator
*
* @param newWork
* @param mapJoinOp
* map-join operator for which local work needs to be generated.
* @param bigTablePos
* @throws SemanticException
*/
private static void genMapJoinLocalWork(MapredWork newWork, MapJoinOperator mapJoinOp, int bigTablePos) throws SemanticException {
// keep the small table alias to avoid concurrent modification exception
ArrayList<String> smallTableAliasList = new ArrayList<String>();
// create a new MapredLocalWork
MapredLocalWork newLocalWork = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : newWork.getMapWork().getAliasToWork().entrySet()) {
String alias = entry.getKey();
Operator<? extends OperatorDesc> op = entry.getValue();
// if the table scan is for big table; then skip it
// tracing down the operator tree from the table scan operator
Operator<? extends OperatorDesc> parentOp = op;
Operator<? extends OperatorDesc> childOp = op.getChildOperators().get(0);
while ((childOp != null) && (!childOp.equals(mapJoinOp))) {
parentOp = childOp;
assert parentOp.getChildOperators().size() == 1;
childOp = parentOp.getChildOperators().get(0);
}
if (childOp == null) {
throw new SemanticException("Cannot find join op by tracing down the table scan operator tree");
}
// skip the big table pos
int i = childOp.getParentOperators().indexOf(parentOp);
if (i == bigTablePos) {
continue;
}
// set alias to work and put into smallTableAliasList
newLocalWork.getAliasToWork().put(alias, op);
smallTableAliasList.add(alias);
// get input path and remove this alias from pathToAlias
// because this file will be fetched by fetch operator
LinkedHashMap<Path, ArrayList<String>> pathToAliases = newWork.getMapWork().getPathToAliases();
// keep record all the input path for this alias
HashSet<Path> pathSet = new HashSet<>();
HashSet<Path> emptyPath = new HashSet<>();
for (Map.Entry<Path, ArrayList<String>> entry2 : pathToAliases.entrySet()) {
Path path = entry2.getKey();
ArrayList<String> list = entry2.getValue();
if (list.contains(alias)) {
// add to path set
pathSet.add(path);
//remove this alias from the alias list
list.remove(alias);
if (list.size() == 0) {
emptyPath.add(path);
}
}
}
//remove the path, with which no alias associates
for (Path path : emptyPath) {
newWork.getMapWork().removePathToAlias(path);
}
// create fetch work
FetchWork fetchWork = null;
List<Path> partDir = new ArrayList<Path>();
List<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
for (Path tablePath : pathSet) {
PartitionDesc partitionDesc = newWork.getMapWork().getPathToPartitionInfo().get(tablePath);
// create fetchwork for non partitioned table
if (partitionDesc.getPartSpec() == null || partitionDesc.getPartSpec().size() == 0) {
fetchWork = new FetchWork(tablePath, partitionDesc.getTableDesc());
break;
}
// if table is partitioned,add partDir and partitionDesc
partDir.add(tablePath);
partDesc.add(partitionDesc);
}
// create fetchwork for partitioned table
if (fetchWork == null) {
TableDesc table = newWork.getMapWork().getAliasToPartnInfo().get(alias).getTableDesc();
fetchWork = new FetchWork(partDir, partDesc, table);
}
// set alias to fetch work
newLocalWork.getAliasToFetchWork().put(alias, fetchWork);
}
// remove small table ailias from aliasToWork;Avoid concurrent modification
for (String alias : smallTableAliasList) {
newWork.getMapWork().getAliasToWork().remove(alias);
}
// set up local work
newWork.getMapWork().setMapRedLocalWork(newLocalWork);
// remove reducer
newWork.setReduceWork(null);
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class GenMapRedUtils method createMRWorkForMergingFiles.
/**
* Create a MapredWork based on input path, the top operator and the input
* table descriptor.
*
* @param conf
* @param topOp
* the table scan operator that is the root of the MapReduce task.
* @param fsDesc
* the file sink descriptor that serves as the input to this merge task.
* @param parentMR
* the parent MapReduce work
* @param parentFS
* the last FileSinkOperator in the parent MapReduce work
* @return the MapredWork
*/
private static MapWork createMRWorkForMergingFiles(HiveConf conf, TableScanOperator topOp, FileSinkDesc fsDesc) {
ArrayList<String> aliases = new ArrayList<String>();
Path inputDir = StringInternUtils.internUriStringsInPath(fsDesc.getFinalDirName());
String inputDirStr = inputDir.toString().intern();
TableDesc tblDesc = fsDesc.getTableInfo();
// dummy alias: just use the input path
aliases.add(inputDirStr);
// constructing the default MapredWork
MapredWork cMrPlan = GenMapRedUtils.getMapRedWorkFromConf(conf);
MapWork cplan = cMrPlan.getMapWork();
cplan.addPathToAlias(inputDir, aliases);
cplan.addPathToPartitionInfo(inputDir, new PartitionDesc(tblDesc, null));
cplan.getAliasToWork().put(inputDirStr, topOp);
cplan.setMapperCannotSpanPartns(true);
return cplan;
}
Aggregations