use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class CombineHiveRecordReader method extractSinglePartSpec.
private PartitionDesc extractSinglePartSpec(CombineHiveInputSplit hsplit) throws IOException {
PartitionDesc part = null;
Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> cache = new HashMap<>();
for (Path path : hsplit.getPaths()) {
PartitionDesc otherPart = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartInfo, path, cache);
LOG.debug("Found spec for " + path + " " + otherPart + " from " + pathToPartInfo);
if (part == null) {
part = otherPart;
} else if (otherPart != part) {
// Assume we should have the exact same object.
// TODO: we could also compare the schema and SerDe, and pass only those to the call
// instead; most of the time these would be the same and LLAP IO can handle that.
LOG.warn("Multiple partitions found; not going to pass a part spec to LLAP IO: {" + part.getPartSpec() + "} and {" + otherPart.getPartSpec() + "}");
return null;
}
}
return part;
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class SymbolicInputFormat method rework.
public void rework(HiveConf job, MapredWork work) throws IOException {
Map<Path, PartitionDesc> pathToParts = work.getMapWork().getPathToPartitionInfo();
List<Path> toRemovePaths = new ArrayList<>();
Map<Path, PartitionDesc> toAddPathToPart = new HashMap<>();
Map<Path, ArrayList<String>> pathToAliases = work.getMapWork().getPathToAliases();
for (Map.Entry<Path, PartitionDesc> pathPartEntry : pathToParts.entrySet()) {
Path path = pathPartEntry.getKey();
PartitionDesc partDesc = pathPartEntry.getValue();
// this path points to a symlink path
if (partDesc.getInputFileFormatClass().equals(SymlinkTextInputFormat.class)) {
// change to TextInputFormat
partDesc.setInputFileFormatClass(TextInputFormat.class);
FileSystem fileSystem = path.getFileSystem(job);
FileStatus fStatus = fileSystem.getFileStatus(path);
FileStatus[] symlinks = null;
if (!fStatus.isDir()) {
symlinks = new FileStatus[] { fStatus };
} else {
symlinks = fileSystem.listStatus(path, FileUtils.HIDDEN_FILES_PATH_FILTER);
}
toRemovePaths.add(path);
ArrayList<String> aliases = pathToAliases.remove(path);
for (FileStatus symlink : symlinks) {
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(fileSystem.open(symlink.getPath())));
partDesc.setInputFileFormatClass(TextInputFormat.class);
String line;
while ((line = reader.readLine()) != null) {
// no check for the line? How to check?
// if the line is invalid for any reason, the job will fail.
FileStatus[] matches = fileSystem.globStatus(new Path(line));
for (FileStatus fileStatus : matches) {
Path schemaLessPath = Path.getPathWithoutSchemeAndAuthority(fileStatus.getPath());
StringInternUtils.internUriStringsInPath(schemaLessPath);
toAddPathToPart.put(schemaLessPath, partDesc);
pathToAliases.put(schemaLessPath, aliases);
}
}
} finally {
org.apache.hadoop.io.IOUtils.closeStream(reader);
}
}
}
}
for (Entry<Path, PartitionDesc> toAdd : toAddPathToPart.entrySet()) {
work.getMapWork().addPathToPartitionInfo(toAdd.getKey(), toAdd.getValue());
}
for (Path toRemove : toRemovePaths) {
work.getMapWork().removePathToPartitionInfo(toRemove);
}
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class HiveFileFormatUtils method populateNewPartitionDesc.
private static Map<Path, PartitionDesc> populateNewPartitionDesc(Map<Path, PartitionDesc> pathToPartitionInfo) {
Map<Path, PartitionDesc> newPathToPartitionInfo = new HashMap<>();
for (Map.Entry<Path, PartitionDesc> entry : pathToPartitionInfo.entrySet()) {
PartitionDesc partDesc = entry.getValue();
Path pathOnly = Path.getPathWithoutSchemeAndAuthority(entry.getKey());
newPathToPartitionInfo.put(pathOnly, partDesc);
}
return newPathToPartitionInfo;
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class HiveFileFormatUtils method getPartitionDescFromPathRecursively.
public static PartitionDesc getPartitionDescFromPathRecursively(Map<Path, PartitionDesc> pathToPartitionInfo, Path dir, Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> cacheMap, boolean ignoreSchema) throws IOException {
PartitionDesc part = doGetPartitionDescFromPath(pathToPartitionInfo, dir);
if (part == null && (ignoreSchema || (dir.toUri().getScheme() == null || dir.toUri().getScheme().trim().equals("")) || FileUtils.pathsContainNoScheme(pathToPartitionInfo.keySet()))) {
Map<Path, PartitionDesc> newPathToPartitionInfo = null;
if (cacheMap != null) {
newPathToPartitionInfo = cacheMap.get(pathToPartitionInfo);
}
if (newPathToPartitionInfo == null) {
// still null
newPathToPartitionInfo = populateNewPartitionDesc(pathToPartitionInfo);
if (cacheMap != null) {
cacheMap.put(pathToPartitionInfo, newPathToPartitionInfo);
}
}
part = doGetPartitionDescFromPath(newPathToPartitionInfo, dir);
}
if (part != null) {
return part;
} else {
throw new IOException("cannot find dir = " + dir.toString() + " in pathToPartitionInfo: " + pathToPartitionInfo.keySet());
}
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class TestPlan method testPlan.
public void testPlan() throws Exception {
final String F1 = "#affiliations";
final String F2 = "friends[0].friendid";
try {
// initialize a complete map reduce configuration
ExprNodeDesc expr1 = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, F1, "", false);
ExprNodeDesc expr2 = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, F2, "", false);
ExprNodeDesc filterExpr = TypeCheckProcFactory.DefaultExprProcessor.getFuncExprNodeDesc("==", expr1, expr2);
FilterDesc filterCtx = new FilterDesc(filterExpr, false);
Operator<FilterDesc> op = OperatorFactory.get(new CompilationOpContext(), FilterDesc.class);
op.setConf(filterCtx);
ArrayList<String> aliasList = new ArrayList<String>();
aliasList.add("a");
LinkedHashMap<Path, ArrayList<String>> pa = new LinkedHashMap<>();
pa.put(new Path("/tmp/testfolder"), aliasList);
TableDesc tblDesc = Utilities.defaultTd;
PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
pt.put(new Path("/tmp/testfolder"), partDesc);
LinkedHashMap<String, Operator<? extends OperatorDesc>> ao = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
ao.put("a", op);
MapredWork mrwork = new MapredWork();
mrwork.getMapWork().setPathToAliases(pa);
mrwork.getMapWork().setPathToPartitionInfo(pt);
mrwork.getMapWork().setAliasToWork(ao);
JobConf job = new JobConf(TestPlan.class);
// serialize the configuration once ..
ByteArrayOutputStream baos = new ByteArrayOutputStream();
SerializationUtilities.serializePlan(mrwork, baos);
baos.close();
String v1 = baos.toString();
// store into configuration
job.set("fs.default.name", "file:///");
Utilities.setMapRedWork(job, mrwork, new Path(System.getProperty("java.io.tmpdir") + File.separator + System.getProperty("user.name") + File.separator + "hive"));
MapredWork mrwork2 = Utilities.getMapRedWork(job);
Utilities.clearWork(job);
// over here we should have some checks of the deserialized object against
// the orginal object
// System.out.println(v1);
// serialize again
baos.reset();
SerializationUtilities.serializePlan(mrwork2, baos);
baos.close();
// verify that the two are equal
assertEquals(v1, baos.toString());
} catch (Exception excp) {
excp.printStackTrace();
throw excp;
}
System.out.println("Serialization/Deserialization of plan successful");
}
Aggregations