Search in sources :

Example 46 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class CombineHiveRecordReader method extractSinglePartSpec.

private PartitionDesc extractSinglePartSpec(CombineHiveInputSplit hsplit) throws IOException {
    PartitionDesc part = null;
    Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> cache = new HashMap<>();
    for (Path path : hsplit.getPaths()) {
        PartitionDesc otherPart = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartInfo, path, cache);
        LOG.debug("Found spec for " + path + " " + otherPart + " from " + pathToPartInfo);
        if (part == null) {
            part = otherPart;
        } else if (otherPart != part) {
            // Assume we should have the exact same object.
            // TODO: we could also compare the schema and SerDe, and pass only those to the call
            //       instead; most of the time these would be the same and LLAP IO can handle that.
            LOG.warn("Multiple partitions found; not going to pass a part spec to LLAP IO: {" + part.getPartSpec() + "} and {" + otherPart.getPartSpec() + "}");
            return null;
        }
    }
    return part;
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 47 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class SymbolicInputFormat method rework.

public void rework(HiveConf job, MapredWork work) throws IOException {
    Map<Path, PartitionDesc> pathToParts = work.getMapWork().getPathToPartitionInfo();
    List<Path> toRemovePaths = new ArrayList<>();
    Map<Path, PartitionDesc> toAddPathToPart = new HashMap<>();
    Map<Path, ArrayList<String>> pathToAliases = work.getMapWork().getPathToAliases();
    for (Map.Entry<Path, PartitionDesc> pathPartEntry : pathToParts.entrySet()) {
        Path path = pathPartEntry.getKey();
        PartitionDesc partDesc = pathPartEntry.getValue();
        // this path points to a symlink path
        if (partDesc.getInputFileFormatClass().equals(SymlinkTextInputFormat.class)) {
            // change to TextInputFormat
            partDesc.setInputFileFormatClass(TextInputFormat.class);
            FileSystem fileSystem = path.getFileSystem(job);
            FileStatus fStatus = fileSystem.getFileStatus(path);
            FileStatus[] symlinks = null;
            if (!fStatus.isDir()) {
                symlinks = new FileStatus[] { fStatus };
            } else {
                symlinks = fileSystem.listStatus(path, FileUtils.HIDDEN_FILES_PATH_FILTER);
            }
            toRemovePaths.add(path);
            ArrayList<String> aliases = pathToAliases.remove(path);
            for (FileStatus symlink : symlinks) {
                BufferedReader reader = null;
                try {
                    reader = new BufferedReader(new InputStreamReader(fileSystem.open(symlink.getPath())));
                    partDesc.setInputFileFormatClass(TextInputFormat.class);
                    String line;
                    while ((line = reader.readLine()) != null) {
                        // no check for the line? How to check?
                        // if the line is invalid for any reason, the job will fail.
                        FileStatus[] matches = fileSystem.globStatus(new Path(line));
                        for (FileStatus fileStatus : matches) {
                            Path schemaLessPath = Path.getPathWithoutSchemeAndAuthority(fileStatus.getPath());
                            StringInternUtils.internUriStringsInPath(schemaLessPath);
                            toAddPathToPart.put(schemaLessPath, partDesc);
                            pathToAliases.put(schemaLessPath, aliases);
                        }
                    }
                } finally {
                    org.apache.hadoop.io.IOUtils.closeStream(reader);
                }
            }
        }
    }
    for (Entry<Path, PartitionDesc> toAdd : toAddPathToPart.entrySet()) {
        work.getMapWork().addPathToPartitionInfo(toAdd.getKey(), toAdd.getValue());
    }
    for (Path toRemove : toRemovePaths) {
        work.getMapWork().removePathToPartitionInfo(toRemove);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) FileSystem(org.apache.hadoop.fs.FileSystem) BufferedReader(java.io.BufferedReader) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) HashMap(java.util.HashMap) Map(java.util.Map)

Example 48 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class HiveFileFormatUtils method populateNewPartitionDesc.

private static Map<Path, PartitionDesc> populateNewPartitionDesc(Map<Path, PartitionDesc> pathToPartitionInfo) {
    Map<Path, PartitionDesc> newPathToPartitionInfo = new HashMap<>();
    for (Map.Entry<Path, PartitionDesc> entry : pathToPartitionInfo.entrySet()) {
        PartitionDesc partDesc = entry.getValue();
        Path pathOnly = Path.getPathWithoutSchemeAndAuthority(entry.getKey());
        newPathToPartitionInfo.put(pathOnly, partDesc);
    }
    return newPathToPartitionInfo;
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap)

Example 49 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class HiveFileFormatUtils method getPartitionDescFromPathRecursively.

public static PartitionDesc getPartitionDescFromPathRecursively(Map<Path, PartitionDesc> pathToPartitionInfo, Path dir, Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> cacheMap, boolean ignoreSchema) throws IOException {
    PartitionDesc part = doGetPartitionDescFromPath(pathToPartitionInfo, dir);
    if (part == null && (ignoreSchema || (dir.toUri().getScheme() == null || dir.toUri().getScheme().trim().equals("")) || FileUtils.pathsContainNoScheme(pathToPartitionInfo.keySet()))) {
        Map<Path, PartitionDesc> newPathToPartitionInfo = null;
        if (cacheMap != null) {
            newPathToPartitionInfo = cacheMap.get(pathToPartitionInfo);
        }
        if (newPathToPartitionInfo == null) {
            // still null
            newPathToPartitionInfo = populateNewPartitionDesc(pathToPartitionInfo);
            if (cacheMap != null) {
                cacheMap.put(pathToPartitionInfo, newPathToPartitionInfo);
            }
        }
        part = doGetPartitionDescFromPath(newPathToPartitionInfo, dir);
    }
    if (part != null) {
        return part;
    } else {
        throw new IOException("cannot find dir = " + dir.toString() + " in pathToPartitionInfo: " + pathToPartitionInfo.keySet());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) IOException(java.io.IOException)

Example 50 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class TestPlan method testPlan.

public void testPlan() throws Exception {
    final String F1 = "#affiliations";
    final String F2 = "friends[0].friendid";
    try {
        // initialize a complete map reduce configuration
        ExprNodeDesc expr1 = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, F1, "", false);
        ExprNodeDesc expr2 = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, F2, "", false);
        ExprNodeDesc filterExpr = TypeCheckProcFactory.DefaultExprProcessor.getFuncExprNodeDesc("==", expr1, expr2);
        FilterDesc filterCtx = new FilterDesc(filterExpr, false);
        Operator<FilterDesc> op = OperatorFactory.get(new CompilationOpContext(), FilterDesc.class);
        op.setConf(filterCtx);
        ArrayList<String> aliasList = new ArrayList<String>();
        aliasList.add("a");
        LinkedHashMap<Path, ArrayList<String>> pa = new LinkedHashMap<>();
        pa.put(new Path("/tmp/testfolder"), aliasList);
        TableDesc tblDesc = Utilities.defaultTd;
        PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
        LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
        pt.put(new Path("/tmp/testfolder"), partDesc);
        LinkedHashMap<String, Operator<? extends OperatorDesc>> ao = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
        ao.put("a", op);
        MapredWork mrwork = new MapredWork();
        mrwork.getMapWork().setPathToAliases(pa);
        mrwork.getMapWork().setPathToPartitionInfo(pt);
        mrwork.getMapWork().setAliasToWork(ao);
        JobConf job = new JobConf(TestPlan.class);
        // serialize the configuration once ..
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        SerializationUtilities.serializePlan(mrwork, baos);
        baos.close();
        String v1 = baos.toString();
        // store into configuration
        job.set("fs.default.name", "file:///");
        Utilities.setMapRedWork(job, mrwork, new Path(System.getProperty("java.io.tmpdir") + File.separator + System.getProperty("user.name") + File.separator + "hive"));
        MapredWork mrwork2 = Utilities.getMapRedWork(job);
        Utilities.clearWork(job);
        // over here we should have some checks of the deserialized object against
        // the orginal object
        // System.out.println(v1);
        // serialize again
        baos.reset();
        SerializationUtilities.serializePlan(mrwork2, baos);
        baos.close();
        // verify that the two are equal
        assertEquals(v1, baos.toString());
    } catch (Exception excp) {
        excp.printStackTrace();
        throw excp;
    }
    System.out.println("Serialization/Deserialization of plan successful");
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) ByteArrayOutputStream(java.io.ByteArrayOutputStream) LinkedHashMap(java.util.LinkedHashMap) FilterDesc(org.apache.hadoop.hive.ql.plan.FilterDesc) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)58 Path (org.apache.hadoop.fs.Path)47 ArrayList (java.util.ArrayList)31 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)27 LinkedHashMap (java.util.LinkedHashMap)19 HashMap (java.util.HashMap)14 Map (java.util.Map)13 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)13 JobConf (org.apache.hadoop.mapred.JobConf)13 IOException (java.io.IOException)11 Properties (java.util.Properties)10 Operator (org.apache.hadoop.hive.ql.exec.Operator)10 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)10 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)10 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)10 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)8 Configuration (org.apache.hadoop.conf.Configuration)7 FileSystem (org.apache.hadoop.fs.FileSystem)7 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)7 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)7