Search in sources :

Example 26 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project haivvreo by jghoman.

the class AvroGenericRecordReader method getSchema.

/**
   * Attempt to retrieve the reader schema.  Haivvreo has a couple opportunities
   * to provide this, depending on whether or not we're just selecting data
   * or running with a MR job.
   * @return  Reader schema for the Avro object, or null if it has not been provided.
   * @throws HaivvreoException
   */
private Schema getSchema(JobConf job, FileSplit split) throws HaivvreoException, IOException {
    FileSystem fs = split.getPath().getFileSystem(job);
    // Inside of a MR job, we can pull out the actual properties
    if (HaivvreoUtils.insideMRJob(job)) {
        MapWork mapWork = Utilities.getMapWork(job);
        // that matches our input split.
        for (Map.Entry<String, PartitionDesc> pathsAndParts : mapWork.getPathToPartitionInfo().entrySet()) {
            String partitionPath = pathsAndParts.getKey();
            if (pathIsInPartition(split.getPath().makeQualified(fs), partitionPath)) {
                if (LOG.isInfoEnabled())
                    LOG.info("Matching partition " + partitionPath + " with input split " + split);
                Properties props = pathsAndParts.getValue().getProperties();
                if (props.containsKey(HaivvreoUtils.SCHEMA_LITERAL) || props.containsKey(HaivvreoUtils.SCHEMA_URL)) {
                    return HaivvreoUtils.determineSchemaOrThrowException(job, props);
                } else
                    // If it's not in this property, it won't be in any others
                    return null;
            }
        }
        if (LOG.isInfoEnabled())
            LOG.info("Unable to match filesplit " + split + " with a partition.");
    }
    // In "select * from table" situations (non-MR), Haivvreo can add things to the job
    // It's safe to add this to the job since it's not *actually* a mapred job.
    // Here the global state is confined to just this process.
    String s = job.get(AvroSerDe.HAIVVREO_SCHEMA);
    if (s != null) {
        LOG.info("Found the avro schema in the job: " + s);
        return Schema.parse(s);
    }
    // No more places to get the schema from. Give up.  May have to re-encode later.
    return null;
}
Also used : MapWork(org.apache.hadoop.hive.ql.plan.MapWork) FileSystem(org.apache.hadoop.fs.FileSystem) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) Properties(java.util.Properties) Map(java.util.Map)

Example 27 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class TestHCatMultiOutputFormat method getTableData.

/**
   * Method to fetch table data
   *
   * @param table table name
   * @param database database
   * @return list of columns in comma seperated way
   * @throws Exception if any error occurs
   */
private List<String> getTableData(String table, String database) throws Exception {
    QueryState queryState = new QueryState(null);
    HiveConf conf = queryState.getConf();
    conf.addResource("hive-site.xml");
    ArrayList<String> results = new ArrayList<String>();
    ArrayList<String> temp = new ArrayList<String>();
    Hive hive = Hive.get(conf);
    org.apache.hadoop.hive.ql.metadata.Table tbl = hive.getTable(database, table);
    FetchWork work;
    if (!tbl.getPartCols().isEmpty()) {
        List<Partition> partitions = hive.getPartitions(tbl);
        List<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
        List<Path> partLocs = new ArrayList<Path>();
        TableDesc tableDesc = Utilities.getTableDesc(tbl);
        for (Partition part : partitions) {
            partLocs.add(part.getDataLocation());
            partDesc.add(Utilities.getPartitionDescFromTableDesc(tableDesc, part, true));
        }
        work = new FetchWork(partLocs, partDesc, tableDesc);
        work.setLimit(100);
    } else {
        work = new FetchWork(tbl.getDataLocation(), Utilities.getTableDesc(tbl));
    }
    FetchTask task = new FetchTask();
    task.setWork(work);
    task.initialize(queryState, null, null, new CompilationOpContext());
    task.fetch(temp);
    for (String str : temp) {
        results.add(str.replace("\t", ","));
    }
    return results;
}
Also used : Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) ArrayList(java.util.ArrayList) QueryState(org.apache.hadoop.hive.ql.QueryState) FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask) Hive(org.apache.hadoop.hive.ql.metadata.Hive) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) HiveConf(org.apache.hadoop.hive.conf.HiveConf) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 28 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class VectorizedRowBatchCtx method getPartitionValues.

public static void getPartitionValues(VectorizedRowBatchCtx vrbCtx, Configuration hiveConf, FileSplit split, Object[] partitionValues) throws IOException {
    Map<Path, PartitionDesc> pathToPartitionInfo = Utilities.getMapWork(hiveConf).getPathToPartitionInfo();
    PartitionDesc partDesc = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, split.getPath(), IOPrepareCache.get().getPartitionDescMap());
    getPartitionValues(vrbCtx, partDesc, partitionValues);
}
Also used : Path(org.apache.hadoop.fs.Path) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc)

Example 29 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class MapJoinProcessor method genMapJoinLocalWork.

/**
   * Generate the MapRed Local Work for the given map-join operator
   *
   * @param newWork
   * @param mapJoinOp
   *          map-join operator for which local work needs to be generated.
   * @param bigTablePos
   * @throws SemanticException
   */
private static void genMapJoinLocalWork(MapredWork newWork, MapJoinOperator mapJoinOp, int bigTablePos) throws SemanticException {
    // keep the small table alias to avoid concurrent modification exception
    ArrayList<String> smallTableAliasList = new ArrayList<String>();
    // create a new  MapredLocalWork
    MapredLocalWork newLocalWork = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
    for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : newWork.getMapWork().getAliasToWork().entrySet()) {
        String alias = entry.getKey();
        Operator<? extends OperatorDesc> op = entry.getValue();
        // if the table scan is for big table; then skip it
        // tracing down the operator tree from the table scan operator
        Operator<? extends OperatorDesc> parentOp = op;
        Operator<? extends OperatorDesc> childOp = op.getChildOperators().get(0);
        while ((childOp != null) && (!childOp.equals(mapJoinOp))) {
            parentOp = childOp;
            assert parentOp.getChildOperators().size() == 1;
            childOp = parentOp.getChildOperators().get(0);
        }
        if (childOp == null) {
            throw new SemanticException("Cannot find join op by tracing down the table scan operator tree");
        }
        // skip the big table pos
        int i = childOp.getParentOperators().indexOf(parentOp);
        if (i == bigTablePos) {
            continue;
        }
        // set alias to work and put into smallTableAliasList
        newLocalWork.getAliasToWork().put(alias, op);
        smallTableAliasList.add(alias);
        // get input path and remove this alias from pathToAlias
        // because this file will be fetched by fetch operator
        LinkedHashMap<Path, ArrayList<String>> pathToAliases = newWork.getMapWork().getPathToAliases();
        // keep record all the input path for this alias
        HashSet<Path> pathSet = new HashSet<>();
        HashSet<Path> emptyPath = new HashSet<>();
        for (Map.Entry<Path, ArrayList<String>> entry2 : pathToAliases.entrySet()) {
            Path path = entry2.getKey();
            ArrayList<String> list = entry2.getValue();
            if (list.contains(alias)) {
                // add to path set
                pathSet.add(path);
                //remove this alias from the alias list
                list.remove(alias);
                if (list.size() == 0) {
                    emptyPath.add(path);
                }
            }
        }
        //remove the path, with which no alias associates
        for (Path path : emptyPath) {
            newWork.getMapWork().removePathToAlias(path);
        }
        // create fetch work
        FetchWork fetchWork = null;
        List<Path> partDir = new ArrayList<Path>();
        List<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
        for (Path tablePath : pathSet) {
            PartitionDesc partitionDesc = newWork.getMapWork().getPathToPartitionInfo().get(tablePath);
            // create fetchwork for non partitioned table
            if (partitionDesc.getPartSpec() == null || partitionDesc.getPartSpec().size() == 0) {
                fetchWork = new FetchWork(tablePath, partitionDesc.getTableDesc());
                break;
            }
            // if table is partitioned,add partDir and partitionDesc
            partDir.add(tablePath);
            partDesc.add(partitionDesc);
        }
        // create fetchwork for partitioned table
        if (fetchWork == null) {
            TableDesc table = newWork.getMapWork().getAliasToPartnInfo().get(alias).getTableDesc();
            fetchWork = new FetchWork(partDir, partDesc, table);
        }
        // set alias to fetch work
        newLocalWork.getAliasToFetchWork().put(alias, fetchWork);
    }
    // remove small table ailias from aliasToWork;Avoid concurrent modification
    for (String alias : smallTableAliasList) {
        newWork.getMapWork().getAliasToWork().remove(alias);
    }
    // set up local work
    newWork.getMapWork().setMapRedLocalWork(newLocalWork);
    // remove reducer
    newWork.setReduceWork(null);
}
Also used : LateralViewJoinOperator(org.apache.hadoop.hive.ql.exec.LateralViewJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) ScriptOperator(org.apache.hadoop.hive.ql.exec.ScriptOperator) Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) HashSet(java.util.HashSet)

Example 30 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class GenMapRedUtils method createMRWorkForMergingFiles.

/**
   * Create a MapredWork based on input path, the top operator and the input
   * table descriptor.
   *
   * @param conf
   * @param topOp
   *          the table scan operator that is the root of the MapReduce task.
   * @param fsDesc
   *          the file sink descriptor that serves as the input to this merge task.
   * @param parentMR
   *          the parent MapReduce work
   * @param parentFS
   *          the last FileSinkOperator in the parent MapReduce work
   * @return the MapredWork
   */
private static MapWork createMRWorkForMergingFiles(HiveConf conf, TableScanOperator topOp, FileSinkDesc fsDesc) {
    ArrayList<String> aliases = new ArrayList<String>();
    Path inputDir = StringInternUtils.internUriStringsInPath(fsDesc.getFinalDirName());
    String inputDirStr = inputDir.toString().intern();
    TableDesc tblDesc = fsDesc.getTableInfo();
    // dummy alias: just use the input path
    aliases.add(inputDirStr);
    // constructing the default MapredWork
    MapredWork cMrPlan = GenMapRedUtils.getMapRedWorkFromConf(conf);
    MapWork cplan = cMrPlan.getMapWork();
    cplan.addPathToAlias(inputDir, aliases);
    cplan.addPathToPartitionInfo(inputDir, new PartitionDesc(tblDesc, null));
    cplan.getAliasToWork().put(inputDirStr, topOp);
    cplan.setMapperCannotSpanPartns(true);
    return cplan;
}
Also used : Path(org.apache.hadoop.fs.Path) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ArrayList(java.util.ArrayList) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Aggregations

PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)58 Path (org.apache.hadoop.fs.Path)47 ArrayList (java.util.ArrayList)31 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)27 LinkedHashMap (java.util.LinkedHashMap)19 HashMap (java.util.HashMap)14 Map (java.util.Map)13 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)13 JobConf (org.apache.hadoop.mapred.JobConf)13 IOException (java.io.IOException)11 Properties (java.util.Properties)10 Operator (org.apache.hadoop.hive.ql.exec.Operator)10 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)10 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)10 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)10 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)8 Configuration (org.apache.hadoop.conf.Configuration)7 FileSystem (org.apache.hadoop.fs.FileSystem)7 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)7 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)7