Search in sources :

Example 31 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class GenMapRedUtils method setUnionPlan.

private static void setUnionPlan(GenMRProcContext opProcCtx, boolean local, Task<? extends Serializable> currTask, GenMRUnionCtx uCtx, boolean mergeTask) throws SemanticException {
    TableScanOperator currTopOp = opProcCtx.getCurrTopOp();
    if (currTopOp != null) {
        String currAliasId = opProcCtx.getCurrAliasId();
        if (mergeTask || !opProcCtx.isSeenOp(currTask, currTopOp)) {
            setTaskPlan(currAliasId, currTopOp, currTask, local, opProcCtx);
        }
        currTopOp = null;
        opProcCtx.setCurrTopOp(currTopOp);
    } else {
        List<String> taskTmpDirLst = uCtx.getTaskTmpDir();
        if ((taskTmpDirLst != null) && !(taskTmpDirLst.isEmpty())) {
            List<TableDesc> tt_descLst = uCtx.getTTDesc();
            assert !taskTmpDirLst.isEmpty() && !tt_descLst.isEmpty();
            assert taskTmpDirLst.size() == tt_descLst.size();
            int size = taskTmpDirLst.size();
            assert local == false;
            List<TableScanOperator> topOperators = uCtx.getListTopOperators();
            MapredWork plan = (MapredWork) currTask.getWork();
            for (int pos = 0; pos < size; pos++) {
                String taskTmpDir = taskTmpDirLst.get(pos);
                TableDesc tt_desc = tt_descLst.get(pos);
                MapWork mWork = plan.getMapWork();
                if (mWork.getPathToAliases().get(taskTmpDir) == null) {
                    taskTmpDir = taskTmpDir.intern();
                    Path taskTmpDirPath = StringInternUtils.internUriStringsInPath(new Path(taskTmpDir));
                    mWork.removePathToAlias(taskTmpDirPath);
                    mWork.addPathToAlias(taskTmpDirPath, taskTmpDir);
                    mWork.addPathToPartitionInfo(taskTmpDirPath, new PartitionDesc(tt_desc, null));
                    mWork.getAliasToWork().put(taskTmpDir, topOperators.get(pos));
                }
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 32 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class GenMapRedUtils method setTaskPlan.

/**
   * set the current task in the mapredWork.
   *
   * @param alias
   *          current alias
   * @param topOp
   *          the top operator of the stack
   * @param plan
   *          current plan
   * @param local
   *          whether you need to add to map-reduce or local work
   * @param tt_desc
   *          table descriptor
   * @throws SerDeException
   */
public static void setTaskPlan(Path path, String alias, Operator<? extends OperatorDesc> topOp, MapWork plan, boolean local, TableDesc tt_desc) throws SemanticException {
    if (path == null || alias == null) {
        return;
    }
    if (topOp instanceof TableScanOperator) {
        try {
            Utilities.addSchemaEvolutionToTableScanOperator((StructObjectInspector) tt_desc.getDeserializer().getObjectInspector(), (TableScanOperator) topOp);
        } catch (Exception e) {
            throw new SemanticException(e);
        }
    }
    if (!local) {
        plan.addPathToAlias(path, alias);
        plan.addPathToPartitionInfo(path, new PartitionDesc(tt_desc, null));
        plan.getAliasToWork().put(alias, topOp);
    } else {
        // populate local work if needed
        MapredLocalWork localPlan = plan.getMapRedLocalWork();
        if (localPlan == null) {
            localPlan = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
        }
        assert localPlan.getAliasToWork().get(alias) == null;
        assert localPlan.getAliasToFetchWork().get(alias) == null;
        localPlan.getAliasToWork().put(alias, topOp);
        localPlan.getAliasToFetchWork().put(alias, new FetchWork(new Path(alias), tt_desc));
        plan.setMapRedLocalWork(localPlan);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) LinkedHashMap(java.util.LinkedHashMap)

Example 33 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class TestInputOutputFormat method createMockExecutionEnvironment.

/**
   * Create a mock execution environment that has enough detail that
   * ORC, vectorization, HiveInputFormat, and CombineHiveInputFormat don't
   * explode.
   * @param workDir a local filesystem work directory
   * @param warehouseDir a mock filesystem warehouse directory
   * @param tableName the table name
   * @param objectInspector object inspector for the row
   * @param isVectorized should run vectorized
   * @return a JobConf that contains the necessary information
   * @throws IOException
   * @throws HiveException
   */
JobConf createMockExecutionEnvironment(Path workDir, Path warehouseDir, String tableName, ObjectInspector objectInspector, boolean isVectorized, int partitions) throws IOException, HiveException {
    JobConf conf = new JobConf();
    Utilities.clearWorkMap(conf);
    conf.set("hive.exec.plan", workDir.toString());
    conf.set("mapred.job.tracker", "local");
    String isVectorizedString = Boolean.toString(isVectorized);
    conf.set("hive.vectorized.execution.enabled", isVectorizedString);
    conf.set(Utilities.VECTOR_MODE, isVectorizedString);
    conf.set(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, isVectorizedString);
    conf.set("fs.mock.impl", MockFileSystem.class.getName());
    conf.set("mapred.mapper.class", ExecMapper.class.getName());
    Path root = new Path(warehouseDir, tableName);
    // clean out previous contents
    ((MockFileSystem) root.getFileSystem(conf)).clear();
    // build partition strings
    String[] partPath = new String[partitions];
    StringBuilder buffer = new StringBuilder();
    for (int p = 0; p < partitions; ++p) {
        partPath[p] = new Path(root, "p=" + p).toString();
        if (p != 0) {
            buffer.append(',');
        }
        buffer.append(partPath[p]);
    }
    conf.set("mapred.input.dir", buffer.toString());
    StringBuilder columnIds = new StringBuilder();
    StringBuilder columnNames = new StringBuilder();
    StringBuilder columnTypes = new StringBuilder();
    StructObjectInspector structOI = (StructObjectInspector) objectInspector;
    List<? extends StructField> fields = structOI.getAllStructFieldRefs();
    int numCols = fields.size();
    for (int i = 0; i < numCols; ++i) {
        if (i != 0) {
            columnIds.append(',');
            columnNames.append(',');
            columnTypes.append(',');
        }
        columnIds.append(i);
        columnNames.append(fields.get(i).getFieldName());
        columnTypes.append(fields.get(i).getFieldObjectInspector().getTypeName());
    }
    conf.set("hive.io.file.readcolumn.ids", columnIds.toString());
    conf.set("partition_columns", "p");
    conf.set(serdeConstants.LIST_COLUMNS, columnNames.toString());
    conf.set(serdeConstants.LIST_COLUMN_TYPES, columnTypes.toString());
    MockFileSystem fs = (MockFileSystem) warehouseDir.getFileSystem(conf);
    fs.clear();
    Properties tblProps = new Properties();
    tblProps.put("name", tableName);
    tblProps.put("serialization.lib", OrcSerde.class.getName());
    tblProps.put("columns", columnNames.toString());
    tblProps.put("columns.types", columnTypes.toString());
    TableDesc tbl = new TableDesc(OrcInputFormat.class, OrcOutputFormat.class, tblProps);
    MapWork mapWork = new MapWork();
    mapWork.setVectorMode(isVectorized);
    if (isVectorized) {
        VectorizedRowBatchCtx vectorizedRowBatchCtx = new VectorizedRowBatchCtx();
        vectorizedRowBatchCtx.init(structOI, new String[0]);
        mapWork.setVectorizedRowBatchCtx(vectorizedRowBatchCtx);
    }
    mapWork.setUseBucketizedHiveInputFormat(false);
    LinkedHashMap<Path, ArrayList<String>> aliasMap = new LinkedHashMap<>();
    ArrayList<String> aliases = new ArrayList<String>();
    aliases.add(tableName);
    LinkedHashMap<Path, PartitionDesc> partMap = new LinkedHashMap<>();
    for (int p = 0; p < partitions; ++p) {
        Path path = new Path(partPath[p]);
        aliasMap.put(path, aliases);
        LinkedHashMap<String, String> partSpec = new LinkedHashMap<String, String>();
        PartitionDesc part = new PartitionDesc(tbl, partSpec);
        if (isVectorized) {
            part.setVectorPartitionDesc(VectorPartitionDesc.createVectorizedInputFileFormat("MockInputFileFormatClassName", false));
        }
        partMap.put(path, part);
    }
    mapWork.setPathToAliases(aliasMap);
    mapWork.setPathToPartitionInfo(partMap);
    // write the plan out
    FileSystem localFs = FileSystem.getLocal(conf).getRaw();
    Path mapXml = new Path(workDir, "map.xml");
    localFs.delete(mapXml, true);
    FSDataOutputStream planStream = localFs.create(mapXml);
    SerializationUtilities.serializePlan(mapWork, planStream);
    conf.setBoolean(Utilities.HAS_MAP_WORK, true);
    planStream.close();
    return conf;
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) Properties(java.util.Properties) LinkedHashMap(java.util.LinkedHashMap) VectorizedRowBatchCtx(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) FileSystem(org.apache.hadoop.fs.FileSystem) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) VectorPartitionDesc(org.apache.hadoop.hive.ql.plan.VectorPartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) JobConf(org.apache.hadoop.mapred.JobConf) ExecMapper(org.apache.hadoop.hive.ql.exec.mr.ExecMapper) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 34 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class TestSymlinkTextInputFormat method setUp.

@Override
protected void setUp() throws IOException {
    conf = new Configuration();
    job = new JobConf(conf);
    TableDesc tblDesc = Utilities.defaultTd;
    PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
    LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
    pt.put(new Path("/tmp/testfolder"), partDesc);
    MapredWork mrwork = new MapredWork();
    mrwork.getMapWork().setPathToPartitionInfo(pt);
    Utilities.setMapRedWork(job, mrwork, new Path("/tmp/" + System.getProperty("user.name"), "hive"));
    fileSystem = FileSystem.getLocal(conf);
    testDir = new Path(System.getProperty("test.tmp.dir", System.getProperty("user.dir", new File(".").getAbsolutePath())) + "/TestSymlinkTextInputFormat");
    reporter = Reporter.NULL;
    fileSystem.delete(testDir, true);
    dataDir1 = new Path(testDir, "datadir1");
    dataDir2 = new Path(testDir, "datadir2");
    symlinkDir = new Path(testDir, "symlinkdir");
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JobConf(org.apache.hadoop.mapred.JobConf) File(java.io.File) LinkedHashMap(java.util.LinkedHashMap)

Example 35 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class TestCombineHiveInputFormat method testAvoidSplitCombination.

public void testAvoidSplitCombination() throws Exception {
    Configuration conf = new Configuration();
    JobConf job = new JobConf(conf);
    TableDesc tblDesc = Utilities.defaultTd;
    tblDesc.setInputFileFormatClass(TestSkipCombineInputFormat.class);
    PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
    LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
    pt.put(new Path("/tmp/testfolder1"), partDesc);
    pt.put(new Path("/tmp/testfolder2"), partDesc);
    MapredWork mrwork = new MapredWork();
    mrwork.getMapWork().setPathToPartitionInfo(pt);
    Path mapWorkPath = new Path("/tmp/" + System.getProperty("user.name"), "hive");
    Utilities.setMapRedWork(conf, mrwork, mapWorkPath);
    try {
        Path[] paths = new Path[2];
        paths[0] = new Path("/tmp/testfolder1");
        paths[1] = new Path("/tmp/testfolder2");
        CombineHiveInputFormat combineInputFormat = ReflectionUtils.newInstance(CombineHiveInputFormat.class, conf);
        combineInputFormat.pathToPartitionInfo = Utilities.getMapWork(conf).getPathToPartitionInfo();
        Set results = combineInputFormat.getNonCombinablePathIndices(job, paths, 2);
        assertEquals("Should have both path indices in the results set", 2, results.size());
    } finally {
        // Cleanup the mapwork path
        FileSystem.get(conf).delete(mapWorkPath, true);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Set(java.util.Set) Configuration(org.apache.hadoop.conf.Configuration) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JobConf(org.apache.hadoop.mapred.JobConf) LinkedHashMap(java.util.LinkedHashMap)

Aggregations

PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)58 Path (org.apache.hadoop.fs.Path)47 ArrayList (java.util.ArrayList)31 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)27 LinkedHashMap (java.util.LinkedHashMap)19 HashMap (java.util.HashMap)14 Map (java.util.Map)13 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)13 JobConf (org.apache.hadoop.mapred.JobConf)13 IOException (java.io.IOException)11 Properties (java.util.Properties)10 Operator (org.apache.hadoop.hive.ql.exec.Operator)10 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)10 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)10 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)10 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)8 Configuration (org.apache.hadoop.conf.Configuration)7 FileSystem (org.apache.hadoop.fs.FileSystem)7 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)7 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)7