Search in sources :

Example 1 with PartitionKeySampler

use of org.apache.hadoop.hive.ql.exec.PartitionKeySampler in project hive by apache.

the class ExecDriver method handleSampling.

private void handleSampling(Context context, MapWork mWork, JobConf job) throws Exception {
    assert mWork.getAliasToWork().keySet().size() == 1;
    String alias = mWork.getAliases().get(0);
    Operator<?> topOp = mWork.getAliasToWork().get(alias);
    PartitionDesc partDesc = mWork.getAliasToPartnInfo().get(alias);
    ArrayList<PartitionDesc> parts = mWork.getPartitionDescs();
    List<Path> inputPaths = mWork.getPaths();
    Path tmpPath = context.getExternalTmpPath(inputPaths.get(0));
    Path partitionFile = new Path(tmpPath, ".partitions");
    ShimLoader.getHadoopShims().setTotalOrderPartitionFile(job, partitionFile);
    PartitionKeySampler sampler = new PartitionKeySampler();
    if (mWork.getSamplingType() == MapWork.SAMPLING_ON_PREV_MR) {
        console.printInfo("Use sampling data created in previous MR");
        // merges sampling data from previous MR and make partition keys for total sort
        for (Path path : inputPaths) {
            FileSystem fs = path.getFileSystem(job);
            for (FileStatus status : fs.globStatus(new Path(path, ".sampling*"))) {
                sampler.addSampleFile(status.getPath(), job);
            }
        }
    } else if (mWork.getSamplingType() == MapWork.SAMPLING_ON_START) {
        console.printInfo("Creating sampling data..");
        assert topOp instanceof TableScanOperator;
        TableScanOperator ts = (TableScanOperator) topOp;
        FetchWork fetchWork;
        if (!partDesc.isPartitioned()) {
            assert inputPaths.size() == 1;
            fetchWork = new FetchWork(inputPaths.get(0), partDesc.getTableDesc());
        } else {
            fetchWork = new FetchWork(inputPaths, parts, partDesc.getTableDesc());
        }
        fetchWork.setSource(ts);
        // random sampling
        FetchOperator fetcher = PartitionKeySampler.createSampler(fetchWork, job, ts);
        try {
            ts.initialize(job, new ObjectInspector[] { fetcher.getOutputObjectInspector() });
            OperatorUtils.setChildrenCollector(ts.getChildOperators(), sampler);
            while (fetcher.pushRow()) {
            }
        } finally {
            fetcher.clearFetchContext();
        }
    } else {
        throw new IllegalArgumentException("Invalid sampling type " + mWork.getSamplingType());
    }
    sampler.writePartitionKeys(partitionFile, job);
}
Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) FileStatus(org.apache.hadoop.fs.FileStatus) PartitionKeySampler(org.apache.hadoop.hive.ql.exec.PartitionKeySampler) FileSystem(org.apache.hadoop.fs.FileSystem) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) FetchOperator(org.apache.hadoop.hive.ql.exec.FetchOperator)

Aggregations

FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 FetchOperator (org.apache.hadoop.hive.ql.exec.FetchOperator)1 PartitionKeySampler (org.apache.hadoop.hive.ql.exec.PartitionKeySampler)1 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)1 FetchWork (org.apache.hadoop.hive.ql.plan.FetchWork)1 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)1 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)1