use of org.apache.hadoop.hive.ql.exec.PartitionKeySampler in project hive by apache.
the class ExecDriver method handleSampling.
private void handleSampling(Context context, MapWork mWork, JobConf job) throws Exception {
assert mWork.getAliasToWork().keySet().size() == 1;
String alias = mWork.getAliases().get(0);
Operator<?> topOp = mWork.getAliasToWork().get(alias);
PartitionDesc partDesc = mWork.getAliasToPartnInfo().get(alias);
ArrayList<PartitionDesc> parts = mWork.getPartitionDescs();
List<Path> inputPaths = mWork.getPaths();
Path tmpPath = context.getExternalTmpPath(inputPaths.get(0));
Path partitionFile = new Path(tmpPath, ".partitions");
ShimLoader.getHadoopShims().setTotalOrderPartitionFile(job, partitionFile);
PartitionKeySampler sampler = new PartitionKeySampler();
if (mWork.getSamplingType() == MapWork.SAMPLING_ON_PREV_MR) {
console.printInfo("Use sampling data created in previous MR");
// merges sampling data from previous MR and make partition keys for total sort
for (Path path : inputPaths) {
FileSystem fs = path.getFileSystem(job);
for (FileStatus status : fs.globStatus(new Path(path, ".sampling*"))) {
sampler.addSampleFile(status.getPath(), job);
}
}
} else if (mWork.getSamplingType() == MapWork.SAMPLING_ON_START) {
console.printInfo("Creating sampling data..");
assert topOp instanceof TableScanOperator;
TableScanOperator ts = (TableScanOperator) topOp;
FetchWork fetchWork;
if (!partDesc.isPartitioned()) {
assert inputPaths.size() == 1;
fetchWork = new FetchWork(inputPaths.get(0), partDesc.getTableDesc());
} else {
fetchWork = new FetchWork(inputPaths, parts, partDesc.getTableDesc());
}
fetchWork.setSource(ts);
// random sampling
FetchOperator fetcher = PartitionKeySampler.createSampler(fetchWork, job, ts);
try {
ts.initialize(job, new ObjectInspector[] { fetcher.getOutputObjectInspector() });
OperatorUtils.setChildrenCollector(ts.getChildOperators(), sampler);
while (fetcher.pushRow()) {
}
} finally {
fetcher.clearFetchContext();
}
} else {
throw new IllegalArgumentException("Invalid sampling type " + mWork.getSamplingType());
}
sampler.writePartitionKeys(partitionFile, job);
}
Aggregations