use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class DynamicPartitionPruner method applyFilterToPartitions.
@SuppressWarnings("rawtypes")
private void applyFilterToPartitions(Converter converter, ExprNodeEvaluator eval, String columnName, Set<Object> values) throws HiveException {
Object[] row = new Object[1];
Iterator<Path> it = work.getPathToPartitionInfo().keySet().iterator();
while (it.hasNext()) {
Path p = it.next();
PartitionDesc desc = work.getPathToPartitionInfo().get(p);
Map<String, String> spec = desc.getPartSpec();
if (spec == null) {
throw new IllegalStateException("No partition spec found in dynamic pruning");
}
String partValueString = spec.get(columnName);
if (partValueString == null) {
throw new IllegalStateException("Could not find partition value for column: " + columnName);
}
Object partValue = converter.convert(partValueString);
if (LOG.isDebugEnabled()) {
LOG.debug("Converted partition value: " + partValue + " original (" + partValueString + ")");
}
row[0] = partValue;
partValue = eval.evaluate(row);
if (LOG.isDebugEnabled()) {
LOG.debug("part key expr applied: " + partValue);
}
if (!values.contains(partValue)) {
LOG.info("Pruning path: " + p);
it.remove();
// work.removePathToPartitionInfo(p);
work.removePathToAlias(p);
}
}
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class HiveContextAwareRecordReader method doNext.
public boolean doNext(K key, V value) throws IOException {
if (this.isSorted) {
if (this.getIOContext().shouldEndBinarySearch() || (!this.getIOContext().useSorted() && this.wasUsingSortedSearch)) {
beginLinearSearch();
this.wasUsingSortedSearch = false;
this.getIOContext().setEndBinarySearch(false);
}
if (this.getIOContext().useSorted()) {
if (this.genericUDFClassName == null && this.getIOContext().getGenericUDFClassName() != null) {
setGenericUDFClassName(this.getIOContext().getGenericUDFClassName());
}
if (this.getIOContext().isBinarySearching()) {
// Proceed with a binary search
if (this.getIOContext().getComparison() != null) {
switch(this.getIOContext().getComparison()) {
case GREATER:
case EQUAL:
// Indexes have only one entry per value, could go linear from here, if we want to
// use this for any sorted table, we'll need to continue the search
rangeEnd = previousPosition;
break;
case LESS:
rangeStart = previousPosition;
break;
default:
break;
}
}
long position = (rangeStart + rangeEnd) / 2;
sync(position);
long newPosition = getSyncedPosition();
// matching rows must be in the final block, so we can end the binary search.
if (newPosition == previousPosition || newPosition >= splitEnd) {
this.getIOContext().setBinarySearching(false);
sync(rangeStart);
}
previousPosition = newPosition;
} else if (foundAllTargets()) {
// Found all possible rows which will not be filtered
return false;
}
}
}
try {
/**
* When start reading new file, check header, footer rows.
* If file contains header, skip header lines before reading the records.
* If file contains footer, used a FooterBuffer to remove footer lines
* at the end of the table file.
**/
if (this.ioCxtRef.getCurrentBlockStart() == 0) {
// Check if the table file has header to skip.
footerBuffer = null;
Path filePath = this.ioCxtRef.getInputPath();
PartitionDesc part = null;
try {
if (pathToPartitionInfo == null) {
pathToPartitionInfo = Utilities.getMapWork(jobConf).getPathToPartitionInfo();
}
part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, filePath, IOPrepareCache.get().getPartitionDescMap());
} catch (AssertionError ae) {
LOG.info("Cannot get partition description from " + this.ioCxtRef.getInputPath() + "because " + ae.getMessage());
part = null;
} catch (Exception e) {
LOG.info("Cannot get partition description from " + this.ioCxtRef.getInputPath() + "because " + e.getMessage());
part = null;
}
TableDesc table = (part == null) ? null : part.getTableDesc();
if (table != null) {
headerCount = Utilities.getHeaderCount(table);
footerCount = Utilities.getFooterCount(table, jobConf);
}
// If input contains header, skip header.
if (!Utilities.skipHeader(recordReader, headerCount, (WritableComparable) key, (Writable) value)) {
return false;
}
if (footerCount > 0) {
footerBuffer = new FooterBuffer();
if (!footerBuffer.initializeBuffer(jobConf, recordReader, footerCount, (WritableComparable) key, (Writable) value)) {
return false;
}
}
}
if (footerBuffer == null) {
// Table files don't have footer rows.
return recordReader.next(key, value);
} else {
return footerBuffer.updateBuffer(jobConf, recordReader, (WritableComparable) key, (Writable) value);
}
} catch (Exception e) {
return HiveIOExceptionHandlerUtil.handleRecordReaderNextException(e, jobConf);
}
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class HiveInputFormat method getSplits.
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
init(job);
Path[] dirs = getInputPaths(job);
JobConf newjob = new JobConf(job);
List<InputSplit> result = new ArrayList<InputSplit>();
List<Path> currentDirs = new ArrayList<Path>();
Class<? extends InputFormat> currentInputFormatClass = null;
TableDesc currentTable = null;
TableScanOperator currentTableScan = null;
boolean pushDownProjection = false;
//Buffers to hold filter pushdown information
StringBuilder readColumnsBuffer = new StringBuilder(newjob.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, ""));
;
StringBuilder readColumnNamesBuffer = new StringBuilder(newjob.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, ""));
// for each dir, get the InputFormat, and do getSplits.
for (Path dir : dirs) {
PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
Class<? extends InputFormat> inputFormatClass = part.getInputFileFormatClass();
TableDesc table = part.getTableDesc();
TableScanOperator tableScan = null;
List<String> aliases = mrwork.getPathToAliases().get(dir);
// Make filter pushdown information available to getSplits.
if ((aliases != null) && (aliases.size() == 1)) {
Operator op = mrwork.getAliasToWork().get(aliases.get(0));
if ((op != null) && (op instanceof TableScanOperator)) {
tableScan = (TableScanOperator) op;
//Reset buffers to store filter push down columns
readColumnsBuffer.setLength(0);
readColumnNamesBuffer.setLength(0);
// push down projections.
ColumnProjectionUtils.appendReadColumns(readColumnsBuffer, readColumnNamesBuffer, tableScan.getNeededColumnIDs(), tableScan.getNeededColumns());
pushDownProjection = true;
// push down filters
pushFilters(newjob, tableScan);
}
} else {
if (LOG.isDebugEnabled()) {
LOG.debug("aliases: {} pathToAliases: {} dir: {}", aliases, mrwork.getPathToAliases(), dir);
}
}
if (!currentDirs.isEmpty() && inputFormatClass.equals(currentInputFormatClass) && table.equals(currentTable) && tableScan == currentTableScan) {
currentDirs.add(dir);
continue;
}
if (!currentDirs.isEmpty()) {
if (LOG.isInfoEnabled()) {
LOG.info("Generating splits as currentDirs is not empty. currentDirs: {}", currentDirs);
}
// set columns to read in conf
if (pushDownProjection) {
pushProjection(newjob, readColumnsBuffer, readColumnNamesBuffer);
}
addSplitsForGroup(currentDirs, currentTableScan, newjob, getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass, currentDirs.size() * (numSplits / dirs.length), currentTable, result);
}
currentDirs.clear();
currentDirs.add(dir);
currentTableScan = tableScan;
currentTable = table;
currentInputFormatClass = inputFormatClass;
}
// set columns to read in conf
if (pushDownProjection) {
pushProjection(newjob, readColumnsBuffer, readColumnNamesBuffer);
}
if (dirs.length != 0) {
if (LOG.isInfoEnabled()) {
LOG.info("Generating splits for dirs: {}", dirs);
}
addSplitsForGroup(currentDirs, currentTableScan, newjob, getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass, currentDirs.size() * (numSplits / dirs.length), currentTable, result);
}
Utilities.clearWorkMapForConf(job);
if (LOG.isInfoEnabled()) {
LOG.info("number of splits " + result.size());
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
return result.toArray(new HiveInputSplit[result.size()]);
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class HiveInputFormat method getRecordReader.
public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
HiveInputSplit hsplit = (HiveInputSplit) split;
InputSplit inputSplit = hsplit.getInputSplit();
String inputFormatClassName = null;
Class inputFormatClass = null;
try {
inputFormatClassName = hsplit.inputFormatClassName();
inputFormatClass = job.getClassByName(inputFormatClassName);
} catch (Exception e) {
throw new IOException("cannot find class " + inputFormatClassName, e);
}
if (this.mrwork == null || pathToPartitionInfo == null) {
init(job);
}
boolean nonNative = false;
PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, hsplit.getPath(), null);
if (LOG.isDebugEnabled()) {
LOG.debug("Found spec for " + hsplit.getPath() + " " + part + " from " + pathToPartitionInfo);
}
if ((part != null) && (part.getTableDesc() != null)) {
Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), job);
nonNative = part.getTableDesc().isNonNative();
}
Path splitPath = hsplit.getPath();
pushProjectionsAndFilters(job, inputFormatClass, splitPath, nonNative);
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
try {
inputFormat = HiveInputFormat.wrapForLlap(inputFormat, job, part);
} catch (HiveException e) {
throw new IOException(e);
}
RecordReader innerReader = null;
try {
innerReader = inputFormat.getRecordReader(inputSplit, job, reporter);
} catch (Exception e) {
innerReader = HiveIOExceptionHandlerUtil.handleRecordReaderCreationException(e, job);
}
HiveRecordReader<K, V> rr = new HiveRecordReader(innerReader, job);
rr.initIOContext(hsplit, job, inputFormatClass, innerReader);
return rr;
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class BucketizedHiveInputFormat method getSplits.
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
init(job);
Path[] dirs = getInputPaths(job);
JobConf newjob = new JobConf(job);
ArrayList<InputSplit> result = new ArrayList<InputSplit>();
int numOrigSplits = 0;
// and then create a BucketizedHiveInputSplit on it
for (Path dir : dirs) {
PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
// create a new InputFormat instance if this is the first time to see this
// class
Class inputFormatClass = part.getInputFileFormatClass();
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
newjob.setInputFormat(inputFormat.getClass());
FileStatus[] listStatus = listStatus(newjob, dir);
for (FileStatus status : listStatus) {
LOG.info("block size: " + status.getBlockSize());
LOG.info("file length: " + status.getLen());
FileInputFormat.setInputPaths(newjob, status.getPath());
InputSplit[] iss = inputFormat.getSplits(newjob, 0);
if (iss != null && iss.length > 0) {
numOrigSplits += iss.length;
result.add(new BucketizedHiveInputSplit(iss, inputFormatClass.getName()));
}
}
}
LOG.info(result.size() + " bucketized splits generated from " + numOrigSplits + " original splits.");
return result.toArray(new BucketizedHiveInputSplit[result.size()]);
}
Aggregations