use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class CombineHiveInputFormat method getCombineSplits.
/**
* Create Hive splits based on CombineFileSplit.
*/
private InputSplit[] getCombineSplits(JobConf job, int numSplits, Map<Path, PartitionDesc> pathToPartitionInfo) throws IOException {
init(job);
Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
Map<String, Operator<? extends OperatorDesc>> aliasToWork = mrwork.getAliasToWork();
CombineFileInputFormatShim combine = ShimLoader.getHadoopShims().getCombineFileInputFormat();
InputSplit[] splits = null;
if (combine == null) {
splits = super.getSplits(job, numSplits);
return splits;
}
if (combine.getInputPathsShim(job).length == 0) {
throw new IOException("No input paths specified in job");
}
ArrayList<InputSplit> result = new ArrayList<InputSplit>();
// combine splits only from same tables and same partitions. Do not combine splits from multiple
// tables or multiple partitions.
Path[] paths = StringInternUtils.internUriStringsInPathArray(combine.getInputPathsShim(job));
List<Path> inpDirs = new ArrayList<Path>();
List<Path> inpFiles = new ArrayList<Path>();
Map<CombinePathInputFormat, CombineFilter> poolMap = new HashMap<CombinePathInputFormat, CombineFilter>();
Set<Path> poolSet = new HashSet<Path>();
LockedDriverState lDrvStat = LockedDriverState.getLockedDriverState();
for (Path path : paths) {
if (lDrvStat != null && lDrvStat.isAborted()) {
throw new IOException("Operation is Canceled. ");
}
PartitionDesc part = HiveFileFormatUtils.getFromPathRecursively(pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
TableDesc tableDesc = part.getTableDesc();
if ((tableDesc != null) && tableDesc.isNonNative()) {
return super.getSplits(job, numSplits);
}
// Use HiveInputFormat if any of the paths is not splittable
Class inputFormatClass = part.getInputFileFormatClass();
String inputFormatClassName = inputFormatClass.getName();
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
String deserializerClassName = null;
try {
deserializerClassName = part.getDeserializer(job).getClass().getName();
} catch (Exception e) {
// ignore
}
FileSystem inpFs = path.getFileSystem(job);
// don't combine if inputformat is a SymlinkTextInputFormat
if (inputFormat instanceof SymlinkTextInputFormat) {
splits = super.getSplits(job, numSplits);
return splits;
}
Path filterPath = path;
// Does a pool exist for this path already
CombineFilter f = null;
List<Operator<? extends OperatorDesc>> opList = null;
if (!mrwork.isMapperCannotSpanPartns()) {
// if mapper can span partitions, make sure a splits does not contain multiple
// opList + inputFormatClassName + deserializerClassName combination
// This is done using the Map of CombinePathInputFormat to PathFilter
opList = HiveFileFormatUtils.doGetWorksFromPath(pathToAliases, aliasToWork, filterPath);
CombinePathInputFormat combinePathInputFormat = new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
f = poolMap.get(combinePathInputFormat);
if (f == null) {
f = new CombineFilter(filterPath);
LOG.info("CombineHiveInputSplit creating pool for " + path + "; using filter path " + filterPath);
combine.createPool(job, f);
poolMap.put(combinePathInputFormat, f);
} else {
LOG.debug("CombineHiveInputSplit: pool is already created for " + path + "; using filter path " + filterPath);
f.addPath(filterPath);
}
} else {
// but won't cross multiple partitions if the user has asked so.
if (!path.getFileSystem(job).getFileStatus(path).isDir()) {
// path is not directory
filterPath = path.getParent();
inpFiles.add(path);
poolSet.add(filterPath);
} else {
inpDirs.add(path);
}
}
}
// Processing directories
List<CombineFileSplit> iss = new ArrayList<CombineFileSplit>();
if (!mrwork.isMapperCannotSpanPartns()) {
// mapper can span partitions
// combine into as few as one split, subject to the PathFilters set
// using combine.createPool.
iss = Arrays.asList(combine.getSplits(job, 1));
} else {
for (Path path : inpDirs) {
processPaths(job, combine, iss, path);
}
if (inpFiles.size() > 0) {
// Processing files
for (Path filterPath : poolSet) {
combine.createPool(job, new CombineFilter(filterPath));
}
processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
}
}
if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
iss = sampleSplits(iss);
}
for (CombineFileSplit is : iss) {
CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is, pathToPartitionInfo);
result.add(csplit);
}
LOG.info("number of splits " + result.size());
return result.toArray(new InputSplit[result.size()]);
}
use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class HiveContextAwareRecordReader method doNext.
public boolean doNext(K key, V value) throws IOException {
if (this.isSorted) {
if (this.getIOContext().shouldEndBinarySearch() || (!this.getIOContext().useSorted() && this.wasUsingSortedSearch)) {
beginLinearSearch();
this.wasUsingSortedSearch = false;
this.getIOContext().setEndBinarySearch(false);
}
if (this.getIOContext().useSorted()) {
if (this.genericUDFClassName == null && this.getIOContext().getGenericUDFClassName() != null) {
setGenericUDFClassName(this.getIOContext().getGenericUDFClassName());
}
if (this.getIOContext().isBinarySearching()) {
// Proceed with a binary search
if (this.getIOContext().getComparison() != null) {
switch(this.getIOContext().getComparison()) {
case GREATER:
case EQUAL:
// Indexes have only one entry per value, could go linear from here, if we want to
// use this for any sorted table, we'll need to continue the search
rangeEnd = previousPosition;
break;
case LESS:
rangeStart = previousPosition;
break;
default:
break;
}
}
long position = (rangeStart + rangeEnd) / 2;
sync(position);
long newPosition = getSyncedPosition();
// matching rows must be in the final block, so we can end the binary search.
if (newPosition == previousPosition || newPosition >= splitEnd) {
this.getIOContext().setBinarySearching(false);
sync(rangeStart);
}
previousPosition = newPosition;
} else if (foundAllTargets()) {
// Found all possible rows which will not be filtered
return false;
}
}
}
try {
/**
* When start reading new file, check header, footer rows.
* If file contains header, skip header lines before reading the records.
* If file contains footer, used a FooterBuffer to remove footer lines
* at the end of the table file.
*/
if (this.ioCxtRef.getCurrentBlockStart() == 0) {
// Check if the table file has header to skip.
footerBuffer = null;
Path filePath = this.ioCxtRef.getInputPath();
PartitionDesc part = null;
try {
if (pathToPartitionInfo == null) {
pathToPartitionInfo = Utilities.getMapWork(jobConf).getPathToPartitionInfo();
}
part = HiveFileFormatUtils.getFromPathRecursively(pathToPartitionInfo, filePath, IOPrepareCache.get().getPartitionDescMap());
} catch (AssertionError ae) {
LOG.info("Cannot get partition description from " + this.ioCxtRef.getInputPath() + "because " + ae.getMessage());
part = null;
} catch (Exception e) {
LOG.info("Cannot get partition description from " + this.ioCxtRef.getInputPath() + "because " + e.getMessage());
part = null;
}
TableDesc table = (part == null) ? null : part.getTableDesc();
if (table != null) {
headerCount = Utilities.getHeaderCount(table);
footerCount = Utilities.getFooterCount(table, jobConf);
}
// If input contains header, skip header.
if (!Utilities.skipHeader(recordReader, headerCount, (WritableComparable) key, (Writable) value)) {
return false;
}
if (footerCount > 0) {
footerBuffer = new FooterBuffer();
if (!footerBuffer.initializeBuffer(jobConf, recordReader, footerCount, (WritableComparable) key, (Writable) value)) {
return false;
}
}
}
if (footerBuffer == null) {
// Table files don't have footer rows.
return recordReader.next(key, value);
} else {
return footerBuffer.updateBuffer(jobConf, recordReader, (WritableComparable) key, (Writable) value);
}
} catch (Exception e) {
return HiveIOExceptionHandlerUtil.handleRecordReaderNextException(e, jobConf);
}
}
use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class HiveInputFormat method getSplits.
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
init(job);
Path[] dirs = getInputPaths(job);
JobConf newjob = new JobConf(job);
List<InputSplit> result = new ArrayList<InputSplit>();
List<Path> currentDirs = new ArrayList<Path>();
Class<? extends InputFormat> currentInputFormatClass = null;
TableDesc currentTable = null;
TableScanOperator currentTableScan = null;
boolean pushDownProjection = false;
// Buffers to hold filter pushdown information
StringBuilder readColumnsBuffer = new StringBuilder(newjob.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, ""));
;
StringBuilder readColumnNamesBuffer = new StringBuilder(newjob.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, ""));
// for each dir, get the InputFormat, and do getSplits.
for (Path dir : dirs) {
PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
Class<? extends InputFormat> inputFormatClass = part.getInputFileFormatClass();
TableDesc table = part.getTableDesc();
TableScanOperator tableScan = null;
List<String> aliases = mrwork.getPathToAliases().get(dir);
// Make filter pushdown information available to getSplits.
if ((aliases != null) && (aliases.size() == 1)) {
Operator op = mrwork.getAliasToWork().get(aliases.get(0));
if ((op != null) && (op instanceof TableScanOperator)) {
tableScan = (TableScanOperator) op;
// Reset buffers to store filter push down columns
readColumnsBuffer.setLength(0);
readColumnNamesBuffer.setLength(0);
// push down projections.
ColumnProjectionUtils.appendReadColumns(readColumnsBuffer, readColumnNamesBuffer, tableScan.getNeededColumnIDs(), tableScan.getNeededColumns());
pushDownProjection = true;
// push down filters
pushFilters(newjob, tableScan, this.mrwork);
}
} else {
if (LOG.isDebugEnabled()) {
LOG.debug("aliases: {} pathToAliases: {} dir: {}", aliases, mrwork.getPathToAliases(), dir);
}
}
if (!currentDirs.isEmpty() && inputFormatClass.equals(currentInputFormatClass) && table.equals(currentTable) && tableScan == currentTableScan) {
currentDirs.add(dir);
continue;
}
if (!currentDirs.isEmpty()) {
if (LOG.isInfoEnabled()) {
LOG.info("Generating splits as currentDirs is not empty. currentDirs: {}", currentDirs);
}
// set columns to read in conf
if (pushDownProjection) {
pushProjection(newjob, readColumnsBuffer, readColumnNamesBuffer);
}
addSplitsForGroup(currentDirs, currentTableScan, newjob, getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass, currentDirs.size() * (numSplits / dirs.length), currentTable, result);
}
currentDirs.clear();
currentDirs.add(dir);
currentTableScan = tableScan;
currentTable = table;
currentInputFormatClass = inputFormatClass;
}
// set columns to read in conf
if (pushDownProjection) {
pushProjection(newjob, readColumnsBuffer, readColumnNamesBuffer);
}
if (dirs.length != 0) {
if (LOG.isInfoEnabled()) {
LOG.info("Generating splits for dirs: {}", dirs);
}
addSplitsForGroup(currentDirs, currentTableScan, newjob, getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass, currentDirs.size() * (numSplits / dirs.length), currentTable, result);
}
Utilities.clearWorkMapForConf(job);
if (LOG.isInfoEnabled()) {
LOG.info("number of splits " + result.size());
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
return result.toArray(new HiveInputSplit[result.size()]);
}
use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class TestUtilities method testGetInputPathsWithEmptyPartitions.
/**
* Check that calling {@link Utilities#getInputPaths(JobConf, MapWork, Path, Context, boolean)}
* can process two different tables that both have empty partitions.
*/
@Test
public void testGetInputPathsWithEmptyPartitions() throws Exception {
String alias1Name = "alias1";
String alias2Name = "alias2";
MapWork mapWork1 = new MapWork();
MapWork mapWork2 = new MapWork();
JobConf jobConf = new JobConf();
Configuration conf = new Configuration();
Path nonExistentPath1 = new Path(UUID.randomUUID().toString());
Path nonExistentPath2 = new Path(UUID.randomUUID().toString());
PartitionDesc mockPartitionDesc = mock(PartitionDesc.class);
TableDesc mockTableDesc = mock(TableDesc.class);
when(mockTableDesc.isNonNative()).thenReturn(false);
when(mockTableDesc.getProperties()).thenReturn(new Properties());
when(mockPartitionDesc.getProperties()).thenReturn(new Properties());
when(mockPartitionDesc.getTableDesc()).thenReturn(mockTableDesc);
doReturn(HiveSequenceFileOutputFormat.class).when(mockPartitionDesc).getOutputFileFormatClass();
mapWork1.setPathToAliases(new LinkedHashMap<>(ImmutableMap.of(nonExistentPath1, Lists.newArrayList(alias1Name))));
mapWork1.setAliasToWork(new LinkedHashMap<>(ImmutableMap.of(alias1Name, (Operator<?>) mock(Operator.class))));
mapWork1.setPathToPartitionInfo(new LinkedHashMap<>(ImmutableMap.of(nonExistentPath1, mockPartitionDesc)));
mapWork2.setPathToAliases(new LinkedHashMap<>(ImmutableMap.of(nonExistentPath2, Lists.newArrayList(alias2Name))));
mapWork2.setAliasToWork(new LinkedHashMap<>(ImmutableMap.of(alias2Name, (Operator<?>) mock(Operator.class))));
mapWork2.setPathToPartitionInfo(new LinkedHashMap<>(ImmutableMap.of(nonExistentPath2, mockPartitionDesc)));
List<Path> inputPaths = new ArrayList<>();
try {
Path scratchDir = new Path(HiveConf.getVar(jobConf, HiveConf.ConfVars.LOCALSCRATCHDIR));
List<Path> inputPaths1 = Utilities.getInputPaths(jobConf, mapWork1, scratchDir, mock(Context.class), false);
inputPaths.addAll(inputPaths1);
assertEquals(inputPaths1.size(), 1);
assertNotEquals(inputPaths1.get(0), nonExistentPath1);
assertTrue(inputPaths1.get(0).getFileSystem(conf).exists(inputPaths1.get(0)));
assertFalse(nonExistentPath1.getFileSystem(conf).exists(nonExistentPath1));
List<Path> inputPaths2 = Utilities.getInputPaths(jobConf, mapWork2, scratchDir, mock(Context.class), false);
inputPaths.addAll(inputPaths2);
assertEquals(inputPaths2.size(), 1);
assertNotEquals(inputPaths2.get(0), nonExistentPath2);
assertTrue(inputPaths2.get(0).getFileSystem(conf).exists(inputPaths2.get(0)));
assertFalse(nonExistentPath2.getFileSystem(conf).exists(nonExistentPath2));
} finally {
File file;
for (Path path : inputPaths) {
file = new File(path.toString());
if (file.exists()) {
file.delete();
}
}
}
}
use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class TestOperators method testMapOperator.
public void testMapOperator() throws Throwable {
try {
System.out.println("Testing Map Operator");
// initialize configuration
JobConf hconf = new JobConf(TestOperators.class);
hconf.set(MRJobConfig.MAP_INPUT_FILE, "hdfs:///testDir/testFile");
IOContextMap.get(hconf).setInputPath(new Path("hdfs:///testDir/testFile"));
// initialize pathToAliases
ArrayList<String> aliases = new ArrayList<String>();
aliases.add("a");
aliases.add("b");
LinkedHashMap<Path, ArrayList<String>> pathToAliases = new LinkedHashMap<>();
pathToAliases.put(new Path("hdfs:///testDir"), aliases);
// initialize pathToTableInfo
// Default: treat the table as a single column "col"
TableDesc td = Utilities.defaultTd;
PartitionDesc pd = new PartitionDesc(td, null);
LinkedHashMap<Path, org.apache.hadoop.hive.ql.plan.PartitionDesc> pathToPartitionInfo = new LinkedHashMap<>();
pathToPartitionInfo.put(new Path("hdfs:///testDir"), pd);
// initialize aliasToWork
CompilationOpContext ctx = new CompilationOpContext();
CollectDesc cd = new CollectDesc(Integer.valueOf(1));
CollectOperator cdop1 = (CollectOperator) OperatorFactory.get(ctx, CollectDesc.class);
cdop1.setConf(cd);
CollectOperator cdop2 = (CollectOperator) OperatorFactory.get(ctx, CollectDesc.class);
cdop2.setConf(cd);
LinkedHashMap<String, Operator<? extends OperatorDesc>> aliasToWork = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
aliasToWork.put("a", cdop1);
aliasToWork.put("b", cdop2);
// initialize mapredWork
MapredWork mrwork = new MapredWork();
mrwork.getMapWork().setPathToAliases(pathToAliases);
mrwork.getMapWork().setPathToPartitionInfo(pathToPartitionInfo);
mrwork.getMapWork().setAliasToWork(aliasToWork);
// get map operator and initialize it
MapOperator mo = new MapOperator(new CompilationOpContext());
mo.initializeAsRoot(hconf, mrwork.getMapWork());
Text tw = new Text();
InspectableObject io1 = new InspectableObject();
InspectableObject io2 = new InspectableObject();
for (int i = 0; i < 5; i++) {
String answer = "[[" + i + ", " + (i + 1) + ", " + (i + 2) + "]]";
tw.set("" + i + "\u0001" + (i + 1) + "\u0001" + (i + 2));
mo.process(tw);
cdop1.retrieve(io1);
cdop2.retrieve(io2);
System.out.println("io1.o.toString() = " + io1.o.toString());
System.out.println("io2.o.toString() = " + io2.o.toString());
System.out.println("answer.toString() = " + answer.toString());
assertEquals(answer.toString(), io1.o.toString());
assertEquals(answer.toString(), io2.o.toString());
}
System.out.println("Map Operator ok");
} catch (Throwable e) {
e.printStackTrace();
throw (e);
}
}
Aggregations