use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class HiveAccumuloTableInputFormat method getSplits.
@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
final AccumuloConnectionParameters accumuloParams = new AccumuloConnectionParameters(jobConf);
final Instance instance = accumuloParams.getInstance();
final ColumnMapper columnMapper;
try {
columnMapper = getColumnMapper(jobConf);
} catch (TooManyAccumuloColumnsException e) {
throw new IOException(e);
}
JobContext context = ShimLoader.getHadoopShims().newJobContext(Job.getInstance(jobConf));
Path[] tablePaths = FileInputFormat.getInputPaths(context);
try {
Connector connector = null;
// Need to get a Connector so we look up the user's authorizations if not otherwise specified
if (accumuloParams.useSasl()) {
log.info("Current user: " + UserGroupInformation.getCurrentUser());
// In a YARN/Tez job, don't have the Kerberos credentials anymore, use the delegation token
AuthenticationToken token = ConfiguratorBase.getAuthenticationToken(AccumuloInputFormat.class, jobConf);
if (null != token && !jobConf.getCredentials().getAllTokens().isEmpty()) {
// Convert the stub from the configuration back into a normal Token
log.info("Found authentication token in Configuration: " + token);
log.info("Job credential tokens: " + jobConf.getCredentials().getAllTokens());
AuthenticationToken unwrappedToken = ConfiguratorBase.unwrapAuthenticationToken(jobConf, token);
log.info("Converted authentication token from Configuration into: " + unwrappedToken);
// will return back the original token (which we know is insufficient)
if (unwrappedToken != token) {
log.info("Creating Accumulo Connector with unwrapped delegation token");
connector = instance.getConnector(accumuloParams.getAccumuloUserName(), unwrappedToken);
} else {
log.info("Job credentials did not contain delegation token, fetching new token");
}
}
if (connector == null) {
log.info("Obtaining Accumulo Connector using KerberosToken");
// Construct a KerberosToken -- relies on ProxyUser configuration. Will be the client making
// the request on top of the HS2's user. Accumulo will require proper proxy-user auth configs.
connector = instance.getConnector(accumuloParams.getAccumuloUserName(), new KerberosToken(accumuloParams.getAccumuloUserName()));
}
} else {
// Still in the local JVM, use the username+password or Kerberos credentials
connector = accumuloParams.getConnector(instance);
}
final List<ColumnMapping> columnMappings = columnMapper.getColumnMappings();
final List<IteratorSetting> iterators = predicateHandler.getIterators(jobConf, columnMapper);
final Collection<Range> ranges = predicateHandler.getRanges(jobConf, columnMapper);
// We don't want that.
if (null != ranges && ranges.isEmpty()) {
return new InputSplit[0];
}
// Set the relevant information in the Configuration for the AccumuloInputFormat
configure(jobConf, instance, connector, accumuloParams, columnMapper, iterators, ranges);
int numColumns = columnMappings.size();
List<Integer> readColIds = ColumnProjectionUtils.getReadColumnIDs(jobConf);
// Sanity check
if (numColumns < readColIds.size())
throw new IOException("Number of column mappings (" + numColumns + ")" + " numbers less than the hive table columns. (" + readColIds.size() + ")");
// get splits from Accumulo
InputSplit[] splits = accumuloInputFormat.getSplits(jobConf, numSplits);
HiveAccumuloSplit[] hiveSplits = new HiveAccumuloSplit[splits.length];
for (int i = 0; i < splits.length; i++) {
RangeInputSplit ris = (RangeInputSplit) splits[i];
ris.setLogLevel(Level.DEBUG);
hiveSplits[i] = new HiveAccumuloSplit(ris, tablePaths[0]);
}
return hiveSplits;
} catch (AccumuloException e) {
log.error("Could not configure AccumuloInputFormat", e);
throw new IOException(StringUtils.stringifyException(e));
} catch (AccumuloSecurityException e) {
log.error("Could not configure AccumuloInputFormat", e);
throw new IOException(StringUtils.stringifyException(e));
} catch (SerDeException e) {
log.error("Could not configure AccumuloInputFormat", e);
throw new IOException(StringUtils.stringifyException(e));
}
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class StreamingAssert method readRecords.
/**
* TODO: this would be more flexible doing a SQL select statement rather than using InputFormat directly
* see {@link org.apache.hive.hcatalog.streaming.TestStreaming#checkDataWritten2(Path, long, long, int, String, String...)}
* @param numSplitsExpected
* @return
* @throws Exception
*/
List<Record> readRecords(int numSplitsExpected) throws Exception {
if (currentDeltas.isEmpty()) {
throw new AssertionError("No data");
}
InputFormat<NullWritable, OrcStruct> inputFormat = new OrcInputFormat();
JobConf job = new JobConf();
job.set("mapred.input.dir", partitionLocation.toString());
job.set(hive_metastoreConstants.BUCKET_COUNT, Integer.toString(table.getSd().getNumBuckets()));
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg");
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string");
AcidUtils.setAcidOperationalProperties(job, true, null);
job.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
job.set(ValidWriteIdList.VALID_WRITEIDS_KEY, writeIds.toString());
InputSplit[] splits = inputFormat.getSplits(job, 1);
assertEquals(numSplitsExpected, splits.length);
List<Record> records = new ArrayList<>();
for (InputSplit is : splits) {
final AcidRecordReader<NullWritable, OrcStruct> recordReader = (AcidRecordReader<NullWritable, OrcStruct>) inputFormat.getRecordReader(is, job, Reporter.NULL);
NullWritable key = recordReader.createKey();
OrcStruct value = recordReader.createValue();
while (recordReader.next(key, value)) {
RecordIdentifier recordIdentifier = recordReader.getRecordIdentifier();
Record record = new Record(new RecordIdentifier(recordIdentifier.getWriteId(), recordIdentifier.getBucketProperty(), recordIdentifier.getRowId()), value.toString());
System.out.println(record);
records.add(record);
}
recordReader.close();
}
return records;
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class CombineHiveInputFormat method getCombineSplits.
/**
* Create Hive splits based on CombineFileSplit.
*/
private InputSplit[] getCombineSplits(JobConf job, int numSplits, Map<Path, PartitionDesc> pathToPartitionInfo) throws IOException {
init(job);
Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
Map<String, Operator<? extends OperatorDesc>> aliasToWork = mrwork.getAliasToWork();
CombineFileInputFormatShim combine = ShimLoader.getHadoopShims().getCombineFileInputFormat();
InputSplit[] splits = null;
if (combine == null) {
splits = super.getSplits(job, numSplits);
return splits;
}
if (combine.getInputPathsShim(job).length == 0) {
throw new IOException("No input paths specified in job");
}
ArrayList<InputSplit> result = new ArrayList<InputSplit>();
// combine splits only from same tables and same partitions. Do not combine splits from multiple
// tables or multiple partitions.
Path[] paths = StringInternUtils.internUriStringsInPathArray(combine.getInputPathsShim(job));
List<Path> inpDirs = new ArrayList<Path>();
List<Path> inpFiles = new ArrayList<Path>();
Map<CombinePathInputFormat, CombineFilter> poolMap = new HashMap<CombinePathInputFormat, CombineFilter>();
Set<Path> poolSet = new HashSet<Path>();
LockedDriverState lDrvStat = LockedDriverState.getLockedDriverState();
for (Path path : paths) {
if (lDrvStat != null && lDrvStat.isAborted()) {
throw new IOException("Operation is Canceled. ");
}
PartitionDesc part = HiveFileFormatUtils.getFromPathRecursively(pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
TableDesc tableDesc = part.getTableDesc();
if ((tableDesc != null) && tableDesc.isNonNative()) {
return super.getSplits(job, numSplits);
}
// Use HiveInputFormat if any of the paths is not splittable
Class inputFormatClass = part.getInputFileFormatClass();
String inputFormatClassName = inputFormatClass.getName();
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
String deserializerClassName = null;
try {
deserializerClassName = part.getDeserializer(job).getClass().getName();
} catch (Exception e) {
// ignore
}
FileSystem inpFs = path.getFileSystem(job);
// don't combine if inputformat is a SymlinkTextInputFormat
if (inputFormat instanceof SymlinkTextInputFormat) {
splits = super.getSplits(job, numSplits);
return splits;
}
Path filterPath = path;
// Does a pool exist for this path already
CombineFilter f = null;
List<Operator<? extends OperatorDesc>> opList = null;
if (!mrwork.isMapperCannotSpanPartns()) {
// if mapper can span partitions, make sure a splits does not contain multiple
// opList + inputFormatClassName + deserializerClassName combination
// This is done using the Map of CombinePathInputFormat to PathFilter
opList = HiveFileFormatUtils.doGetWorksFromPath(pathToAliases, aliasToWork, filterPath);
CombinePathInputFormat combinePathInputFormat = new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
f = poolMap.get(combinePathInputFormat);
if (f == null) {
f = new CombineFilter(filterPath);
LOG.info("CombineHiveInputSplit creating pool for " + path + "; using filter path " + filterPath);
combine.createPool(job, f);
poolMap.put(combinePathInputFormat, f);
} else {
LOG.debug("CombineHiveInputSplit: pool is already created for " + path + "; using filter path " + filterPath);
f.addPath(filterPath);
}
} else {
// but won't cross multiple partitions if the user has asked so.
if (!path.getFileSystem(job).getFileStatus(path).isDir()) {
// path is not directory
filterPath = path.getParent();
inpFiles.add(path);
poolSet.add(filterPath);
} else {
inpDirs.add(path);
}
}
}
// Processing directories
List<CombineFileSplit> iss = new ArrayList<CombineFileSplit>();
if (!mrwork.isMapperCannotSpanPartns()) {
// mapper can span partitions
// combine into as few as one split, subject to the PathFilters set
// using combine.createPool.
iss = Arrays.asList(combine.getSplits(job, 1));
} else {
for (Path path : inpDirs) {
processPaths(job, combine, iss, path);
}
if (inpFiles.size() > 0) {
// Processing files
for (Path filterPath : poolSet) {
combine.createPool(job, new CombineFilter(filterPath));
}
processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
}
}
if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
iss = sampleSplits(iss);
}
for (CombineFileSplit is : iss) {
CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is, pathToPartitionInfo);
result.add(csplit);
}
LOG.info("number of splits " + result.size());
return result.toArray(new InputSplit[result.size()]);
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class HiveInputFormat method addSplitsForGroup.
/*
* AddSplitsForGroup collects separate calls to setInputPaths into one where possible.
* The reason for this is that this is faster on some InputFormats. E.g.: Orc will start
* a threadpool to do the work and calling it multiple times unnecessarily will create a lot
* of unnecessary thread pools.
*/
private void addSplitsForGroup(List<Path> dirs, TableScanOperator tableScan, JobConf conf, InputFormat inputFormat, Class<? extends InputFormat> inputFormatClass, int splits, TableDesc table, List<InputSplit> result) throws IOException {
ValidWriteIdList validWriteIdList = AcidUtils.getTableValidWriteIdList(conf, table.getTableName());
ValidWriteIdList validMmWriteIdList;
if (AcidUtils.isInsertOnlyTable(table.getProperties())) {
if (validWriteIdList == null) {
throw new IOException("Insert-Only table: " + table.getTableName() + " is missing from the ValidWriteIdList config: " + conf.get(ValidTxnWriteIdList.VALID_TABLES_WRITEIDS_KEY));
}
validMmWriteIdList = validWriteIdList;
} else {
// for non-MM case
validMmWriteIdList = null;
}
try {
Utilities.copyTablePropertiesToConf(table, conf);
if (tableScan != null) {
AcidUtils.setAcidOperationalProperties(conf, tableScan.getConf().isTranscationalTable(), tableScan.getConf().getAcidOperationalProperties());
if (tableScan.getConf().isTranscationalTable() && (validWriteIdList == null)) {
throw new IOException("Acid table: " + table.getTableName() + " is missing from the ValidWriteIdList config: " + conf.get(ValidTxnWriteIdList.VALID_TABLES_WRITEIDS_KEY));
}
if (validWriteIdList != null) {
AcidUtils.setValidWriteIdList(conf, validWriteIdList);
}
}
} catch (HiveException e) {
throw new IOException(e);
}
if (tableScan != null) {
pushFilters(conf, tableScan, this.mrwork);
}
Path[] finalDirs = processPathsForMmRead(dirs, conf, validMmWriteIdList);
if (finalDirs == null) {
// No valid inputs.
return;
}
FileInputFormat.setInputPaths(conf, finalDirs);
conf.setInputFormat(inputFormat.getClass());
int headerCount = 0;
int footerCount = 0;
if (table != null) {
headerCount = Utilities.getHeaderCount(table);
footerCount = Utilities.getFooterCount(table, conf);
if (headerCount != 0 || footerCount != 0) {
// Input file has header or footer, cannot be splitted.
HiveConf.setLongVar(conf, ConfVars.MAPREDMINSPLITSIZE, Long.MAX_VALUE);
}
}
InputSplit[] iss = inputFormat.getSplits(conf, splits);
for (InputSplit is : iss) {
result.add(new HiveInputSplit(is, inputFormatClass.getName()));
}
if (iss.length == 0 && finalDirs.length > 0 && conf.getBoolean(Utilities.ENSURE_OPERATORS_EXECUTED, false)) {
// If there are no inputs; the Execution engine skips the operator tree.
// To prevent it from happening; an opaque ZeroRows input is added here - when needed.
result.add(new HiveInputSplit(new NullRowsInputFormat.DummyInputSplit(finalDirs[0].toString()), ZeroRowsInputFormat.class.getName()));
}
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class HiveInputFormat method getSplits.
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
init(job);
Path[] dirs = getInputPaths(job);
JobConf newjob = new JobConf(job);
List<InputSplit> result = new ArrayList<InputSplit>();
List<Path> currentDirs = new ArrayList<Path>();
Class<? extends InputFormat> currentInputFormatClass = null;
TableDesc currentTable = null;
TableScanOperator currentTableScan = null;
boolean pushDownProjection = false;
// Buffers to hold filter pushdown information
StringBuilder readColumnsBuffer = new StringBuilder(newjob.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, ""));
;
StringBuilder readColumnNamesBuffer = new StringBuilder(newjob.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, ""));
// for each dir, get the InputFormat, and do getSplits.
for (Path dir : dirs) {
PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
Class<? extends InputFormat> inputFormatClass = part.getInputFileFormatClass();
TableDesc table = part.getTableDesc();
TableScanOperator tableScan = null;
List<String> aliases = mrwork.getPathToAliases().get(dir);
// Make filter pushdown information available to getSplits.
if ((aliases != null) && (aliases.size() == 1)) {
Operator op = mrwork.getAliasToWork().get(aliases.get(0));
if ((op != null) && (op instanceof TableScanOperator)) {
tableScan = (TableScanOperator) op;
// Reset buffers to store filter push down columns
readColumnsBuffer.setLength(0);
readColumnNamesBuffer.setLength(0);
// push down projections.
ColumnProjectionUtils.appendReadColumns(readColumnsBuffer, readColumnNamesBuffer, tableScan.getNeededColumnIDs(), tableScan.getNeededColumns());
pushDownProjection = true;
// push down filters
pushFilters(newjob, tableScan, this.mrwork);
}
} else {
if (LOG.isDebugEnabled()) {
LOG.debug("aliases: {} pathToAliases: {} dir: {}", aliases, mrwork.getPathToAliases(), dir);
}
}
if (!currentDirs.isEmpty() && inputFormatClass.equals(currentInputFormatClass) && table.equals(currentTable) && tableScan == currentTableScan) {
currentDirs.add(dir);
continue;
}
if (!currentDirs.isEmpty()) {
if (LOG.isInfoEnabled()) {
LOG.info("Generating splits as currentDirs is not empty. currentDirs: {}", currentDirs);
}
// set columns to read in conf
if (pushDownProjection) {
pushProjection(newjob, readColumnsBuffer, readColumnNamesBuffer);
}
addSplitsForGroup(currentDirs, currentTableScan, newjob, getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass, currentDirs.size() * (numSplits / dirs.length), currentTable, result);
}
currentDirs.clear();
currentDirs.add(dir);
currentTableScan = tableScan;
currentTable = table;
currentInputFormatClass = inputFormatClass;
}
// set columns to read in conf
if (pushDownProjection) {
pushProjection(newjob, readColumnsBuffer, readColumnNamesBuffer);
}
if (dirs.length != 0) {
if (LOG.isInfoEnabled()) {
LOG.info("Generating splits for dirs: {}", dirs);
}
addSplitsForGroup(currentDirs, currentTableScan, newjob, getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass, currentDirs.size() * (numSplits / dirs.length), currentTable, result);
}
Utilities.clearWorkMapForConf(job);
if (LOG.isInfoEnabled()) {
LOG.info("number of splits " + result.size());
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
return result.toArray(new HiveInputSplit[result.size()]);
}
Aggregations