use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.
the class HiveAccumuloTableInputFormat method getSplits.
@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
final AccumuloConnectionParameters accumuloParams = new AccumuloConnectionParameters(jobConf);
final Instance instance = accumuloParams.getInstance();
final ColumnMapper columnMapper;
try {
columnMapper = getColumnMapper(jobConf);
} catch (TooManyAccumuloColumnsException e) {
throw new IOException(e);
}
JobContext context = ShimLoader.getHadoopShims().newJobContext(Job.getInstance(jobConf));
Path[] tablePaths = FileInputFormat.getInputPaths(context);
try {
Connector connector = null;
// Need to get a Connector so we look up the user's authorizations if not otherwise specified
if (accumuloParams.useSasl()) {
log.info("Current user: " + UserGroupInformation.getCurrentUser());
// In a YARN/Tez job, don't have the Kerberos credentials anymore, use the delegation token
AuthenticationToken token = ConfiguratorBase.getAuthenticationToken(AccumuloInputFormat.class, jobConf);
if (null != token && !jobConf.getCredentials().getAllTokens().isEmpty()) {
// Convert the stub from the configuration back into a normal Token
log.info("Found authentication token in Configuration: " + token);
log.info("Job credential tokens: " + jobConf.getCredentials().getAllTokens());
AuthenticationToken unwrappedToken = ConfiguratorBase.unwrapAuthenticationToken(jobConf, token);
log.info("Converted authentication token from Configuration into: " + unwrappedToken);
// will return back the original token (which we know is insufficient)
if (unwrappedToken != token) {
log.info("Creating Accumulo Connector with unwrapped delegation token");
connector = instance.getConnector(accumuloParams.getAccumuloUserName(), unwrappedToken);
} else {
log.info("Job credentials did not contain delegation token, fetching new token");
}
}
if (connector == null) {
log.info("Obtaining Accumulo Connector using KerberosToken");
// Construct a KerberosToken -- relies on ProxyUser configuration. Will be the client making
// the request on top of the HS2's user. Accumulo will require proper proxy-user auth configs.
connector = instance.getConnector(accumuloParams.getAccumuloUserName(), new KerberosToken(accumuloParams.getAccumuloUserName()));
}
} else {
// Still in the local JVM, use the username+password or Kerberos credentials
connector = accumuloParams.getConnector(instance);
}
final List<ColumnMapping> columnMappings = columnMapper.getColumnMappings();
final List<IteratorSetting> iterators = predicateHandler.getIterators(jobConf, columnMapper);
final Collection<Range> ranges = predicateHandler.getRanges(jobConf, columnMapper);
// We don't want that.
if (null != ranges && ranges.isEmpty()) {
return new InputSplit[0];
}
// Set the relevant information in the Configuration for the AccumuloInputFormat
configure(jobConf, instance, connector, accumuloParams, columnMapper, iterators, ranges);
int numColumns = columnMappings.size();
List<Integer> readColIds = ColumnProjectionUtils.getReadColumnIDs(jobConf);
// Sanity check
if (numColumns < readColIds.size())
throw new IOException("Number of column mappings (" + numColumns + ")" + " numbers less than the hive table columns. (" + readColIds.size() + ")");
// get splits from Accumulo
InputSplit[] splits = accumuloInputFormat.getSplits(jobConf, numSplits);
HiveAccumuloSplit[] hiveSplits = new HiveAccumuloSplit[splits.length];
for (int i = 0; i < splits.length; i++) {
RangeInputSplit ris = (RangeInputSplit) splits[i];
ris.setLogLevel(Level.DEBUG);
hiveSplits[i] = new HiveAccumuloSplit(ris, tablePaths[0]);
}
return hiveSplits;
} catch (AccumuloException e) {
log.error("Could not configure AccumuloInputFormat", e);
throw new IOException(StringUtils.stringifyException(e));
} catch (AccumuloSecurityException e) {
log.error("Could not configure AccumuloInputFormat", e);
throw new IOException(StringUtils.stringifyException(e));
} catch (SerDeException e) {
log.error("Could not configure AccumuloInputFormat", e);
throw new IOException(StringUtils.stringifyException(e));
}
}
use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.
the class HiveHBaseTableOutputFormat method checkOutputSpecs.
@Override
public void checkOutputSpecs(FileSystem fs, JobConf jc) throws IOException {
// obtain delegation tokens for the job
if (UserGroupInformation.getCurrentUser().hasKerberosCredentials()) {
TableMapReduceUtil.initCredentials(jc);
}
String hbaseTableName = jc.get(HBaseSerDe.HBASE_TABLE_NAME);
jc.set(TableOutputFormat.OUTPUT_TABLE, hbaseTableName);
Job job = new Job(jc);
JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
try {
checkOutputSpecs(jobContext);
} catch (InterruptedException e) {
throw new IOException(e);
}
}
use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.
the class TestRCFileMapReduceInputFormat method writeThenReadByRecordReader.
private void writeThenReadByRecordReader(int intervalRecordCount, int writeCount, int splitNumber, long maxSplitSize, CompressionCodec codec) throws IOException, InterruptedException {
Path testDir = new Path(System.getProperty("test.tmp.dir", ".") + "/mapred/testsmallfirstsplit");
Path testFile = new Path(testDir, "test_rcfile");
fs.delete(testFile, true);
Configuration cloneConf = new Configuration(conf);
RCFileOutputFormat.setColumnNumber(cloneConf, bytesArray.length);
cloneConf.setInt(HiveConf.ConfVars.HIVE_RCFILE_RECORD_INTERVAL.varname, intervalRecordCount);
RCFile.Writer writer = new RCFile.Writer(fs, cloneConf, testFile, null, codec);
BytesRefArrayWritable bytes = new BytesRefArrayWritable(bytesArray.length);
for (int i = 0; i < bytesArray.length; i++) {
BytesRefWritable cu = null;
cu = new BytesRefWritable(bytesArray[i], 0, bytesArray[i].length);
bytes.set(i, cu);
}
for (int i = 0; i < writeCount; i++) {
writer.append(bytes);
}
writer.close();
RCFileMapReduceInputFormat<LongWritable, BytesRefArrayWritable> inputFormat = new RCFileMapReduceInputFormat<LongWritable, BytesRefArrayWritable>();
Configuration jonconf = new Configuration(cloneConf);
jonconf.set("mapred.input.dir", testDir.toString());
JobContext context = new Job(jonconf);
HiveConf.setLongVar(context.getConfiguration(), HiveConf.ConfVars.MAPREDMAXSPLITSIZE, maxSplitSize);
List<InputSplit> splits = inputFormat.getSplits(context);
assertEquals("splits length should be " + splitNumber, splitNumber, splits.size());
int readCount = 0;
for (int i = 0; i < splits.size(); i++) {
TaskAttemptContext tac = ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptContext(jonconf, new TaskAttemptID());
RecordReader<LongWritable, BytesRefArrayWritable> rr = inputFormat.createRecordReader(splits.get(i), tac);
rr.initialize(splits.get(i), tac);
while (rr.nextKeyValue()) {
readCount++;
}
}
assertEquals("readCount should be equal to writeCount", readCount, writeCount);
}
use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.
the class HCatBaseInputFormat method createRecordReader.
/**
* Create the RecordReader for the given InputSplit. Returns the underlying
* RecordReader if the required operations are supported and schema matches
* with HCatTable schema. Returns an HCatRecordReader if operations need to
* be implemented in HCat.
* @param split the split
* @param taskContext the task attempt context
* @return the record reader instance, either an HCatRecordReader(later) or
* the underlying storage handler's RecordReader
* @throws IOException or InterruptedException
*/
@Override
public RecordReader<WritableComparable, HCatRecord> createRecordReader(InputSplit split, TaskAttemptContext taskContext) throws IOException, InterruptedException {
HCatSplit hcatSplit = InternalUtil.castToHCatSplit(split);
PartInfo partitionInfo = hcatSplit.getPartitionInfo();
// Ensure PartInfo's TableInfo is initialized.
if (partitionInfo.getTableInfo() == null) {
partitionInfo.setTableInfo(HCatUtil.getLastInputJobInfosFromConf(taskContext.getConfiguration()).getTableInfo());
}
JobContext jobContext = taskContext;
Configuration conf = jobContext.getConfiguration();
HiveStorageHandler storageHandler = HCatUtil.getStorageHandler(conf, partitionInfo);
JobConf jobConf = HCatUtil.getJobConfFromContext(jobContext);
Map<String, String> jobProperties = partitionInfo.getJobProperties();
HCatUtil.copyJobPropertiesToJobConf(jobProperties, jobConf);
Map<String, Object> valuesNotInDataCols = getColValsNotInDataColumns(getOutputSchema(conf), partitionInfo);
return new HCatRecordReader(storageHandler, valuesNotInDataCols);
}
use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.
the class FileOutputCommitterContainer method discoverPartitions.
/**
* Run to discover dynamic partitions available
*/
private void discoverPartitions(JobContext context) throws IOException {
if (!partitionsDiscovered) {
// LOG.info("discover ptns called");
OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(context.getConfiguration());
harProcessor.setEnabled(jobInfo.getHarRequested());
List<Integer> dynamicPartCols = jobInfo.getPosOfDynPartCols();
int maxDynamicPartitions = jobInfo.getMaxDynamicPartitions();
Path loadPath = new Path(jobInfo.getLocation());
FileSystem fs = loadPath.getFileSystem(context.getConfiguration());
// construct a path pattern (e.g., /*/*) to find all dynamically generated paths
String dynPathSpec = loadPath.toUri().getPath();
dynPathSpec = dynPathSpec.replace("__HIVE_DEFAULT_PARTITION__", "*");
// LOG.info("Searching for "+dynPathSpec);
Path pathPattern = new Path(dynPathSpec);
FileStatus[] status = fs.globStatus(pathPattern, FileUtils.HIDDEN_FILES_PATH_FILTER);
partitionsDiscoveredByPath = new LinkedHashMap<String, Map<String, String>>();
contextDiscoveredByPath = new LinkedHashMap<String, JobContext>();
if (status.length == 0) {
// LOG.warn("No partition found genereated by dynamic partitioning in ["
// +loadPath+"] with depth["+jobInfo.getTable().getPartitionKeysSize()
// +"], dynSpec["+dynPathSpec+"]");
} else {
if ((maxDynamicPartitions != -1) && (status.length > maxDynamicPartitions)) {
this.partitionsDiscovered = true;
throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS, "Number of dynamic partitions being created " + "exceeds configured max allowable partitions[" + maxDynamicPartitions + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname + "] if needed.");
}
for (FileStatus st : status) {
LinkedHashMap<String, String> fullPartSpec = new LinkedHashMap<String, String>();
if (!customDynamicLocationUsed) {
Warehouse.makeSpecFromName(fullPartSpec, st.getPath(), null);
} else {
HCatFileUtil.getPartKeyValuesForCustomLocation(fullPartSpec, jobInfo, st.getPath().toString());
}
partitionsDiscoveredByPath.put(st.getPath().toString(), fullPartSpec);
JobConf jobConf = (JobConf) context.getConfiguration();
JobContext currContext = HCatMapRedUtil.createJobContext(jobConf, context.getJobID(), InternalUtil.createReporter(HCatMapRedUtil.createTaskAttemptContext(jobConf, ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptID())));
HCatOutputFormat.configureOutputStorageHandler(currContext, jobInfo, fullPartSpec);
contextDiscoveredByPath.put(st.getPath().toString(), currContext);
}
}
// for (Entry<String,Map<String,String>> spec : partitionsDiscoveredByPath.entrySet()){
// LOG.info("Partition "+ spec.getKey());
// for (Entry<String,String> e : spec.getValue().entrySet()){
// LOG.info(e.getKey() + "=>" +e.getValue());
// }
// }
this.partitionsDiscovered = true;
}
}
Aggregations