use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.
the class HiveHBaseTableInputFormat method getSplitsInternal.
private InputSplit[] getSplitsInternal(JobConf jobConf, int numSplits) throws IOException {
// obtain delegation tokens for the job
if (UserGroupInformation.getCurrentUser().hasKerberosCredentials()) {
TableMapReduceUtil.initCredentials(jobConf);
}
String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME);
Connection conn = ConnectionFactory.createConnection(HBaseConfiguration.create(jobConf));
TableName tableName = TableName.valueOf(hbaseTableName);
initializeTable(conn, tableName);
String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);
boolean doColumnRegexMatching = jobConf.getBoolean(HBaseSerDe.HBASE_COLUMNS_REGEX_MATCHING, true);
try {
if (hbaseColumnsMapping == null) {
throw new IOException(HBaseSerDe.HBASE_COLUMNS_MAPPING + " required for HBase Table.");
}
ColumnMappings columnMappings = null;
try {
columnMappings = HBaseSerDe.parseColumnsMapping(hbaseColumnsMapping, doColumnRegexMatching);
} catch (SerDeException e) {
throw new IOException(e);
}
int iKey = columnMappings.getKeyIndex();
int iTimestamp = columnMappings.getTimestampIndex();
ColumnMapping keyMapping = columnMappings.getKeyMapping();
// Take filter pushdown into account while calculating splits; this
// allows us to prune off regions immediately. Note that although
// the Javadoc for the superclass getSplits says that it returns one
// split per region, the implementation actually takes the scan
// definition into account and excludes regions which don't satisfy
// the start/stop row conditions (HBASE-1829).
Scan scan = createFilterScan(jobConf, iKey, iTimestamp, HiveHBaseInputFormatUtil.getStorageFormatOfKey(keyMapping.mappingSpec, jobConf.get(HBaseSerDe.HBASE_TABLE_DEFAULT_STORAGE_TYPE, "string")));
// The list of families that have been added to the scan
List<String> addedFamilies = new ArrayList<String>();
// same as in getRecordReader?
for (ColumnMapping colMap : columnMappings) {
if (colMap.hbaseRowKey || colMap.hbaseTimestamp) {
continue;
}
if (colMap.qualifierName == null) {
scan.addFamily(colMap.familyNameBytes);
addedFamilies.add(colMap.familyName);
} else {
if (!addedFamilies.contains(colMap.familyName)) {
// add the column only if the family has not already been added
scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes);
}
}
}
setScan(scan);
Job job = new Job(jobConf);
JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
List<org.apache.hadoop.mapreduce.InputSplit> splits = super.getSplits(jobContext);
InputSplit[] results = new InputSplit[splits.size()];
for (int i = 0; i < splits.size(); i++) {
results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]);
}
return results;
} finally {
closeTable();
conn.close();
}
}
use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.
the class DruidQueryBasedInputFormat method getInputSplits.
protected HiveDruidSplit[] getInputSplits(Configuration conf) throws IOException {
String address = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_DRUID_BROKER_DEFAULT_ADDRESS);
String queryId = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYID);
if (StringUtils.isEmpty(address)) {
throw new IOException("Druid broker address not specified in configuration");
}
String druidQuery = StringEscapeUtils.unescapeJava(conf.get(Constants.DRUID_QUERY_JSON));
String druidQueryType;
if (StringUtils.isEmpty(druidQuery)) {
// Empty, maybe because CBO did not run; we fall back to
// full Select query
LOG.warn("Druid query is empty; creating Select query");
String dataSource = conf.get(Constants.DRUID_DATA_SOURCE);
if (dataSource == null || dataSource.isEmpty()) {
throw new IOException("Druid data source cannot be empty or null");
}
druidQuery = DruidStorageHandlerUtils.createScanAllQuery(dataSource, Utilities.getColumnNames(conf));
druidQueryType = Query.SCAN;
conf.set(Constants.DRUID_QUERY_TYPE, druidQueryType);
} else {
druidQueryType = conf.get(Constants.DRUID_QUERY_TYPE);
if (druidQueryType == null) {
throw new IOException("Druid query type not recognized");
}
}
// Add Hive Query ID to Druid Query
if (queryId != null) {
druidQuery = withQueryId(druidQuery, queryId);
}
// hive depends on FileSplits
Job job = Job.getInstance(conf);
JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
Path[] paths = FileInputFormat.getInputPaths(jobContext);
// Then, create splits with the Druid queries.
switch(druidQueryType) {
case Query.TIMESERIES:
case Query.TOPN:
case Query.GROUP_BY:
return new HiveDruidSplit[] { new HiveDruidSplit(druidQuery, paths[0], new String[] { address }) };
case Query.SCAN:
ScanQuery scanQuery = DruidStorageHandlerUtils.JSON_MAPPER.readValue(druidQuery, ScanQuery.class);
return distributeScanQuery(address, scanQuery, paths[0]);
default:
throw new IOException("Druid query type not recognized");
}
}
use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.
the class FileOutputCommitterContainer method abortJob.
@Override
public void abortJob(JobContext jobContext, State state) throws IOException {
try {
if (dynamicPartitioningUsed) {
discoverPartitions(jobContext);
}
org.apache.hadoop.mapred.JobContext mapRedJobContext = HCatMapRedUtil.createJobContext(jobContext);
if (getBaseOutputCommitter() != null && !dynamicPartitioningUsed) {
getBaseOutputCommitter().abortJob(mapRedJobContext, state);
} else if (dynamicPartitioningUsed) {
for (JobContext currContext : contextDiscoveredByPath.values()) {
try {
new JobConf(currContext.getConfiguration()).getOutputCommitter().abortJob(currContext, state);
} catch (Exception e) {
throw new IOException(e);
}
}
}
Path src;
OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(jobContext.getConfiguration());
Path tblPath = new Path(jobInfo.getTableInfo().getTableLocation());
if (dynamicPartitioningUsed) {
if (!customDynamicLocationUsed) {
src = new Path(getPartitionRootLocation(jobInfo.getLocation(), jobInfo.getTableInfo().getTable().getPartitionKeysSize()));
} else {
src = new Path(getCustomPartitionRootLocation(jobInfo, jobContext.getConfiguration()));
}
} else {
src = new Path(jobInfo.getLocation());
}
FileSystem fs = src.getFileSystem(jobContext.getConfiguration());
// Note fs.delete will fail on Windows. The reason is in OutputCommitter,
// Hadoop is still writing to _logs/history. On Linux, OS don't care file is still
// open and remove the directory anyway, but on Windows, OS refuse to remove a
// directory containing open files. So on Windows, we will leave output directory
// behind when job fail. User needs to remove the output directory manually
LOG.info("Job failed. Try cleaning up temporary directory [{}].", src);
if (!src.equals(tblPath)) {
fs.delete(src, true);
}
} finally {
cancelDelegationTokens(jobContext);
}
}
use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.
the class FileOutputCommitterContainer method commitJob.
@Override
public void commitJob(JobContext jobContext) throws IOException {
if (dynamicPartitioningUsed) {
discoverPartitions(jobContext);
// dir
for (JobContext context : contextDiscoveredByPath.values()) {
new JobConf(context.getConfiguration()).getOutputCommitter().commitJob(context);
}
}
if (getBaseOutputCommitter() != null && !dynamicPartitioningUsed) {
getBaseOutputCommitter().commitJob(HCatMapRedUtil.createJobContext(jobContext));
}
registerPartitions(jobContext);
// create _SUCCESS FILE if so requested.
OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(jobContext.getConfiguration());
if (getOutputDirMarking(jobContext.getConfiguration())) {
Path outputPath = new Path(jobInfo.getLocation());
FileSystem fileSys = outputPath.getFileSystem(jobContext.getConfiguration());
// create a file in the folder to mark it
if (fileSys.exists(outputPath)) {
Path filePath = new Path(outputPath, SUCCEEDED_FILE_NAME);
if (!fileSys.exists(filePath)) {
// may have been
// created by
// baseCommitter.commitJob()
fileSys.create(filePath).close();
}
}
}
// Commit has succeeded (since no exceptions have been thrown.)
// Safe to cancel delegation tokens now.
cancelDelegationTokens(jobContext);
}
use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.
the class Hadoop23Shims method getCombineFileInputFormat.
@Override
public HadoopShims.CombineFileInputFormatShim getCombineFileInputFormat() {
return new CombineFileInputFormatShim() {
@Override
public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
throw new IOException("CombineFileInputFormat.getRecordReader not needed.");
}
@Override
protected List<FileStatus> listStatus(JobContext job) throws IOException {
List<FileStatus> result = super.listStatus(job);
Iterator<FileStatus> it = result.iterator();
while (it.hasNext()) {
FileStatus stat = it.next();
if (!stat.isFile() || (stat.getLen() == 0 && !stat.getPath().toUri().getScheme().equals("nullscan"))) {
it.remove();
}
}
return result;
}
};
}
Aggregations