use of org.apache.hadoop.hbase.mapreduce.TableSplit in project hive by apache.
the class HiveHBaseTableInputFormat method getRecordReader.
@Override
public RecordReader<ImmutableBytesWritable, ResultWritable> getRecordReader(InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException {
HBaseSplit hbaseSplit = (HBaseSplit) split;
TableSplit tableSplit = hbaseSplit.getTableSplit();
final org.apache.hadoop.mapreduce.RecordReader<ImmutableBytesWritable, Result> recordReader;
Job job = new Job(jobConf);
TaskAttemptContext tac = ShimLoader.getHadoopShims().newTaskAttemptContext(job.getConfiguration(), reporter);
final Configuration hbaseConf = HBaseConfiguration.create(jobConf);
final Scan scan = HiveHBaseInputFormatUtil.getScan(jobConf);
LOG.debug("HBase configurations: {}", hbaseConf);
LOG.info("Using global scan configuration (ignore per-split scan configs): {}", scan);
final Connection conn;
synchronized (HBASE_TABLE_MONITOR) {
conn = ConnectionFactory.createConnection(hbaseConf);
initializeTable(conn, tableSplit.getTable());
setScan(scan);
recordReader = createRecordReader(tableSplit, tac);
try {
recordReader.initialize(tableSplit, tac);
} catch (InterruptedException e) {
// Free up the HTable connections
closeTable();
conn.close();
throw new IOException("Failed to initialize RecordReader", e);
}
}
return new RecordReader<ImmutableBytesWritable, ResultWritable>() {
@Override
public void close() throws IOException {
synchronized (HBASE_TABLE_MONITOR) {
recordReader.close();
closeTable();
conn.close();
}
}
@Override
public ImmutableBytesWritable createKey() {
return new ImmutableBytesWritable();
}
@Override
public ResultWritable createValue() {
return new ResultWritable(new Result());
}
@Override
public long getPos() throws IOException {
return 0;
}
@Override
public float getProgress() throws IOException {
float progress = 0.0F;
try {
progress = recordReader.getProgress();
} catch (InterruptedException e) {
throw new IOException(e);
}
return progress;
}
@Override
public boolean next(ImmutableBytesWritable rowKey, ResultWritable value) throws IOException {
boolean next = false;
try {
next = recordReader.nextKeyValue();
if (next) {
rowKey.set(recordReader.getCurrentValue().getRow());
value.setResult(recordReader.getCurrentValue());
}
} catch (InterruptedException e) {
throw new IOException(e);
}
return next;
}
};
}
use of org.apache.hadoop.hbase.mapreduce.TableSplit in project akela by mozilla-metrics.
the class MultiScanTableInputFormat method createRecordReader.
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(org.apache.hadoop.mapreduce.InputSplit, org.apache.hadoop.mapreduce.TaskAttemptContext)
*/
@Override
public RecordReader<ImmutableBytesWritable, Result> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
if (scans == null) {
throw new IOException("No scans were provided");
}
if (table == null) {
throw new IOException("No table was provided.");
}
if (trr == null) {
trr = new TableRecordReader();
}
TableSplit tSplit = (TableSplit) split;
Scan scan = new Scan(scans[0]);
scan.setStartRow(tSplit.getStartRow());
scan.setStopRow(tSplit.getEndRow());
trr.setScan(scan);
trr.setHTable(table);
trr.init();
return trr;
}
use of org.apache.hadoop.hbase.mapreduce.TableSplit in project akela by mozilla-metrics.
the class MultiScanTableInputFormat method getSplits.
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)
*/
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
if (table == null) {
throw new IOException("No table was provided.");
}
Pair<byte[][], byte[][]> keys = table.getStartEndKeys();
if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) {
throw new IOException("Expecting at least one region.");
}
Set<InputSplit> splits = new HashSet<InputSplit>();
for (int i = 0; i < keys.getFirst().length; i++) {
String regionLocation = table.getRegionLocation(keys.getFirst()[i]).getServerAddress().getHostname();
for (Scan s : scans) {
byte[] startRow = s.getStartRow();
byte[] stopRow = s.getStopRow();
// determine if the given start an stop key fall into the region
if ((startRow.length == 0 || keys.getSecond()[i].length == 0 || Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) && (stopRow.length == 0 || Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
byte[] splitStart = startRow.length == 0 || Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ? keys.getFirst()[i] : startRow;
byte[] splitStop = (stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) && keys.getSecond()[i].length > 0 ? keys.getSecond()[i] : stopRow;
InputSplit split = new TableSplit(table.getTableName(), splitStart, splitStop, regionLocation);
splits.add(split);
}
}
}
return new ArrayList<InputSplit>(splits);
}
use of org.apache.hadoop.hbase.mapreduce.TableSplit in project hive by apache.
the class HiveHBaseTableInputFormat method getSplitsInternal.
private InputSplit[] getSplitsInternal(JobConf jobConf, int numSplits) throws IOException {
// obtain delegation tokens for the job
if (UserGroupInformation.getCurrentUser().hasKerberosCredentials()) {
TableMapReduceUtil.initCredentials(jobConf);
}
String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME);
Connection conn = ConnectionFactory.createConnection(HBaseConfiguration.create(jobConf));
TableName tableName = TableName.valueOf(hbaseTableName);
initializeTable(conn, tableName);
String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);
boolean doColumnRegexMatching = jobConf.getBoolean(HBaseSerDe.HBASE_COLUMNS_REGEX_MATCHING, true);
try {
if (hbaseColumnsMapping == null) {
throw new IOException(HBaseSerDe.HBASE_COLUMNS_MAPPING + " required for HBase Table.");
}
ColumnMappings columnMappings = null;
try {
columnMappings = HBaseSerDe.parseColumnsMapping(hbaseColumnsMapping, doColumnRegexMatching);
} catch (SerDeException e) {
throw new IOException(e);
}
int iKey = columnMappings.getKeyIndex();
int iTimestamp = columnMappings.getTimestampIndex();
ColumnMapping keyMapping = columnMappings.getKeyMapping();
// Take filter pushdown into account while calculating splits; this
// allows us to prune off regions immediately. Note that although
// the Javadoc for the superclass getSplits says that it returns one
// split per region, the implementation actually takes the scan
// definition into account and excludes regions which don't satisfy
// the start/stop row conditions (HBASE-1829).
Scan scan = createFilterScan(jobConf, iKey, iTimestamp, HiveHBaseInputFormatUtil.getStorageFormatOfKey(keyMapping.mappingSpec, jobConf.get(HBaseSerDe.HBASE_TABLE_DEFAULT_STORAGE_TYPE, "string")));
// The list of families that have been added to the scan
List<String> addedFamilies = new ArrayList<String>();
// same as in getRecordReader?
for (ColumnMapping colMap : columnMappings) {
if (colMap.hbaseRowKey || colMap.hbaseTimestamp) {
continue;
}
if (colMap.qualifierName == null) {
scan.addFamily(colMap.familyNameBytes);
addedFamilies.add(colMap.familyName);
} else {
if (!addedFamilies.contains(colMap.familyName)) {
// add the column only if the family has not already been added
scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes);
}
}
}
setScan(scan);
Job job = new Job(jobConf);
JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
List<org.apache.hadoop.mapreduce.InputSplit> splits = super.getSplits(jobContext);
InputSplit[] results = new InputSplit[splits.size()];
for (int i = 0; i < splits.size(); i++) {
results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]);
}
return results;
} finally {
closeTable();
conn.close();
}
}
Aggregations