Search in sources :

Example 1 with TableSplit

use of org.apache.hadoop.hbase.mapreduce.TableSplit in project hive by apache.

the class HiveHBaseTableInputFormat method getRecordReader.

@Override
public RecordReader<ImmutableBytesWritable, ResultWritable> getRecordReader(InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException {
    HBaseSplit hbaseSplit = (HBaseSplit) split;
    TableSplit tableSplit = hbaseSplit.getTableSplit();
    final org.apache.hadoop.mapreduce.RecordReader<ImmutableBytesWritable, Result> recordReader;
    Job job = new Job(jobConf);
    TaskAttemptContext tac = ShimLoader.getHadoopShims().newTaskAttemptContext(job.getConfiguration(), reporter);
    final Configuration hbaseConf = HBaseConfiguration.create(jobConf);
    final Scan scan = HiveHBaseInputFormatUtil.getScan(jobConf);
    LOG.debug("HBase configurations: {}", hbaseConf);
    LOG.info("Using global scan configuration (ignore per-split scan configs): {}", scan);
    final Connection conn;
    synchronized (HBASE_TABLE_MONITOR) {
        conn = ConnectionFactory.createConnection(hbaseConf);
        initializeTable(conn, tableSplit.getTable());
        setScan(scan);
        recordReader = createRecordReader(tableSplit, tac);
        try {
            recordReader.initialize(tableSplit, tac);
        } catch (InterruptedException e) {
            // Free up the HTable connections
            closeTable();
            conn.close();
            throw new IOException("Failed to initialize RecordReader", e);
        }
    }
    return new RecordReader<ImmutableBytesWritable, ResultWritable>() {

        @Override
        public void close() throws IOException {
            synchronized (HBASE_TABLE_MONITOR) {
                recordReader.close();
                closeTable();
                conn.close();
            }
        }

        @Override
        public ImmutableBytesWritable createKey() {
            return new ImmutableBytesWritable();
        }

        @Override
        public ResultWritable createValue() {
            return new ResultWritable(new Result());
        }

        @Override
        public long getPos() throws IOException {
            return 0;
        }

        @Override
        public float getProgress() throws IOException {
            float progress = 0.0F;
            try {
                progress = recordReader.getProgress();
            } catch (InterruptedException e) {
                throw new IOException(e);
            }
            return progress;
        }

        @Override
        public boolean next(ImmutableBytesWritable rowKey, ResultWritable value) throws IOException {
            boolean next = false;
            try {
                next = recordReader.nextKeyValue();
                if (next) {
                    rowKey.set(recordReader.getCurrentValue().getRow());
                    value.setResult(recordReader.getCurrentValue());
                }
            } catch (InterruptedException e) {
                throw new IOException(e);
            }
            return next;
        }
    };
}
Also used : ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) Configuration(org.apache.hadoop.conf.Configuration) HBaseConfiguration(org.apache.hadoop.hbase.HBaseConfiguration) RecordReader(org.apache.hadoop.mapred.RecordReader) Connection(org.apache.hadoop.hbase.client.Connection) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) IOException(java.io.IOException) Result(org.apache.hadoop.hbase.client.Result) TableSplit(org.apache.hadoop.hbase.mapreduce.TableSplit) Scan(org.apache.hadoop.hbase.client.Scan) Job(org.apache.hadoop.mapreduce.Job)

Example 2 with TableSplit

use of org.apache.hadoop.hbase.mapreduce.TableSplit in project akela by mozilla-metrics.

the class MultiScanTableInputFormat method createRecordReader.

/* (non-Javadoc)
	 * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(org.apache.hadoop.mapreduce.InputSplit, org.apache.hadoop.mapreduce.TaskAttemptContext)
	 */
@Override
public RecordReader<ImmutableBytesWritable, Result> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    if (scans == null) {
        throw new IOException("No scans were provided");
    }
    if (table == null) {
        throw new IOException("No table was provided.");
    }
    if (trr == null) {
        trr = new TableRecordReader();
    }
    TableSplit tSplit = (TableSplit) split;
    Scan scan = new Scan(scans[0]);
    scan.setStartRow(tSplit.getStartRow());
    scan.setStopRow(tSplit.getEndRow());
    trr.setScan(scan);
    trr.setHTable(table);
    trr.init();
    return trr;
}
Also used : TableSplit(org.apache.hadoop.hbase.mapreduce.TableSplit) Scan(org.apache.hadoop.hbase.client.Scan) IOException(java.io.IOException)

Example 3 with TableSplit

use of org.apache.hadoop.hbase.mapreduce.TableSplit in project akela by mozilla-metrics.

the class MultiScanTableInputFormat method getSplits.

/* (non-Javadoc)
	 * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)
	 */
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    if (table == null) {
        throw new IOException("No table was provided.");
    }
    Pair<byte[][], byte[][]> keys = table.getStartEndKeys();
    if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) {
        throw new IOException("Expecting at least one region.");
    }
    Set<InputSplit> splits = new HashSet<InputSplit>();
    for (int i = 0; i < keys.getFirst().length; i++) {
        String regionLocation = table.getRegionLocation(keys.getFirst()[i]).getServerAddress().getHostname();
        for (Scan s : scans) {
            byte[] startRow = s.getStartRow();
            byte[] stopRow = s.getStopRow();
            // determine if the given start an stop key fall into the region
            if ((startRow.length == 0 || keys.getSecond()[i].length == 0 || Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) && (stopRow.length == 0 || Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
                byte[] splitStart = startRow.length == 0 || Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ? keys.getFirst()[i] : startRow;
                byte[] splitStop = (stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) && keys.getSecond()[i].length > 0 ? keys.getSecond()[i] : stopRow;
                InputSplit split = new TableSplit(table.getTableName(), splitStart, splitStop, regionLocation);
                splits.add(split);
            }
        }
    }
    return new ArrayList<InputSplit>(splits);
}
Also used : TableSplit(org.apache.hadoop.hbase.mapreduce.TableSplit) ArrayList(java.util.ArrayList) Scan(org.apache.hadoop.hbase.client.Scan) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapreduce.InputSplit) HashSet(java.util.HashSet)

Example 4 with TableSplit

use of org.apache.hadoop.hbase.mapreduce.TableSplit in project hive by apache.

the class HiveHBaseTableInputFormat method getSplitsInternal.

private InputSplit[] getSplitsInternal(JobConf jobConf, int numSplits) throws IOException {
    // obtain delegation tokens for the job
    if (UserGroupInformation.getCurrentUser().hasKerberosCredentials()) {
        TableMapReduceUtil.initCredentials(jobConf);
    }
    String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME);
    Connection conn = ConnectionFactory.createConnection(HBaseConfiguration.create(jobConf));
    TableName tableName = TableName.valueOf(hbaseTableName);
    initializeTable(conn, tableName);
    String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);
    boolean doColumnRegexMatching = jobConf.getBoolean(HBaseSerDe.HBASE_COLUMNS_REGEX_MATCHING, true);
    try {
        if (hbaseColumnsMapping == null) {
            throw new IOException(HBaseSerDe.HBASE_COLUMNS_MAPPING + " required for HBase Table.");
        }
        ColumnMappings columnMappings = null;
        try {
            columnMappings = HBaseSerDe.parseColumnsMapping(hbaseColumnsMapping, doColumnRegexMatching);
        } catch (SerDeException e) {
            throw new IOException(e);
        }
        int iKey = columnMappings.getKeyIndex();
        int iTimestamp = columnMappings.getTimestampIndex();
        ColumnMapping keyMapping = columnMappings.getKeyMapping();
        // Take filter pushdown into account while calculating splits; this
        // allows us to prune off regions immediately.  Note that although
        // the Javadoc for the superclass getSplits says that it returns one
        // split per region, the implementation actually takes the scan
        // definition into account and excludes regions which don't satisfy
        // the start/stop row conditions (HBASE-1829).
        Scan scan = createFilterScan(jobConf, iKey, iTimestamp, HiveHBaseInputFormatUtil.getStorageFormatOfKey(keyMapping.mappingSpec, jobConf.get(HBaseSerDe.HBASE_TABLE_DEFAULT_STORAGE_TYPE, "string")));
        // The list of families that have been added to the scan
        List<String> addedFamilies = new ArrayList<String>();
        // same as in getRecordReader?
        for (ColumnMapping colMap : columnMappings) {
            if (colMap.hbaseRowKey || colMap.hbaseTimestamp) {
                continue;
            }
            if (colMap.qualifierName == null) {
                scan.addFamily(colMap.familyNameBytes);
                addedFamilies.add(colMap.familyName);
            } else {
                if (!addedFamilies.contains(colMap.familyName)) {
                    // add the column only if the family has not already been added
                    scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes);
                }
            }
        }
        setScan(scan);
        Job job = new Job(jobConf);
        JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
        Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
        List<org.apache.hadoop.mapreduce.InputSplit> splits = super.getSplits(jobContext);
        InputSplit[] results = new InputSplit[splits.size()];
        for (int i = 0; i < splits.size(); i++) {
            results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]);
        }
        return results;
    } finally {
        closeTable();
        conn.close();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Connection(org.apache.hadoop.hbase.client.Connection) ArrayList(java.util.ArrayList) IOException(java.io.IOException) TableName(org.apache.hadoop.hbase.TableName) TableSplit(org.apache.hadoop.hbase.mapreduce.TableSplit) Scan(org.apache.hadoop.hbase.client.Scan) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapred.InputSplit) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) ColumnMapping(org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping)

Aggregations

IOException (java.io.IOException)4 Scan (org.apache.hadoop.hbase.client.Scan)4 TableSplit (org.apache.hadoop.hbase.mapreduce.TableSplit)4 ArrayList (java.util.ArrayList)2 Connection (org.apache.hadoop.hbase.client.Connection)2 Job (org.apache.hadoop.mapreduce.Job)2 HashSet (java.util.HashSet)1 Configuration (org.apache.hadoop.conf.Configuration)1 Path (org.apache.hadoop.fs.Path)1 HBaseConfiguration (org.apache.hadoop.hbase.HBaseConfiguration)1 TableName (org.apache.hadoop.hbase.TableName)1 Result (org.apache.hadoop.hbase.client.Result)1 ImmutableBytesWritable (org.apache.hadoop.hbase.io.ImmutableBytesWritable)1 ColumnMapping (org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping)1 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)1 InputSplit (org.apache.hadoop.mapred.InputSplit)1 RecordReader (org.apache.hadoop.mapred.RecordReader)1 InputSplit (org.apache.hadoop.mapreduce.InputSplit)1 JobContext (org.apache.hadoop.mapreduce.JobContext)1 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)1