use of org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping in project hive by apache.
the class HBaseSerDe method parseColumnsMapping.
/**
* Parses the HBase columns mapping specifier to identify the column families, qualifiers
* and also caches the byte arrays corresponding to them. One of the Hive table
* columns maps to the HBase row key, by default the first column.
*
* @param columnsMappingSpec string hbase.columns.mapping specified when creating table
* @param doColumnRegexMatching whether to do a regex matching on the columns or not
* @param hideColumnPrefix whether to hide a prefix of column mapping in key name in a map (works only if @doColumnRegexMatching is true)
* @return List<ColumnMapping> which contains the column mapping information by position
* @throws org.apache.hadoop.hive.serde2.SerDeException
*/
public static ColumnMappings parseColumnsMapping(String columnsMappingSpec, boolean doColumnRegexMatching, boolean hideColumnPrefix) throws SerDeException {
if (columnsMappingSpec == null) {
throw new SerDeException("Error: hbase.columns.mapping missing for this HBase table.");
}
if (columnsMappingSpec.isEmpty() || columnsMappingSpec.equals(HBASE_KEY_COL)) {
throw new SerDeException("Error: hbase.columns.mapping specifies only the HBase table" + " row key. A valid Hive-HBase table must specify at least one additional column.");
}
int rowKeyIndex = -1;
int timestampIndex = -1;
List<ColumnMapping> columnsMapping = new ArrayList<ColumnMapping>();
String[] columnSpecs = columnsMappingSpec.split(",");
for (int i = 0; i < columnSpecs.length; i++) {
String mappingSpec = columnSpecs[i].trim();
String[] mapInfo = mappingSpec.split("#");
String colInfo = mapInfo[0];
int idxFirst = colInfo.indexOf(":");
int idxLast = colInfo.lastIndexOf(":");
if (idxFirst < 0 || !(idxFirst == idxLast)) {
throw new SerDeException("Error: the HBase columns mapping contains a badly formed " + "column family, column qualifier specification.");
}
ColumnMapping columnMapping = new ColumnMapping();
if (colInfo.equals(HBASE_KEY_COL)) {
rowKeyIndex = i;
columnMapping.familyName = colInfo;
columnMapping.familyNameBytes = Bytes.toBytes(colInfo);
columnMapping.qualifierName = null;
columnMapping.qualifierNameBytes = null;
columnMapping.hbaseRowKey = true;
} else if (colInfo.equals(HBASE_TIMESTAMP_COL)) {
timestampIndex = i;
columnMapping.familyName = colInfo;
columnMapping.familyNameBytes = Bytes.toBytes(colInfo);
columnMapping.qualifierName = null;
columnMapping.qualifierNameBytes = null;
columnMapping.hbaseTimestamp = true;
} else {
String[] parts = colInfo.split(":");
assert (parts.length > 0 && parts.length <= 2);
columnMapping.familyName = parts[0];
columnMapping.familyNameBytes = Bytes.toBytes(parts[0]);
columnMapping.hbaseRowKey = false;
columnMapping.hbaseTimestamp = false;
if (parts.length == 2) {
if (doColumnRegexMatching && parts[1].endsWith(".*")) {
// we have a prefix with a wildcard
columnMapping.qualifierPrefix = parts[1].substring(0, parts[1].length() - 2);
columnMapping.qualifierPrefixBytes = Bytes.toBytes(columnMapping.qualifierPrefix);
// pass a flag to hide prefixes
columnMapping.doPrefixCut = hideColumnPrefix;
// we weren't provided any actual qualifier name. Set these to
// null.
columnMapping.qualifierName = null;
columnMapping.qualifierNameBytes = null;
} else {
// set the regular provided qualifier names
columnMapping.qualifierName = parts[1];
columnMapping.qualifierNameBytes = Bytes.toBytes(parts[1]);
// if there is no prefix then we don't cut anything
columnMapping.doPrefixCut = false;
}
} else {
columnMapping.qualifierName = null;
columnMapping.qualifierNameBytes = null;
}
}
columnMapping.mappingSpec = mappingSpec;
columnsMapping.add(columnMapping);
}
if (rowKeyIndex == -1) {
rowKeyIndex = 0;
ColumnMapping columnMapping = new ColumnMapping();
columnMapping.familyName = HBaseSerDe.HBASE_KEY_COL;
columnMapping.familyNameBytes = Bytes.toBytes(HBaseSerDe.HBASE_KEY_COL);
columnMapping.qualifierName = null;
columnMapping.qualifierNameBytes = null;
columnMapping.hbaseRowKey = true;
columnMapping.mappingSpec = HBaseSerDe.HBASE_KEY_COL;
columnsMapping.add(0, columnMapping);
}
return new ColumnMappings(columnsMapping, rowKeyIndex, timestampIndex);
}
use of org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping in project hive by apache.
the class HiveHBaseInputFormatUtil method getScan.
/**
* Parse {@code jobConf} to create a {@link Scan} instance.
*/
public static Scan getScan(JobConf jobConf) throws IOException {
String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);
boolean doColumnRegexMatching = jobConf.getBoolean(HBaseSerDe.HBASE_COLUMNS_REGEX_MATCHING, true);
List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf);
ColumnMappings columnMappings;
try {
columnMappings = HBaseSerDe.parseColumnsMapping(hbaseColumnsMapping, doColumnRegexMatching);
} catch (SerDeException e) {
throw new IOException(e);
}
if (columnMappings.size() < readColIDs.size()) {
throw new IOException("Cannot read more columns than the given table contains.");
}
boolean readAllColumns = ColumnProjectionUtils.isReadAllColumns(jobConf);
Scan scan = new Scan();
boolean empty = true;
// The list of families that have been added to the scan
List<String> addedFamilies = new ArrayList<String>();
if (!readAllColumns) {
ColumnMapping[] columnsMapping = columnMappings.getColumnsMapping();
for (int i : readColIDs) {
ColumnMapping colMap = columnsMapping[i];
if (colMap.hbaseRowKey || colMap.hbaseTimestamp) {
continue;
}
if (colMap.qualifierName == null) {
scan.addFamily(colMap.familyNameBytes);
addedFamilies.add(colMap.familyName);
} else {
if (!addedFamilies.contains(colMap.familyName)) {
// add only if the corresponding family has not already been added
scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes);
}
}
empty = false;
}
}
// count only on the keys
if (empty) {
if (readAllColumns) {
for (ColumnMapping colMap : columnMappings) {
if (colMap.hbaseRowKey || colMap.hbaseTimestamp) {
continue;
}
if (colMap.qualifierName == null) {
scan.addFamily(colMap.familyNameBytes);
} else {
scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes);
}
}
} else {
// Add a filter to just do a scan on the keys so that we pick up everything
scan.setFilter(new FilterList(new FirstKeyOnlyFilter(), new KeyOnlyFilter()));
}
}
String scanCache = jobConf.get(HBaseSerDe.HBASE_SCAN_CACHE);
if (scanCache != null) {
scan.setCaching(Integer.parseInt(scanCache));
}
String scanCacheBlocks = jobConf.get(HBaseSerDe.HBASE_SCAN_CACHEBLOCKS);
if (scanCacheBlocks != null) {
scan.setCacheBlocks(Boolean.parseBoolean(scanCacheBlocks));
}
String scanBatch = jobConf.get(HBaseSerDe.HBASE_SCAN_BATCH);
if (scanBatch != null) {
scan.setBatch(Integer.parseInt(scanBatch));
}
String filterObjectSerialized = jobConf.get(TableScanDesc.FILTER_OBJECT_CONF_STR);
if (filterObjectSerialized != null) {
setupScanRange(scan, filterObjectSerialized, jobConf, true);
}
return scan;
}
use of org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping in project hive by apache.
the class HiveHBaseTableInputFormat method getSplitsInternal.
private InputSplit[] getSplitsInternal(JobConf jobConf, int numSplits) throws IOException {
// obtain delegation tokens for the job
if (UserGroupInformation.getCurrentUser().hasKerberosCredentials()) {
TableMapReduceUtil.initCredentials(jobConf);
}
String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME);
if (conn == null) {
conn = ConnectionFactory.createConnection(HBaseConfiguration.create(jobConf));
}
TableName tableName = TableName.valueOf(hbaseTableName);
initializeTable(conn, tableName);
String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);
boolean doColumnRegexMatching = jobConf.getBoolean(HBaseSerDe.HBASE_COLUMNS_REGEX_MATCHING, true);
try {
if (hbaseColumnsMapping == null) {
throw new IOException(HBaseSerDe.HBASE_COLUMNS_MAPPING + " required for HBase Table.");
}
ColumnMappings columnMappings = null;
try {
columnMappings = HBaseSerDe.parseColumnsMapping(hbaseColumnsMapping, doColumnRegexMatching);
} catch (SerDeException e) {
throw new IOException(e);
}
int iKey = columnMappings.getKeyIndex();
int iTimestamp = columnMappings.getTimestampIndex();
ColumnMapping keyMapping = columnMappings.getKeyMapping();
// Take filter pushdown into account while calculating splits; this
// allows us to prune off regions immediately. Note that although
// the Javadoc for the superclass getSplits says that it returns one
// split per region, the implementation actually takes the scan
// definition into account and excludes regions which don't satisfy
// the start/stop row conditions (HBASE-1829).
Scan scan = createFilterScan(jobConf, iKey, iTimestamp, HiveHBaseInputFormatUtil.getStorageFormatOfKey(keyMapping.mappingSpec, jobConf.get(HBaseSerDe.HBASE_TABLE_DEFAULT_STORAGE_TYPE, "string")));
// The list of families that have been added to the scan
List<String> addedFamilies = new ArrayList<String>();
// same as in getRecordReader?
for (ColumnMapping colMap : columnMappings) {
if (colMap.hbaseRowKey || colMap.hbaseTimestamp) {
continue;
}
if (colMap.qualifierName == null) {
scan.addFamily(colMap.familyNameBytes);
addedFamilies.add(colMap.familyName);
} else {
if (!addedFamilies.contains(colMap.familyName)) {
// add the column only if the family has not already been added
scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes);
}
}
}
setScan(scan);
Job job = new Job(jobConf);
JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
List<org.apache.hadoop.mapreduce.InputSplit> splits = super.getSplits(jobContext);
InputSplit[] results = new InputSplit[splits.size()];
for (int i = 0; i < splits.size(); i++) {
results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]);
}
return results;
} finally {
closeTable();
if (conn != null) {
conn.close();
conn = null;
}
}
}
use of org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping in project hive by apache.
the class LazyHBaseRow method uncheckedGetField.
/**
* Get the field out of the row without checking whether parsing is needed.
* This is called by both getField and getFieldsAsList.
* @param fieldID The id of the field starting from 0.
* @return The value of the field
*/
private Object uncheckedGetField(int fieldID) {
LazyObjectBase[] fields = getFields();
boolean[] fieldsInited = getFieldInited();
if (!fieldsInited[fieldID]) {
fieldsInited[fieldID] = true;
ColumnMapping colMap = columnsMapping[fieldID];
if (!colMap.hbaseRowKey && !colMap.hbaseTimestamp && colMap.qualifierName == null) {
// it is a column family
// primitive type for Map<Key, Value> can be stored in binary format. Pass in the
// qualifier prefix to cherry pick the qualifiers that match the prefix instead of picking
// up everything
((LazyHBaseCellMap) fields[fieldID]).init(result, colMap.familyNameBytes, colMap.binaryStorage, colMap.qualifierPrefixBytes, colMap.isDoPrefixCut());
return fields[fieldID].getObject();
}
if (colMap.hbaseTimestamp) {
// Get the latest timestamp of all the cells as the row timestamp
// from hbase-0.96.0
long timestamp = result.rawCells()[0].getTimestamp();
for (int i = 1; i < result.rawCells().length; i++) {
timestamp = Math.max(timestamp, result.rawCells()[i].getTimestamp());
}
LazyObjectBase lz = fields[fieldID];
if (lz instanceof LazyTimestamp) {
((LazyTimestamp) lz).getWritableObject().setTime(timestamp);
} else {
((LazyLong) lz).getWritableObject().set(timestamp);
}
return lz.getObject();
}
byte[] bytes;
if (colMap.hbaseRowKey) {
bytes = result.getRow();
} else {
// it is a column i.e. a column-family with column-qualifier
bytes = result.getValue(colMap.familyNameBytes, colMap.qualifierNameBytes);
}
if (bytes == null || isNull(oi.getNullSequence(), bytes, 0, bytes.length)) {
fields[fieldID].setNull();
} else {
ByteArrayRef ref = new ByteArrayRef();
ref.setData(bytes);
fields[fieldID].init(ref, 0, bytes.length);
}
}
return fields[fieldID].getObject();
}
use of org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping in project hive by apache.
the class TestLazyHBaseObject method testLazyHBaseRow2.
/**
* Test the LazyHBaseRow class with a mapping from a Hive field to
* an HBase column family.
* @throws SerDeException
*/
public void testLazyHBaseRow2() throws SerDeException {
// column family is mapped to Map<string,string>
List<TypeInfo> fieldTypeInfos = TypeInfoUtils.getTypeInfosFromTypeString("string,int,array<string>,map<string,string>,string");
List<String> fieldNames = Arrays.asList(new String[] { "key", "a", "b", "c", "d" });
Text nullSequence = new Text("\\N");
String hbaseColsMapping = ":key,cfa:a,cfa:b,cfb:,cfc:d";
ColumnMappings columnMappings = null;
try {
columnMappings = HBaseSerDe.parseColumnsMapping(hbaseColsMapping);
} catch (SerDeException e) {
fail(e.toString());
}
for (ColumnMapping colMap : columnMappings) {
if (!colMap.hbaseRowKey && colMap.qualifierName == null) {
colMap.binaryStorage.add(false);
colMap.binaryStorage.add(false);
} else {
colMap.binaryStorage.add(false);
}
}
ObjectInspector oi = LazyFactory.createLazyStructInspector(fieldNames, fieldTypeInfos, new byte[] { ' ', ':', '=' }, nullSequence, false, false, (byte) 0);
LazyHBaseRow o = new LazyHBaseRow((LazySimpleStructObjectInspector) oi, columnMappings);
List<Cell> kvs = new ArrayList<Cell>();
kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfa"), Bytes.toBytes("a"), Bytes.toBytes("123")));
kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfa"), Bytes.toBytes("b"), Bytes.toBytes("a:b:c")));
kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfb"), Bytes.toBytes("d"), Bytes.toBytes("e")));
kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfb"), Bytes.toBytes("f"), Bytes.toBytes("g")));
kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfc"), Bytes.toBytes("d"), Bytes.toBytes("hi")));
Result r = Result.create(kvs);
o.init(r);
assertEquals(("{'key':'test-row','a':123,'b':['a','b','c']," + "'c':{'d':'e','f':'g'},'d':'hi'}").replace("'", "\""), SerDeUtils.getJSONString(o, oi));
kvs.clear();
kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfa"), Bytes.toBytes("a"), Bytes.toBytes("123")));
kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfb"), Bytes.toBytes("d"), Bytes.toBytes("e")));
kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfb"), Bytes.toBytes("f"), Bytes.toBytes("g")));
r = Result.create(kvs);
o.init(r);
assertEquals(("{'key':'test-row','a':123,'b':null," + "'c':{'d':'e','f':'g'},'d':null}").replace("'", "\""), SerDeUtils.getJSONString(o, oi));
kvs.clear();
kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfa"), Bytes.toBytes("b"), Bytes.toBytes("a")));
kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfb"), Bytes.toBytes("f"), Bytes.toBytes("g")));
kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfc"), Bytes.toBytes("d"), Bytes.toBytes("no")));
r = Result.create(kvs);
o.init(r);
assertEquals(("{'key':'test-row','a':null,'b':['a']," + "'c':{'f':'g'},'d':'no'}").replace("'", "\""), SerDeUtils.getJSONString(o, oi));
kvs.clear();
kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfa"), Bytes.toBytes("b"), Bytes.toBytes(":a::")));
kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfc"), Bytes.toBytes("d"), Bytes.toBytes("no")));
r = Result.create(kvs);
o.init(r);
assertEquals(("{'key':'test-row','a':null,'b':['','a','','']," + "'c':{},'d':'no'}").replace("'", "\""), SerDeUtils.getJSONString(o, oi));
kvs.clear();
kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfa"), Bytes.toBytes("a"), Bytes.toBytes("123")));
kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfa"), Bytes.toBytes("b"), Bytes.toBytes("")));
kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfc"), Bytes.toBytes("d"), Bytes.toBytes("")));
r = Result.create(kvs);
o.init(r);
assertEquals("{'key':'test-row','a':123,'b':[],'c':{},'d':''}".replace("'", "\""), SerDeUtils.getJSONString(o, oi));
}
Aggregations