Search in sources :

Example 6 with ColumnMapping

use of org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping in project hive by apache.

the class HBaseSerDe method parseColumnsMapping.

/**
 * Parses the HBase columns mapping specifier to identify the column families, qualifiers
 * and also caches the byte arrays corresponding to them. One of the Hive table
 * columns maps to the HBase row key, by default the first column.
 *
 * @param columnsMappingSpec string hbase.columns.mapping specified when creating table
 * @param doColumnRegexMatching whether to do a regex matching on the columns or not
 * @param hideColumnPrefix whether to hide a prefix of column mapping in key name in a map (works only if @doColumnRegexMatching is true)
 * @return List<ColumnMapping> which contains the column mapping information by position
 * @throws org.apache.hadoop.hive.serde2.SerDeException
 */
public static ColumnMappings parseColumnsMapping(String columnsMappingSpec, boolean doColumnRegexMatching, boolean hideColumnPrefix) throws SerDeException {
    if (columnsMappingSpec == null) {
        throw new SerDeException("Error: hbase.columns.mapping missing for this HBase table.");
    }
    if (columnsMappingSpec.isEmpty() || columnsMappingSpec.equals(HBASE_KEY_COL)) {
        throw new SerDeException("Error: hbase.columns.mapping specifies only the HBase table" + " row key. A valid Hive-HBase table must specify at least one additional column.");
    }
    int rowKeyIndex = -1;
    int timestampIndex = -1;
    List<ColumnMapping> columnsMapping = new ArrayList<ColumnMapping>();
    String[] columnSpecs = columnsMappingSpec.split(",");
    for (int i = 0; i < columnSpecs.length; i++) {
        String mappingSpec = columnSpecs[i].trim();
        String[] mapInfo = mappingSpec.split("#");
        String colInfo = mapInfo[0];
        int idxFirst = colInfo.indexOf(":");
        int idxLast = colInfo.lastIndexOf(":");
        if (idxFirst < 0 || !(idxFirst == idxLast)) {
            throw new SerDeException("Error: the HBase columns mapping contains a badly formed " + "column family, column qualifier specification.");
        }
        ColumnMapping columnMapping = new ColumnMapping();
        if (colInfo.equals(HBASE_KEY_COL)) {
            rowKeyIndex = i;
            columnMapping.familyName = colInfo;
            columnMapping.familyNameBytes = Bytes.toBytes(colInfo);
            columnMapping.qualifierName = null;
            columnMapping.qualifierNameBytes = null;
            columnMapping.hbaseRowKey = true;
        } else if (colInfo.equals(HBASE_TIMESTAMP_COL)) {
            timestampIndex = i;
            columnMapping.familyName = colInfo;
            columnMapping.familyNameBytes = Bytes.toBytes(colInfo);
            columnMapping.qualifierName = null;
            columnMapping.qualifierNameBytes = null;
            columnMapping.hbaseTimestamp = true;
        } else {
            String[] parts = colInfo.split(":");
            assert (parts.length > 0 && parts.length <= 2);
            columnMapping.familyName = parts[0];
            columnMapping.familyNameBytes = Bytes.toBytes(parts[0]);
            columnMapping.hbaseRowKey = false;
            columnMapping.hbaseTimestamp = false;
            if (parts.length == 2) {
                if (doColumnRegexMatching && parts[1].endsWith(".*")) {
                    // we have a prefix with a wildcard
                    columnMapping.qualifierPrefix = parts[1].substring(0, parts[1].length() - 2);
                    columnMapping.qualifierPrefixBytes = Bytes.toBytes(columnMapping.qualifierPrefix);
                    // pass a flag to hide prefixes
                    columnMapping.doPrefixCut = hideColumnPrefix;
                    // we weren't provided any actual qualifier name. Set these to
                    // null.
                    columnMapping.qualifierName = null;
                    columnMapping.qualifierNameBytes = null;
                } else {
                    // set the regular provided qualifier names
                    columnMapping.qualifierName = parts[1];
                    columnMapping.qualifierNameBytes = Bytes.toBytes(parts[1]);
                    // if there is no prefix then we don't cut anything
                    columnMapping.doPrefixCut = false;
                }
            } else {
                columnMapping.qualifierName = null;
                columnMapping.qualifierNameBytes = null;
            }
        }
        columnMapping.mappingSpec = mappingSpec;
        columnsMapping.add(columnMapping);
    }
    if (rowKeyIndex == -1) {
        rowKeyIndex = 0;
        ColumnMapping columnMapping = new ColumnMapping();
        columnMapping.familyName = HBaseSerDe.HBASE_KEY_COL;
        columnMapping.familyNameBytes = Bytes.toBytes(HBaseSerDe.HBASE_KEY_COL);
        columnMapping.qualifierName = null;
        columnMapping.qualifierNameBytes = null;
        columnMapping.hbaseRowKey = true;
        columnMapping.mappingSpec = HBaseSerDe.HBASE_KEY_COL;
        columnsMapping.add(0, columnMapping);
    }
    return new ColumnMappings(columnsMapping, rowKeyIndex, timestampIndex);
}
Also used : ArrayList(java.util.ArrayList) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) ColumnMapping(org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping)

Example 7 with ColumnMapping

use of org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping in project hive by apache.

the class HiveHBaseInputFormatUtil method getScan.

/**
 * Parse {@code jobConf} to create a {@link Scan} instance.
 */
public static Scan getScan(JobConf jobConf) throws IOException {
    String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);
    boolean doColumnRegexMatching = jobConf.getBoolean(HBaseSerDe.HBASE_COLUMNS_REGEX_MATCHING, true);
    List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf);
    ColumnMappings columnMappings;
    try {
        columnMappings = HBaseSerDe.parseColumnsMapping(hbaseColumnsMapping, doColumnRegexMatching);
    } catch (SerDeException e) {
        throw new IOException(e);
    }
    if (columnMappings.size() < readColIDs.size()) {
        throw new IOException("Cannot read more columns than the given table contains.");
    }
    boolean readAllColumns = ColumnProjectionUtils.isReadAllColumns(jobConf);
    Scan scan = new Scan();
    boolean empty = true;
    // The list of families that have been added to the scan
    List<String> addedFamilies = new ArrayList<String>();
    if (!readAllColumns) {
        ColumnMapping[] columnsMapping = columnMappings.getColumnsMapping();
        for (int i : readColIDs) {
            ColumnMapping colMap = columnsMapping[i];
            if (colMap.hbaseRowKey || colMap.hbaseTimestamp) {
                continue;
            }
            if (colMap.qualifierName == null) {
                scan.addFamily(colMap.familyNameBytes);
                addedFamilies.add(colMap.familyName);
            } else {
                if (!addedFamilies.contains(colMap.familyName)) {
                    // add only if the corresponding family has not already been added
                    scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes);
                }
            }
            empty = false;
        }
    }
    // count only on the keys
    if (empty) {
        if (readAllColumns) {
            for (ColumnMapping colMap : columnMappings) {
                if (colMap.hbaseRowKey || colMap.hbaseTimestamp) {
                    continue;
                }
                if (colMap.qualifierName == null) {
                    scan.addFamily(colMap.familyNameBytes);
                } else {
                    scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes);
                }
            }
        } else {
            // Add a filter to just do a scan on the keys so that we pick up everything
            scan.setFilter(new FilterList(new FirstKeyOnlyFilter(), new KeyOnlyFilter()));
        }
    }
    String scanCache = jobConf.get(HBaseSerDe.HBASE_SCAN_CACHE);
    if (scanCache != null) {
        scan.setCaching(Integer.parseInt(scanCache));
    }
    String scanCacheBlocks = jobConf.get(HBaseSerDe.HBASE_SCAN_CACHEBLOCKS);
    if (scanCacheBlocks != null) {
        scan.setCacheBlocks(Boolean.parseBoolean(scanCacheBlocks));
    }
    String scanBatch = jobConf.get(HBaseSerDe.HBASE_SCAN_BATCH);
    if (scanBatch != null) {
        scan.setBatch(Integer.parseInt(scanBatch));
    }
    String filterObjectSerialized = jobConf.get(TableScanDesc.FILTER_OBJECT_CONF_STR);
    if (filterObjectSerialized != null) {
        setupScanRange(scan, filterObjectSerialized, jobConf, true);
    }
    return scan;
}
Also used : FirstKeyOnlyFilter(org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter) KeyOnlyFilter(org.apache.hadoop.hbase.filter.KeyOnlyFilter) FirstKeyOnlyFilter(org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter) ArrayList(java.util.ArrayList) FilterList(org.apache.hadoop.hbase.filter.FilterList) IOException(java.io.IOException) Scan(org.apache.hadoop.hbase.client.Scan) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) ColumnMapping(org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping)

Example 8 with ColumnMapping

use of org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping in project hive by apache.

the class HiveHBaseTableInputFormat method getSplitsInternal.

private InputSplit[] getSplitsInternal(JobConf jobConf, int numSplits) throws IOException {
    // obtain delegation tokens for the job
    if (UserGroupInformation.getCurrentUser().hasKerberosCredentials()) {
        TableMapReduceUtil.initCredentials(jobConf);
    }
    String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME);
    if (conn == null) {
        conn = ConnectionFactory.createConnection(HBaseConfiguration.create(jobConf));
    }
    TableName tableName = TableName.valueOf(hbaseTableName);
    initializeTable(conn, tableName);
    String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);
    boolean doColumnRegexMatching = jobConf.getBoolean(HBaseSerDe.HBASE_COLUMNS_REGEX_MATCHING, true);
    try {
        if (hbaseColumnsMapping == null) {
            throw new IOException(HBaseSerDe.HBASE_COLUMNS_MAPPING + " required for HBase Table.");
        }
        ColumnMappings columnMappings = null;
        try {
            columnMappings = HBaseSerDe.parseColumnsMapping(hbaseColumnsMapping, doColumnRegexMatching);
        } catch (SerDeException e) {
            throw new IOException(e);
        }
        int iKey = columnMappings.getKeyIndex();
        int iTimestamp = columnMappings.getTimestampIndex();
        ColumnMapping keyMapping = columnMappings.getKeyMapping();
        // Take filter pushdown into account while calculating splits; this
        // allows us to prune off regions immediately.  Note that although
        // the Javadoc for the superclass getSplits says that it returns one
        // split per region, the implementation actually takes the scan
        // definition into account and excludes regions which don't satisfy
        // the start/stop row conditions (HBASE-1829).
        Scan scan = createFilterScan(jobConf, iKey, iTimestamp, HiveHBaseInputFormatUtil.getStorageFormatOfKey(keyMapping.mappingSpec, jobConf.get(HBaseSerDe.HBASE_TABLE_DEFAULT_STORAGE_TYPE, "string")));
        // The list of families that have been added to the scan
        List<String> addedFamilies = new ArrayList<String>();
        // same as in getRecordReader?
        for (ColumnMapping colMap : columnMappings) {
            if (colMap.hbaseRowKey || colMap.hbaseTimestamp) {
                continue;
            }
            if (colMap.qualifierName == null) {
                scan.addFamily(colMap.familyNameBytes);
                addedFamilies.add(colMap.familyName);
            } else {
                if (!addedFamilies.contains(colMap.familyName)) {
                    // add the column only if the family has not already been added
                    scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes);
                }
            }
        }
        setScan(scan);
        Job job = new Job(jobConf);
        JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
        Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
        List<org.apache.hadoop.mapreduce.InputSplit> splits = super.getSplits(jobContext);
        InputSplit[] results = new InputSplit[splits.size()];
        for (int i = 0; i < splits.size(); i++) {
            results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]);
        }
        return results;
    } finally {
        closeTable();
        if (conn != null) {
            conn.close();
            conn = null;
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) IOException(java.io.IOException) TableName(org.apache.hadoop.hbase.TableName) TableSplit(org.apache.hadoop.hbase.mapreduce.TableSplit) Scan(org.apache.hadoop.hbase.client.Scan) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapred.InputSplit) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) ColumnMapping(org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping)

Example 9 with ColumnMapping

use of org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping in project hive by apache.

the class LazyHBaseRow method uncheckedGetField.

/**
 * Get the field out of the row without checking whether parsing is needed.
 * This is called by both getField and getFieldsAsList.
 * @param fieldID  The id of the field starting from 0.
 * @return  The value of the field
 */
private Object uncheckedGetField(int fieldID) {
    LazyObjectBase[] fields = getFields();
    boolean[] fieldsInited = getFieldInited();
    if (!fieldsInited[fieldID]) {
        fieldsInited[fieldID] = true;
        ColumnMapping colMap = columnsMapping[fieldID];
        if (!colMap.hbaseRowKey && !colMap.hbaseTimestamp && colMap.qualifierName == null) {
            // it is a column family
            // primitive type for Map<Key, Value> can be stored in binary format. Pass in the
            // qualifier prefix to cherry pick the qualifiers that match the prefix instead of picking
            // up everything
            ((LazyHBaseCellMap) fields[fieldID]).init(result, colMap.familyNameBytes, colMap.binaryStorage, colMap.qualifierPrefixBytes, colMap.isDoPrefixCut());
            return fields[fieldID].getObject();
        }
        if (colMap.hbaseTimestamp) {
            // Get the latest timestamp of all the cells as the row timestamp
            // from hbase-0.96.0
            long timestamp = result.rawCells()[0].getTimestamp();
            for (int i = 1; i < result.rawCells().length; i++) {
                timestamp = Math.max(timestamp, result.rawCells()[i].getTimestamp());
            }
            LazyObjectBase lz = fields[fieldID];
            if (lz instanceof LazyTimestamp) {
                ((LazyTimestamp) lz).getWritableObject().setTime(timestamp);
            } else {
                ((LazyLong) lz).getWritableObject().set(timestamp);
            }
            return lz.getObject();
        }
        byte[] bytes;
        if (colMap.hbaseRowKey) {
            bytes = result.getRow();
        } else {
            // it is a column i.e. a column-family with column-qualifier
            bytes = result.getValue(colMap.familyNameBytes, colMap.qualifierNameBytes);
        }
        if (bytes == null || isNull(oi.getNullSequence(), bytes, 0, bytes.length)) {
            fields[fieldID].setNull();
        } else {
            ByteArrayRef ref = new ByteArrayRef();
            ref.setData(bytes);
            fields[fieldID].init(ref, 0, bytes.length);
        }
    }
    return fields[fieldID].getObject();
}
Also used : ByteArrayRef(org.apache.hadoop.hive.serde2.lazy.ByteArrayRef) LazyTimestamp(org.apache.hadoop.hive.serde2.lazy.LazyTimestamp) LazyObjectBase(org.apache.hadoop.hive.serde2.lazy.LazyObjectBase) ColumnMapping(org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping)

Example 10 with ColumnMapping

use of org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping in project hive by apache.

the class TestLazyHBaseObject method testLazyHBaseRow2.

/**
 * Test the LazyHBaseRow class with a mapping from a Hive field to
 * an HBase column family.
 * @throws SerDeException
 */
public void testLazyHBaseRow2() throws SerDeException {
    // column family is mapped to Map<string,string>
    List<TypeInfo> fieldTypeInfos = TypeInfoUtils.getTypeInfosFromTypeString("string,int,array<string>,map<string,string>,string");
    List<String> fieldNames = Arrays.asList(new String[] { "key", "a", "b", "c", "d" });
    Text nullSequence = new Text("\\N");
    String hbaseColsMapping = ":key,cfa:a,cfa:b,cfb:,cfc:d";
    ColumnMappings columnMappings = null;
    try {
        columnMappings = HBaseSerDe.parseColumnsMapping(hbaseColsMapping);
    } catch (SerDeException e) {
        fail(e.toString());
    }
    for (ColumnMapping colMap : columnMappings) {
        if (!colMap.hbaseRowKey && colMap.qualifierName == null) {
            colMap.binaryStorage.add(false);
            colMap.binaryStorage.add(false);
        } else {
            colMap.binaryStorage.add(false);
        }
    }
    ObjectInspector oi = LazyFactory.createLazyStructInspector(fieldNames, fieldTypeInfos, new byte[] { ' ', ':', '=' }, nullSequence, false, false, (byte) 0);
    LazyHBaseRow o = new LazyHBaseRow((LazySimpleStructObjectInspector) oi, columnMappings);
    List<Cell> kvs = new ArrayList<Cell>();
    kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfa"), Bytes.toBytes("a"), Bytes.toBytes("123")));
    kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfa"), Bytes.toBytes("b"), Bytes.toBytes("a:b:c")));
    kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfb"), Bytes.toBytes("d"), Bytes.toBytes("e")));
    kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfb"), Bytes.toBytes("f"), Bytes.toBytes("g")));
    kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfc"), Bytes.toBytes("d"), Bytes.toBytes("hi")));
    Result r = Result.create(kvs);
    o.init(r);
    assertEquals(("{'key':'test-row','a':123,'b':['a','b','c']," + "'c':{'d':'e','f':'g'},'d':'hi'}").replace("'", "\""), SerDeUtils.getJSONString(o, oi));
    kvs.clear();
    kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfa"), Bytes.toBytes("a"), Bytes.toBytes("123")));
    kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfb"), Bytes.toBytes("d"), Bytes.toBytes("e")));
    kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfb"), Bytes.toBytes("f"), Bytes.toBytes("g")));
    r = Result.create(kvs);
    o.init(r);
    assertEquals(("{'key':'test-row','a':123,'b':null," + "'c':{'d':'e','f':'g'},'d':null}").replace("'", "\""), SerDeUtils.getJSONString(o, oi));
    kvs.clear();
    kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfa"), Bytes.toBytes("b"), Bytes.toBytes("a")));
    kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfb"), Bytes.toBytes("f"), Bytes.toBytes("g")));
    kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfc"), Bytes.toBytes("d"), Bytes.toBytes("no")));
    r = Result.create(kvs);
    o.init(r);
    assertEquals(("{'key':'test-row','a':null,'b':['a']," + "'c':{'f':'g'},'d':'no'}").replace("'", "\""), SerDeUtils.getJSONString(o, oi));
    kvs.clear();
    kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfa"), Bytes.toBytes("b"), Bytes.toBytes(":a::")));
    kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfc"), Bytes.toBytes("d"), Bytes.toBytes("no")));
    r = Result.create(kvs);
    o.init(r);
    assertEquals(("{'key':'test-row','a':null,'b':['','a','','']," + "'c':{},'d':'no'}").replace("'", "\""), SerDeUtils.getJSONString(o, oi));
    kvs.clear();
    kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfa"), Bytes.toBytes("a"), Bytes.toBytes("123")));
    kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfa"), Bytes.toBytes("b"), Bytes.toBytes("")));
    kvs.add(new KeyValue(Bytes.toBytes("test-row"), Bytes.toBytes("cfc"), Bytes.toBytes("d"), Bytes.toBytes("")));
    r = Result.create(kvs);
    o.init(r);
    assertEquals("{'key':'test-row','a':123,'b':[],'c':{},'d':''}".replace("'", "\""), SerDeUtils.getJSONString(o, oi));
}
Also used : LazySimpleStructObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) LazyMapObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.LazyMapObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) KeyValue(org.apache.hadoop.hbase.KeyValue) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) LazyString(org.apache.hadoop.hive.serde2.lazy.LazyString) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) Result(org.apache.hadoop.hbase.client.Result) Cell(org.apache.hadoop.hbase.Cell) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) ColumnMapping(org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping)

Aggregations

ColumnMapping (org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping)10 ArrayList (java.util.ArrayList)7 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)7 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)4 IOException (java.io.IOException)3 Cell (org.apache.hadoop.hbase.Cell)3 KeyValue (org.apache.hadoop.hbase.KeyValue)3 Result (org.apache.hadoop.hbase.client.Result)3 LazyString (org.apache.hadoop.hive.serde2.lazy.LazyString)3 LazyMapObjectInspector (org.apache.hadoop.hive.serde2.lazy.objectinspector.LazyMapObjectInspector)3 LazySimpleStructObjectInspector (org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector)3 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)3 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)3 Text (org.apache.hadoop.io.Text)3 Scan (org.apache.hadoop.hbase.client.Scan)2 HashSet (java.util.HashSet)1 LinkedHashSet (java.util.LinkedHashSet)1 Path (org.apache.hadoop.fs.Path)1 HColumnDescriptor (org.apache.hadoop.hbase.HColumnDescriptor)1 HTableDescriptor (org.apache.hadoop.hbase.HTableDescriptor)1