Examples with ColumnMapper - org.apache.hadoop.hive.accumulo.columns.ColumnMapper

Example 1 with ColumnMapper

use of org.apache.hadoop.hive.accumulo.columns.ColumnMapper in project hive by apache.

the class HiveAccumuloTableInputFormat method getSplits.

@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
    final AccumuloConnectionParameters accumuloParams = new AccumuloConnectionParameters(jobConf);
    final Instance instance = accumuloParams.getInstance();
    final ColumnMapper columnMapper;
    try {
        columnMapper = getColumnMapper(jobConf);
    } catch (TooManyAccumuloColumnsException e) {
        throw new IOException(e);
    }
    JobContext context = ShimLoader.getHadoopShims().newJobContext(Job.getInstance(jobConf));
    Path[] tablePaths = FileInputFormat.getInputPaths(context);
    try {
        Connector connector = null;
        // Need to get a Connector so we look up the user's authorizations if not otherwise specified
        if (accumuloParams.useSasl()) {
            log.info("Current user: " + UserGroupInformation.getCurrentUser());
            // In a YARN/Tez job, don't have the Kerberos credentials anymore, use the delegation token
            AuthenticationToken token = ConfiguratorBase.getAuthenticationToken(AccumuloInputFormat.class, jobConf);
            if (null != token && !jobConf.getCredentials().getAllTokens().isEmpty()) {
                // Convert the stub from the configuration back into a normal Token
                log.info("Found authentication token in Configuration: " + token);
                log.info("Job credential tokens: " + jobConf.getCredentials().getAllTokens());
                AuthenticationToken unwrappedToken = ConfiguratorBase.unwrapAuthenticationToken(jobConf, token);
                log.info("Converted authentication token from Configuration into: " + unwrappedToken);
                // will return back the original token (which we know is insufficient)
                if (unwrappedToken != token) {
                    log.info("Creating Accumulo Connector with unwrapped delegation token");
                    connector = instance.getConnector(accumuloParams.getAccumuloUserName(), unwrappedToken);
                } else {
                    log.info("Job credentials did not contain delegation token, fetching new token");
                }
            }
            if (connector == null) {
                log.info("Obtaining Accumulo Connector using KerberosToken");
                // Construct a KerberosToken -- relies on ProxyUser configuration. Will be the client making
                // the request on top of the HS2's user. Accumulo will require proper proxy-user auth configs.
                connector = instance.getConnector(accumuloParams.getAccumuloUserName(), new KerberosToken(accumuloParams.getAccumuloUserName()));
            }
        } else {
            // Still in the local JVM, use the username+password or Kerberos credentials
            connector = accumuloParams.getConnector(instance);
        }
        final List<ColumnMapping> columnMappings = columnMapper.getColumnMappings();
        final List<IteratorSetting> iterators = predicateHandler.getIterators(jobConf, columnMapper);
        final Collection<Range> ranges = predicateHandler.getRanges(jobConf, columnMapper);
        // We don't want that.
        if (null != ranges && ranges.isEmpty()) {
            return new InputSplit[0];
        }
        // Set the relevant information in the Configuration for the AccumuloInputFormat
        configure(jobConf, instance, connector, accumuloParams, columnMapper, iterators, ranges);
        int numColumns = columnMappings.size();
        List<Integer> readColIds = ColumnProjectionUtils.getReadColumnIDs(jobConf);
        // Sanity check
        if (numColumns < readColIds.size())
            throw new IOException("Number of column mappings (" + numColumns + ")" + " numbers less than the hive table columns. (" + readColIds.size() + ")");
        // get splits from Accumulo
        InputSplit[] splits = accumuloInputFormat.getSplits(jobConf, numSplits);
        HiveAccumuloSplit[] hiveSplits = new HiveAccumuloSplit[splits.length];
        for (int i = 0; i < splits.length; i++) {
            RangeInputSplit ris = (RangeInputSplit) splits[i];
            ris.setLogLevel(Level.DEBUG);
            hiveSplits[i] = new HiveAccumuloSplit(ris, tablePaths[0]);
        }
        return hiveSplits;
    } catch (AccumuloException e) {
        log.error("Could not configure AccumuloInputFormat", e);
        throw new IOException(StringUtils.stringifyException(e));
    } catch (AccumuloSecurityException e) {
        log.error("Could not configure AccumuloInputFormat", e);
        throw new IOException(StringUtils.stringifyException(e));
    } catch (SerDeException e) {
        log.error("Could not configure AccumuloInputFormat", e);
        throw new IOException(StringUtils.stringifyException(e));
    }
}

Also used : Connector(org.apache.accumulo.core.client.Connector) AuthenticationToken(org.apache.accumulo.core.client.security.tokens.AuthenticationToken) MockInstance(org.apache.accumulo.core.client.mock.MockInstance) Instance(org.apache.accumulo.core.client.Instance) RangeInputSplit(org.apache.accumulo.core.client.mapred.RangeInputSplit) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) JobContext(org.apache.hadoop.mapreduce.JobContext) RangeInputSplit(org.apache.accumulo.core.client.mapred.RangeInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) HiveAccumuloMapColumnMapping(org.apache.hadoop.hive.accumulo.columns.HiveAccumuloMapColumnMapping) ColumnMapping(org.apache.hadoop.hive.accumulo.columns.ColumnMapping) HiveAccumuloColumnMapping(org.apache.hadoop.hive.accumulo.columns.HiveAccumuloColumnMapping) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) Path(org.apache.hadoop.fs.Path) AccumuloException(org.apache.accumulo.core.client.AccumuloException) KerberosToken(org.apache.accumulo.core.client.security.tokens.KerberosToken) IOException(java.io.IOException) TooManyAccumuloColumnsException(org.apache.hadoop.hive.accumulo.serde.TooManyAccumuloColumnsException) Range(org.apache.accumulo.core.data.Range) IteratorSetting(org.apache.accumulo.core.client.IteratorSetting) AccumuloConnectionParameters(org.apache.hadoop.hive.accumulo.AccumuloConnectionParameters) ColumnMapper(org.apache.hadoop.hive.accumulo.columns.ColumnMapper)

Example 2 with ColumnMapper

use of org.apache.hadoop.hive.accumulo.columns.ColumnMapper in project hive by apache.

the class TestDefaultAccumuloRowIdFactory method testCorrectComplexInspectors.

@Test
public void testCorrectComplexInspectors() throws SerDeException {
    AccumuloSerDe accumuloSerDe = new AccumuloSerDe();
    Properties properties = new Properties();
    Configuration conf = new Configuration();
    properties.setProperty(AccumuloSerDeParameters.COLUMN_MAPPINGS, ":rowID,cf:cq");
    properties.setProperty(serdeConstants.LIST_COLUMNS, "row,col");
    properties.setProperty(serdeConstants.LIST_COLUMN_TYPES, "struct<col1:int,col2:int>,map<string,string>");
    accumuloSerDe.initialize(conf, properties, null);
    AccumuloRowIdFactory factory = accumuloSerDe.getParams().getRowIdFactory();
    List<TypeInfo> columnTypes = accumuloSerDe.getParams().getHiveColumnTypes();
    ColumnMapper mapper = accumuloSerDe.getParams().getColumnMapper();
    LazySerDeParameters serDeParams = accumuloSerDe.getParams().getSerDeParameters();
    List<ObjectInspector> OIs = accumuloSerDe.getColumnObjectInspectors(columnTypes, serDeParams, mapper.getColumnMappings(), factory);
    // Expect the correct OIs
    Assert.assertEquals(2, OIs.size());
    Assert.assertEquals(LazySimpleStructObjectInspector.class, OIs.get(0).getClass());
    Assert.assertEquals(LazyMapObjectInspector.class, OIs.get(1).getClass());
    LazySimpleStructObjectInspector structOI = (LazySimpleStructObjectInspector) OIs.get(0);
    Assert.assertEquals(2, (int) structOI.getSeparator());
    LazyMapObjectInspector mapOI = (LazyMapObjectInspector) OIs.get(1);
    Assert.assertEquals(2, (int) mapOI.getItemSeparator());
    Assert.assertEquals(3, (int) mapOI.getKeyValueSeparator());
}

Also used : LazyMapObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.LazyMapObjectInspector) LazySimpleStructObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector) LazyIntObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyIntObjectInspector) LazyStringObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyStringObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) LazySimpleStructObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector) Configuration(org.apache.hadoop.conf.Configuration) LazySerDeParameters(org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters) LazyMapObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.LazyMapObjectInspector) Properties(java.util.Properties) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) ColumnMapper(org.apache.hadoop.hive.accumulo.columns.ColumnMapper) Test(org.junit.Test)

Example 3 with ColumnMapper

use of org.apache.hadoop.hive.accumulo.columns.ColumnMapper in project hive by apache.

the class TestDefaultAccumuloRowIdFactory method testCorrectPrimitiveInspectors.

@Test
public void testCorrectPrimitiveInspectors() throws SerDeException {
    AccumuloSerDe accumuloSerDe = new AccumuloSerDe();
    Properties properties = new Properties();
    Configuration conf = new Configuration();
    properties.setProperty(AccumuloSerDeParameters.COLUMN_MAPPINGS, ":rowID,cf:cq");
    properties.setProperty(serdeConstants.LIST_COLUMNS, "row,col");
    properties.setProperty(serdeConstants.LIST_COLUMN_TYPES, "string,int");
    accumuloSerDe.initialize(conf, properties, null);
    AccumuloRowIdFactory factory = accumuloSerDe.getParams().getRowIdFactory();
    List<TypeInfo> columnTypes = accumuloSerDe.getParams().getHiveColumnTypes();
    ColumnMapper mapper = accumuloSerDe.getParams().getColumnMapper();
    LazySerDeParameters serDeParams = accumuloSerDe.getParams().getSerDeParameters();
    List<ObjectInspector> OIs = accumuloSerDe.getColumnObjectInspectors(columnTypes, serDeParams, mapper.getColumnMappings(), factory);
    Assert.assertEquals(2, OIs.size());
    Assert.assertEquals(LazyStringObjectInspector.class, OIs.get(0).getClass());
    Assert.assertEquals(LazyIntObjectInspector.class, OIs.get(1).getClass());
}

Also used : LazyMapObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.LazyMapObjectInspector) LazySimpleStructObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector) LazyIntObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyIntObjectInspector) LazyStringObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyStringObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) Configuration(org.apache.hadoop.conf.Configuration) LazySerDeParameters(org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters) Properties(java.util.Properties) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) ColumnMapper(org.apache.hadoop.hive.accumulo.columns.ColumnMapper) Test(org.junit.Test)

Example 4 with ColumnMapper

use of org.apache.hadoop.hive.accumulo.columns.ColumnMapper in project hive by apache.

the class TestHiveAccumuloTableInputFormat method testMapColumnPairs.

@Test
public void testMapColumnPairs() throws TooManyAccumuloColumnsException {
    ColumnMapper columnMapper = new ColumnMapper(":rowID,cf:*", conf.get(AccumuloSerDeParameters.DEFAULT_STORAGE_TYPE), Arrays.asList("row", "col"), Arrays.<TypeInfo>asList(TypeInfoFactory.stringTypeInfo, TypeInfoFactory.getMapTypeInfo(TypeInfoFactory.stringTypeInfo, TypeInfoFactory.stringTypeInfo)));
    Set<Pair<Text, Text>> pairs = inputformat.getPairCollection(columnMapper.getColumnMappings());
    Assert.assertEquals(1, pairs.size());
    Pair<Text, Text> cfCq = pairs.iterator().next();
    Assert.assertEquals("cf", cfCq.getFirst().toString());
    Assert.assertNull(cfCq.getSecond());
}

Also used : Text(org.apache.hadoop.io.Text) ColumnMapper(org.apache.hadoop.hive.accumulo.columns.ColumnMapper) Pair(org.apache.accumulo.core.util.Pair) Test(org.junit.Test)

Example 5 with ColumnMapper

use of org.apache.hadoop.hive.accumulo.columns.ColumnMapper in project hive by apache.

the class TestHiveAccumuloTableInputFormat method testConfigureAccumuloInputFormatWithIterators.

@Test
public void testConfigureAccumuloInputFormatWithIterators() throws Exception {
    AccumuloConnectionParameters accumuloParams = new AccumuloConnectionParameters(conf);
    ColumnMapper columnMapper = new ColumnMapper(conf.get(AccumuloSerDeParameters.COLUMN_MAPPINGS), conf.get(AccumuloSerDeParameters.DEFAULT_STORAGE_TYPE), columnNames, columnTypes);
    Set<Pair<Text, Text>> cfCqPairs = inputformat.getPairCollection(columnMapper.getColumnMappings());
    List<IteratorSetting> iterators = new ArrayList<IteratorSetting>();
    Set<Range> ranges = Collections.singleton(new Range());
    String instanceName = "realInstance";
    String zookeepers = "host1:2181,host2:2181,host3:2181";
    IteratorSetting cfg = new IteratorSetting(50, PrimitiveComparisonFilter.class);
    cfg.addOption(PrimitiveComparisonFilter.P_COMPARE_CLASS, StringCompare.class.getName());
    cfg.addOption(PrimitiveComparisonFilter.COMPARE_OPT_CLASS, Equal.class.getName());
    cfg.addOption(PrimitiveComparisonFilter.CONST_VAL, "dave");
    cfg.addOption(PrimitiveComparisonFilter.COLUMN, "person:name");
    iterators.add(cfg);
    cfg = new IteratorSetting(50, PrimitiveComparisonFilter.class);
    cfg.addOption(PrimitiveComparisonFilter.P_COMPARE_CLASS, IntCompare.class.getName());
    cfg.addOption(PrimitiveComparisonFilter.COMPARE_OPT_CLASS, Equal.class.getName());
    cfg.addOption(PrimitiveComparisonFilter.CONST_VAL, "50");
    cfg.addOption(PrimitiveComparisonFilter.COLUMN, "person:age");
    iterators.add(cfg);
    ZooKeeperInstance zkInstance = Mockito.mock(ZooKeeperInstance.class);
    HiveAccumuloTableInputFormat mockInputFormat = Mockito.mock(HiveAccumuloTableInputFormat.class);
    HiveAccumuloHelper helper = Mockito.mock(HiveAccumuloHelper.class);
    // Stub out the ZKI mock
    Mockito.when(zkInstance.getInstanceName()).thenReturn(instanceName);
    Mockito.when(zkInstance.getZooKeepers()).thenReturn(zookeepers);
    // Stub out a mocked Helper instance
    Mockito.when(mockInputFormat.getHelper()).thenReturn(helper);
    // Call out to the real configure method
    Mockito.doCallRealMethod().when(mockInputFormat).configure(conf, zkInstance, con, accumuloParams, columnMapper, iterators, ranges);
    // Also compute the correct cf:cq pairs so we can assert the right argument was passed
    Mockito.doCallRealMethod().when(mockInputFormat).getPairCollection(columnMapper.getColumnMappings());
    mockInputFormat.configure(conf, zkInstance, con, accumuloParams, columnMapper, iterators, ranges);
    // Verify that the correct methods are invoked on AccumuloInputFormat
    Mockito.verify(helper).setInputFormatZooKeeperInstance(conf, instanceName, zookeepers, false);
    Mockito.verify(helper).setInputFormatConnectorInfo(conf, USER, new PasswordToken(PASS));
    Mockito.verify(mockInputFormat).setInputTableName(conf, TEST_TABLE);
    Mockito.verify(mockInputFormat).setScanAuthorizations(conf, con.securityOperations().getUserAuthorizations(USER));
    Mockito.verify(mockInputFormat).addIterators(conf, iterators);
    Mockito.verify(mockInputFormat).setRanges(conf, ranges);
    Mockito.verify(mockInputFormat).fetchColumns(conf, cfCqPairs);
}

Also used : StringCompare(org.apache.hadoop.hive.accumulo.predicate.compare.StringCompare) ArrayList(java.util.ArrayList) Range(org.apache.accumulo.core.data.Range) ZooKeeperInstance(org.apache.accumulo.core.client.ZooKeeperInstance) HiveAccumuloHelper(org.apache.hadoop.hive.accumulo.HiveAccumuloHelper) PasswordToken(org.apache.accumulo.core.client.security.tokens.PasswordToken) IteratorSetting(org.apache.accumulo.core.client.IteratorSetting) PrimitiveComparisonFilter(org.apache.hadoop.hive.accumulo.predicate.PrimitiveComparisonFilter) GreaterThanOrEqual(org.apache.hadoop.hive.accumulo.predicate.compare.GreaterThanOrEqual) Equal(org.apache.hadoop.hive.accumulo.predicate.compare.Equal) IntCompare(org.apache.hadoop.hive.accumulo.predicate.compare.IntCompare) AccumuloConnectionParameters(org.apache.hadoop.hive.accumulo.AccumuloConnectionParameters) ColumnMapper(org.apache.hadoop.hive.accumulo.columns.ColumnMapper) Pair(org.apache.accumulo.core.util.Pair) Test(org.junit.Test)

Aggregations

ColumnMapper (org.apache.hadoop.hive.accumulo.columns.ColumnMapper)18 Test (org.junit.Test)15 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)10 IteratorSetting (org.apache.accumulo.core.client.IteratorSetting)9 Range (org.apache.accumulo.core.data.Range)8 AccumuloConnectionParameters (org.apache.hadoop.hive.accumulo.AccumuloConnectionParameters)7 Text (org.apache.hadoop.io.Text)7 Pair (org.apache.accumulo.core.util.Pair)6 Properties (java.util.Properties)5 PasswordToken (org.apache.accumulo.core.client.security.tokens.PasswordToken)5 Configuration (org.apache.hadoop.conf.Configuration)5 HiveAccumuloHelper (org.apache.hadoop.hive.accumulo.HiveAccumuloHelper)5 LazySimpleStructObjectInspector (org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector)5 ZooKeeperInstance (org.apache.accumulo.core.client.ZooKeeperInstance)4 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)4 ExprNodeConstantDesc (org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc)4 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)4 ExprNodeGenericFuncDesc (org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc)4 TooManyAccumuloColumnsException (org.apache.hadoop.hive.accumulo.serde.TooManyAccumuloColumnsException)3 ByteArrayOutputStream (java.io.ByteArrayOutputStream)2