Search in sources :

Example 1 with ColumnMapping

use of org.apache.hadoop.hive.accumulo.columns.ColumnMapping in project hive by apache.

the class HiveAccumuloTableInputFormat method getPairCollection.

/**
 * Create col fam/qual pairs from pipe separated values, usually from config object. Ignores
 * rowID.
 *
 * @param columnMappings
 *          The list of ColumnMappings for the given query
 * @return a Set of Pairs of colfams and colquals
 */
protected HashSet<Pair<Text, Text>> getPairCollection(List<ColumnMapping> columnMappings) {
    final HashSet<Pair<Text, Text>> pairs = new HashSet<Pair<Text, Text>>();
    for (ColumnMapping columnMapping : columnMappings) {
        if (columnMapping instanceof HiveAccumuloColumnMapping) {
            HiveAccumuloColumnMapping accumuloColumnMapping = (HiveAccumuloColumnMapping) columnMapping;
            Text cf = new Text(accumuloColumnMapping.getColumnFamily());
            Text cq = null;
            // A null cq implies an empty column qualifier
            if (null != accumuloColumnMapping.getColumnQualifier()) {
                cq = new Text(accumuloColumnMapping.getColumnQualifier());
            }
            pairs.add(new Pair<Text, Text>(cf, cq));
        } else if (columnMapping instanceof HiveAccumuloMapColumnMapping) {
            HiveAccumuloMapColumnMapping mapMapping = (HiveAccumuloMapColumnMapping) columnMapping;
            // Can't fetch prefix on colqual, must pull the entire qualifier
            // TODO use an iterator to do the filter, server-side.
            pairs.add(new Pair<Text, Text>(new Text(mapMapping.getColumnFamily()), null));
        }
    }
    log.info("Computed columns to fetch (" + pairs + ") from " + columnMappings);
    return pairs;
}
Also used : HiveAccumuloMapColumnMapping(org.apache.hadoop.hive.accumulo.columns.HiveAccumuloMapColumnMapping) Text(org.apache.hadoop.io.Text) HiveAccumuloColumnMapping(org.apache.hadoop.hive.accumulo.columns.HiveAccumuloColumnMapping) HiveAccumuloMapColumnMapping(org.apache.hadoop.hive.accumulo.columns.HiveAccumuloMapColumnMapping) ColumnMapping(org.apache.hadoop.hive.accumulo.columns.ColumnMapping) HiveAccumuloColumnMapping(org.apache.hadoop.hive.accumulo.columns.HiveAccumuloColumnMapping) Pair(org.apache.accumulo.core.util.Pair) HashSet(java.util.HashSet)

Example 2 with ColumnMapping

use of org.apache.hadoop.hive.accumulo.columns.ColumnMapping in project hive by apache.

the class HiveAccumuloTableInputFormat method getSplits.

@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
    final AccumuloConnectionParameters accumuloParams = new AccumuloConnectionParameters(jobConf);
    final Instance instance = accumuloParams.getInstance();
    final ColumnMapper columnMapper;
    try {
        columnMapper = getColumnMapper(jobConf);
    } catch (TooManyAccumuloColumnsException e) {
        throw new IOException(e);
    }
    JobContext context = ShimLoader.getHadoopShims().newJobContext(Job.getInstance(jobConf));
    Path[] tablePaths = FileInputFormat.getInputPaths(context);
    try {
        Connector connector = null;
        // Need to get a Connector so we look up the user's authorizations if not otherwise specified
        if (accumuloParams.useSasl()) {
            log.info("Current user: " + UserGroupInformation.getCurrentUser());
            // In a YARN/Tez job, don't have the Kerberos credentials anymore, use the delegation token
            AuthenticationToken token = ConfiguratorBase.getAuthenticationToken(AccumuloInputFormat.class, jobConf);
            if (null != token && !jobConf.getCredentials().getAllTokens().isEmpty()) {
                // Convert the stub from the configuration back into a normal Token
                log.info("Found authentication token in Configuration: " + token);
                log.info("Job credential tokens: " + jobConf.getCredentials().getAllTokens());
                AuthenticationToken unwrappedToken = ConfiguratorBase.unwrapAuthenticationToken(jobConf, token);
                log.info("Converted authentication token from Configuration into: " + unwrappedToken);
                // will return back the original token (which we know is insufficient)
                if (unwrappedToken != token) {
                    log.info("Creating Accumulo Connector with unwrapped delegation token");
                    connector = instance.getConnector(accumuloParams.getAccumuloUserName(), unwrappedToken);
                } else {
                    log.info("Job credentials did not contain delegation token, fetching new token");
                }
            }
            if (connector == null) {
                log.info("Obtaining Accumulo Connector using KerberosToken");
                // Construct a KerberosToken -- relies on ProxyUser configuration. Will be the client making
                // the request on top of the HS2's user. Accumulo will require proper proxy-user auth configs.
                connector = instance.getConnector(accumuloParams.getAccumuloUserName(), new KerberosToken(accumuloParams.getAccumuloUserName()));
            }
        } else {
            // Still in the local JVM, use the username+password or Kerberos credentials
            connector = accumuloParams.getConnector(instance);
        }
        final List<ColumnMapping> columnMappings = columnMapper.getColumnMappings();
        final List<IteratorSetting> iterators = predicateHandler.getIterators(jobConf, columnMapper);
        final Collection<Range> ranges = predicateHandler.getRanges(jobConf, columnMapper);
        // We don't want that.
        if (null != ranges && ranges.isEmpty()) {
            return new InputSplit[0];
        }
        // Set the relevant information in the Configuration for the AccumuloInputFormat
        configure(jobConf, instance, connector, accumuloParams, columnMapper, iterators, ranges);
        int numColumns = columnMappings.size();
        List<Integer> readColIds = ColumnProjectionUtils.getReadColumnIDs(jobConf);
        // Sanity check
        if (numColumns < readColIds.size())
            throw new IOException("Number of column mappings (" + numColumns + ")" + " numbers less than the hive table columns. (" + readColIds.size() + ")");
        // get splits from Accumulo
        InputSplit[] splits = accumuloInputFormat.getSplits(jobConf, numSplits);
        HiveAccumuloSplit[] hiveSplits = new HiveAccumuloSplit[splits.length];
        for (int i = 0; i < splits.length; i++) {
            RangeInputSplit ris = (RangeInputSplit) splits[i];
            ris.setLogLevel(Level.DEBUG);
            hiveSplits[i] = new HiveAccumuloSplit(ris, tablePaths[0]);
        }
        return hiveSplits;
    } catch (AccumuloException e) {
        log.error("Could not configure AccumuloInputFormat", e);
        throw new IOException(StringUtils.stringifyException(e));
    } catch (AccumuloSecurityException e) {
        log.error("Could not configure AccumuloInputFormat", e);
        throw new IOException(StringUtils.stringifyException(e));
    } catch (SerDeException e) {
        log.error("Could not configure AccumuloInputFormat", e);
        throw new IOException(StringUtils.stringifyException(e));
    }
}
Also used : Connector(org.apache.accumulo.core.client.Connector) AuthenticationToken(org.apache.accumulo.core.client.security.tokens.AuthenticationToken) MockInstance(org.apache.accumulo.core.client.mock.MockInstance) Instance(org.apache.accumulo.core.client.Instance) RangeInputSplit(org.apache.accumulo.core.client.mapred.RangeInputSplit) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) JobContext(org.apache.hadoop.mapreduce.JobContext) RangeInputSplit(org.apache.accumulo.core.client.mapred.RangeInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) HiveAccumuloMapColumnMapping(org.apache.hadoop.hive.accumulo.columns.HiveAccumuloMapColumnMapping) ColumnMapping(org.apache.hadoop.hive.accumulo.columns.ColumnMapping) HiveAccumuloColumnMapping(org.apache.hadoop.hive.accumulo.columns.HiveAccumuloColumnMapping) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) Path(org.apache.hadoop.fs.Path) AccumuloException(org.apache.accumulo.core.client.AccumuloException) KerberosToken(org.apache.accumulo.core.client.security.tokens.KerberosToken) IOException(java.io.IOException) TooManyAccumuloColumnsException(org.apache.hadoop.hive.accumulo.serde.TooManyAccumuloColumnsException) Range(org.apache.accumulo.core.data.Range) IteratorSetting(org.apache.accumulo.core.client.IteratorSetting) AccumuloConnectionParameters(org.apache.hadoop.hive.accumulo.AccumuloConnectionParameters) ColumnMapper(org.apache.hadoop.hive.accumulo.columns.ColumnMapper)

Example 3 with ColumnMapping

use of org.apache.hadoop.hive.accumulo.columns.ColumnMapping in project hive by apache.

the class AccumuloRowSerializer method serialize.

public Mutation serialize(Object obj, ObjectInspector objInspector) throws SerDeException, IOException {
    if (objInspector.getCategory() != ObjectInspector.Category.STRUCT) {
        throw new SerDeException(getClass().toString() + " can only serialize struct types, but we got: " + objInspector.getTypeName());
    }
    // Prepare the field ObjectInspectors
    StructObjectInspector soi = (StructObjectInspector) objInspector;
    List<? extends StructField> fields = soi.getAllStructFieldRefs();
    List<Object> columnValues = soi.getStructFieldsDataAsList(obj);
    // Fail if we try to access an offset out of bounds
    if (rowIdOffset >= fields.size()) {
        throw new IllegalStateException("Attempted to access field outside of definition for struct. Have " + fields.size() + " fields and tried to access offset " + rowIdOffset);
    }
    StructField field = fields.get(rowIdOffset);
    Object value = columnValues.get(rowIdOffset);
    // The ObjectInspector for the row ID
    ObjectInspector fieldObjectInspector = field.getFieldObjectInspector();
    // Serialize the row component using the RowIdFactory. In the normal case, this will just
    // delegate back to the "local" serializeRowId method
    byte[] data = rowIdFactory.serializeRowId(value, field, output);
    // Set that as the row id in the mutation
    Mutation mutation = new Mutation(data);
    // Each column in the row
    for (int i = 0; i < fields.size(); i++) {
        if (rowIdOffset == i) {
            continue;
        }
        // Get the relevant information for this column
        field = fields.get(i);
        value = columnValues.get(i);
        // Despite having a fixed schema from Hive, we have sparse columns in Accumulo
        if (null == value) {
            continue;
        }
        // The ObjectInspector for the current column
        fieldObjectInspector = field.getFieldObjectInspector();
        // Make sure we got the right implementation of a ColumnMapping
        ColumnMapping mapping = mappings.get(i);
        if (mapping instanceof HiveAccumuloColumnMapping) {
            serializeColumnMapping((HiveAccumuloColumnMapping) mapping, fieldObjectInspector, value, mutation);
        } else if (mapping instanceof HiveAccumuloMapColumnMapping) {
            serializeColumnMapping((HiveAccumuloMapColumnMapping) mapping, fieldObjectInspector, value, mutation);
        } else {
            throw new IllegalArgumentException("Mapping for " + field.getFieldName() + " was not a HiveColumnMapping, but was " + mapping.getClass());
        }
    }
    return mutation;
}
Also used : ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) HiveAccumuloMapColumnMapping(org.apache.hadoop.hive.accumulo.columns.HiveAccumuloMapColumnMapping) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) Mutation(org.apache.accumulo.core.data.Mutation) HiveAccumuloColumnMapping(org.apache.hadoop.hive.accumulo.columns.HiveAccumuloColumnMapping) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) ColumnMapping(org.apache.hadoop.hive.accumulo.columns.ColumnMapping) HiveAccumuloMapColumnMapping(org.apache.hadoop.hive.accumulo.columns.HiveAccumuloMapColumnMapping) HiveAccumuloColumnMapping(org.apache.hadoop.hive.accumulo.columns.HiveAccumuloColumnMapping) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 4 with ColumnMapping

use of org.apache.hadoop.hive.accumulo.columns.ColumnMapping in project hive by apache.

the class TestAccumuloRowSerializer method testInvalidRowIdOffset.

@Test(expected = IllegalArgumentException.class)
public void testInvalidRowIdOffset() throws SerDeException {
    ArrayList<ColumnMapping> mappings = new ArrayList<ColumnMapping>();
    // Should fail because of the -1
    new AccumuloRowSerializer(-1, null, mappings, new ColumnVisibility(), null);
}
Also used : ArrayList(java.util.ArrayList) ColumnVisibility(org.apache.accumulo.core.security.ColumnVisibility) ColumnMapping(org.apache.hadoop.hive.accumulo.columns.ColumnMapping) Test(org.junit.Test)

Example 5 with ColumnMapping

use of org.apache.hadoop.hive.accumulo.columns.ColumnMapping in project hive by apache.

the class TestAccumuloRowSerializer method testBufferResetBeforeUse.

@Test
public void testBufferResetBeforeUse() throws IOException {
    ByteStream.Output output = new ByteStream.Output();
    PrimitiveObjectInspector fieldObjectInspector = Mockito.mock(StringObjectInspector.class);
    ColumnMapping mapping = Mockito.mock(ColumnMapping.class);
    // Write some garbage to the buffer that should be erased
    output.write("foobar".getBytes());
    // Stub out the serializer
    AccumuloRowSerializer serializer = Mockito.mock(AccumuloRowSerializer.class);
    String object = "hello";
    Mockito.when(serializer.getSerializedValue(Mockito.any(ObjectInspector.class), Mockito.any(), Mockito.any(ByteStream.Output.class), Mockito.any(ColumnMapping.class))).thenCallRealMethod();
    Mockito.when(fieldObjectInspector.getCategory()).thenReturn(ObjectInspector.Category.PRIMITIVE);
    Mockito.when(fieldObjectInspector.getPrimitiveCategory()).thenReturn(PrimitiveCategory.STRING);
    Mockito.when(fieldObjectInspector.getPrimitiveWritableObject(Mockito.any(Object.class))).thenReturn(new Text(object));
    Mockito.when(mapping.getEncoding()).thenReturn(ColumnEncoding.STRING);
    // Invoke the method
    serializer.getSerializedValue(fieldObjectInspector, object, output, mapping);
    // Verify the buffer was reset (real output doesn't happen because it was mocked)
    Assert.assertEquals(0, output.size());
}
Also used : LazySimpleStructObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) LazyStringObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyStringObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) LazyMapObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.LazyMapObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) ByteStream(org.apache.hadoop.hive.serde2.ByteStream) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) Text(org.apache.hadoop.io.Text) ColumnMapping(org.apache.hadoop.hive.accumulo.columns.ColumnMapping) Test(org.junit.Test)

Aggregations

ColumnMapping (org.apache.hadoop.hive.accumulo.columns.ColumnMapping)9 HiveAccumuloColumnMapping (org.apache.hadoop.hive.accumulo.columns.HiveAccumuloColumnMapping)5 HiveAccumuloMapColumnMapping (org.apache.hadoop.hive.accumulo.columns.HiveAccumuloMapColumnMapping)4 HiveAccumuloRowIdColumnMapping (org.apache.hadoop.hive.accumulo.columns.HiveAccumuloRowIdColumnMapping)4 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)4 Text (org.apache.hadoop.io.Text)4 ArrayList (java.util.ArrayList)3 LazySimpleStructObjectInspector (org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector)3 Test (org.junit.Test)3 HashSet (java.util.HashSet)2 Pair (org.apache.accumulo.core.util.Pair)2 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)2 PrimitiveObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector)2 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)2 IOException (java.io.IOException)1 AccumuloException (org.apache.accumulo.core.client.AccumuloException)1 AccumuloSecurityException (org.apache.accumulo.core.client.AccumuloSecurityException)1 Connector (org.apache.accumulo.core.client.Connector)1 Instance (org.apache.accumulo.core.client.Instance)1 IteratorSetting (org.apache.accumulo.core.client.IteratorSetting)1