use of org.apache.hadoop.hive.accumulo.columns.ColumnMapping in project hive by apache.
the class HiveAccumuloTableInputFormat method getPairCollection.
/**
* Create col fam/qual pairs from pipe separated values, usually from config object. Ignores
* rowID.
*
* @param columnMappings
* The list of ColumnMappings for the given query
* @return a Set of Pairs of colfams and colquals
*/
protected HashSet<Pair<Text, Text>> getPairCollection(List<ColumnMapping> columnMappings) {
final HashSet<Pair<Text, Text>> pairs = new HashSet<Pair<Text, Text>>();
for (ColumnMapping columnMapping : columnMappings) {
if (columnMapping instanceof HiveAccumuloColumnMapping) {
HiveAccumuloColumnMapping accumuloColumnMapping = (HiveAccumuloColumnMapping) columnMapping;
Text cf = new Text(accumuloColumnMapping.getColumnFamily());
Text cq = null;
// A null cq implies an empty column qualifier
if (null != accumuloColumnMapping.getColumnQualifier()) {
cq = new Text(accumuloColumnMapping.getColumnQualifier());
}
pairs.add(new Pair<Text, Text>(cf, cq));
} else if (columnMapping instanceof HiveAccumuloMapColumnMapping) {
HiveAccumuloMapColumnMapping mapMapping = (HiveAccumuloMapColumnMapping) columnMapping;
// Can't fetch prefix on colqual, must pull the entire qualifier
// TODO use an iterator to do the filter, server-side.
pairs.add(new Pair<Text, Text>(new Text(mapMapping.getColumnFamily()), null));
}
}
log.info("Computed columns to fetch (" + pairs + ") from " + columnMappings);
return pairs;
}
use of org.apache.hadoop.hive.accumulo.columns.ColumnMapping in project hive by apache.
the class HiveAccumuloTableInputFormat method getSplits.
@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
final AccumuloConnectionParameters accumuloParams = new AccumuloConnectionParameters(jobConf);
final Instance instance = accumuloParams.getInstance();
final ColumnMapper columnMapper;
try {
columnMapper = getColumnMapper(jobConf);
} catch (TooManyAccumuloColumnsException e) {
throw new IOException(e);
}
JobContext context = ShimLoader.getHadoopShims().newJobContext(Job.getInstance(jobConf));
Path[] tablePaths = FileInputFormat.getInputPaths(context);
try {
Connector connector = null;
// Need to get a Connector so we look up the user's authorizations if not otherwise specified
if (accumuloParams.useSasl()) {
log.info("Current user: " + UserGroupInformation.getCurrentUser());
// In a YARN/Tez job, don't have the Kerberos credentials anymore, use the delegation token
AuthenticationToken token = ConfiguratorBase.getAuthenticationToken(AccumuloInputFormat.class, jobConf);
if (null != token && !jobConf.getCredentials().getAllTokens().isEmpty()) {
// Convert the stub from the configuration back into a normal Token
log.info("Found authentication token in Configuration: " + token);
log.info("Job credential tokens: " + jobConf.getCredentials().getAllTokens());
AuthenticationToken unwrappedToken = ConfiguratorBase.unwrapAuthenticationToken(jobConf, token);
log.info("Converted authentication token from Configuration into: " + unwrappedToken);
// will return back the original token (which we know is insufficient)
if (unwrappedToken != token) {
log.info("Creating Accumulo Connector with unwrapped delegation token");
connector = instance.getConnector(accumuloParams.getAccumuloUserName(), unwrappedToken);
} else {
log.info("Job credentials did not contain delegation token, fetching new token");
}
}
if (connector == null) {
log.info("Obtaining Accumulo Connector using KerberosToken");
// Construct a KerberosToken -- relies on ProxyUser configuration. Will be the client making
// the request on top of the HS2's user. Accumulo will require proper proxy-user auth configs.
connector = instance.getConnector(accumuloParams.getAccumuloUserName(), new KerberosToken(accumuloParams.getAccumuloUserName()));
}
} else {
// Still in the local JVM, use the username+password or Kerberos credentials
connector = accumuloParams.getConnector(instance);
}
final List<ColumnMapping> columnMappings = columnMapper.getColumnMappings();
final List<IteratorSetting> iterators = predicateHandler.getIterators(jobConf, columnMapper);
final Collection<Range> ranges = predicateHandler.getRanges(jobConf, columnMapper);
// We don't want that.
if (null != ranges && ranges.isEmpty()) {
return new InputSplit[0];
}
// Set the relevant information in the Configuration for the AccumuloInputFormat
configure(jobConf, instance, connector, accumuloParams, columnMapper, iterators, ranges);
int numColumns = columnMappings.size();
List<Integer> readColIds = ColumnProjectionUtils.getReadColumnIDs(jobConf);
// Sanity check
if (numColumns < readColIds.size())
throw new IOException("Number of column mappings (" + numColumns + ")" + " numbers less than the hive table columns. (" + readColIds.size() + ")");
// get splits from Accumulo
InputSplit[] splits = accumuloInputFormat.getSplits(jobConf, numSplits);
HiveAccumuloSplit[] hiveSplits = new HiveAccumuloSplit[splits.length];
for (int i = 0; i < splits.length; i++) {
RangeInputSplit ris = (RangeInputSplit) splits[i];
ris.setLogLevel(Level.DEBUG);
hiveSplits[i] = new HiveAccumuloSplit(ris, tablePaths[0]);
}
return hiveSplits;
} catch (AccumuloException e) {
log.error("Could not configure AccumuloInputFormat", e);
throw new IOException(StringUtils.stringifyException(e));
} catch (AccumuloSecurityException e) {
log.error("Could not configure AccumuloInputFormat", e);
throw new IOException(StringUtils.stringifyException(e));
} catch (SerDeException e) {
log.error("Could not configure AccumuloInputFormat", e);
throw new IOException(StringUtils.stringifyException(e));
}
}
use of org.apache.hadoop.hive.accumulo.columns.ColumnMapping in project hive by apache.
the class AccumuloRowSerializer method serialize.
public Mutation serialize(Object obj, ObjectInspector objInspector) throws SerDeException, IOException {
if (objInspector.getCategory() != ObjectInspector.Category.STRUCT) {
throw new SerDeException(getClass().toString() + " can only serialize struct types, but we got: " + objInspector.getTypeName());
}
// Prepare the field ObjectInspectors
StructObjectInspector soi = (StructObjectInspector) objInspector;
List<? extends StructField> fields = soi.getAllStructFieldRefs();
List<Object> columnValues = soi.getStructFieldsDataAsList(obj);
// Fail if we try to access an offset out of bounds
if (rowIdOffset >= fields.size()) {
throw new IllegalStateException("Attempted to access field outside of definition for struct. Have " + fields.size() + " fields and tried to access offset " + rowIdOffset);
}
StructField field = fields.get(rowIdOffset);
Object value = columnValues.get(rowIdOffset);
// The ObjectInspector for the row ID
ObjectInspector fieldObjectInspector = field.getFieldObjectInspector();
// Serialize the row component using the RowIdFactory. In the normal case, this will just
// delegate back to the "local" serializeRowId method
byte[] data = rowIdFactory.serializeRowId(value, field, output);
// Set that as the row id in the mutation
Mutation mutation = new Mutation(data);
// Each column in the row
for (int i = 0; i < fields.size(); i++) {
if (rowIdOffset == i) {
continue;
}
// Get the relevant information for this column
field = fields.get(i);
value = columnValues.get(i);
// Despite having a fixed schema from Hive, we have sparse columns in Accumulo
if (null == value) {
continue;
}
// The ObjectInspector for the current column
fieldObjectInspector = field.getFieldObjectInspector();
// Make sure we got the right implementation of a ColumnMapping
ColumnMapping mapping = mappings.get(i);
if (mapping instanceof HiveAccumuloColumnMapping) {
serializeColumnMapping((HiveAccumuloColumnMapping) mapping, fieldObjectInspector, value, mutation);
} else if (mapping instanceof HiveAccumuloMapColumnMapping) {
serializeColumnMapping((HiveAccumuloMapColumnMapping) mapping, fieldObjectInspector, value, mutation);
} else {
throw new IllegalArgumentException("Mapping for " + field.getFieldName() + " was not a HiveColumnMapping, but was " + mapping.getClass());
}
}
return mutation;
}
use of org.apache.hadoop.hive.accumulo.columns.ColumnMapping in project hive by apache.
the class TestAccumuloRowSerializer method testInvalidRowIdOffset.
@Test(expected = IllegalArgumentException.class)
public void testInvalidRowIdOffset() throws SerDeException {
ArrayList<ColumnMapping> mappings = new ArrayList<ColumnMapping>();
// Should fail because of the -1
new AccumuloRowSerializer(-1, null, mappings, new ColumnVisibility(), null);
}
use of org.apache.hadoop.hive.accumulo.columns.ColumnMapping in project hive by apache.
the class TestAccumuloRowSerializer method testBufferResetBeforeUse.
@Test
public void testBufferResetBeforeUse() throws IOException {
ByteStream.Output output = new ByteStream.Output();
PrimitiveObjectInspector fieldObjectInspector = Mockito.mock(StringObjectInspector.class);
ColumnMapping mapping = Mockito.mock(ColumnMapping.class);
// Write some garbage to the buffer that should be erased
output.write("foobar".getBytes());
// Stub out the serializer
AccumuloRowSerializer serializer = Mockito.mock(AccumuloRowSerializer.class);
String object = "hello";
Mockito.when(serializer.getSerializedValue(Mockito.any(ObjectInspector.class), Mockito.any(), Mockito.any(ByteStream.Output.class), Mockito.any(ColumnMapping.class))).thenCallRealMethod();
Mockito.when(fieldObjectInspector.getCategory()).thenReturn(ObjectInspector.Category.PRIMITIVE);
Mockito.when(fieldObjectInspector.getPrimitiveCategory()).thenReturn(PrimitiveCategory.STRING);
Mockito.when(fieldObjectInspector.getPrimitiveWritableObject(Mockito.any(Object.class))).thenReturn(new Text(object));
Mockito.when(mapping.getEncoding()).thenReturn(ColumnEncoding.STRING);
// Invoke the method
serializer.getSerializedValue(fieldObjectInspector, object, output, mapping);
// Verify the buffer was reset (real output doesn't happen because it was mocked)
Assert.assertEquals(0, output.size());
}
Aggregations