use of org.apache.accumulo.hadoopImpl.mapreduce.InputTableConfig in project accumulo by apache.
the class AccumuloRecordReader method initialize.
/**
* Initialize a scanner over the given input split using this task attempt configuration.
*/
public void initialize(InputSplit inSplit, JobConf job) throws IOException {
baseSplit = (org.apache.accumulo.hadoopImpl.mapreduce.RangeInputSplit) inSplit;
log.debug("Initializing input split: " + baseSplit);
client = createClient(job, CLASS);
ClientContext context = (ClientContext) client;
Authorizations authorizations = InputConfigurator.getScanAuthorizations(CLASS, job);
String classLoaderContext = InputConfigurator.getClassLoaderContext(CLASS, job);
String table = baseSplit.getTableName();
// in case the table name changed, we can still use the previous name for terms of
// configuration, but the scanner will use the table id resolved at job setup time
InputTableConfig tableConfig = InputConfigurator.getInputTableConfig(CLASS, job, baseSplit.getTableName());
log.debug("Created client with user: " + context.whoami());
log.debug("Creating scanner for table: " + table);
log.debug("Authorizations are: " + authorizations);
if (baseSplit instanceof BatchInputSplit) {
BatchScanner scanner;
BatchInputSplit multiRangeSplit = (BatchInputSplit) baseSplit;
try {
// Note: BatchScanner will use at most one thread per tablet, currently BatchInputSplit
// will not span tablets
int scanThreads = 1;
scanner = context.createBatchScanner(baseSplit.getTableName(), authorizations, scanThreads);
setupIterators(job, scanner, baseSplit);
if (classLoaderContext != null) {
scanner.setClassLoaderContext(classLoaderContext);
}
} catch (TableNotFoundException e) {
throw new IOException(e);
}
scanner.setRanges(multiRangeSplit.getRanges());
scannerBase = scanner;
} else if (baseSplit instanceof RangeInputSplit) {
split = (RangeInputSplit) baseSplit;
Boolean isOffline = baseSplit.isOffline();
if (isOffline == null) {
isOffline = tableConfig.isOfflineScan();
}
Boolean isIsolated = baseSplit.isIsolatedScan();
if (isIsolated == null) {
isIsolated = tableConfig.shouldUseIsolatedScanners();
}
Boolean usesLocalIterators = baseSplit.usesLocalIterators();
if (usesLocalIterators == null) {
usesLocalIterators = tableConfig.shouldUseLocalIterators();
}
Scanner scanner;
try {
if (isOffline) {
scanner = new OfflineScanner(context, TableId.of(baseSplit.getTableId()), authorizations);
} else {
scanner = new ScannerImpl(context, TableId.of(baseSplit.getTableId()), authorizations);
}
if (isIsolated) {
log.info("Creating isolated scanner");
scanner = new IsolatedScanner(scanner);
}
if (usesLocalIterators) {
log.info("Using local iterators");
scanner = new ClientSideIteratorScanner(scanner);
}
setupIterators(job, scanner, baseSplit);
} catch (RuntimeException e) {
throw new IOException(e);
}
scanner.setRange(baseSplit.getRange());
scannerBase = scanner;
} else {
throw new IllegalArgumentException("Can not initialize from " + baseSplit.getClass());
}
Collection<IteratorSetting.Column> columns = baseSplit.getFetchedColumns();
if (columns == null) {
columns = tableConfig.getFetchedColumns();
}
// setup a scanner within the bounds of this split
for (Pair<Text, Text> c : columns) {
if (c.getSecond() != null) {
log.debug("Fetching column " + c.getFirst() + ":" + c.getSecond());
scannerBase.fetchColumn(c.getFirst(), c.getSecond());
} else {
log.debug("Fetching column family " + c.getFirst());
scannerBase.fetchColumnFamily(c.getFirst());
}
}
SamplerConfiguration samplerConfig = baseSplit.getSamplerConfiguration();
if (samplerConfig == null) {
samplerConfig = tableConfig.getSamplerConfiguration();
}
if (samplerConfig != null) {
scannerBase.setSamplerConfiguration(samplerConfig);
}
Map<String, String> executionHints = baseSplit.getExecutionHints();
if (executionHints == null || executionHints.isEmpty()) {
executionHints = tableConfig.getExecutionHints();
}
if (executionHints != null) {
scannerBase.setExecutionHints(executionHints);
}
scannerIterator = scannerBase.iterator();
numKeysRead = 0;
}
use of org.apache.accumulo.hadoopImpl.mapreduce.InputTableConfig in project accumulo by apache.
the class MultiTableInputFormatTest method testStoreTables.
/**
* Verify {@link InputTableConfig} objects get correctly serialized in the JobContext.
*/
@Test
public void testStoreTables() throws Exception {
String table1Name = testName.getMethodName() + "1";
String table2Name = testName.getMethodName() + "2";
JobConf job = new JobConf();
Properties clientProps = org.apache.accumulo.hadoop.mapreduce.AccumuloInputFormatTest.setupClientProperties();
List<Range> ranges = singletonList(new Range("a", "b"));
Set<IteratorSetting.Column> cols = singleton(new IteratorSetting.Column(new Text("CF1"), new Text("CQ1")));
IteratorSetting iter1 = new IteratorSetting(50, "iter1", "iterclass1");
IteratorSetting iter2 = new IteratorSetting(60, "iter2", "iterclass2");
List<IteratorSetting> allIters = new ArrayList<>();
allIters.add(iter1);
allIters.add(iter2);
// if auths are not set client will try to get from server, we dont want that here
Authorizations auths = Authorizations.EMPTY;
// @formatter:off
AccumuloInputFormat.configure().clientProperties(clientProps).table(table1Name).auths(auths).ranges(ranges).fetchColumns(cols).addIterator(iter1).addIterator(iter2).localIterators(true).offlineScan(// end table 1
true).table(table2Name).auths(auths).ranges(ranges).fetchColumns(cols).addIterator(// end
iter2).store(job);
// @formatter:on
InputTableConfig table1 = new InputTableConfig();
table1.setScanAuths(auths).setRanges(ranges).fetchColumns(cols).setUseLocalIterators(true).setOfflineScan(true);
allIters.forEach(table1::addIterator);
InputTableConfig table2 = new InputTableConfig();
table2.setScanAuths(auths).setRanges(ranges).fetchColumns(cols).addIterator(iter2);
assertEquals(table1, InputConfigurator.getInputTableConfig(CLASS, job, table1Name));
assertEquals(table2, InputConfigurator.getInputTableConfig(CLASS, job, table2Name));
}
use of org.apache.accumulo.hadoopImpl.mapreduce.InputTableConfig in project accumulo by apache.
the class InputConfigurator method getInputTableConfigs.
/**
* Returns all {@link InputTableConfig} objects associated with this job.
*
* @param implementingClass
* the class whose name will be used as a prefix for the property configuration key
* @param conf
* the Hadoop configuration object to configure
* @param tableName
* the table name for which to retrieve the configuration
* @return all of the table query configs for the job
* @since 1.6.0
*/
private static Map<String, InputTableConfig> getInputTableConfigs(Class<?> implementingClass, Configuration conf, String tableName) {
Map<String, InputTableConfig> configs = new HashMap<>();
Map.Entry<String, InputTableConfig> defaultConfig = getDefaultInputTableConfig(implementingClass, conf, tableName);
if (defaultConfig != null)
configs.put(defaultConfig.getKey(), defaultConfig.getValue());
String configString = conf.get(enumToConfKey(implementingClass, ScanOpts.TABLE_CONFIGS));
MapWritable mapWritable = new MapWritable();
if (configString != null) {
try {
byte[] bytes = Base64.getDecoder().decode(configString);
ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
mapWritable.readFields(new DataInputStream(bais));
bais.close();
} catch (IOException e) {
throw new IllegalStateException("The table query configurations could not be deserialized" + " from the given configuration");
}
}
for (Map.Entry<Writable, Writable> entry : mapWritable.entrySet()) configs.put(entry.getKey().toString(), (InputTableConfig) entry.getValue());
return configs;
}
use of org.apache.accumulo.hadoopImpl.mapreduce.InputTableConfig in project accumulo by apache.
the class AccumuloRecordReader method getSplits.
/**
* Gets the splits of the tables that have been set on the job by reading the metadata table for
* the specified ranges.
*/
public static InputSplit[] getSplits(JobConf job, Class<?> callingClass) throws IOException {
validateOptions(job, callingClass);
LinkedList<InputSplit> splits = new LinkedList<>();
Map<String, InputTableConfig> tableConfigs = InputConfigurator.getInputTableConfigs(callingClass, job);
try (AccumuloClient client = createClient(job, callingClass);
var context = ((ClientContext) client)) {
for (Map.Entry<String, InputTableConfig> tableConfigEntry : tableConfigs.entrySet()) {
String tableName = tableConfigEntry.getKey();
InputTableConfig tableConfig = tableConfigEntry.getValue();
TableId tableId;
// resolve table name to id once, and use id from this point forward
try {
tableId = context.getTableId(tableName);
} catch (TableNotFoundException e) {
throw new IOException(e);
}
boolean batchScan = InputConfigurator.isBatchScan(callingClass, job);
boolean supportBatchScan = !(tableConfig.isOfflineScan() || tableConfig.shouldUseIsolatedScanners() || tableConfig.shouldUseLocalIterators());
if (batchScan && !supportBatchScan)
throw new IllegalArgumentException("BatchScanner optimization not available for offline" + " scan, isolated, or local iterators");
boolean autoAdjust = tableConfig.shouldAutoAdjustRanges();
if (batchScan && !autoAdjust)
throw new IllegalArgumentException("AutoAdjustRanges must be enabled when using BatchScanner optimization");
List<Range> ranges = autoAdjust ? Range.mergeOverlapping(tableConfig.getRanges()) : tableConfig.getRanges();
if (ranges.isEmpty()) {
ranges = new ArrayList<>(1);
ranges.add(new Range());
}
// get the metadata information for these ranges
Map<String, Map<KeyExtent, List<Range>>> binnedRanges = new HashMap<>();
TabletLocator tl;
try {
if (tableConfig.isOfflineScan()) {
binnedRanges = binOfflineTable(job, tableId, ranges, callingClass);
while (binnedRanges == null) {
// Some tablets were still online, try again
// sleep randomly between 100 and 200 ms
sleepUninterruptibly(100 + random.nextInt(100), TimeUnit.MILLISECONDS);
binnedRanges = binOfflineTable(job, tableId, ranges, callingClass);
}
} else {
tl = InputConfigurator.getTabletLocator(callingClass, job, tableId);
// its possible that the cache could contain complete, but old information about a
// tables
// tablets... so clear it
tl.invalidateCache();
while (!tl.binRanges(context, ranges, binnedRanges).isEmpty()) {
context.requireNotDeleted(tableId);
context.requireNotOffline(tableId, tableName);
binnedRanges.clear();
log.warn("Unable to locate bins for specified ranges. Retrying.");
// sleep randomly between 100 and 200 ms
sleepUninterruptibly(100 + random.nextInt(100), TimeUnit.MILLISECONDS);
tl.invalidateCache();
}
}
} catch (TableOfflineException | TableNotFoundException | AccumuloException | AccumuloSecurityException e) {
throw new IOException(e);
}
HashMap<Range, ArrayList<String>> splitsToAdd = null;
if (!autoAdjust)
splitsToAdd = new HashMap<>();
HashMap<String, String> hostNameCache = new HashMap<>();
for (Map.Entry<String, Map<KeyExtent, List<Range>>> tserverBin : binnedRanges.entrySet()) {
String ip = tserverBin.getKey().split(":", 2)[0];
String location = hostNameCache.get(ip);
if (location == null) {
InetAddress inetAddress = InetAddress.getByName(ip);
location = inetAddress.getCanonicalHostName();
hostNameCache.put(ip, location);
}
for (Map.Entry<KeyExtent, List<Range>> extentRanges : tserverBin.getValue().entrySet()) {
Range ke = extentRanges.getKey().toDataRange();
if (batchScan) {
// group ranges by tablet to be read by a BatchScanner
ArrayList<Range> clippedRanges = new ArrayList<>();
for (Range r : extentRanges.getValue()) clippedRanges.add(ke.clip(r));
BatchInputSplit split = new BatchInputSplit(tableName, tableId, clippedRanges, new String[] { location });
SplitUtils.updateSplit(split, tableConfig);
splits.add(split);
} else {
// not grouping by tablet
for (Range r : extentRanges.getValue()) {
if (autoAdjust) {
// divide ranges into smaller ranges, based on the tablets
RangeInputSplit split = new RangeInputSplit(tableName, tableId.canonical(), ke.clip(r), new String[] { location });
SplitUtils.updateSplit(split, tableConfig);
split.setOffline(tableConfig.isOfflineScan());
split.setIsolatedScan(tableConfig.shouldUseIsolatedScanners());
split.setUsesLocalIterators(tableConfig.shouldUseLocalIterators());
splits.add(split);
} else {
// don't divide ranges
ArrayList<String> locations = splitsToAdd.get(r);
if (locations == null)
locations = new ArrayList<>(1);
locations.add(location);
splitsToAdd.put(r, locations);
}
}
}
}
}
if (!autoAdjust)
for (Map.Entry<Range, ArrayList<String>> entry : splitsToAdd.entrySet()) {
RangeInputSplit split = new RangeInputSplit(tableName, tableId.canonical(), entry.getKey(), entry.getValue().toArray(new String[0]));
SplitUtils.updateSplit(split, tableConfig);
split.setOffline(tableConfig.isOfflineScan());
split.setIsolatedScan(tableConfig.shouldUseIsolatedScanners());
split.setUsesLocalIterators(tableConfig.shouldUseLocalIterators());
splits.add(split);
}
}
}
return splits.toArray(new InputSplit[splits.size()]);
}
use of org.apache.accumulo.hadoopImpl.mapreduce.InputTableConfig in project accumulo by apache.
the class MultiTableInputFormatTest method testManyTables.
@Test
public void testManyTables() throws Exception {
Job job = Job.getInstance();
Properties clientProps = org.apache.accumulo.hadoop.mapreduce.AccumuloInputFormatTest.setupClientProperties();
// if auths are not set client will try to get from server, we dont want that here
Authorizations auths = Authorizations.EMPTY;
// set the client properties once then loop over tables
InputFormatBuilder.TableParams<Job> opts = AccumuloInputFormat.configure().clientProperties(clientProps);
for (int i = 0; i < 10_000; i++) {
List<Range> ranges = singletonList(new Range("a" + i, "b" + i));
Set<Column> cols = singleton(new Column(new Text("CF" + i), new Text("CQ" + i)));
IteratorSetting iter = new IteratorSetting(50, "iter" + i, "iterclass" + i);
opts.table("table" + i).auths(auths).ranges(ranges).fetchColumns(cols).addIterator(iter);
}
opts.store(job);
// verify
Map<String, InputTableConfig> configs = InputConfigurator.getInputTableConfigs(CLASS, job.getConfiguration());
assertEquals(10_000, configs.size());
// create objects to test against
for (int i = 0; i < 10_000; i++) {
InputTableConfig t = new InputTableConfig();
List<Range> ranges = singletonList(new Range("a" + i, "b" + i));
Set<Column> cols = singleton(new Column(new Text("CF" + i), new Text("CQ" + i)));
IteratorSetting iter = new IteratorSetting(50, "iter" + i, "iterclass" + i);
t.setScanAuths(auths).setRanges(ranges).fetchColumns(cols).addIterator(iter);
assertEquals(t, configs.get("table" + i));
}
}
Aggregations