use of org.apache.accumulo.core.client.mapreduce.impl.BatchInputSplit in project accumulo by apache.
the class AbstractInputFormat method getSplits.
/**
* Gets the splits of the tables that have been set on the job by reading the metadata table for the specified ranges.
*
* @return the splits from the tables based on the ranges.
* @throws java.io.IOException
* if a table set on the job doesn't exist or an error occurs initializing the tablet locator
*/
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
Level logLevel = getLogLevel(context);
log.setLevel(logLevel);
validateOptions(context);
Random random = new Random();
LinkedList<InputSplit> splits = new LinkedList<>();
Map<String, InputTableConfig> tableConfigs = getInputTableConfigs(context);
for (Map.Entry<String, InputTableConfig> tableConfigEntry : tableConfigs.entrySet()) {
String tableName = tableConfigEntry.getKey();
InputTableConfig tableConfig = tableConfigEntry.getValue();
Instance instance = getInstance(context);
Table.ID tableId;
// resolve table name to id once, and use id from this point forward
if (DeprecationUtil.isMockInstance(instance)) {
tableId = Table.ID.of("");
} else {
try {
tableId = Tables.getTableId(instance, tableName);
} catch (TableNotFoundException e) {
throw new IOException(e);
}
}
Authorizations auths = getScanAuthorizations(context);
String principal = getPrincipal(context);
AuthenticationToken token = getAuthenticationToken(context);
boolean batchScan = InputConfigurator.isBatchScan(CLASS, context.getConfiguration());
boolean supportBatchScan = !(tableConfig.isOfflineScan() || tableConfig.shouldUseIsolatedScanners() || tableConfig.shouldUseLocalIterators());
if (batchScan && !supportBatchScan)
throw new IllegalArgumentException("BatchScanner optimization not available for offline scan, isolated, or local iterators");
boolean autoAdjust = tableConfig.shouldAutoAdjustRanges();
if (batchScan && !autoAdjust)
throw new IllegalArgumentException("AutoAdjustRanges must be enabled when using BatchScanner optimization");
List<Range> ranges = autoAdjust ? Range.mergeOverlapping(tableConfig.getRanges()) : tableConfig.getRanges();
if (ranges.isEmpty()) {
ranges = new ArrayList<>(1);
ranges.add(new Range());
}
// get the metadata information for these ranges
Map<String, Map<KeyExtent, List<Range>>> binnedRanges = new HashMap<>();
TabletLocator tl;
try {
if (tableConfig.isOfflineScan()) {
binnedRanges = binOfflineTable(context, tableId, ranges);
while (binnedRanges == null) {
// Some tablets were still online, try again
// sleep randomly between 100 and 200 ms
sleepUninterruptibly(100 + random.nextInt(100), TimeUnit.MILLISECONDS);
binnedRanges = binOfflineTable(context, tableId, ranges);
}
} else {
tl = InputConfigurator.getTabletLocator(CLASS, context.getConfiguration(), tableId);
// its possible that the cache could contain complete, but old information about a tables tablets... so clear it
tl.invalidateCache();
ClientContext clientContext = new ClientContext(getInstance(context), new Credentials(getPrincipal(context), getAuthenticationToken(context)), getClientConfiguration(context));
while (!tl.binRanges(clientContext, ranges, binnedRanges).isEmpty()) {
if (!DeprecationUtil.isMockInstance(instance)) {
String tableIdStr = tableId.canonicalID();
if (!Tables.exists(instance, tableId))
throw new TableDeletedException(tableIdStr);
if (Tables.getTableState(instance, tableId) == TableState.OFFLINE)
throw new TableOfflineException(instance, tableIdStr);
}
binnedRanges.clear();
log.warn("Unable to locate bins for specified ranges. Retrying.");
// sleep randomly between 100 and 200 ms
sleepUninterruptibly(100 + random.nextInt(100), TimeUnit.MILLISECONDS);
tl.invalidateCache();
}
}
} catch (Exception e) {
throw new IOException(e);
}
// all of this code will add either range per each locations or split ranges and add range-location split
// Map from Range to Array of Locations, we only use this if we're don't split
HashMap<Range, ArrayList<String>> splitsToAdd = null;
if (!autoAdjust)
splitsToAdd = new HashMap<>();
HashMap<String, String> hostNameCache = new HashMap<>();
for (Map.Entry<String, Map<KeyExtent, List<Range>>> tserverBin : binnedRanges.entrySet()) {
String ip = tserverBin.getKey().split(":", 2)[0];
String location = hostNameCache.get(ip);
if (location == null) {
InetAddress inetAddress = InetAddress.getByName(ip);
location = inetAddress.getCanonicalHostName();
hostNameCache.put(ip, location);
}
for (Map.Entry<KeyExtent, List<Range>> extentRanges : tserverBin.getValue().entrySet()) {
Range ke = extentRanges.getKey().toDataRange();
if (batchScan) {
// group ranges by tablet to be read by a BatchScanner
ArrayList<Range> clippedRanges = new ArrayList<>();
for (Range r : extentRanges.getValue()) clippedRanges.add(ke.clip(r));
BatchInputSplit split = new BatchInputSplit(tableName, tableId, clippedRanges, new String[] { location });
SplitUtils.updateSplit(split, instance, tableConfig, principal, token, auths, logLevel);
splits.add(split);
} else {
// not grouping by tablet
for (Range r : extentRanges.getValue()) {
if (autoAdjust) {
// divide ranges into smaller ranges, based on the tablets
RangeInputSplit split = new RangeInputSplit(tableName, tableId.canonicalID(), ke.clip(r), new String[] { location });
SplitUtils.updateSplit(split, instance, tableConfig, principal, token, auths, logLevel);
split.setOffline(tableConfig.isOfflineScan());
split.setIsolatedScan(tableConfig.shouldUseIsolatedScanners());
split.setUsesLocalIterators(tableConfig.shouldUseLocalIterators());
splits.add(split);
} else {
// don't divide ranges
ArrayList<String> locations = splitsToAdd.get(r);
if (locations == null)
locations = new ArrayList<>(1);
locations.add(location);
splitsToAdd.put(r, locations);
}
}
}
}
}
if (!autoAdjust)
for (Map.Entry<Range, ArrayList<String>> entry : splitsToAdd.entrySet()) {
RangeInputSplit split = new RangeInputSplit(tableName, tableId.canonicalID(), entry.getKey(), entry.getValue().toArray(new String[0]));
SplitUtils.updateSplit(split, instance, tableConfig, principal, token, auths, logLevel);
split.setOffline(tableConfig.isOfflineScan());
split.setIsolatedScan(tableConfig.shouldUseIsolatedScanners());
split.setUsesLocalIterators(tableConfig.shouldUseLocalIterators());
splits.add(split);
}
}
return splits;
}
use of org.apache.accumulo.core.client.mapreduce.impl.BatchInputSplit in project accumulo by apache.
the class AccumuloInputFormatIT method testGetSplits.
/**
* Tests several different paths through the getSplits() method by setting different properties and verifying the results.
*/
@Test
public void testGetSplits() throws Exception {
Connector conn = getConnector();
String table = getUniqueNames(1)[0];
conn.tableOperations().create(table);
insertData(table, currentTimeMillis());
ClientConfiguration clientConf = cluster.getClientConfig();
AccumuloConfiguration clusterClientConf = new ConfigurationCopy(DefaultConfiguration.getInstance());
// Pass SSL and CredentialProvider options into the ClientConfiguration given to AccumuloInputFormat
boolean sslEnabled = Boolean.valueOf(clusterClientConf.get(Property.INSTANCE_RPC_SSL_ENABLED));
if (sslEnabled) {
ClientProperty[] sslProperties = new ClientProperty[] { ClientProperty.INSTANCE_RPC_SSL_ENABLED, ClientProperty.INSTANCE_RPC_SSL_CLIENT_AUTH, ClientProperty.RPC_SSL_KEYSTORE_PATH, ClientProperty.RPC_SSL_KEYSTORE_TYPE, ClientProperty.RPC_SSL_KEYSTORE_PASSWORD, ClientProperty.RPC_SSL_TRUSTSTORE_PATH, ClientProperty.RPC_SSL_TRUSTSTORE_TYPE, ClientProperty.RPC_SSL_TRUSTSTORE_PASSWORD, ClientProperty.RPC_USE_JSSE, ClientProperty.GENERAL_SECURITY_CREDENTIAL_PROVIDER_PATHS };
for (ClientProperty prop : sslProperties) {
// The default property is returned if it's not in the ClientConfiguration so we don't have to check if the value is actually defined
clientConf.setProperty(prop, clusterClientConf.get(prop.getKey()));
}
}
Job job = Job.getInstance();
AccumuloInputFormat.setInputTableName(job, table);
AccumuloInputFormat.setZooKeeperInstance(job, clientConf);
AccumuloInputFormat.setConnectorInfo(job, getAdminPrincipal(), getAdminToken());
// split table
TreeSet<Text> splitsToAdd = new TreeSet<>();
for (int i = 0; i < 10000; i += 1000) splitsToAdd.add(new Text(String.format("%09d", i)));
conn.tableOperations().addSplits(table, splitsToAdd);
// wait for splits to be propagated
sleepUninterruptibly(500, TimeUnit.MILLISECONDS);
// get splits without setting any range
Collection<Text> actualSplits = conn.tableOperations().listSplits(table);
List<InputSplit> splits = inputFormat.getSplits(job);
// No ranges set on the job so it'll start with -inf
assertEquals(actualSplits.size() + 1, splits.size());
// set ranges and get splits
List<Range> ranges = new ArrayList<>();
for (Text text : actualSplits) ranges.add(new Range(text));
AccumuloInputFormat.setRanges(job, ranges);
splits = inputFormat.getSplits(job);
assertEquals(actualSplits.size(), splits.size());
// offline mode
AccumuloInputFormat.setOfflineTableScan(job, true);
try {
inputFormat.getSplits(job);
fail("An exception should have been thrown");
} catch (IOException e) {
}
conn.tableOperations().offline(table, true);
splits = inputFormat.getSplits(job);
assertEquals(actualSplits.size(), splits.size());
// auto adjust ranges
ranges = new ArrayList<>();
for (int i = 0; i < 5; i++) // overlapping ranges
ranges.add(new Range(String.format("%09d", i), String.format("%09d", i + 2)));
AccumuloInputFormat.setRanges(job, ranges);
splits = inputFormat.getSplits(job);
assertEquals(2, splits.size());
AccumuloInputFormat.setAutoAdjustRanges(job, false);
splits = inputFormat.getSplits(job);
assertEquals(ranges.size(), splits.size());
// BatchScan not available for offline scans
AccumuloInputFormat.setBatchScan(job, true);
// Reset auto-adjust ranges too
AccumuloInputFormat.setAutoAdjustRanges(job, true);
AccumuloInputFormat.setOfflineTableScan(job, true);
try {
inputFormat.getSplits(job);
fail("An exception should have been thrown");
} catch (IllegalArgumentException e) {
}
conn.tableOperations().online(table, true);
AccumuloInputFormat.setOfflineTableScan(job, false);
// test for resumption of success
splits = inputFormat.getSplits(job);
assertEquals(2, splits.size());
// BatchScan not available with isolated iterators
AccumuloInputFormat.setScanIsolation(job, true);
try {
inputFormat.getSplits(job);
fail("An exception should have been thrown");
} catch (IllegalArgumentException e) {
}
AccumuloInputFormat.setScanIsolation(job, false);
// test for resumption of success
splits = inputFormat.getSplits(job);
assertEquals(2, splits.size());
// BatchScan not available with local iterators
AccumuloInputFormat.setLocalIterators(job, true);
try {
inputFormat.getSplits(job);
fail("An exception should have been thrown");
} catch (IllegalArgumentException e) {
}
AccumuloInputFormat.setLocalIterators(job, false);
// Check we are getting back correct type pf split
conn.tableOperations().online(table);
splits = inputFormat.getSplits(job);
for (InputSplit split : splits) assert (split instanceof BatchInputSplit);
// We should divide along the tablet lines similar to when using `setAutoAdjustRanges(job, true)`
assertEquals(2, splits.size());
}
Aggregations