use of com.datastax.driver.core.TokenRange in project cassandra by apache.
the class CqlInputFormat method getSplits.
public List<org.apache.hadoop.mapreduce.InputSplit> getSplits(JobContext context) throws IOException {
Configuration conf = HadoopCompat.getConfiguration(context);
validateConfiguration(conf);
keyspace = ConfigHelper.getInputKeyspace(conf);
cfName = ConfigHelper.getInputColumnFamily(conf);
partitioner = ConfigHelper.getInputPartitioner(conf);
logger.trace("partitioner is {}", partitioner);
// canonical ranges, split into pieces, fetching the splits in parallel
ExecutorService executor = new ThreadPoolExecutor(0, 128, 60L, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>());
List<org.apache.hadoop.mapreduce.InputSplit> splits = new ArrayList<>();
try (Cluster cluster = CqlConfigHelper.getInputCluster(ConfigHelper.getInputInitialAddress(conf).split(","), conf);
Session session = cluster.connect()) {
List<Future<List<org.apache.hadoop.mapreduce.InputSplit>>> splitfutures = new ArrayList<>();
Pair<String, String> jobKeyRange = ConfigHelper.getInputKeyRange(conf);
Range<Token> jobRange = null;
if (jobKeyRange != null) {
jobRange = new Range<>(partitioner.getTokenFactory().fromString(jobKeyRange.left), partitioner.getTokenFactory().fromString(jobKeyRange.right));
}
Metadata metadata = cluster.getMetadata();
// canonical ranges and nodes holding replicas
Map<TokenRange, Set<Host>> masterRangeNodes = getRangeMap(keyspace, metadata);
for (TokenRange range : masterRangeNodes.keySet()) {
if (jobRange == null) {
// for each tokenRange, pick a live owner and ask it to compute bite-sized splits
splitfutures.add(executor.submit(new SplitCallable(range, masterRangeNodes.get(range), conf, session)));
} else {
TokenRange jobTokenRange = rangeToTokenRange(metadata, jobRange);
if (range.intersects(jobTokenRange)) {
for (TokenRange intersection : range.intersectWith(jobTokenRange)) {
// for each tokenRange, pick a live owner and ask it to compute bite-sized splits
splitfutures.add(executor.submit(new SplitCallable(intersection, masterRangeNodes.get(range), conf, session)));
}
}
}
}
// wait until we have all the results back
for (Future<List<org.apache.hadoop.mapreduce.InputSplit>> futureInputSplits : splitfutures) {
try {
splits.addAll(futureInputSplits.get());
} catch (Exception e) {
throw new IOException("Could not get input splits", e);
}
}
} finally {
executor.shutdownNow();
}
assert splits.size() > 0;
Collections.shuffle(splits, new Random(System.nanoTime()));
return splits;
}
use of com.datastax.driver.core.TokenRange in project cassandra by apache.
the class CqlInputFormat method describeSplits.
private Map<TokenRange, Long> describeSplits(String keyspace, String table, TokenRange tokenRange, int splitSize, int splitSizeMb, Session session) {
String query = String.format("SELECT mean_partition_size, partitions_count " + "FROM %s.%s " + "WHERE keyspace_name = ? AND table_name = ? AND range_start = ? AND range_end = ?", SchemaConstants.SYSTEM_KEYSPACE_NAME, SystemKeyspace.SIZE_ESTIMATES);
ResultSet resultSet = session.execute(query, keyspace, table, tokenRange.getStart().toString(), tokenRange.getEnd().toString());
Row row = resultSet.one();
long meanPartitionSize = 0;
long partitionCount = 0;
int splitCount = 0;
if (row != null) {
meanPartitionSize = row.getLong("mean_partition_size");
partitionCount = row.getLong("partitions_count");
splitCount = splitSizeMb > 0 ? (int) (meanPartitionSize * partitionCount / splitSizeMb / 1024 / 1024) : (int) (partitionCount / splitSize);
}
// Assume smallest granularity of partition count available from CASSANDRA-7688
if (splitCount == 0) {
Map<TokenRange, Long> wrappedTokenRange = new HashMap<>();
wrappedTokenRange.put(tokenRange, (long) 128);
return wrappedTokenRange;
}
List<TokenRange> splitRanges = tokenRange.splitEvenly(splitCount);
Map<TokenRange, Long> rangesWithLength = new HashMap<>();
for (TokenRange range : splitRanges) rangesWithLength.put(range, partitionCount / splitCount);
return rangesWithLength;
}
use of com.datastax.driver.core.TokenRange in project presto by prestodb.
the class CassandraTokenSplitManager method getSplits.
public List<TokenSplit> getSplits(String keyspace, String table) {
Set<TokenRange> tokenRanges = getTokenRanges();
if (tokenRanges.isEmpty()) {
throw new PrestoException(CASSANDRA_METADATA_ERROR, "The cluster metadata is not available. " + "Please make sure that the Cassandra cluster is up and running, " + "and that the contact points are specified correctly.");
}
if (tokenRanges.stream().anyMatch(TokenRange::isWrappedAround)) {
tokenRanges = unwrap(tokenRanges);
}
Optional<TokenRing> tokenRing = createForPartitioner(getPartitioner());
long totalPartitionsCount = getTotalPartitionsCount(keyspace, table);
List<TokenSplit> splits = new ArrayList<>();
for (TokenRange tokenRange : tokenRanges) {
if (tokenRange.isEmpty()) {
continue;
}
checkState(!tokenRange.isWrappedAround(), "all token ranges must be unwrapped at this step");
List<String> endpoints = getEndpoints(keyspace, tokenRange);
checkState(!endpoints.isEmpty(), "endpoints is empty for token range: %s", tokenRange);
if (!tokenRing.isPresent()) {
checkState(!tokenRange.isWrappedAround(), "all token ranges must be unwrapped at this step");
splits.add(createSplit(tokenRange, endpoints));
continue;
}
double tokenRangeRingFraction = tokenRing.get().getRingFraction(tokenRange.getStart().toString(), tokenRange.getEnd().toString());
long partitionsCountEstimate = round(totalPartitionsCount * tokenRangeRingFraction);
checkState(partitionsCountEstimate >= 0, "unexpected partitions count estimate: %d", partitionsCountEstimate);
int subSplitCount = max(toIntExact(partitionsCountEstimate / splitSize), 1);
List<TokenRange> subRanges = tokenRange.splitEvenly(subSplitCount);
for (TokenRange subRange : subRanges) {
if (subRange.isEmpty()) {
continue;
}
checkState(!subRange.isWrappedAround(), "all token ranges must be unwrapped at this step");
splits.add(createSplit(subRange, endpoints));
}
}
shuffle(splits, ThreadLocalRandom.current());
return unmodifiableList(splits);
}
Aggregations