use of org.apache.accumulo.core.clientImpl.bulk.Bulk.Files in project accumulo by apache.
the class BulkImport method load.
@Override
public void load() throws TableNotFoundException, IOException, AccumuloException, AccumuloSecurityException {
TableId tableId = context.getTableId(tableName);
FileSystem fs = VolumeConfiguration.fileSystemForPath(dir, context.getHadoopConf());
Path srcPath = checkPath(fs, dir);
SortedMap<KeyExtent, Bulk.Files> mappings;
TableOperationsImpl tableOps = new TableOperationsImpl(context);
int maxTablets = 0;
for (var prop : tableOps.getProperties(tableName)) {
if (prop.getKey().equals(Property.TABLE_BULK_MAX_TABLETS.getKey())) {
maxTablets = Integer.parseInt(prop.getValue());
break;
}
}
Retry retry = Retry.builder().infiniteRetries().retryAfter(100, MILLISECONDS).incrementBy(100, MILLISECONDS).maxWait(2, MINUTES).backOffFactor(1.5).logInterval(3, MINUTES).createRetry();
// retry if a merge occurs
boolean shouldRetry = true;
while (shouldRetry) {
if (plan == null) {
mappings = computeMappingFromFiles(fs, tableId, srcPath, maxTablets);
} else {
mappings = computeMappingFromPlan(fs, tableId, srcPath, maxTablets);
}
if (mappings.isEmpty()) {
if (ignoreEmptyDir == true) {
log.info("Attempted to import files from empty directory - {}. Zero files imported", srcPath);
return;
} else {
throw new IllegalArgumentException("Attempted to import zero files from " + srcPath);
}
}
BulkSerialize.writeLoadMapping(mappings, srcPath.toString(), fs::create);
List<ByteBuffer> args = Arrays.asList(ByteBuffer.wrap(tableId.canonical().getBytes(UTF_8)), ByteBuffer.wrap(srcPath.toString().getBytes(UTF_8)), ByteBuffer.wrap((setTime + "").getBytes(UTF_8)));
try {
tableOps.doBulkFateOperation(args, tableName);
shouldRetry = false;
} catch (AccumuloBulkMergeException ae) {
if (plan != null) {
checkPlanForSplits(ae);
}
try {
retry.waitForNextAttempt();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
log.info(ae.getMessage() + ". Retrying bulk import to " + tableName);
}
}
}
use of org.apache.accumulo.core.clientImpl.bulk.Bulk.Files in project accumulo by apache.
the class BulkImport method computeFileToTabletMappings.
public SortedMap<KeyExtent, Bulk.Files> computeFileToTabletMappings(FileSystem fs, TableId tableId, Path dirPath, Executor executor, ClientContext context, int maxTablets) throws IOException {
KeyExtentCache extentCache = new ConcurrentKeyExtentCache(tableId, context);
List<FileStatus> files = filterInvalid(fs.listStatus(dirPath, p -> !p.getName().equals(Constants.BULK_LOAD_MAPPING)));
// we know all of the file lens, so construct a cache and populate it in order to avoid later
// trips to the namenode
Cache<String, Long> fileLensCache = getPopulatedFileLenCache(dirPath, files);
List<CompletableFuture<Map<KeyExtent, Bulk.FileInfo>>> futures = new ArrayList<>();
CryptoService cs = CryptoServiceFactory.newDefaultInstance();
for (FileStatus fileStatus : files) {
Path filePath = fileStatus.getPath();
CompletableFuture<Map<KeyExtent, Bulk.FileInfo>> future = CompletableFuture.supplyAsync(() -> {
try {
long t1 = System.currentTimeMillis();
List<KeyExtent> extents = findOverlappingTablets(context, extentCache, filePath, fs, fileLensCache, cs);
// make sure file isn't going to too many tablets
checkTabletCount(maxTablets, extents.size(), filePath.toString());
Map<KeyExtent, Long> estSizes = estimateSizes(context.getConfiguration(), filePath, fileStatus.getLen(), extents, fs, fileLensCache, cs);
Map<KeyExtent, Bulk.FileInfo> pathLocations = new HashMap<>();
for (KeyExtent ke : extents) {
pathLocations.put(ke, new Bulk.FileInfo(filePath, estSizes.getOrDefault(ke, 0L)));
}
long t2 = System.currentTimeMillis();
log.debug("Mapped {} to {} tablets in {}ms", filePath, pathLocations.size(), t2 - t1);
return pathLocations;
} catch (Exception e) {
throw new CompletionException(e);
}
}, executor);
futures.add(future);
}
SortedMap<KeyExtent, Bulk.Files> mappings = new TreeMap<>();
for (CompletableFuture<Map<KeyExtent, Bulk.FileInfo>> future : futures) {
try {
Map<KeyExtent, Bulk.FileInfo> pathMapping = future.get();
pathMapping.forEach((ext, fi) -> mappings.computeIfAbsent(ext, k -> new Files()).add(fi));
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new RuntimeException(e);
} catch (ExecutionException e) {
throw new RuntimeException(e);
}
}
return mergeOverlapping(mappings);
}
use of org.apache.accumulo.core.clientImpl.bulk.Bulk.Files in project accumulo by apache.
the class BulkImportTest method addMapping.
private void addMapping(SortedMap<KeyExtent, Files> mappings, String prevRow, String endRow, String... fileNames) {
KeyExtent ke = new KeyExtent(TableId.of("42"), endRow == null ? null : new Text(endRow), prevRow == null ? null : new Text(prevRow));
Files files = new Files();
for (String name : fileNames) {
files.add(new FileInfo(name, 2, 2));
}
mappings.put(ke, files);
}
use of org.apache.accumulo.core.clientImpl.bulk.Bulk.Files in project accumulo by apache.
the class BulkSerializeTest method testRemap.
@Test
public void testRemap() throws Exception {
TableId tableId = TableId.of("3");
SortedMap<KeyExtent, Bulk.Files> mapping = generateMapping(tableId);
SortedMap<KeyExtent, Bulk.Files> newNameMapping = new TreeMap<>();
Map<String, String> nameMap = new HashMap<>();
mapping.forEach((extent, files) -> {
Files newFiles = new Files();
files.forEach(fi -> {
newFiles.add(new FileInfo("N" + fi.name, fi.estSize, fi.estEntries));
nameMap.put(fi.name, "N" + fi.name);
});
newNameMapping.put(extent, newFiles);
});
ByteArrayOutputStream mappingBaos = new ByteArrayOutputStream();
ByteArrayOutputStream nameBaos = new ByteArrayOutputStream();
BulkSerialize.writeRenameMap(nameMap, "/some/dir", p -> nameBaos);
BulkSerialize.writeLoadMapping(mapping, "/some/dir", p -> mappingBaos);
Input input = p -> {
if (p.getName().equals(Constants.BULK_LOAD_MAPPING)) {
return new ByteArrayInputStream(mappingBaos.toByteArray());
} else if (p.getName().equals(Constants.BULK_RENAME_FILE)) {
return new ByteArrayInputStream(nameBaos.toByteArray());
} else {
throw new IllegalArgumentException("bad path " + p);
}
};
try (LoadMappingIterator lmi = BulkSerialize.getUpdatedLoadMapping("/some/dir", tableId, input)) {
SortedMap<KeyExtent, Bulk.Files> actual = new TreeMap<>();
lmi.forEachRemaining(e -> actual.put(e.getKey(), e.getValue()));
assertEquals(newNameMapping, actual);
}
}
use of org.apache.accumulo.core.clientImpl.bulk.Bulk.Files in project accumulo by apache.
the class BulkImport method computeMappingFromPlan.
private SortedMap<KeyExtent, Files> computeMappingFromPlan(FileSystem fs, TableId tableId, Path srcPath, int maxTablets) throws IOException, AccumuloException, AccumuloSecurityException, TableNotFoundException {
Map<String, List<Destination>> fileDestinations = plan.getDestinations().stream().collect(groupingBy(Destination::getFileName));
List<FileStatus> statuses = filterInvalid(fs.listStatus(srcPath, p -> !p.getName().equals(Constants.BULK_LOAD_MAPPING)));
Map<String, Long> fileLens = getFileLenMap(statuses);
if (!fileDestinations.keySet().equals(fileLens.keySet())) {
throw new IllegalArgumentException("Load plan files differ from directory files, symmetric difference : " + Sets.symmetricDifference(fileDestinations.keySet(), fileLens.keySet()));
}
KeyExtentCache extentCache = new ConcurrentKeyExtentCache(tableId, context);
// Pre-populate cache by looking up all end rows in sorted order. Doing this in sorted order
// leverages read ahead.
fileDestinations.values().stream().flatMap(List::stream).filter(dest -> dest.getRangeType() == RangeType.FILE).flatMap(dest -> Stream.of(dest.getStartRow(), dest.getEndRow())).filter(Objects::nonNull).map(Text::new).sorted().distinct().forEach(row -> {
try {
extentCache.lookup(row);
} catch (Exception e) {
throw new RuntimeException(e);
}
});
SortedMap<KeyExtent, Files> mapping = new TreeMap<>();
for (Entry<String, List<Destination>> entry : fileDestinations.entrySet()) {
String fileName = entry.getKey();
List<Destination> destinations = entry.getValue();
Set<KeyExtent> extents = mapDestinationsToExtents(tableId, extentCache, destinations);
log.debug("The file {} mapped to {} tablets.", fileName, extents.size());
checkTabletCount(maxTablets, extents.size(), fileName);
long estSize = (long) (fileLens.get(fileName) / (double) extents.size());
for (KeyExtent keyExtent : extents) {
mapping.computeIfAbsent(keyExtent, k -> new Files()).add(new FileInfo(fileName, estSize, 0));
}
}
return mergeOverlapping(mapping);
}
Aggregations