Search in sources :

Example 1 with Files

use of org.apache.accumulo.core.clientImpl.bulk.Bulk.Files in project accumulo by apache.

the class BulkImport method load.

@Override
public void load() throws TableNotFoundException, IOException, AccumuloException, AccumuloSecurityException {
    TableId tableId = context.getTableId(tableName);
    FileSystem fs = VolumeConfiguration.fileSystemForPath(dir, context.getHadoopConf());
    Path srcPath = checkPath(fs, dir);
    SortedMap<KeyExtent, Bulk.Files> mappings;
    TableOperationsImpl tableOps = new TableOperationsImpl(context);
    int maxTablets = 0;
    for (var prop : tableOps.getProperties(tableName)) {
        if (prop.getKey().equals(Property.TABLE_BULK_MAX_TABLETS.getKey())) {
            maxTablets = Integer.parseInt(prop.getValue());
            break;
        }
    }
    Retry retry = Retry.builder().infiniteRetries().retryAfter(100, MILLISECONDS).incrementBy(100, MILLISECONDS).maxWait(2, MINUTES).backOffFactor(1.5).logInterval(3, MINUTES).createRetry();
    // retry if a merge occurs
    boolean shouldRetry = true;
    while (shouldRetry) {
        if (plan == null) {
            mappings = computeMappingFromFiles(fs, tableId, srcPath, maxTablets);
        } else {
            mappings = computeMappingFromPlan(fs, tableId, srcPath, maxTablets);
        }
        if (mappings.isEmpty()) {
            if (ignoreEmptyDir == true) {
                log.info("Attempted to import files from empty directory - {}. Zero files imported", srcPath);
                return;
            } else {
                throw new IllegalArgumentException("Attempted to import zero files from " + srcPath);
            }
        }
        BulkSerialize.writeLoadMapping(mappings, srcPath.toString(), fs::create);
        List<ByteBuffer> args = Arrays.asList(ByteBuffer.wrap(tableId.canonical().getBytes(UTF_8)), ByteBuffer.wrap(srcPath.toString().getBytes(UTF_8)), ByteBuffer.wrap((setTime + "").getBytes(UTF_8)));
        try {
            tableOps.doBulkFateOperation(args, tableName);
            shouldRetry = false;
        } catch (AccumuloBulkMergeException ae) {
            if (plan != null) {
                checkPlanForSplits(ae);
            }
            try {
                retry.waitForNextAttempt();
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
            log.info(ae.getMessage() + ". Retrying bulk import to " + tableName);
        }
    }
}
Also used : TableId(org.apache.accumulo.core.data.TableId) Path(org.apache.hadoop.fs.Path) AccumuloBulkMergeException(org.apache.accumulo.core.clientImpl.AccumuloBulkMergeException) TableOperationsImpl(org.apache.accumulo.core.clientImpl.TableOperationsImpl) KeyExtent(org.apache.accumulo.core.dataImpl.KeyExtent) ByteBuffer(java.nio.ByteBuffer) FileSystem(org.apache.hadoop.fs.FileSystem) Retry(org.apache.accumulo.fate.util.Retry) Files(org.apache.accumulo.core.clientImpl.bulk.Bulk.Files)

Example 2 with Files

use of org.apache.accumulo.core.clientImpl.bulk.Bulk.Files in project accumulo by apache.

the class BulkImport method computeFileToTabletMappings.

public SortedMap<KeyExtent, Bulk.Files> computeFileToTabletMappings(FileSystem fs, TableId tableId, Path dirPath, Executor executor, ClientContext context, int maxTablets) throws IOException {
    KeyExtentCache extentCache = new ConcurrentKeyExtentCache(tableId, context);
    List<FileStatus> files = filterInvalid(fs.listStatus(dirPath, p -> !p.getName().equals(Constants.BULK_LOAD_MAPPING)));
    // we know all of the file lens, so construct a cache and populate it in order to avoid later
    // trips to the namenode
    Cache<String, Long> fileLensCache = getPopulatedFileLenCache(dirPath, files);
    List<CompletableFuture<Map<KeyExtent, Bulk.FileInfo>>> futures = new ArrayList<>();
    CryptoService cs = CryptoServiceFactory.newDefaultInstance();
    for (FileStatus fileStatus : files) {
        Path filePath = fileStatus.getPath();
        CompletableFuture<Map<KeyExtent, Bulk.FileInfo>> future = CompletableFuture.supplyAsync(() -> {
            try {
                long t1 = System.currentTimeMillis();
                List<KeyExtent> extents = findOverlappingTablets(context, extentCache, filePath, fs, fileLensCache, cs);
                // make sure file isn't going to too many tablets
                checkTabletCount(maxTablets, extents.size(), filePath.toString());
                Map<KeyExtent, Long> estSizes = estimateSizes(context.getConfiguration(), filePath, fileStatus.getLen(), extents, fs, fileLensCache, cs);
                Map<KeyExtent, Bulk.FileInfo> pathLocations = new HashMap<>();
                for (KeyExtent ke : extents) {
                    pathLocations.put(ke, new Bulk.FileInfo(filePath, estSizes.getOrDefault(ke, 0L)));
                }
                long t2 = System.currentTimeMillis();
                log.debug("Mapped {} to {} tablets in {}ms", filePath, pathLocations.size(), t2 - t1);
                return pathLocations;
            } catch (Exception e) {
                throw new CompletionException(e);
            }
        }, executor);
        futures.add(future);
    }
    SortedMap<KeyExtent, Bulk.Files> mappings = new TreeMap<>();
    for (CompletableFuture<Map<KeyExtent, Bulk.FileInfo>> future : futures) {
        try {
            Map<KeyExtent, Bulk.FileInfo> pathMapping = future.get();
            pathMapping.forEach((ext, fi) -> mappings.computeIfAbsent(ext, k -> new Files()).add(fi));
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt();
            throw new RuntimeException(e);
        } catch (ExecutionException e) {
            throw new RuntimeException(e);
        }
    }
    return mergeOverlapping(mappings);
}
Also used : TableId(org.apache.accumulo.core.data.TableId) ByteSequence(org.apache.accumulo.core.data.ByteSequence) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) LoggerFactory(org.slf4j.LoggerFactory) Text(org.apache.hadoop.io.Text) FileStatus(org.apache.hadoop.fs.FileStatus) ByteBuffer(java.nio.ByteBuffer) TableNotFoundException(org.apache.accumulo.core.client.TableNotFoundException) CachableBlockFile.pathToCacheId(org.apache.accumulo.core.file.blockfile.impl.CachableBlockFile.pathToCacheId) ConfigurationTypeHelper(org.apache.accumulo.core.conf.ConfigurationTypeHelper) FileOperations(org.apache.accumulo.core.file.FileOperations) AccumuloBulkMergeException(org.apache.accumulo.core.clientImpl.AccumuloBulkMergeException) TableOperationsImpl(org.apache.accumulo.core.clientImpl.TableOperationsImpl) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) FileInfo(org.apache.accumulo.core.clientImpl.bulk.Bulk.FileInfo) Property(org.apache.accumulo.core.conf.Property) LoadPlan(org.apache.accumulo.core.data.LoadPlan) ClientContext(org.apache.accumulo.core.clientImpl.ClientContext) Collection(java.util.Collection) FileSKVIterator(org.apache.accumulo.core.file.FileSKVIterator) Set(java.util.Set) CompletionException(java.util.concurrent.CompletionException) ThreadPools(org.apache.accumulo.core.util.threads.ThreadPools) MILLISECONDS(java.util.concurrent.TimeUnit.MILLISECONDS) RangeType(org.apache.accumulo.core.data.LoadPlan.RangeType) FileNotFoundException(java.io.FileNotFoundException) Sets(com.google.common.collect.Sets) VolumeConfiguration(org.apache.accumulo.core.volume.VolumeConfiguration) Objects(java.util.Objects) List(java.util.List) Stream(java.util.stream.Stream) Entry(java.util.Map.Entry) Files(org.apache.accumulo.core.clientImpl.bulk.Bulk.Files) ImportDestinationArguments(org.apache.accumulo.core.client.admin.TableOperations.ImportDestinationArguments) ImportMappingOptions(org.apache.accumulo.core.client.admin.TableOperations.ImportMappingOptions) CacheBuilder(com.google.common.cache.CacheBuilder) SortedMap(java.util.SortedMap) FilenameUtils(org.apache.commons.io.FilenameUtils) Collectors.groupingBy(java.util.stream.Collectors.groupingBy) Destination(org.apache.accumulo.core.data.LoadPlan.Destination) MINUTES(java.util.concurrent.TimeUnit.MINUTES) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) Key(org.apache.accumulo.core.data.Key) ExecutorService(java.util.concurrent.ExecutorService) EXISTING_TABLE_NAME(org.apache.accumulo.core.util.Validators.EXISTING_TABLE_NAME) Retry(org.apache.accumulo.fate.util.Retry) Logger(org.slf4j.Logger) CryptoService(org.apache.accumulo.core.spi.crypto.CryptoService) Executor(java.util.concurrent.Executor) UTF_8(java.nio.charset.StandardCharsets.UTF_8) KeyExtent(org.apache.accumulo.core.dataImpl.KeyExtent) IOException(java.io.IOException) Constants(org.apache.accumulo.core.Constants) CryptoServiceFactory(org.apache.accumulo.core.crypto.CryptoServiceFactory) AccumuloException(org.apache.accumulo.core.client.AccumuloException) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration) Range(org.apache.accumulo.core.data.Range) ExecutionException(java.util.concurrent.ExecutionException) TreeMap(java.util.TreeMap) Preconditions(com.google.common.base.Preconditions) Cache(com.google.common.cache.Cache) Collections(java.util.Collections) ClientProperty(org.apache.accumulo.core.conf.ClientProperty) FileStatus(org.apache.hadoop.fs.FileStatus) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) FileInfo(org.apache.accumulo.core.clientImpl.bulk.Bulk.FileInfo) KeyExtent(org.apache.accumulo.core.dataImpl.KeyExtent) CompletableFuture(java.util.concurrent.CompletableFuture) FileInfo(org.apache.accumulo.core.clientImpl.bulk.Bulk.FileInfo) CryptoService(org.apache.accumulo.core.spi.crypto.CryptoService) Files(org.apache.accumulo.core.clientImpl.bulk.Bulk.Files) ExecutionException(java.util.concurrent.ExecutionException) Path(org.apache.hadoop.fs.Path) TreeMap(java.util.TreeMap) TableNotFoundException(org.apache.accumulo.core.client.TableNotFoundException) AccumuloBulkMergeException(org.apache.accumulo.core.clientImpl.AccumuloBulkMergeException) CompletionException(java.util.concurrent.CompletionException) FileNotFoundException(java.io.FileNotFoundException) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) IOException(java.io.IOException) AccumuloException(org.apache.accumulo.core.client.AccumuloException) ExecutionException(java.util.concurrent.ExecutionException) CompletionException(java.util.concurrent.CompletionException) Map(java.util.Map) SortedMap(java.util.SortedMap) HashMap(java.util.HashMap) TreeMap(java.util.TreeMap)

Example 3 with Files

use of org.apache.accumulo.core.clientImpl.bulk.Bulk.Files in project accumulo by apache.

the class BulkImportTest method addMapping.

private void addMapping(SortedMap<KeyExtent, Files> mappings, String prevRow, String endRow, String... fileNames) {
    KeyExtent ke = new KeyExtent(TableId.of("42"), endRow == null ? null : new Text(endRow), prevRow == null ? null : new Text(prevRow));
    Files files = new Files();
    for (String name : fileNames) {
        files.add(new FileInfo(name, 2, 2));
    }
    mappings.put(ke, files);
}
Also used : FileInfo(org.apache.accumulo.core.clientImpl.bulk.Bulk.FileInfo) Text(org.apache.hadoop.io.Text) Files(org.apache.accumulo.core.clientImpl.bulk.Bulk.Files) KeyExtent(org.apache.accumulo.core.dataImpl.KeyExtent)

Example 4 with Files

use of org.apache.accumulo.core.clientImpl.bulk.Bulk.Files in project accumulo by apache.

the class BulkSerializeTest method testRemap.

@Test
public void testRemap() throws Exception {
    TableId tableId = TableId.of("3");
    SortedMap<KeyExtent, Bulk.Files> mapping = generateMapping(tableId);
    SortedMap<KeyExtent, Bulk.Files> newNameMapping = new TreeMap<>();
    Map<String, String> nameMap = new HashMap<>();
    mapping.forEach((extent, files) -> {
        Files newFiles = new Files();
        files.forEach(fi -> {
            newFiles.add(new FileInfo("N" + fi.name, fi.estSize, fi.estEntries));
            nameMap.put(fi.name, "N" + fi.name);
        });
        newNameMapping.put(extent, newFiles);
    });
    ByteArrayOutputStream mappingBaos = new ByteArrayOutputStream();
    ByteArrayOutputStream nameBaos = new ByteArrayOutputStream();
    BulkSerialize.writeRenameMap(nameMap, "/some/dir", p -> nameBaos);
    BulkSerialize.writeLoadMapping(mapping, "/some/dir", p -> mappingBaos);
    Input input = p -> {
        if (p.getName().equals(Constants.BULK_LOAD_MAPPING)) {
            return new ByteArrayInputStream(mappingBaos.toByteArray());
        } else if (p.getName().equals(Constants.BULK_RENAME_FILE)) {
            return new ByteArrayInputStream(nameBaos.toByteArray());
        } else {
            throw new IllegalArgumentException("bad path " + p);
        }
    };
    try (LoadMappingIterator lmi = BulkSerialize.getUpdatedLoadMapping("/some/dir", tableId, input)) {
        SortedMap<KeyExtent, Bulk.Files> actual = new TreeMap<>();
        lmi.forEachRemaining(e -> actual.put(e.getKey(), e.getValue()));
        assertEquals(newNameMapping, actual);
    }
}
Also used : TableId(org.apache.accumulo.core.data.TableId) TableId(org.apache.accumulo.core.data.TableId) ByteArrayOutputStream(java.io.ByteArrayOutputStream) KeyExtent(org.apache.accumulo.core.dataImpl.KeyExtent) Text(org.apache.hadoop.io.Text) HashMap(java.util.HashMap) Constants(org.apache.accumulo.core.Constants) Test(org.junit.jupiter.api.Test) ByteArrayInputStream(java.io.ByteArrayInputStream) TreeMap(java.util.TreeMap) Map(java.util.Map) Input(org.apache.accumulo.core.clientImpl.bulk.BulkSerialize.Input) FileInfo(org.apache.accumulo.core.clientImpl.bulk.Bulk.FileInfo) Files(org.apache.accumulo.core.clientImpl.bulk.Bulk.Files) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) SortedMap(java.util.SortedMap) HashMap(java.util.HashMap) ByteArrayOutputStream(java.io.ByteArrayOutputStream) TreeMap(java.util.TreeMap) KeyExtent(org.apache.accumulo.core.dataImpl.KeyExtent) Input(org.apache.accumulo.core.clientImpl.bulk.BulkSerialize.Input) FileInfo(org.apache.accumulo.core.clientImpl.bulk.Bulk.FileInfo) ByteArrayInputStream(java.io.ByteArrayInputStream) Files(org.apache.accumulo.core.clientImpl.bulk.Bulk.Files) Test(org.junit.jupiter.api.Test)

Example 5 with Files

use of org.apache.accumulo.core.clientImpl.bulk.Bulk.Files in project accumulo by apache.

the class BulkImport method computeMappingFromPlan.

private SortedMap<KeyExtent, Files> computeMappingFromPlan(FileSystem fs, TableId tableId, Path srcPath, int maxTablets) throws IOException, AccumuloException, AccumuloSecurityException, TableNotFoundException {
    Map<String, List<Destination>> fileDestinations = plan.getDestinations().stream().collect(groupingBy(Destination::getFileName));
    List<FileStatus> statuses = filterInvalid(fs.listStatus(srcPath, p -> !p.getName().equals(Constants.BULK_LOAD_MAPPING)));
    Map<String, Long> fileLens = getFileLenMap(statuses);
    if (!fileDestinations.keySet().equals(fileLens.keySet())) {
        throw new IllegalArgumentException("Load plan files differ from directory files, symmetric difference : " + Sets.symmetricDifference(fileDestinations.keySet(), fileLens.keySet()));
    }
    KeyExtentCache extentCache = new ConcurrentKeyExtentCache(tableId, context);
    // Pre-populate cache by looking up all end rows in sorted order. Doing this in sorted order
    // leverages read ahead.
    fileDestinations.values().stream().flatMap(List::stream).filter(dest -> dest.getRangeType() == RangeType.FILE).flatMap(dest -> Stream.of(dest.getStartRow(), dest.getEndRow())).filter(Objects::nonNull).map(Text::new).sorted().distinct().forEach(row -> {
        try {
            extentCache.lookup(row);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    });
    SortedMap<KeyExtent, Files> mapping = new TreeMap<>();
    for (Entry<String, List<Destination>> entry : fileDestinations.entrySet()) {
        String fileName = entry.getKey();
        List<Destination> destinations = entry.getValue();
        Set<KeyExtent> extents = mapDestinationsToExtents(tableId, extentCache, destinations);
        log.debug("The file {} mapped to {} tablets.", fileName, extents.size());
        checkTabletCount(maxTablets, extents.size(), fileName);
        long estSize = (long) (fileLens.get(fileName) / (double) extents.size());
        for (KeyExtent keyExtent : extents) {
            mapping.computeIfAbsent(keyExtent, k -> new Files()).add(new FileInfo(fileName, estSize, 0));
        }
    }
    return mergeOverlapping(mapping);
}
Also used : TableId(org.apache.accumulo.core.data.TableId) ByteSequence(org.apache.accumulo.core.data.ByteSequence) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) LoggerFactory(org.slf4j.LoggerFactory) Text(org.apache.hadoop.io.Text) FileStatus(org.apache.hadoop.fs.FileStatus) ByteBuffer(java.nio.ByteBuffer) TableNotFoundException(org.apache.accumulo.core.client.TableNotFoundException) CachableBlockFile.pathToCacheId(org.apache.accumulo.core.file.blockfile.impl.CachableBlockFile.pathToCacheId) ConfigurationTypeHelper(org.apache.accumulo.core.conf.ConfigurationTypeHelper) FileOperations(org.apache.accumulo.core.file.FileOperations) AccumuloBulkMergeException(org.apache.accumulo.core.clientImpl.AccumuloBulkMergeException) TableOperationsImpl(org.apache.accumulo.core.clientImpl.TableOperationsImpl) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) FileInfo(org.apache.accumulo.core.clientImpl.bulk.Bulk.FileInfo) Property(org.apache.accumulo.core.conf.Property) LoadPlan(org.apache.accumulo.core.data.LoadPlan) ClientContext(org.apache.accumulo.core.clientImpl.ClientContext) Collection(java.util.Collection) FileSKVIterator(org.apache.accumulo.core.file.FileSKVIterator) Set(java.util.Set) CompletionException(java.util.concurrent.CompletionException) ThreadPools(org.apache.accumulo.core.util.threads.ThreadPools) MILLISECONDS(java.util.concurrent.TimeUnit.MILLISECONDS) RangeType(org.apache.accumulo.core.data.LoadPlan.RangeType) FileNotFoundException(java.io.FileNotFoundException) Sets(com.google.common.collect.Sets) VolumeConfiguration(org.apache.accumulo.core.volume.VolumeConfiguration) Objects(java.util.Objects) List(java.util.List) Stream(java.util.stream.Stream) Entry(java.util.Map.Entry) Files(org.apache.accumulo.core.clientImpl.bulk.Bulk.Files) ImportDestinationArguments(org.apache.accumulo.core.client.admin.TableOperations.ImportDestinationArguments) ImportMappingOptions(org.apache.accumulo.core.client.admin.TableOperations.ImportMappingOptions) CacheBuilder(com.google.common.cache.CacheBuilder) SortedMap(java.util.SortedMap) FilenameUtils(org.apache.commons.io.FilenameUtils) Collectors.groupingBy(java.util.stream.Collectors.groupingBy) Destination(org.apache.accumulo.core.data.LoadPlan.Destination) MINUTES(java.util.concurrent.TimeUnit.MINUTES) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) Key(org.apache.accumulo.core.data.Key) ExecutorService(java.util.concurrent.ExecutorService) EXISTING_TABLE_NAME(org.apache.accumulo.core.util.Validators.EXISTING_TABLE_NAME) Retry(org.apache.accumulo.fate.util.Retry) Logger(org.slf4j.Logger) CryptoService(org.apache.accumulo.core.spi.crypto.CryptoService) Executor(java.util.concurrent.Executor) UTF_8(java.nio.charset.StandardCharsets.UTF_8) KeyExtent(org.apache.accumulo.core.dataImpl.KeyExtent) IOException(java.io.IOException) Constants(org.apache.accumulo.core.Constants) CryptoServiceFactory(org.apache.accumulo.core.crypto.CryptoServiceFactory) AccumuloException(org.apache.accumulo.core.client.AccumuloException) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration) Range(org.apache.accumulo.core.data.Range) ExecutionException(java.util.concurrent.ExecutionException) TreeMap(java.util.TreeMap) Preconditions(com.google.common.base.Preconditions) Cache(com.google.common.cache.Cache) Collections(java.util.Collections) ClientProperty(org.apache.accumulo.core.conf.ClientProperty) Destination(org.apache.accumulo.core.data.LoadPlan.Destination) FileStatus(org.apache.hadoop.fs.FileStatus) TreeMap(java.util.TreeMap) KeyExtent(org.apache.accumulo.core.dataImpl.KeyExtent) TableNotFoundException(org.apache.accumulo.core.client.TableNotFoundException) AccumuloBulkMergeException(org.apache.accumulo.core.clientImpl.AccumuloBulkMergeException) CompletionException(java.util.concurrent.CompletionException) FileNotFoundException(java.io.FileNotFoundException) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) IOException(java.io.IOException) AccumuloException(org.apache.accumulo.core.client.AccumuloException) ExecutionException(java.util.concurrent.ExecutionException) FileInfo(org.apache.accumulo.core.clientImpl.bulk.Bulk.FileInfo) Objects(java.util.Objects) List(java.util.List) ArrayList(java.util.ArrayList) Files(org.apache.accumulo.core.clientImpl.bulk.Bulk.Files)

Aggregations

Files (org.apache.accumulo.core.clientImpl.bulk.Bulk.Files)7 KeyExtent (org.apache.accumulo.core.dataImpl.KeyExtent)7 Text (org.apache.hadoop.io.Text)5 HashMap (java.util.HashMap)4 Map (java.util.Map)4 FileInfo (org.apache.accumulo.core.clientImpl.bulk.Bulk.FileInfo)4 TableId (org.apache.accumulo.core.data.TableId)4 Path (org.apache.hadoop.fs.Path)4 ByteBuffer (java.nio.ByteBuffer)3 Entry (java.util.Map.Entry)3 SortedMap (java.util.SortedMap)3 TreeMap (java.util.TreeMap)3 Preconditions (com.google.common.base.Preconditions)2 Cache (com.google.common.cache.Cache)2 CacheBuilder (com.google.common.cache.CacheBuilder)2 Sets (com.google.common.collect.Sets)2 FileNotFoundException (java.io.FileNotFoundException)2 IOException (java.io.IOException)2 UTF_8 (java.nio.charset.StandardCharsets.UTF_8)2 ArrayList (java.util.ArrayList)2