Search in sources :

Example 1 with DictionaryMapping

use of com.bakdata.conquery.models.dictionary.DictionaryMapping in project conquery by bakdata.

the class ImportJob method sendBuckets.

/**
 * select, then send buckets.
 */
private Map<WorkerId, Set<BucketId>> sendBuckets(Map<Integer, Integer> starts, Map<Integer, Integer> lengths, DictionaryMapping primaryMapping, Import imp, Map<Integer, List<Integer>> buckets2LocalEntities, ColumnStore[] storesSorted) throws JsonProcessingException {
    Map<WorkerId, Set<BucketId>> newWorkerAssignments = new HashMap<>();
    final ProgressReporter subJob = getProgressReporter().subJob(buckets2LocalEntities.size());
    for (Map.Entry<Integer, List<Integer>> bucket2entities : buckets2LocalEntities.entrySet()) {
        WorkerInformation responsibleWorker = Objects.requireNonNull(namespace.getResponsibleWorkerForBucket(bucket2entities.getKey()), () -> "No responsible worker for Bucket#" + bucket2entities.getKey());
        awaitFreeJobQueue(responsibleWorker);
        final Bucket bucket = selectBucket(starts, lengths, storesSorted, primaryMapping, imp, bucket2entities.getKey(), bucket2entities.getValue());
        newWorkerAssignments.computeIfAbsent(responsibleWorker.getId(), (ignored) -> new HashSet<>()).add(bucket.getId());
        log.trace("Sending Bucket[{}] to {}", bucket.getId(), responsibleWorker.getId());
        responsibleWorker.send(ImportBucket.forBucket(bucket));
        subJob.report(1);
    }
    subJob.done();
    return newWorkerAssignments;
}
Also used : Dictionary(com.bakdata.conquery.models.dictionary.Dictionary) ColumnStore(com.bakdata.conquery.models.events.stores.root.ColumnStore) java.util(java.util) ConqueryConfig(com.bakdata.conquery.models.config.ConqueryConfig) Getter(lombok.Getter) PreprocessedHeader(com.bakdata.conquery.models.preproc.PreprocessedHeader) RequiredArgsConstructor(lombok.RequiredArgsConstructor) PreprocessedData(com.bakdata.conquery.models.preproc.PreprocessedData) com.bakdata.conquery.models.identifiable.ids.specific(com.bakdata.conquery.models.identifiable.ids.specific) NamespaceStorage(com.bakdata.conquery.io.storage.NamespaceStorage) IdMutex(com.bakdata.conquery.models.identifiable.IdMutex) com.bakdata.conquery.models.datasets(com.bakdata.conquery.models.datasets) IntegerStore(com.bakdata.conquery.models.events.stores.root.IntegerStore) WorkerInformation(com.bakdata.conquery.models.worker.WorkerInformation) BadRequestException(javax.ws.rs.BadRequestException) JSONException(com.bakdata.conquery.models.exceptions.JSONException) DictionaryMapping(com.bakdata.conquery.models.dictionary.DictionaryMapping) IntegerParser(com.bakdata.conquery.models.preproc.parser.specific.IntegerParser) PreprocessedReader(com.bakdata.conquery.models.preproc.PreprocessedReader) ResourceUtil(com.bakdata.conquery.util.ResourceUtil) Bucket(com.bakdata.conquery.models.events.Bucket) MajorTypeId(com.bakdata.conquery.models.events.MajorTypeId) JsonProcessingException(com.fasterxml.jackson.core.JsonProcessingException) IOException(java.io.IOException) StringStore(com.bakdata.conquery.models.events.stores.root.StringStore) com.bakdata.conquery.models.messages.namespaces.specific(com.bakdata.conquery.models.messages.namespaces.specific) Collectors(java.util.stream.Collectors) Entity(com.bakdata.conquery.models.query.entity.Entity) ProgressReporter(com.bakdata.conquery.util.progressreporter.ProgressReporter) Slf4j(lombok.extern.slf4j.Slf4j) IntList(it.unimi.dsi.fastutil.ints.IntList) Response(javax.ws.rs.core.Response) IntOpenHashSet(it.unimi.dsi.fastutil.ints.IntOpenHashSet) WebApplicationException(javax.ws.rs.WebApplicationException) IntSet(it.unimi.dsi.fastutil.ints.IntSet) PreprocessedDictionaries(com.bakdata.conquery.models.preproc.PreprocessedDictionaries) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList) Namespace(com.bakdata.conquery.models.worker.Namespace) InputStream(java.io.InputStream) WorkerInformation(com.bakdata.conquery.models.worker.WorkerInformation) IntOpenHashSet(it.unimi.dsi.fastutil.ints.IntOpenHashSet) IntSet(it.unimi.dsi.fastutil.ints.IntSet) ProgressReporter(com.bakdata.conquery.util.progressreporter.ProgressReporter) Bucket(com.bakdata.conquery.models.events.Bucket) IntList(it.unimi.dsi.fastutil.ints.IntList) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList) IntOpenHashSet(it.unimi.dsi.fastutil.ints.IntOpenHashSet)

Example 2 with DictionaryMapping

use of com.bakdata.conquery.models.dictionary.DictionaryMapping in project conquery by bakdata.

the class ImportJob method execute.

@Override
public void execute() throws JSONException, InterruptedException, IOException {
    getProgressReporter().setMax(NUMBER_OF_STEPS);
    log.trace("Updating primary dictionary");
    // Update primary dictionary: load new data, and create mapping.
    final DictionaryMapping primaryMapping = importPrimaryDictionary(dictionaries.getPrimaryDictionary());
    getProgressReporter().report(1);
    // Distribute the new IDs among workers
    distributeWorkerResponsibilities(primaryMapping);
    getProgressReporter().report(1);
    log.info("Importing Dictionaries");
    Map<String, DictionaryMapping> sharedDictionaryMappings = importDictionaries(namespace, dictionaries.getDictionaries(), table.getColumns(), header.getName());
    log.info("Remapping Dictionaries {}", sharedDictionaryMappings.values());
    applyDictionaryMappings(sharedDictionaryMappings, container.getStores());
    Import imp = createImport(header, container.getStores(), table.getColumns(), container.size());
    namespace.getStorage().updateImport(imp);
    Map<Integer, List<Integer>> buckets2LocalEntities = groupEntitiesByBucket(container.entities(), primaryMapping, bucketSize);
    final ColumnStore[] storesSorted = Arrays.stream(table.getColumns()).map(Column::getName).map(container.getStores()::get).map(Objects::requireNonNull).toArray(ColumnStore[]::new);
    log.info("Start sending {} Buckets", buckets2LocalEntities.size());
    // we use this to track assignment to workers.
    final Map<WorkerId, Set<BucketId>> workerAssignments = sendBuckets(container.getStarts(), container.getLengths(), primaryMapping, imp, buckets2LocalEntities, storesSorted);
    workerAssignments.forEach(namespace::addBucketsToWorker);
    getProgressReporter().done();
}
Also used : IntOpenHashSet(it.unimi.dsi.fastutil.ints.IntOpenHashSet) IntSet(it.unimi.dsi.fastutil.ints.IntSet) DictionaryMapping(com.bakdata.conquery.models.dictionary.DictionaryMapping) ColumnStore(com.bakdata.conquery.models.events.stores.root.ColumnStore) IntList(it.unimi.dsi.fastutil.ints.IntList) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList)

Example 3 with DictionaryMapping

use of com.bakdata.conquery.models.dictionary.DictionaryMapping in project conquery by bakdata.

the class ImportJob method applyDictionaryMappings.

/**
 * Apply new positions into incoming shared dictionaries.
 */
private void applyDictionaryMappings(Map<String, DictionaryMapping> mappings, Map<String, ColumnStore> values) {
    final ProgressReporter subJob = getProgressReporter().subJob(mappings.size());
    for (Map.Entry<String, DictionaryMapping> entry : mappings.entrySet()) {
        final String columnName = entry.getKey();
        final DictionaryMapping mapping = entry.getValue();
        final StringStore stringStore = (StringStore) values.get(columnName);
        log.debug("Remapping Column[{}] = {} with {}", columnName, stringStore, mapping);
        // we need to find a new Type for the index-Column as it's going to be remapped and might change in size
        final IntegerParser indexParser = new IntegerParser(config);
        final IntSummaryStatistics statistics = mapping.target().intStream().summaryStatistics();
        indexParser.setLines(stringStore.getLines());
        indexParser.setMinValue(statistics.getMin());
        indexParser.setMaxValue(statistics.getMax());
        final IntegerStore newType = indexParser.findBestType();
        log.trace("Decided for {}", newType);
        mapping.applyToStore(stringStore, newType);
        stringStore.setIndexStore(newType);
        subJob.report(1);
    }
}
Also used : IntegerStore(com.bakdata.conquery.models.events.stores.root.IntegerStore) ProgressReporter(com.bakdata.conquery.util.progressreporter.ProgressReporter) StringStore(com.bakdata.conquery.models.events.stores.root.StringStore) DictionaryMapping(com.bakdata.conquery.models.dictionary.DictionaryMapping) IntegerParser(com.bakdata.conquery.models.preproc.parser.specific.IntegerParser)

Example 4 with DictionaryMapping

use of com.bakdata.conquery.models.dictionary.DictionaryMapping in project conquery by bakdata.

the class ImportJob method importDictionaries.

/**
 * Import all dictionaries. Shared dictionaries are merge into existing ones. All are distributed to corresponding workers.
 * Create mappings for shared dictionaries dict.
 * This is not synchronized because the methods is called within the job execution.
 */
private static Map<String, DictionaryMapping> importDictionaries(Namespace namespace, Map<String, Dictionary> dicts, Column[] columns, String importName) {
    // Empty Maps are Coalesced to null by Jackson
    if (dicts == null) {
        return Collections.emptyMap();
    }
    final Map<String, DictionaryMapping> out = new HashMap<>();
    log.trace("Importing Dictionaries");
    for (Column column : columns) {
        if (column.getType() != MajorTypeId.STRING) {
            continue;
        }
        // Might not have an underlying Dictionary (eg Singleton, direct-Number)
        // but could also be an error :/ Most likely the former
        final Dictionary importDictionary = dicts.get(column.getName());
        if (importDictionary == null) {
            log.trace("No Dictionary for {}", column);
            continue;
        }
        if (column.getSharedDictionary() == null) {
            // Normal Dictionary -> no merge necessary, just distribute
            distributeDictionary(namespace, importDictionary);
        } else {
            // It's a shared dictionary
            final String sharedDictionaryName = column.getSharedDictionary();
            log.trace("Column[{}.{}] part of shared Dictionary[{}]", importName, column.getName(), sharedDictionaryName);
            final DictionaryId dictionaryId = new DictionaryId(namespace.getDataset().getId(), sharedDictionaryName);
            final Dictionary sharedDictionary = namespace.getStorage().getDictionary(dictionaryId);
            // This should never fail, becaus the dictionary is pre-created in the replacement generation step
            ResourceUtil.throwNotFoundIfNull(dictionaryId, sharedDictionary);
            log.trace("Merging into shared Dictionary[{}]", sharedDictionary);
            DictionaryMapping mapping = DictionaryMapping.createAndImport(importDictionary, sharedDictionary);
            if (mapping.getNumberOfNewIds() != 0) {
                distributeDictionary(namespace, mapping.getTargetDictionary());
            }
            out.put(column.getName(), mapping);
        }
    }
    return out;
}
Also used : Dictionary(com.bakdata.conquery.models.dictionary.Dictionary) DictionaryMapping(com.bakdata.conquery.models.dictionary.DictionaryMapping)

Example 5 with DictionaryMapping

use of com.bakdata.conquery.models.dictionary.DictionaryMapping in project conquery by bakdata.

the class ImportJob method selectBucket.

/**
 * - remap Entity-Ids to global
 * - calculate per-Entity regions of Bucklet (start/end)
 * - split stores
 */
private Bucket selectBucket(Map<Integer, Integer> localStarts, Map<Integer, Integer> localLengths, ColumnStore[] stores, DictionaryMapping primaryMapping, Import imp, int bucketId, List<Integer> localEntities) {
    final int root = bucketSize * bucketId;
    IntList selectionStart = new IntArrayList();
    IntList selectionLength = new IntArrayList();
    IntSet entities = new IntOpenHashSet();
    // First entity of Bucket starts at 0, the following are appended.
    int[] entityStarts = new int[bucketSize];
    int[] entityEnds = new int[bucketSize];
    Arrays.fill(entityEnds, -1);
    Arrays.fill(entityStarts, -1);
    int currentStart = 0;
    for (int position = 0; position < bucketSize; position++) {
        int globalId = root + position;
        int localId = primaryMapping.target2Source(globalId);
        if (localId == -1) {
            continue;
        }
        if (!localStarts.containsKey(localId)) {
            continue;
        }
        entities.add(globalId);
        final int length = localLengths.get(localId);
        selectionStart.add(localStarts.get(localId));
        selectionLength.add(length);
        entityStarts[position] = currentStart;
        entityEnds[position] = currentStart + length;
        currentStart += length;
    }
    // copy only the parts of the bucket we need
    final ColumnStore[] bucketStores = Arrays.stream(stores).map(store -> store.select(selectionStart.toIntArray(), selectionLength.toIntArray())).toArray(ColumnStore[]::new);
    return new Bucket(bucketId, root, selectionLength.intStream().sum(), bucketStores, entities, entityStarts, entityEnds, imp);
}
Also used : IntOpenHashSet(it.unimi.dsi.fastutil.ints.IntOpenHashSet) Dictionary(com.bakdata.conquery.models.dictionary.Dictionary) ColumnStore(com.bakdata.conquery.models.events.stores.root.ColumnStore) java.util(java.util) ConqueryConfig(com.bakdata.conquery.models.config.ConqueryConfig) Getter(lombok.Getter) PreprocessedHeader(com.bakdata.conquery.models.preproc.PreprocessedHeader) RequiredArgsConstructor(lombok.RequiredArgsConstructor) PreprocessedData(com.bakdata.conquery.models.preproc.PreprocessedData) com.bakdata.conquery.models.identifiable.ids.specific(com.bakdata.conquery.models.identifiable.ids.specific) NamespaceStorage(com.bakdata.conquery.io.storage.NamespaceStorage) IdMutex(com.bakdata.conquery.models.identifiable.IdMutex) com.bakdata.conquery.models.datasets(com.bakdata.conquery.models.datasets) IntegerStore(com.bakdata.conquery.models.events.stores.root.IntegerStore) WorkerInformation(com.bakdata.conquery.models.worker.WorkerInformation) BadRequestException(javax.ws.rs.BadRequestException) JSONException(com.bakdata.conquery.models.exceptions.JSONException) DictionaryMapping(com.bakdata.conquery.models.dictionary.DictionaryMapping) IntegerParser(com.bakdata.conquery.models.preproc.parser.specific.IntegerParser) PreprocessedReader(com.bakdata.conquery.models.preproc.PreprocessedReader) ResourceUtil(com.bakdata.conquery.util.ResourceUtil) Bucket(com.bakdata.conquery.models.events.Bucket) MajorTypeId(com.bakdata.conquery.models.events.MajorTypeId) JsonProcessingException(com.fasterxml.jackson.core.JsonProcessingException) IOException(java.io.IOException) StringStore(com.bakdata.conquery.models.events.stores.root.StringStore) com.bakdata.conquery.models.messages.namespaces.specific(com.bakdata.conquery.models.messages.namespaces.specific) Collectors(java.util.stream.Collectors) Entity(com.bakdata.conquery.models.query.entity.Entity) ProgressReporter(com.bakdata.conquery.util.progressreporter.ProgressReporter) Slf4j(lombok.extern.slf4j.Slf4j) IntList(it.unimi.dsi.fastutil.ints.IntList) Response(javax.ws.rs.core.Response) IntOpenHashSet(it.unimi.dsi.fastutil.ints.IntOpenHashSet) WebApplicationException(javax.ws.rs.WebApplicationException) IntSet(it.unimi.dsi.fastutil.ints.IntSet) PreprocessedDictionaries(com.bakdata.conquery.models.preproc.PreprocessedDictionaries) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList) Namespace(com.bakdata.conquery.models.worker.Namespace) InputStream(java.io.InputStream) ColumnStore(com.bakdata.conquery.models.events.stores.root.ColumnStore) Bucket(com.bakdata.conquery.models.events.Bucket) IntSet(it.unimi.dsi.fastutil.ints.IntSet) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList) IntList(it.unimi.dsi.fastutil.ints.IntList)

Aggregations

DictionaryMapping (com.bakdata.conquery.models.dictionary.DictionaryMapping)6 Dictionary (com.bakdata.conquery.models.dictionary.Dictionary)4 ColumnStore (com.bakdata.conquery.models.events.stores.root.ColumnStore)3 IntegerStore (com.bakdata.conquery.models.events.stores.root.IntegerStore)3 StringStore (com.bakdata.conquery.models.events.stores.root.StringStore)3 IntegerParser (com.bakdata.conquery.models.preproc.parser.specific.IntegerParser)3 ProgressReporter (com.bakdata.conquery.util.progressreporter.ProgressReporter)3 IntArrayList (it.unimi.dsi.fastutil.ints.IntArrayList)3 IntList (it.unimi.dsi.fastutil.ints.IntList)3 IntOpenHashSet (it.unimi.dsi.fastutil.ints.IntOpenHashSet)3 IntSet (it.unimi.dsi.fastutil.ints.IntSet)3 NamespaceStorage (com.bakdata.conquery.io.storage.NamespaceStorage)2 ConqueryConfig (com.bakdata.conquery.models.config.ConqueryConfig)2 com.bakdata.conquery.models.datasets (com.bakdata.conquery.models.datasets)2 Bucket (com.bakdata.conquery.models.events.Bucket)2 MajorTypeId (com.bakdata.conquery.models.events.MajorTypeId)2 JSONException (com.bakdata.conquery.models.exceptions.JSONException)2 IdMutex (com.bakdata.conquery.models.identifiable.IdMutex)2 com.bakdata.conquery.models.identifiable.ids.specific (com.bakdata.conquery.models.identifiable.ids.specific)2 com.bakdata.conquery.models.messages.namespaces.specific (com.bakdata.conquery.models.messages.namespaces.specific)2