use of com.bakdata.conquery.models.dictionary.DictionaryMapping in project conquery by bakdata.
the class ImportJob method sendBuckets.
/**
* select, then send buckets.
*/
private Map<WorkerId, Set<BucketId>> sendBuckets(Map<Integer, Integer> starts, Map<Integer, Integer> lengths, DictionaryMapping primaryMapping, Import imp, Map<Integer, List<Integer>> buckets2LocalEntities, ColumnStore[] storesSorted) throws JsonProcessingException {
Map<WorkerId, Set<BucketId>> newWorkerAssignments = new HashMap<>();
final ProgressReporter subJob = getProgressReporter().subJob(buckets2LocalEntities.size());
for (Map.Entry<Integer, List<Integer>> bucket2entities : buckets2LocalEntities.entrySet()) {
WorkerInformation responsibleWorker = Objects.requireNonNull(namespace.getResponsibleWorkerForBucket(bucket2entities.getKey()), () -> "No responsible worker for Bucket#" + bucket2entities.getKey());
awaitFreeJobQueue(responsibleWorker);
final Bucket bucket = selectBucket(starts, lengths, storesSorted, primaryMapping, imp, bucket2entities.getKey(), bucket2entities.getValue());
newWorkerAssignments.computeIfAbsent(responsibleWorker.getId(), (ignored) -> new HashSet<>()).add(bucket.getId());
log.trace("Sending Bucket[{}] to {}", bucket.getId(), responsibleWorker.getId());
responsibleWorker.send(ImportBucket.forBucket(bucket));
subJob.report(1);
}
subJob.done();
return newWorkerAssignments;
}
use of com.bakdata.conquery.models.dictionary.DictionaryMapping in project conquery by bakdata.
the class ImportJob method execute.
@Override
public void execute() throws JSONException, InterruptedException, IOException {
getProgressReporter().setMax(NUMBER_OF_STEPS);
log.trace("Updating primary dictionary");
// Update primary dictionary: load new data, and create mapping.
final DictionaryMapping primaryMapping = importPrimaryDictionary(dictionaries.getPrimaryDictionary());
getProgressReporter().report(1);
// Distribute the new IDs among workers
distributeWorkerResponsibilities(primaryMapping);
getProgressReporter().report(1);
log.info("Importing Dictionaries");
Map<String, DictionaryMapping> sharedDictionaryMappings = importDictionaries(namespace, dictionaries.getDictionaries(), table.getColumns(), header.getName());
log.info("Remapping Dictionaries {}", sharedDictionaryMappings.values());
applyDictionaryMappings(sharedDictionaryMappings, container.getStores());
Import imp = createImport(header, container.getStores(), table.getColumns(), container.size());
namespace.getStorage().updateImport(imp);
Map<Integer, List<Integer>> buckets2LocalEntities = groupEntitiesByBucket(container.entities(), primaryMapping, bucketSize);
final ColumnStore[] storesSorted = Arrays.stream(table.getColumns()).map(Column::getName).map(container.getStores()::get).map(Objects::requireNonNull).toArray(ColumnStore[]::new);
log.info("Start sending {} Buckets", buckets2LocalEntities.size());
// we use this to track assignment to workers.
final Map<WorkerId, Set<BucketId>> workerAssignments = sendBuckets(container.getStarts(), container.getLengths(), primaryMapping, imp, buckets2LocalEntities, storesSorted);
workerAssignments.forEach(namespace::addBucketsToWorker);
getProgressReporter().done();
}
use of com.bakdata.conquery.models.dictionary.DictionaryMapping in project conquery by bakdata.
the class ImportJob method applyDictionaryMappings.
/**
* Apply new positions into incoming shared dictionaries.
*/
private void applyDictionaryMappings(Map<String, DictionaryMapping> mappings, Map<String, ColumnStore> values) {
final ProgressReporter subJob = getProgressReporter().subJob(mappings.size());
for (Map.Entry<String, DictionaryMapping> entry : mappings.entrySet()) {
final String columnName = entry.getKey();
final DictionaryMapping mapping = entry.getValue();
final StringStore stringStore = (StringStore) values.get(columnName);
log.debug("Remapping Column[{}] = {} with {}", columnName, stringStore, mapping);
// we need to find a new Type for the index-Column as it's going to be remapped and might change in size
final IntegerParser indexParser = new IntegerParser(config);
final IntSummaryStatistics statistics = mapping.target().intStream().summaryStatistics();
indexParser.setLines(stringStore.getLines());
indexParser.setMinValue(statistics.getMin());
indexParser.setMaxValue(statistics.getMax());
final IntegerStore newType = indexParser.findBestType();
log.trace("Decided for {}", newType);
mapping.applyToStore(stringStore, newType);
stringStore.setIndexStore(newType);
subJob.report(1);
}
}
use of com.bakdata.conquery.models.dictionary.DictionaryMapping in project conquery by bakdata.
the class ImportJob method importDictionaries.
/**
* Import all dictionaries. Shared dictionaries are merge into existing ones. All are distributed to corresponding workers.
* Create mappings for shared dictionaries dict.
* This is not synchronized because the methods is called within the job execution.
*/
private static Map<String, DictionaryMapping> importDictionaries(Namespace namespace, Map<String, Dictionary> dicts, Column[] columns, String importName) {
// Empty Maps are Coalesced to null by Jackson
if (dicts == null) {
return Collections.emptyMap();
}
final Map<String, DictionaryMapping> out = new HashMap<>();
log.trace("Importing Dictionaries");
for (Column column : columns) {
if (column.getType() != MajorTypeId.STRING) {
continue;
}
// Might not have an underlying Dictionary (eg Singleton, direct-Number)
// but could also be an error :/ Most likely the former
final Dictionary importDictionary = dicts.get(column.getName());
if (importDictionary == null) {
log.trace("No Dictionary for {}", column);
continue;
}
if (column.getSharedDictionary() == null) {
// Normal Dictionary -> no merge necessary, just distribute
distributeDictionary(namespace, importDictionary);
} else {
// It's a shared dictionary
final String sharedDictionaryName = column.getSharedDictionary();
log.trace("Column[{}.{}] part of shared Dictionary[{}]", importName, column.getName(), sharedDictionaryName);
final DictionaryId dictionaryId = new DictionaryId(namespace.getDataset().getId(), sharedDictionaryName);
final Dictionary sharedDictionary = namespace.getStorage().getDictionary(dictionaryId);
// This should never fail, becaus the dictionary is pre-created in the replacement generation step
ResourceUtil.throwNotFoundIfNull(dictionaryId, sharedDictionary);
log.trace("Merging into shared Dictionary[{}]", sharedDictionary);
DictionaryMapping mapping = DictionaryMapping.createAndImport(importDictionary, sharedDictionary);
if (mapping.getNumberOfNewIds() != 0) {
distributeDictionary(namespace, mapping.getTargetDictionary());
}
out.put(column.getName(), mapping);
}
}
return out;
}
use of com.bakdata.conquery.models.dictionary.DictionaryMapping in project conquery by bakdata.
the class ImportJob method selectBucket.
/**
* - remap Entity-Ids to global
* - calculate per-Entity regions of Bucklet (start/end)
* - split stores
*/
private Bucket selectBucket(Map<Integer, Integer> localStarts, Map<Integer, Integer> localLengths, ColumnStore[] stores, DictionaryMapping primaryMapping, Import imp, int bucketId, List<Integer> localEntities) {
final int root = bucketSize * bucketId;
IntList selectionStart = new IntArrayList();
IntList selectionLength = new IntArrayList();
IntSet entities = new IntOpenHashSet();
// First entity of Bucket starts at 0, the following are appended.
int[] entityStarts = new int[bucketSize];
int[] entityEnds = new int[bucketSize];
Arrays.fill(entityEnds, -1);
Arrays.fill(entityStarts, -1);
int currentStart = 0;
for (int position = 0; position < bucketSize; position++) {
int globalId = root + position;
int localId = primaryMapping.target2Source(globalId);
if (localId == -1) {
continue;
}
if (!localStarts.containsKey(localId)) {
continue;
}
entities.add(globalId);
final int length = localLengths.get(localId);
selectionStart.add(localStarts.get(localId));
selectionLength.add(length);
entityStarts[position] = currentStart;
entityEnds[position] = currentStart + length;
currentStart += length;
}
// copy only the parts of the bucket we need
final ColumnStore[] bucketStores = Arrays.stream(stores).map(store -> store.select(selectionStart.toIntArray(), selectionLength.toIntArray())).toArray(ColumnStore[]::new);
return new Bucket(bucketId, root, selectionLength.intStream().sum(), bucketStores, entities, entityStarts, entityEnds, imp);
}
Aggregations