Search in sources :

Example 1 with ColumnStore

use of com.bakdata.conquery.models.events.stores.root.ColumnStore in project conquery by bakdata.

the class AbstractSelectFilter method addImport.

@Override
public void addImport(Import imp) {
    if (values == null) {
        values = new HashSet<>();
    }
    final ColumnStore store = getColumn().getTypeFor(imp);
    values.addAll(Sets.newHashSet(((StringStore) store).iterator()));
}
Also used : ColumnStore(com.bakdata.conquery.models.events.stores.root.ColumnStore) StringStore(com.bakdata.conquery.models.events.stores.root.StringStore)

Example 2 with ColumnStore

use of com.bakdata.conquery.models.events.stores.root.ColumnStore in project conquery by bakdata.

the class Preprocessed method combineStores.

/**
 * Combine raw by-Entity data into column stores, appropriately formatted.
 */
@SuppressWarnings("rawtypes")
private Map<String, ColumnStore> combineStores(Int2IntMap entityStart) {
    Map<String, ColumnStore> columnStores = Arrays.stream(columns).parallel().collect(Collectors.toMap(PPColumn::getName, PPColumn::findBestType));
    // This object can be huge!
    Int2ObjectMap<IntList> entityEvents = new Int2ObjectOpenHashMap<>(entityStart.size());
    for (int pos = 0, size = rowEntities.size(); pos < size; pos++) {
        int entity = rowEntities.getInt(pos);
        entityEvents.computeIfAbsent(entity, (ignored) -> new IntArrayList()).add(pos);
    }
    for (int colIdx = 0; colIdx < columns.length; colIdx++) {
        final PPColumn ppColumn = columns[colIdx];
        final ColumnValues columnValues = values[colIdx];
        // No need to preprocess the column further more, if it does not contain values, likely backed by a compound ColumnStore
        if (columnValues == null) {
            continue;
        }
        final ColumnStore store = columnStores.get(ppColumn.getName());
        entityStart.int2IntEntrySet().forEach(entry -> {
            final int entity = entry.getIntKey();
            int outIndex = entry.getIntValue();
            final IntList events = entityEvents.getOrDefault(entity, IntLists.emptyList());
            for (int inIndex : events) {
                if (columnValues.isNull(inIndex)) {
                    store.setNull(outIndex);
                } else {
                    final Object raw = columnValues.get(inIndex);
                    ppColumn.getParser().setValue(store, outIndex, raw);
                }
                outIndex++;
            }
        });
    }
    return columnStores;
}
Also used : Arrays(java.util.Arrays) Dictionary(com.bakdata.conquery.models.dictionary.Dictionary) ColumnStore(com.bakdata.conquery.models.events.stores.root.ColumnStore) ConqueryConfig(com.bakdata.conquery.models.config.ConqueryConfig) Int2IntMap(it.unimi.dsi.fastutil.ints.Int2IntMap) JsonGenerator(com.fasterxml.jackson.core.JsonGenerator) HashMap(java.util.HashMap) Int2IntAVLTreeMap(it.unimi.dsi.fastutil.ints.Int2IntAVLTreeMap) MapTypeGuesser(com.bakdata.conquery.models.preproc.parser.specific.string.MapTypeGuesser) Map(java.util.Map) Parser(com.bakdata.conquery.models.preproc.parser.Parser) IntLists(it.unimi.dsi.fastutil.ints.IntLists) OutputStream(java.io.OutputStream) StringTypeEncoded(com.bakdata.conquery.models.events.stores.specific.string.StringTypeEncoded) Int2ObjectOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap) MajorTypeId(com.bakdata.conquery.models.events.MajorTypeId) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) StringParser(com.bakdata.conquery.models.preproc.parser.specific.StringParser) StringStore(com.bakdata.conquery.models.events.stores.root.StringStore) Collectors(java.util.stream.Collectors) File(java.io.File) Slf4j(lombok.extern.slf4j.Slf4j) IntList(it.unimi.dsi.fastutil.ints.IntList) ColumnValues(com.bakdata.conquery.models.preproc.parser.ColumnValues) Int2ObjectMap(it.unimi.dsi.fastutil.ints.Int2ObjectMap) Data(lombok.Data) IntSummaryStatistics(java.util.IntSummaryStatistics) GZIPOutputStream(java.util.zip.GZIPOutputStream) Jackson(com.bakdata.conquery.io.jackson.Jackson) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList) Int2ObjectOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap) ColumnStore(com.bakdata.conquery.models.events.stores.root.ColumnStore) ColumnValues(com.bakdata.conquery.models.preproc.parser.ColumnValues) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList) IntList(it.unimi.dsi.fastutil.ints.IntList)

Example 3 with ColumnStore

use of com.bakdata.conquery.models.events.stores.root.ColumnStore in project conquery by bakdata.

the class ImportJob method sendBuckets.

/**
 * select, then send buckets.
 */
private Map<WorkerId, Set<BucketId>> sendBuckets(Map<Integer, Integer> starts, Map<Integer, Integer> lengths, DictionaryMapping primaryMapping, Import imp, Map<Integer, List<Integer>> buckets2LocalEntities, ColumnStore[] storesSorted) throws JsonProcessingException {
    Map<WorkerId, Set<BucketId>> newWorkerAssignments = new HashMap<>();
    final ProgressReporter subJob = getProgressReporter().subJob(buckets2LocalEntities.size());
    for (Map.Entry<Integer, List<Integer>> bucket2entities : buckets2LocalEntities.entrySet()) {
        WorkerInformation responsibleWorker = Objects.requireNonNull(namespace.getResponsibleWorkerForBucket(bucket2entities.getKey()), () -> "No responsible worker for Bucket#" + bucket2entities.getKey());
        awaitFreeJobQueue(responsibleWorker);
        final Bucket bucket = selectBucket(starts, lengths, storesSorted, primaryMapping, imp, bucket2entities.getKey(), bucket2entities.getValue());
        newWorkerAssignments.computeIfAbsent(responsibleWorker.getId(), (ignored) -> new HashSet<>()).add(bucket.getId());
        log.trace("Sending Bucket[{}] to {}", bucket.getId(), responsibleWorker.getId());
        responsibleWorker.send(ImportBucket.forBucket(bucket));
        subJob.report(1);
    }
    subJob.done();
    return newWorkerAssignments;
}
Also used : Dictionary(com.bakdata.conquery.models.dictionary.Dictionary) ColumnStore(com.bakdata.conquery.models.events.stores.root.ColumnStore) java.util(java.util) ConqueryConfig(com.bakdata.conquery.models.config.ConqueryConfig) Getter(lombok.Getter) PreprocessedHeader(com.bakdata.conquery.models.preproc.PreprocessedHeader) RequiredArgsConstructor(lombok.RequiredArgsConstructor) PreprocessedData(com.bakdata.conquery.models.preproc.PreprocessedData) com.bakdata.conquery.models.identifiable.ids.specific(com.bakdata.conquery.models.identifiable.ids.specific) NamespaceStorage(com.bakdata.conquery.io.storage.NamespaceStorage) IdMutex(com.bakdata.conquery.models.identifiable.IdMutex) com.bakdata.conquery.models.datasets(com.bakdata.conquery.models.datasets) IntegerStore(com.bakdata.conquery.models.events.stores.root.IntegerStore) WorkerInformation(com.bakdata.conquery.models.worker.WorkerInformation) BadRequestException(javax.ws.rs.BadRequestException) JSONException(com.bakdata.conquery.models.exceptions.JSONException) DictionaryMapping(com.bakdata.conquery.models.dictionary.DictionaryMapping) IntegerParser(com.bakdata.conquery.models.preproc.parser.specific.IntegerParser) PreprocessedReader(com.bakdata.conquery.models.preproc.PreprocessedReader) ResourceUtil(com.bakdata.conquery.util.ResourceUtil) Bucket(com.bakdata.conquery.models.events.Bucket) MajorTypeId(com.bakdata.conquery.models.events.MajorTypeId) JsonProcessingException(com.fasterxml.jackson.core.JsonProcessingException) IOException(java.io.IOException) StringStore(com.bakdata.conquery.models.events.stores.root.StringStore) com.bakdata.conquery.models.messages.namespaces.specific(com.bakdata.conquery.models.messages.namespaces.specific) Collectors(java.util.stream.Collectors) Entity(com.bakdata.conquery.models.query.entity.Entity) ProgressReporter(com.bakdata.conquery.util.progressreporter.ProgressReporter) Slf4j(lombok.extern.slf4j.Slf4j) IntList(it.unimi.dsi.fastutil.ints.IntList) Response(javax.ws.rs.core.Response) IntOpenHashSet(it.unimi.dsi.fastutil.ints.IntOpenHashSet) WebApplicationException(javax.ws.rs.WebApplicationException) IntSet(it.unimi.dsi.fastutil.ints.IntSet) PreprocessedDictionaries(com.bakdata.conquery.models.preproc.PreprocessedDictionaries) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList) Namespace(com.bakdata.conquery.models.worker.Namespace) InputStream(java.io.InputStream) WorkerInformation(com.bakdata.conquery.models.worker.WorkerInformation) IntOpenHashSet(it.unimi.dsi.fastutil.ints.IntOpenHashSet) IntSet(it.unimi.dsi.fastutil.ints.IntSet) ProgressReporter(com.bakdata.conquery.util.progressreporter.ProgressReporter) Bucket(com.bakdata.conquery.models.events.Bucket) IntList(it.unimi.dsi.fastutil.ints.IntList) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList) IntOpenHashSet(it.unimi.dsi.fastutil.ints.IntOpenHashSet)

Example 4 with ColumnStore

use of com.bakdata.conquery.models.events.stores.root.ColumnStore in project conquery by bakdata.

the class ImportJob method execute.

@Override
public void execute() throws JSONException, InterruptedException, IOException {
    getProgressReporter().setMax(NUMBER_OF_STEPS);
    log.trace("Updating primary dictionary");
    // Update primary dictionary: load new data, and create mapping.
    final DictionaryMapping primaryMapping = importPrimaryDictionary(dictionaries.getPrimaryDictionary());
    getProgressReporter().report(1);
    // Distribute the new IDs among workers
    distributeWorkerResponsibilities(primaryMapping);
    getProgressReporter().report(1);
    log.info("Importing Dictionaries");
    Map<String, DictionaryMapping> sharedDictionaryMappings = importDictionaries(namespace, dictionaries.getDictionaries(), table.getColumns(), header.getName());
    log.info("Remapping Dictionaries {}", sharedDictionaryMappings.values());
    applyDictionaryMappings(sharedDictionaryMappings, container.getStores());
    Import imp = createImport(header, container.getStores(), table.getColumns(), container.size());
    namespace.getStorage().updateImport(imp);
    Map<Integer, List<Integer>> buckets2LocalEntities = groupEntitiesByBucket(container.entities(), primaryMapping, bucketSize);
    final ColumnStore[] storesSorted = Arrays.stream(table.getColumns()).map(Column::getName).map(container.getStores()::get).map(Objects::requireNonNull).toArray(ColumnStore[]::new);
    log.info("Start sending {} Buckets", buckets2LocalEntities.size());
    // we use this to track assignment to workers.
    final Map<WorkerId, Set<BucketId>> workerAssignments = sendBuckets(container.getStarts(), container.getLengths(), primaryMapping, imp, buckets2LocalEntities, storesSorted);
    workerAssignments.forEach(namespace::addBucketsToWorker);
    getProgressReporter().done();
}
Also used : IntOpenHashSet(it.unimi.dsi.fastutil.ints.IntOpenHashSet) IntSet(it.unimi.dsi.fastutil.ints.IntSet) DictionaryMapping(com.bakdata.conquery.models.dictionary.DictionaryMapping) ColumnStore(com.bakdata.conquery.models.events.stores.root.ColumnStore) IntList(it.unimi.dsi.fastutil.ints.IntList) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList)

Example 5 with ColumnStore

use of com.bakdata.conquery.models.events.stores.root.ColumnStore in project conquery by bakdata.

the class Preprocessed method collectDictionaries.

private static Map<String, Dictionary> collectDictionaries(Map<String, ColumnStore> columnStores) {
    final Map<String, Dictionary> collect = new HashMap<>();
    for (Map.Entry<String, ColumnStore> entry : columnStores.entrySet()) {
        if (!(entry.getValue() instanceof StringStore)) {
            continue;
        }
        final Dictionary dictionary = ((StringStore) entry.getValue()).getUnderlyingDictionary();
        if (dictionary == null) {
            continue;
        }
        collect.put(entry.getKey(), dictionary);
    }
    return collect;
}
Also used : Dictionary(com.bakdata.conquery.models.dictionary.Dictionary) ColumnStore(com.bakdata.conquery.models.events.stores.root.ColumnStore) HashMap(java.util.HashMap) Int2ObjectOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap) StringStore(com.bakdata.conquery.models.events.stores.root.StringStore) Int2IntMap(it.unimi.dsi.fastutil.ints.Int2IntMap) HashMap(java.util.HashMap) Int2IntAVLTreeMap(it.unimi.dsi.fastutil.ints.Int2IntAVLTreeMap) Map(java.util.Map) Int2ObjectOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap) Int2ObjectMap(it.unimi.dsi.fastutil.ints.Int2ObjectMap)

Aggregations

ColumnStore (com.bakdata.conquery.models.events.stores.root.ColumnStore)12 StringStore (com.bakdata.conquery.models.events.stores.root.StringStore)6 Dictionary (com.bakdata.conquery.models.dictionary.Dictionary)5 ConqueryConfig (com.bakdata.conquery.models.config.ConqueryConfig)4 IntOpenHashSet (it.unimi.dsi.fastutil.ints.IntOpenHashSet)4 DictionaryMapping (com.bakdata.conquery.models.dictionary.DictionaryMapping)3 Bucket (com.bakdata.conquery.models.events.Bucket)3 MajorTypeId (com.bakdata.conquery.models.events.MajorTypeId)3 Int2IntAVLTreeMap (it.unimi.dsi.fastutil.ints.Int2IntAVLTreeMap)3 IntArrayList (it.unimi.dsi.fastutil.ints.IntArrayList)3 IntList (it.unimi.dsi.fastutil.ints.IntList)3 NamespaceStorage (com.bakdata.conquery.io.storage.NamespaceStorage)2 com.bakdata.conquery.models.datasets (com.bakdata.conquery.models.datasets)2 IntegerDateStore (com.bakdata.conquery.models.events.stores.primitive.IntegerDateStore)2 IntegerStore (com.bakdata.conquery.models.events.stores.root.IntegerStore)2 JSONException (com.bakdata.conquery.models.exceptions.JSONException)2 IdMutex (com.bakdata.conquery.models.identifiable.IdMutex)2 com.bakdata.conquery.models.identifiable.ids.specific (com.bakdata.conquery.models.identifiable.ids.specific)2 com.bakdata.conquery.models.messages.namespaces.specific (com.bakdata.conquery.models.messages.namespaces.specific)2 PreprocessedData (com.bakdata.conquery.models.preproc.PreprocessedData)2