Search in sources :

Example 1 with DetailedFacetItem

use of edu.stanford.muse.util.DetailedFacetItem in project epadd by ePADD.

the class CrossCollectionSearch method initialize.

/**
 * initializes lookup structures (entity infos and ctokenToInfos) for cross collection search
 * reads all archives available in the base dir.
 * should be synchronized so there's no chance of doing it multiple times at the same time.
 */
private static synchronized void initialize(String baseDir) {
    // this is created only once in one run. if it has already been created, reuse it.
    // in the future, this may be read from a serialized file, etc.
    cTokenToInfos = LinkedHashMultimap.create();
    File[] files = new File(baseDir).listFiles();
    if (files == null) {
        log.warn("Trying to initialize cross collection search from an invalid directory: " + baseDir);
        return;
    }
    int archiveNum = 0;
    for (File f : files) {
        if (!f.isDirectory())
            continue;
        try {
            String archiveFile = f.getAbsolutePath() + File.separator + Archive.BAG_DATA_FOLDER + File.separator + Archive.SESSIONS_SUBDIR + File.separator + "default" + SimpleSessions.getSessionSuffix();
            if (!new File(archiveFile).exists()) {
                log.warn("Unable to find archive file" + archiveFile + ".. Serious error");
                continue;
            }
            // Assumption is that this feature is present only in discovery mode. In future when we want to add it to processing, we need proper care.
            Archive archive = ArchiveReaderWriter.readArchiveIfPresent(f.getAbsolutePath(), ModeConfig.Mode.DISCOVERY);
            if (archive == null) {
                log.warn("failed to read archive from " + f.getAbsolutePath());
                continue;
            }
            log.info("Loaded archive from " + f.getAbsolutePath());
            log.info("Loaded archive metadata from " + f.getAbsolutePath());
            // process all docs in this archive to set up centityToInfo map
            String archiveID = ArchiveReaderWriter.getArchiveIDForArchive(archive);
            Map<String, EntityInfo> centityToInfo = new LinkedHashMap<>();
            {
                // get all contacts from the addressbook
                Set<Pair<String, Pair<Pair<Date, Date>, Integer>>> correspondentEntities = new LinkedHashSet<>();
                {
                    Map<Contact, DetailedFacetItem> res = IndexUtils.partitionDocsByPerson(archive.getAllDocs(), archive.getAddressBook());
                    res.entrySet().forEach(s -> {
                        // get contactname
                        Contact c = s.getKey();
                        // get duration (first and last doc where this contact was used)
                        Set<EmailDocument> edocs = s.getValue().docs.stream().map(t -> (EmailDocument) t).collect(Collectors.toSet());
                        Pair<Date, Date> duration = EmailUtils.getFirstLast(edocs);
                        if (duration == null) {
                            duration = new Pair<>(archive.collectionMetadata.firstDate, archive.collectionMetadata.lastDate);
                        }
                        if (duration.first == null)
                            duration.first = archive.collectionMetadata.firstDate;
                        if (duration.second == null)
                            duration.second = archive.collectionMetadata.lastDate;
                        // get number of messages where this was used.
                        Integer count = s.getValue().docs.size();
                        if (c.getNames() != null) {
                            Pair<Date, Date> finalDuration = duration;
                            c.getNames().forEach(w -> {
                                if (!Util.nullOrEmpty(w) && finalDuration != null && count != null)
                                    correspondentEntities.add(new Pair(canonicalize(w), new Pair(finalDuration, count)));
                            });
                        }
                        if (c.getEmails() != null) {
                            Pair<Date, Date> finalDuration1 = duration;
                            c.getEmails().forEach(w -> {
                                if (!Util.nullOrEmpty(w) && finalDuration1 != null && count != null)
                                    correspondentEntities.add(new Pair(canonicalize(w), new Pair(finalDuration1, count)));
                            });
                        }
                    });
                }
                // get all entities from entitybookmanager
                Set<Pair<String, Pair<Pair<Date, Date>, Integer>>> entitiessummary = new LinkedHashSet<>();
                {
                    entitiessummary = archive.getEntityBookManager().getAllEntitiesSummary();
                    // filter out any null or empty strings (just in case)
                    // don't canonicalize right away because we need to keep the original form of the name
                    entitiessummary = entitiessummary.stream().filter(s -> !Util.nullOrEmpty(s.first)).collect(Collectors.toSet());
                }
                // if an entity is present as a person entity as well as in correspondent then consider the count of the person entity as the final count.  Therefore start with
                // processing of correspondent entities.
                correspondentEntities.forEach(entity -> {
                    String centity = canonicalize(entity.first);
                    EntityInfo ei = centityToInfo.get(centity);
                    if (ei == null) {
                        ei = new EntityInfo();
                        ei.archiveID = archiveID;
                        ei.displayName = entity.first;
                        centityToInfo.put(centity, ei);
                    }
                    ei.isCorrespondent = true;
                    ei.firstDate = entity.second.first.first;
                    ei.lastDate = entity.second.first.second;
                    ei.count = entity.second.second;
                });
                // Now process entities (except correspondents).
                entitiessummary.forEach(entity -> {
                    String centity = canonicalize(entity.first);
                    EntityInfo ei = centityToInfo.get(centity);
                    if (ei == null) {
                        ei = new EntityInfo();
                        ei.archiveID = archiveID;
                        ei.displayName = entity.first;
                        centityToInfo.put(centity, ei);
                    }
                    // ei.isCorrespondent=true;
                    ei.firstDate = entity.second.first.first;
                    ei.lastDate = entity.second.first.second;
                    ei.count = entity.second.second;
                });
            }
            log.info("Archive # " + archiveNum + " read " + centityToInfo.size() + " entities");
            // now set up this map as a token map
            for (EntityInfo ei : centityToInfo.values()) {
                String entity = ei.displayName;
                String centity = canonicalize(entity);
                allCEntities.add(centity);
                // consider a set of tokens because we don't want repeats
                Set<String> ctokens = new LinkedHashSet<>(Util.tokenize(centity));
                for (String ctoken : ctokens) cTokenToInfos.put(ctoken, ei);
            }
        } catch (Exception e) {
            Util.print_exception("Error loading archive in directory " + f.getAbsolutePath(), e, log);
        }
        archiveNum++;
    }
}
Also used : Config(edu.stanford.muse.Config) java.util(java.util) edu.stanford.muse.index(edu.stanford.muse.index) AddressBook(edu.stanford.muse.AddressBookManager.AddressBook) Util(edu.stanford.muse.util.Util) Multimap(com.google.common.collect.Multimap) Collectors(java.util.stream.Collectors) File(java.io.File) MappedEntity(edu.stanford.muse.ie.variants.MappedEntity) DetailedFacetItem(edu.stanford.muse.util.DetailedFacetItem) Contact(edu.stanford.muse.AddressBookManager.Contact) Pair(edu.stanford.muse.util.Pair) Logger(org.apache.logging.log4j.Logger) EntityBook(edu.stanford.muse.ie.variants.EntityBook) EmailUtils(edu.stanford.muse.util.EmailUtils) SimpleSessions(edu.stanford.muse.webapp.SimpleSessions) ModeConfig(edu.stanford.muse.webapp.ModeConfig) LogManager(org.apache.logging.log4j.LogManager) LinkedHashMultimap(com.google.common.collect.LinkedHashMultimap) Contact(edu.stanford.muse.AddressBookManager.Contact) DetailedFacetItem(edu.stanford.muse.util.DetailedFacetItem) File(java.io.File) Pair(edu.stanford.muse.util.Pair)

Aggregations

LinkedHashMultimap (com.google.common.collect.LinkedHashMultimap)1 Multimap (com.google.common.collect.Multimap)1 AddressBook (edu.stanford.muse.AddressBookManager.AddressBook)1 Contact (edu.stanford.muse.AddressBookManager.Contact)1 Config (edu.stanford.muse.Config)1 EntityBook (edu.stanford.muse.ie.variants.EntityBook)1 MappedEntity (edu.stanford.muse.ie.variants.MappedEntity)1 edu.stanford.muse.index (edu.stanford.muse.index)1 DetailedFacetItem (edu.stanford.muse.util.DetailedFacetItem)1 EmailUtils (edu.stanford.muse.util.EmailUtils)1 Pair (edu.stanford.muse.util.Pair)1 Util (edu.stanford.muse.util.Util)1 ModeConfig (edu.stanford.muse.webapp.ModeConfig)1 SimpleSessions (edu.stanford.muse.webapp.SimpleSessions)1 File (java.io.File)1 java.util (java.util)1 Collectors (java.util.stream.Collectors)1 LogManager (org.apache.logging.log4j.LogManager)1 Logger (org.apache.logging.log4j.Logger)1