Search in sources :

Example 1 with Archive

use of edu.stanford.muse.index.Archive in project epadd by ePADD.

the class EntityFeature method checkIndex.

/**
 * @arg2 force creation of index irrespective of previous existence of the
 *       index.
 *       Checks and creates index if required.
 * @return true if successful
 */
public boolean checkIndex(Archive archive, boolean force) {
    Boolean exists = indexExists(archive);
    int c1 = 0, c2 = 0, c3 = 0;
    int g1 = 0, g2 = 0, g3 = 0;
    int f1 = 0, f2 = 0, f3 = 0;
    boolean istatus = true;
    if (force || (!exists)) {
        Map<String, EntityFeature> features = new HashMap<>();
        Collection<EmailDocument> docs = (Collection) archive.getAllDocs();
        int totalEntities = 0;
        log.info("No feature index found..., starting to process and index. This can take a while.");
        int di = 0;
        for (EmailDocument ed : docs) {
            if (cancel) {
                clean(archive);
                return false;
            }
            if (di % 1000 == 0) {
                JSPHelper.log.info("Done analysing documents: " + di + " of: " + docs.size());
                status = "Analyzed " + di + "/" + docs.size() + " email documents";
                pctComplete = ((double) di * 50) / (double) docs.size();
            }
            di++;
            List<Span> names;
            try {
                names = Arrays.asList(archive.getAllNamesInDoc(ed, true));
            } catch (IOException ioe) {
                log.error("Problem accessing entities in " + ed.getUniqueId(), ioe);
                continue;
            }
            List<String> entities = names.stream().filter(n -> n.type == NEType.Type.PERSON.getCode()).map(n -> n.text).collect(Collectors.toList());
            List<String> places = names.stream().filter(n -> n.type == NEType.Type.PLACE.getCode()).map(n -> n.text).collect(Collectors.toList());
            List<String> orgs = names.stream().filter(n -> n.type == NEType.Type.ORGANISATION.getCode()).map(n -> n.text).collect(Collectors.toList());
            if (entities != null)
                c1 += entities.size();
            if (orgs != null)
                c2 += orgs.size();
            if (places != null)
                c3 += places.size();
            Map<String, String> goodNames = new HashMap<>();
            List<String> correspondents = ed.getAllNames();
            List<String> addresses = ed.getAllAddrs();
            if (correspondents != null)
                for (String c : correspondents) {
                    if (c != null && c.contains(" ")) {
                        // EmailUtils.normalizePersonNameForLookup(c);
                        String n = IndexUtils.canonicalizeEntity(c);
                        goodNames.put(n, "person");
                    }
                }
            for (String e : entities) {
                if (e != null && e.contains(" ")) {
                    String canonicalEntity = IndexUtils.canonicalizeEntity(e);
                    if (canonicalEntity == null)
                        continue;
                    goodNames.put(canonicalEntity, "person");
                    g1++;
                }
            }
            for (String o : orgs) {
                String canonicalEntity = IndexUtils.canonicalizeEntity(o);
                if (canonicalEntity == null)
                    continue;
                goodNames.put(canonicalEntity, "org");
                g2++;
            }
            for (String p : places) {
                String canonicalEntity = IndexUtils.canonicalizeEntity(p);
                if (canonicalEntity == null)
                    continue;
                goodNames.put(canonicalEntity, "places");
                g3++;
            }
            // O(goodNames.size())
            for (String gn : goodNames.keySet()) {
                if (features.get(gn) == null) {
                    if (goodNames.get(gn).equals("person")) {
                        features.put(gn, new EntityFeature(gn, EntityFeature.PERSON));
                        f1++;
                    } else if (goodNames.get(gn).equals("org")) {
                        features.put(gn, new EntityFeature(gn, EntityFeature.ORG));
                        f2++;
                    } else if (goodNames.get(gn).equals("places")) {
                        features.put(gn, new EntityFeature(gn, EntityFeature.PLACE));
                        f3++;
                    }
                }
                features.get(gn).accountForThis();
                features.get(gn).addAllCE(goodNames.keySet());
                if (addresses != null)
                    features.get(gn).addAllEA(addresses);
                features.get(gn).priorProbablity = features.get(gn).priorProbablity + 1.0;
                totalEntities++;
            }
        }
        log.info("Found: " + c1 + " entities, " + c2 + " orgs and " + c3 + " places");
        log.info("Gn: " + g1 + " entities, " + g2 + " orgs and " + g3 + " places");
        log.info("Found goodfeatures: " + f1 + " entities, " + f2 + " orgs and " + f3 + " places");
        for (String key : features.keySet()) features.get(key).priorProbablity = features.get(key).priorProbablity / (double) totalEntities;
        log.info("Done analysing docs. Starting to index.");
        istatus = index(features, archive);
    }
    return istatus;
}
Also used : java.util(java.util) Config(edu.stanford.muse.Config) TypeToken(com.google.gson.reflect.TypeToken) CharArraySet(org.apache.lucene.analysis.CharArraySet) StringField(org.apache.lucene.document.StringField) JSPHelper(edu.stanford.muse.webapp.JSPHelper) StatusProvider(edu.stanford.muse.email.StatusProvider) OpenMode(org.apache.lucene.index.IndexWriterConfig.OpenMode) JSONUtils(edu.stanford.muse.util.JSONUtils) Document(org.apache.lucene.document.Document) Gson(com.google.gson.Gson) org.apache.lucene.search(org.apache.lucene.search) Directory(org.apache.lucene.store.Directory) SimpleSessions(edu.stanford.muse.webapp.SimpleSessions) FSDirectory(org.apache.lucene.store.FSDirectory) Span(edu.stanford.muse.util.Span) FeatureUtils(edu.stanford.muse.ner.featuregen.FeatureUtils) IOException(java.io.IOException) IndexUtils(edu.stanford.muse.index.IndexUtils) Collectors(java.util.stream.Collectors) Version(org.apache.lucene.util.Version) File(java.io.File) Serializable(java.io.Serializable) org.apache.lucene.index(org.apache.lucene.index) Pair(edu.stanford.muse.util.Pair) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Archive(edu.stanford.muse.index.Archive) Field(org.apache.lucene.document.Field) NEType(edu.stanford.muse.ner.model.NEType) Log(org.apache.commons.logging.Log) TextField(org.apache.lucene.document.TextField) EmailDocument(edu.stanford.muse.index.EmailDocument) LogFactory(org.apache.commons.logging.LogFactory) StopAnalyzer(org.apache.lucene.analysis.core.StopAnalyzer) StringEscapeUtils(org.apache.commons.lang.StringEscapeUtils) EmailDocument(edu.stanford.muse.index.EmailDocument) IOException(java.io.IOException) Span(edu.stanford.muse.util.Span)

Example 2 with Archive

use of edu.stanford.muse.index.Archive in project epadd by ePADD.

the class EntityFeature method main.

public static void main(String[] args) {
    long start_time = System.currentTimeMillis();
    Archive archive = null;
    try {
        String aFile = System.getProperty("user.home") + File.separator + "epadd-appraisal" + File.separator + "user";
        archive = SimpleSessions.readArchiveIfPresent(aFile);
    } catch (Exception e) {
        e.printStackTrace();
    }
    // getAbbreviations("HPC", archive);
    Set<EntityFeature> efts = EntityFeature.getMatches("florida", archive);
    long end_time = System.currentTimeMillis();
    System.err.println("Query completed in: " + (end_time - start_time));
    for (EntityFeature eft : efts) System.err.println(eft.cooccuringEntities.keySet());
    try {
        for (LeafReaderContext ctx : reader.leaves()) {
            LeafReader reader = ctx.reader();
            for (int i = 0; i < reader.maxDoc(); i++) {
                Document doc = reader.document(i);
                // for (IndexableField f: doc.getFields())
                System.err.println(doc.get("name"));
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}
Also used : Archive(edu.stanford.muse.index.Archive) Document(org.apache.lucene.document.Document) EmailDocument(edu.stanford.muse.index.EmailDocument) IOException(java.io.IOException)

Example 3 with Archive

use of edu.stanford.muse.index.Archive in project epadd by ePADD.

the class SessionListener method sessionDestroyed.

public void sessionDestroyed(HttpSessionEvent event) {
    log.info("Destroying session: " + event.getSession().getId() + " at " + new Date());
    HttpSession session = event.getSession();
    if (ModeConfig.isDiscoveryMode())
        log.info("Not saving archive on session destroy because we're in discovery mode");
    else {
        // save the archive before quitting the session, so the annotations, flags, etc. can be saved
        Archive archive = (Archive) session.getAttribute("archive");
        if (archive != null)
            try {
                SimpleSessions.saveArchive(archive);
            } catch (Exception e) {
                Util.print_exception(e, log);
                return;
            }
    }
    synchronized (this) {
        sessionCount--;
        log.info("Current number of sessions: " + sessionCount);
    }
}
Also used : Archive(edu.stanford.muse.index.Archive) HttpSession(javax.servlet.http.HttpSession) Date(java.util.Date)

Example 4 with Archive

use of edu.stanford.muse.index.Archive in project epadd by ePADD.

the class SimpleSessions method loadSessionAsMap.

/**
 * loads session from the given filename, and returns the map of loaded
 * attributes.
 * if readOnly is false, caller MUST make sure to call packIndex.
 * baseDir is Indexer's baseDir (path before "indexes/")
 *
 * @throws IOException
 * @throws LockObtainFailedException
 * @throws CorruptIndexException
 * Change as on Nov 2017-
 * Earlier the whole archive was serialized and deserialized as one big entity. Now it is broken into
 * four main parts, Addressbook, entitybook, correspondentAuthorityMapper and the rest of the object
 * We save all these four components separately in saveArchive. Therefore while reading, we need to read
 * all those separately from appropriate files.
 */
public static Map<String, Object> loadSessionAsMap(String filename, String baseDir, boolean readOnly) throws IOException {
    log.info("Loading session from file " + filename + " size: " + Util.commatize(new File(filename).length() / 1024) + " KB");
    ObjectInputStream ois = null;
    // keep reading till eof exception
    Map<String, Object> result = new LinkedHashMap<>();
    try {
        ois = new ObjectInputStream(new GZIPInputStream(new FileInputStream(filename)));
        while (true) {
            String key = (String) ois.readObject();
            log.info("loading key: " + key);
            try {
                Object value = ois.readObject();
                if (value == null)
                    break;
                result.put(key, value);
            } catch (InvalidClassException ice) {
                log.error("Bad version for value of key " + key + ": " + ice + "\nContinuing but this key is not set...");
            } catch (ClassNotFoundException cnfe) {
                log.error("Class not found for value of key " + key + ": " + cnfe + "\nContinuing but this key is not set...");
            }
        }
    } catch (EOFException eof) {
        log.info("end of session file reached");
    } catch (Exception e) {
        log.warn("Warning unable to load session: " + Util.stackTrace(e));
        result.clear();
    }
    if (ois != null)
        try {
            ois.close();
        } catch (Exception e) {
            Util.print_exception(e, log);
        }
    // need to set up sentiments explicitly -- now no need since lexicon is part of the session
    log.info("Memory status: " + Util.getMemoryStats());
    Archive archive = (Archive) result.get("archive");
    // no groups in public mode
    if (archive != null) {
        /*
				Read other three modules of Archive object which were set as transient and hence did not serialize.
				*/
        // file path names of addressbook, entitybook and correspondentAuthorityMapper data.
        String dir = baseDir + File.separatorChar + Archive.SESSIONS_SUBDIR;
        String addressBookPath = dir + File.separatorChar + Archive.ADDRESSBOOK_SUFFIX;
        String entityBookPath = dir + File.separatorChar + Archive.ENTITYBOOK_SUFFIX;
        String cAuthorityPath = dir + File.separatorChar + Archive.CAUTHORITYMAPPER_SUFFIX;
        String labMapDirPath = dir + File.separatorChar + Archive.LABELMAPDIR;
        String annotationMapPath = dir + File.separatorChar + Archive.ANNOTATION_SUFFIX;
        // above three files are not present. In that case start afresh with importing the email-archive again in processing mode.
        if (!(new File(addressBookPath).exists()) || !(new File(entityBookPath).exists()) || !(new File(cAuthorityPath).exists())) {
            result.put("archive", null);
            return result;
        }
        // ///////////////AddressBook////////////////////////////////////////////
        BufferedReader br = new BufferedReader(new FileReader(addressBookPath));
        AddressBook ab = AddressBook.readObjectFromStream(br);
        archive.addressBook = ab;
        br.close();
        // //////////////EntityBook/////////////////////////////////////
        br = new BufferedReader(new FileReader(entityBookPath));
        EntityBook eb = EntityBook.readObjectFromStream(br);
        archive.setEntityBook(eb);
        br.close();
        // /////////////CorrespondentAuthorityMapper/////////////////////////////
        CorrespondentAuthorityMapper cmapper = null;
        cmapper = CorrespondentAuthorityMapper.readObjectFromStream(cAuthorityPath);
        archive.correspondentAuthorityMapper = cmapper;
        // ///////////////Label Mapper/////////////////////////////////////////////////////
        LabelManager labelManager = null;
        try {
            labelManager = LabelManager.readObjectFromStream(labMapDirPath);
        } catch (Exception e) {
            Util.print_exception("Exception in reading label manager from archive, assigning a new label manager", e, log);
            labelManager = new LabelManager();
        }
        archive.setLabelManager(labelManager);
        // /////////////Annotation Manager///////////////////////////////////////////////////////
        AnnotationManager annotationManager = AnnotationManager.readObjectFromStream(annotationMapPath);
        archive.setAnnotationManager(annotationManager);
        // this is useful when we import a legacy archive into processing, where we've updated the pm file directly, without updating the archive.
        try {
            archive.collectionMetadata = readCollectionMetadata(baseDir);
        } catch (Exception e) {
            Util.print_exception("Error trying to read processing metadata file", e, log);
        }
        // ///////////////////////////Done reading//////////////////////////////////////////////////////
        // most of this code should probably move inside Archive, maybe a function called "postDeserialized()"
        archive.postDeserialized(baseDir, readOnly);
        result.put("emailDocs", archive.getAllDocs());
    }
    return result;
}
Also used : AnnotationManager(edu.stanford.muse.AnnotationManager.AnnotationManager) Archive(edu.stanford.muse.index.Archive) EntityBook(edu.stanford.muse.ie.variants.EntityBook) ParseException(org.apache.lucene.queryparser.classic.ParseException) LockObtainFailedException(org.apache.lucene.store.LockObtainFailedException) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) GZIPInputStream(java.util.zip.GZIPInputStream) AddressBook(edu.stanford.muse.AddressBookManager.AddressBook) CorrespondentAuthorityMapper(edu.stanford.muse.AddressBookManager.CorrespondentAuthorityMapper) LabelManager(edu.stanford.muse.LabelManager.LabelManager)

Example 5 with Archive

use of edu.stanford.muse.index.Archive in project epadd by ePADD.

the class SimpleSessions method prepareAndLoadDefaultArchive.

/**
 * reads from default dir (usually ~/.muse/user) and sets up cachedir,
 * archive vars.
 */
public static Archive prepareAndLoadDefaultArchive(HttpServletRequest request) throws IOException {
    HttpSession session = request.getSession();
    // allow cacheDir parameter to override default location
    String dir = request.getParameter("cacheDir");
    if (Util.nullOrEmpty(dir))
        dir = SimpleSessions.CACHE_DIR;
    JSPHelper.log.info("Trying to read archive from " + dir);
    Archive archive = SimpleSessions.readArchiveIfPresent(dir);
    if (archive != null) {
        JSPHelper.log.info("Good, archive read from " + dir);
        /*	// always set these three together
			session.setAttribute("userKey", "user");
			session.setAttribute("cacheDir", dir);
			session.setAttribute("archive", archive);
*/
        // is this really needed?
        // prepare default lexicon files etc.
        Archive.prepareBaseDir(dir);
    /*
			Lexicon lex = archive.getLexicon("general");
			if (lex != null)
				session.setAttribute("lexicon", lex); // set up default general lexicon, so something is in the session as default lexicon (so facets can show it)
*/
    }
    return archive;
}
Also used : Archive(edu.stanford.muse.index.Archive) HttpSession(javax.servlet.http.HttpSession)

Aggregations

Archive (edu.stanford.muse.index.Archive)10 EmailDocument (edu.stanford.muse.index.EmailDocument)3 HttpSession (javax.servlet.http.HttpSession)3 AddressBook (edu.stanford.muse.AddressBookManager.AddressBook)2 Pair (edu.stanford.muse.util.Pair)2 File (java.io.File)2 IOException (java.io.IOException)2 Document (org.apache.lucene.document.Document)2 Gson (com.google.gson.Gson)1 TypeToken (com.google.gson.reflect.TypeToken)1 Contact (edu.stanford.muse.AddressBookManager.Contact)1 CorrespondentAuthorityMapper (edu.stanford.muse.AddressBookManager.CorrespondentAuthorityMapper)1 AnnotationManager (edu.stanford.muse.AnnotationManager.AnnotationManager)1 AuthorityMapper (edu.stanford.muse.AuthorityMapper.AuthorityMapper)1 Config (edu.stanford.muse.Config)1 LabelManager (edu.stanford.muse.LabelManager.LabelManager)1 StatusProvider (edu.stanford.muse.email.StatusProvider)1 EntityBook (edu.stanford.muse.ie.variants.EntityBook)1 Document (edu.stanford.muse.index.Document)1 IndexUtils (edu.stanford.muse.index.IndexUtils)1