use of edu.stanford.muse.index.Archive in project epadd by ePADD.
the class EntityFeature method checkIndex.
/**
* @arg2 force creation of index irrespective of previous existence of the
* index.
* Checks and creates index if required.
* @return true if successful
*/
public boolean checkIndex(Archive archive, boolean force) {
Boolean exists = indexExists(archive);
int c1 = 0, c2 = 0, c3 = 0;
int g1 = 0, g2 = 0, g3 = 0;
int f1 = 0, f2 = 0, f3 = 0;
boolean istatus = true;
if (force || (!exists)) {
Map<String, EntityFeature> features = new HashMap<>();
Collection<EmailDocument> docs = (Collection) archive.getAllDocs();
int totalEntities = 0;
log.info("No feature index found..., starting to process and index. This can take a while.");
int di = 0;
for (EmailDocument ed : docs) {
if (cancel) {
clean(archive);
return false;
}
if (di % 1000 == 0) {
JSPHelper.log.info("Done analysing documents: " + di + " of: " + docs.size());
status = "Analyzed " + di + "/" + docs.size() + " email documents";
pctComplete = ((double) di * 50) / (double) docs.size();
}
di++;
List<Span> names;
try {
names = Arrays.asList(archive.getAllNamesInDoc(ed, true));
} catch (IOException ioe) {
log.error("Problem accessing entities in " + ed.getUniqueId(), ioe);
continue;
}
List<String> entities = names.stream().filter(n -> n.type == NEType.Type.PERSON.getCode()).map(n -> n.text).collect(Collectors.toList());
List<String> places = names.stream().filter(n -> n.type == NEType.Type.PLACE.getCode()).map(n -> n.text).collect(Collectors.toList());
List<String> orgs = names.stream().filter(n -> n.type == NEType.Type.ORGANISATION.getCode()).map(n -> n.text).collect(Collectors.toList());
if (entities != null)
c1 += entities.size();
if (orgs != null)
c2 += orgs.size();
if (places != null)
c3 += places.size();
Map<String, String> goodNames = new HashMap<>();
List<String> correspondents = ed.getAllNames();
List<String> addresses = ed.getAllAddrs();
if (correspondents != null)
for (String c : correspondents) {
if (c != null && c.contains(" ")) {
// EmailUtils.normalizePersonNameForLookup(c);
String n = IndexUtils.canonicalizeEntity(c);
goodNames.put(n, "person");
}
}
for (String e : entities) {
if (e != null && e.contains(" ")) {
String canonicalEntity = IndexUtils.canonicalizeEntity(e);
if (canonicalEntity == null)
continue;
goodNames.put(canonicalEntity, "person");
g1++;
}
}
for (String o : orgs) {
String canonicalEntity = IndexUtils.canonicalizeEntity(o);
if (canonicalEntity == null)
continue;
goodNames.put(canonicalEntity, "org");
g2++;
}
for (String p : places) {
String canonicalEntity = IndexUtils.canonicalizeEntity(p);
if (canonicalEntity == null)
continue;
goodNames.put(canonicalEntity, "places");
g3++;
}
// O(goodNames.size())
for (String gn : goodNames.keySet()) {
if (features.get(gn) == null) {
if (goodNames.get(gn).equals("person")) {
features.put(gn, new EntityFeature(gn, EntityFeature.PERSON));
f1++;
} else if (goodNames.get(gn).equals("org")) {
features.put(gn, new EntityFeature(gn, EntityFeature.ORG));
f2++;
} else if (goodNames.get(gn).equals("places")) {
features.put(gn, new EntityFeature(gn, EntityFeature.PLACE));
f3++;
}
}
features.get(gn).accountForThis();
features.get(gn).addAllCE(goodNames.keySet());
if (addresses != null)
features.get(gn).addAllEA(addresses);
features.get(gn).priorProbablity = features.get(gn).priorProbablity + 1.0;
totalEntities++;
}
}
log.info("Found: " + c1 + " entities, " + c2 + " orgs and " + c3 + " places");
log.info("Gn: " + g1 + " entities, " + g2 + " orgs and " + g3 + " places");
log.info("Found goodfeatures: " + f1 + " entities, " + f2 + " orgs and " + f3 + " places");
for (String key : features.keySet()) features.get(key).priorProbablity = features.get(key).priorProbablity / (double) totalEntities;
log.info("Done analysing docs. Starting to index.");
istatus = index(features, archive);
}
return istatus;
}
use of edu.stanford.muse.index.Archive in project epadd by ePADD.
the class EntityFeature method main.
public static void main(String[] args) {
long start_time = System.currentTimeMillis();
Archive archive = null;
try {
String aFile = System.getProperty("user.home") + File.separator + "epadd-appraisal" + File.separator + "user";
archive = SimpleSessions.readArchiveIfPresent(aFile);
} catch (Exception e) {
e.printStackTrace();
}
// getAbbreviations("HPC", archive);
Set<EntityFeature> efts = EntityFeature.getMatches("florida", archive);
long end_time = System.currentTimeMillis();
System.err.println("Query completed in: " + (end_time - start_time));
for (EntityFeature eft : efts) System.err.println(eft.cooccuringEntities.keySet());
try {
for (LeafReaderContext ctx : reader.leaves()) {
LeafReader reader = ctx.reader();
for (int i = 0; i < reader.maxDoc(); i++) {
Document doc = reader.document(i);
// for (IndexableField f: doc.getFields())
System.err.println(doc.get("name"));
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
use of edu.stanford.muse.index.Archive in project epadd by ePADD.
the class SessionListener method sessionDestroyed.
public void sessionDestroyed(HttpSessionEvent event) {
log.info("Destroying session: " + event.getSession().getId() + " at " + new Date());
HttpSession session = event.getSession();
if (ModeConfig.isDiscoveryMode())
log.info("Not saving archive on session destroy because we're in discovery mode");
else {
// save the archive before quitting the session, so the annotations, flags, etc. can be saved
Archive archive = (Archive) session.getAttribute("archive");
if (archive != null)
try {
SimpleSessions.saveArchive(archive);
} catch (Exception e) {
Util.print_exception(e, log);
return;
}
}
synchronized (this) {
sessionCount--;
log.info("Current number of sessions: " + sessionCount);
}
}
use of edu.stanford.muse.index.Archive in project epadd by ePADD.
the class SimpleSessions method loadSessionAsMap.
/**
* loads session from the given filename, and returns the map of loaded
* attributes.
* if readOnly is false, caller MUST make sure to call packIndex.
* baseDir is Indexer's baseDir (path before "indexes/")
*
* @throws IOException
* @throws LockObtainFailedException
* @throws CorruptIndexException
* Change as on Nov 2017-
* Earlier the whole archive was serialized and deserialized as one big entity. Now it is broken into
* four main parts, Addressbook, entitybook, correspondentAuthorityMapper and the rest of the object
* We save all these four components separately in saveArchive. Therefore while reading, we need to read
* all those separately from appropriate files.
*/
public static Map<String, Object> loadSessionAsMap(String filename, String baseDir, boolean readOnly) throws IOException {
log.info("Loading session from file " + filename + " size: " + Util.commatize(new File(filename).length() / 1024) + " KB");
ObjectInputStream ois = null;
// keep reading till eof exception
Map<String, Object> result = new LinkedHashMap<>();
try {
ois = new ObjectInputStream(new GZIPInputStream(new FileInputStream(filename)));
while (true) {
String key = (String) ois.readObject();
log.info("loading key: " + key);
try {
Object value = ois.readObject();
if (value == null)
break;
result.put(key, value);
} catch (InvalidClassException ice) {
log.error("Bad version for value of key " + key + ": " + ice + "\nContinuing but this key is not set...");
} catch (ClassNotFoundException cnfe) {
log.error("Class not found for value of key " + key + ": " + cnfe + "\nContinuing but this key is not set...");
}
}
} catch (EOFException eof) {
log.info("end of session file reached");
} catch (Exception e) {
log.warn("Warning unable to load session: " + Util.stackTrace(e));
result.clear();
}
if (ois != null)
try {
ois.close();
} catch (Exception e) {
Util.print_exception(e, log);
}
// need to set up sentiments explicitly -- now no need since lexicon is part of the session
log.info("Memory status: " + Util.getMemoryStats());
Archive archive = (Archive) result.get("archive");
// no groups in public mode
if (archive != null) {
/*
Read other three modules of Archive object which were set as transient and hence did not serialize.
*/
// file path names of addressbook, entitybook and correspondentAuthorityMapper data.
String dir = baseDir + File.separatorChar + Archive.SESSIONS_SUBDIR;
String addressBookPath = dir + File.separatorChar + Archive.ADDRESSBOOK_SUFFIX;
String entityBookPath = dir + File.separatorChar + Archive.ENTITYBOOK_SUFFIX;
String cAuthorityPath = dir + File.separatorChar + Archive.CAUTHORITYMAPPER_SUFFIX;
String labMapDirPath = dir + File.separatorChar + Archive.LABELMAPDIR;
String annotationMapPath = dir + File.separatorChar + Archive.ANNOTATION_SUFFIX;
// above three files are not present. In that case start afresh with importing the email-archive again in processing mode.
if (!(new File(addressBookPath).exists()) || !(new File(entityBookPath).exists()) || !(new File(cAuthorityPath).exists())) {
result.put("archive", null);
return result;
}
// ///////////////AddressBook////////////////////////////////////////////
BufferedReader br = new BufferedReader(new FileReader(addressBookPath));
AddressBook ab = AddressBook.readObjectFromStream(br);
archive.addressBook = ab;
br.close();
// //////////////EntityBook/////////////////////////////////////
br = new BufferedReader(new FileReader(entityBookPath));
EntityBook eb = EntityBook.readObjectFromStream(br);
archive.setEntityBook(eb);
br.close();
// /////////////CorrespondentAuthorityMapper/////////////////////////////
CorrespondentAuthorityMapper cmapper = null;
cmapper = CorrespondentAuthorityMapper.readObjectFromStream(cAuthorityPath);
archive.correspondentAuthorityMapper = cmapper;
// ///////////////Label Mapper/////////////////////////////////////////////////////
LabelManager labelManager = null;
try {
labelManager = LabelManager.readObjectFromStream(labMapDirPath);
} catch (Exception e) {
Util.print_exception("Exception in reading label manager from archive, assigning a new label manager", e, log);
labelManager = new LabelManager();
}
archive.setLabelManager(labelManager);
// /////////////Annotation Manager///////////////////////////////////////////////////////
AnnotationManager annotationManager = AnnotationManager.readObjectFromStream(annotationMapPath);
archive.setAnnotationManager(annotationManager);
// this is useful when we import a legacy archive into processing, where we've updated the pm file directly, without updating the archive.
try {
archive.collectionMetadata = readCollectionMetadata(baseDir);
} catch (Exception e) {
Util.print_exception("Error trying to read processing metadata file", e, log);
}
// ///////////////////////////Done reading//////////////////////////////////////////////////////
// most of this code should probably move inside Archive, maybe a function called "postDeserialized()"
archive.postDeserialized(baseDir, readOnly);
result.put("emailDocs", archive.getAllDocs());
}
return result;
}
use of edu.stanford.muse.index.Archive in project epadd by ePADD.
the class SimpleSessions method prepareAndLoadDefaultArchive.
/**
* reads from default dir (usually ~/.muse/user) and sets up cachedir,
* archive vars.
*/
public static Archive prepareAndLoadDefaultArchive(HttpServletRequest request) throws IOException {
HttpSession session = request.getSession();
// allow cacheDir parameter to override default location
String dir = request.getParameter("cacheDir");
if (Util.nullOrEmpty(dir))
dir = SimpleSessions.CACHE_DIR;
JSPHelper.log.info("Trying to read archive from " + dir);
Archive archive = SimpleSessions.readArchiveIfPresent(dir);
if (archive != null) {
JSPHelper.log.info("Good, archive read from " + dir);
/* // always set these three together
session.setAttribute("userKey", "user");
session.setAttribute("cacheDir", dir);
session.setAttribute("archive", archive);
*/
// is this really needed?
// prepare default lexicon files etc.
Archive.prepareBaseDir(dir);
/*
Lexicon lex = archive.getLexicon("general");
if (lex != null)
session.setAttribute("lexicon", lex); // set up default general lexicon, so something is in the session as default lexicon (so facets can show it)
*/
}
return archive;
}
Aggregations