use of edu.stanford.muse.ner.NER in project epadd by ePADD.
the class JSPHelper method fetchAndIndexEmails.
// /* this version of fetchemails must have folders defined in request since there is no primary email address */
// public static Triple<Collection<EmailDocument>, AddressBook, BlobStore> fetchEmails(HttpServletRequest request, HttpSession session, boolean download) throws Exception
// {
// return fetchEmails (request, session, download, /* downloadattachments = */ false, false);
// }
//
// /** fetches messages without downloading or attachments.
// * support default folder for primary email address */
// public static Triple<Collection<EmailDocument>, AddressBook, BlobStore> fetchEmails(HttpServletRequest request, HttpSession session, String primaryEmailAddress) throws Exception
// {
// return fetchEmails (request, session, false, false, false);
// }
//
// public static boolean fetchEmailsDefaultFolders(HttpServletRequest request, HttpSession session, boolean downloadMessageText, boolean downloadAttachments) throws Exception
// {
// try {
// fetchEmails(request, session, downloadMessageText, downloadAttachments, true);
// } catch (Exception e) {
// return false;
// }
// return true;
// }
//
// public static Triple<Collection<EmailDocument>, AddressBook, BlobStore> fetchEmails(HttpServletRequest request, HttpSession session, boolean downloadMessageText, boolean downloadAttachments, boolean useDefaultFolders)
// throws UnsupportedEncodingException, MessagingException, InterruptedException, IOException, JSONException, NoDefaultFolderException, CancelledException
// {
// return fetchEmails(request, session, downloadMessageText, downloadAttachments, useDefaultFolders, null);
// }
/**
* A VIP method.
* reads email accounts and installs addressBook and emailDocs into session
* useDefaultFolders: use the default folder for that fetcher if there are
* no explicit folders in that fetcher.
* throws out of memory error if it runs out of memory.
*
* @throws JSONException
* @throws IOException
* @throws InterruptedException
* @throws MessagingException
* @throws UnsupportedEncodingException
* @throws NoDefaultFolderException
* @throws Exception
*/
public static void fetchAndIndexEmails(Archive archive, MuseEmailFetcher m, HttpServletRequest request, HttpSession session, boolean downloadMessageText, boolean downloadAttachments, boolean useDefaultFolders) throws MessagingException, InterruptedException, IOException, JSONException, NoDefaultFolderException, CancelledException, OutOfMemoryError {
// first thing, set up a static status so user doesn't see a stale status message
session.setAttribute("statusProvider", new StaticStatusProvider("Starting up..."));
checkContainer(request);
String encoding = request.getCharacterEncoding();
log.info("request parameter encoding is " + encoding);
if (!downloadMessageText)
if ("true".equalsIgnoreCase(request.getParameter("downloadMessageText"))) {
downloadMessageText = true;
log.info("Downloading message text because advanced option was set");
}
if (!downloadAttachments)
if ("true".equalsIgnoreCase(request.getParameter("downloadAttachments"))) {
downloadAttachments = true;
// because text is needed for attachment wall -- otherwise we can't break out from piclens to browsing messages associated with a particular thumbnail
downloadMessageText = true;
log.info("Downloading attachments because advanced option was set");
}
String[] allFolders = request.getParameterValues("folder");
if (allFolders != null) {
// try to read folder strings, first checking for exceptions
try {
allFolders = JSPHelper.convertRequestParamsToUTF8(allFolders, true);
} catch (UnsupportedEncodingException e) {
// report exception and try to read whatever folders we can, ignoring the exception this time
log.warn("Unsupported encoding exception: " + e);
try {
allFolders = JSPHelper.convertRequestParamsToUTF8(allFolders, false);
} catch (UnsupportedEncodingException e1) {
log.warn("Should not reach here!" + e1);
}
}
}
Multimap<String, String> requestMap = convertRequestToMap(request);
Filter filter = Filter.parseFilter(requestMap);
// if required, forceEncoding can go into fetch config
// String s = (String) session.getAttribute("forceEncoding");
FetchConfig fc = new FetchConfig();
fc.downloadMessages = downloadMessageText;
fc.downloadAttachments = downloadAttachments;
fc.filter = filter;
archive.setBaseDir(getBaseDir(m, request));
m.fetchAndIndexEmails(archive, allFolders, useDefaultFolders, fc, session);
// make sure the archive is dumped at this point
archive.close();
archive.openForRead();
// perform entity IE related tasks only if the message text is available
if (downloadMessageText) {
String modelFile = SequenceModel.MODEL_FILENAME;
NERModel nerModel = null;
// =(SequenceModel) session.getAttribute("ner");
session.setAttribute("statusProvider", new StaticStatusProvider("Loading NER sequence model from resource: " + modelFile + "..."));
try {
if (System.getProperty("muse.dummy.ner") != null) {
log.info("Using dummy NER model, all CIC patterns will be treated as valid entities");
nerModel = new DummyNERModel();
} else {
log.info("Loading NER sequence model from: " + modelFile + " ...");
nerModel = SequenceModel.loadModelFromRules(SequenceModel.RULES_DIRNAME);
}
} catch (IOException e) {
Util.print_exception("Could not load the sequence model from: " + modelFile, e, log);
}
if (nerModel == null) {
log.error("Could not load NER model from: " + modelFile);
} else {
NER ner = new NER(archive, nerModel);
session.setAttribute("statusProvider", ner);
ner.recognizeArchive();
// Here, instead of getting the count of all entities (present in ner.stats object)
// get the count of only those entities which pass a given thersold.
// This is to fix a bug where the count of person entities displayed on browse-top.jsp
// page was different than the count of entities actually displayed following a thersold.
// @TODO make it more modular
// archive.collectionMetadata.entityCounts = ner.stats.counts;
double theta = 0.001;
archive.collectionMetadata.entityCounts = Archive.getEntitiesCountMapModuloThersold(archive, theta);
log.info(ner.stats);
}
// archive.collectionMetadata.numPotentiallySensitiveMessages = archive.numMatchesPresetQueries();
log.info("Number of potentially sensitive messages " + archive.collectionMetadata.numPotentiallySensitiveMessages);
// Is there a reliable and more proper way of checking the mode it is running in?
String logF = System.getProperty("muse.log");
if (logF == null || logF.endsWith("epadd.log")) {
// try {
// InternalAuthorityAssigner assignauthorities = new InternalAuthorityAssigner();
// session.setAttribute("statusProvider", assignauthorities);
// assignauthorities.initialize(archive);
// if (!assignauthorities.isCancelled())
// request.getSession().setAttribute("authorities", assignauthorities);
// else
// assignauthorities = null;
// boolean success = assignauthorities.checkFeaturesIndex(archive, true);
// if (!success) {
// log.warn("Could not build context mixtures for entities");
// } else
// log.info("Successfully built context mixtures for entities");
// } catch (Exception e) {
// log.warn("Exception while building context mixtures", e);
// }
}
}
// add the new stores
}
Aggregations