use of edu.stanford.muse.exceptions.CancelledException in project epadd by ePADD.
the class MuseEmailFetcher method fetchAndIndexEmails.
/**
* key method to fetch actual email messages. can take a long time.
* @param session is used only to set the status provider object. callers who do not need to track status can leave it as null
* @param selectedFolders is in the format <account name>^-^<folder name>
* @param session is used only to put a status object in. can be null in which case status object is not set.
* emailDocs, addressBook and blobstore
* @throws NoDefaultFolderException
*/
public void fetchAndIndexEmails(Archive archive, String[] selectedFolders, boolean useDefaultFolders, FetchConfig fetchConfig, HttpSession session) throws MessagingException, InterruptedException, IOException, JSONException, NoDefaultFolderException, CancelledException {
setupFetchers(-1);
long startTime = System.currentTimeMillis();
if (session != null)
session.setAttribute("statusProvider", new StaticStatusProvider("Starting to process messages..."));
boolean op_cancelled = false, out_of_mem = false;
BlobStore attachmentsStore = archive.getBlobStore();
fetchConfig.downloadAttachments = fetchConfig.downloadAttachments && attachmentsStore != null;
if (Util.nullOrEmpty(fetchers)) {
log.warn("Trying to fetch email with no fetchers, setup not called ?");
return;
}
setupFoldersForFetchers(fetchers, selectedFolders, useDefaultFolders);
List<FolderInfo> fetchedFolderInfos = new ArrayList<>();
// one fetcher will aggregate everything
FetchStats stats = new FetchStats();
MTEmailFetcher aggregatingFetcher = null;
// a fetcher is one source, like an account or a top-level mbox dir. A fetcher could include multiple folders.
long startTimeMillis = System.currentTimeMillis();
for (MTEmailFetcher fetcher : fetchers) {
if (session != null)
session.setAttribute("statusProvider", fetcher);
fetcher.setArchive(archive);
fetcher.setFetchConfig(fetchConfig);
log.info("Memory status before fetching emails: " + Util.getMemoryStats());
// this is the big call, can run for a long time. Note: running in the same thread, its not fetcher.start();
List<FolderInfo> foldersFetchedByThisFetcher = fetcher.run();
// but don't abort immediately, only at the end, after addressbook has been built for at least the processed messages
if (fetcher.isCancelled()) {
log.info("NOTE: fetcher operation was cancelled");
op_cancelled = true;
break;
}
if (fetcher.mayHaveRunOutOfMemory()) {
log.warn("Fetcher operation ran out of memory " + fetcher);
out_of_mem = true;
break;
}
fetchedFolderInfos.addAll(foldersFetchedByThisFetcher);
if (aggregatingFetcher == null && !Util.nullOrEmpty(foldersFetchedByThisFetcher))
// first non-empty fetcher
aggregatingFetcher = fetcher;
if (aggregatingFetcher != null)
aggregatingFetcher.merge(fetcher);
// add the indexed folders to the stats
EmailStore store = fetcher.getStore();
String fetcherDescription = store.displayName + ":" + store.emailAddress;
for (FolderInfo fi : fetchedFolderInfos) stats.selectedFolders.add(new Pair<>(fetcherDescription, fi));
}
if (op_cancelled)
throw new CancelledException();
if (out_of_mem)
throw new OutOfMemoryError();
if (aggregatingFetcher != null) {
stats.importStats = aggregatingFetcher.stats;
if (aggregatingFetcher.mayHaveRunOutOfMemory())
throw new OutOfMemoryError();
}
// save memory
aggregatingFetcher = null;
long endTimeMillis = System.currentTimeMillis();
long elapsedMillis = endTimeMillis - startTimeMillis;
log.info(elapsedMillis + " ms for fetch+index, Memory status: " + Util.getMemoryStats());
// note: this is all archive docs, not just the ones that may have been just imported
List<EmailDocument> allEmailDocs = (List) archive.getAllDocs();
archive.addFetchedFolderInfos(fetchedFolderInfos);
if (allEmailDocs.size() == 0)
log.warn("0 messages from email fetcher");
EmailUtils.cleanDates(allEmailDocs);
// create a new address book
if (session != null)
session.setAttribute("statusProvider", new StaticStatusProvider("Building address book..."));
AddressBook addressBook = EmailDocument.buildAddressBook(allEmailDocs, archive.ownerEmailAddrs, archive.ownerNames);
log.info("Address book stats: " + addressBook.getStats());
if (session != null)
session.setAttribute("statusProvider", new StaticStatusProvider("Finishing up..."));
archive.setAddressBook(addressBook);
// we shouldn't really have dups now because the archive ensures that only unique docs are added
// move sorting to archive.postprocess?
EmailUtils.removeDupsAndSort(allEmailDocs);
// report stats
stats.lastUpdate = new Date().getTime();
// (String) JSPHelper.getSessionAttribute(session, "userKey");
stats.userKey = "USER KEY UNUSED";
stats.fetchAndIndexTimeMillis = elapsedMillis;
updateStats(archive, addressBook, stats);
if (session != null)
session.removeAttribute("statusProvider");
log.info("Fetch+index complete: " + Util.commatize(System.currentTimeMillis() - startTime) + " ms");
}
use of edu.stanford.muse.exceptions.CancelledException in project epadd by ePADD.
the class NER method recognizeArchive.
// main method trains the model, recognizes the entities and updates the doc.
public void recognizeArchive() throws CancelledException, IOException {
time = 0;
archive.openForRead();
archive.setupForWrite();
if (cancelled) {
status = "Cancelling...";
throw new CancelledException();
}
List<Document> docs = archive.getAllDocs();
if (cancelled) {
status = "Cancelling...";
throw new CancelledException();
}
int di = 0, ds = docs.size();
int ps = 0, ls = 0, os = 0;
long totalTime = 0, updateTime = 0, recTime = 0, duTime = 0, snoTime = 0;
for (Document doc : docs) {
long st1 = System.currentTimeMillis();
long st = System.currentTimeMillis();
org.apache.lucene.document.Document ldoc = archive.getLuceneDoc(doc.getUniqueId());
// pass the lucene doc instead of muse doc, else a major performance penalty
// do not recognise names in original content and content separately
// Its possible to improve the performance further by using linear kernel
// instead of RBF kernel and classifier instead of a regression model
// (the confidence scores of regression model can be useful in segmentation)
String originalContent = archive.getContents(ldoc, true);
String content = archive.getContents(ldoc, false);
String title = archive.getTitle(ldoc);
// original content is substring of content;
Span[] names = nerModel.find(content);
Span[] namesT = nerModel.find(title);
recTime += System.currentTimeMillis() - st;
st = System.currentTimeMillis();
stats.update(names);
stats.update(namesT);
updateTime += System.currentTimeMillis() - st;
st = System.currentTimeMillis();
// !!!!!!SEVERE!!!!!!!!!!
// TODO: an entity name is stored in NAMES, NAMES_ORIGINAL, nameoffsets, and one or more of
// EPER, ELOC, EORG fields, that is a lot of redundancy
// !!!!!!SEVERE!!!!!!!!!!
// storeSerialized(ldoc, NAMES_OFFSETS, mapAndOffsets.second);
// storeSerialized(ldoc, TITLE_NAMES_OFFSETS, mapAndOffsetsTitle.second);
// storeSerialized(ldoc, FINE_ENTITIES, mapAndOffsets.getFirst());
// storeSerialized(ldoc, TITLE_FINE_ENTITIES, mapAndOffsets.getSecond());
Map<Short, Integer> counts = new LinkedHashMap<>();
Map<Short, Integer> countsT = new LinkedHashMap<>();
Arrays.asList(names).stream().map(sp -> NEType.getCoarseType(sp.type).getCode()).forEach(s -> counts.put(s, counts.getOrDefault(s, 0) + 1));
Arrays.asList(namesT).stream().map(sp -> NEType.getCoarseType(sp.type).getCode()).forEach(s -> countsT.put(s, countsT.getOrDefault(s, 0) + 1));
ps += counts.getOrDefault(NEType.Type.PERSON.getCode(), 0) + countsT.getOrDefault(NEType.Type.PERSON.getCode(), 0);
ls += counts.getOrDefault(NEType.Type.PLACE.getCode(), 0) + countsT.getOrDefault(NEType.Type.PLACE.getCode(), 0);
os += counts.getOrDefault(NEType.Type.ORGANISATION.getCode(), 0) + countsT.getOrDefault(NEType.Type.ORGANISATION.getCode(), 0);
snoTime += System.currentTimeMillis() - st;
st = System.currentTimeMillis();
ldoc.removeField(NAMES);
ldoc.removeField(NAMES_TITLE);
ldoc.add(new StoredField(NAMES, Util.join(Arrays.asList(names).stream().map(Span::parsablePrint).collect(Collectors.toSet()), Indexer.NAMES_FIELD_DELIMITER)));
ldoc.add(new StoredField(NAMES_TITLE, Util.join(Arrays.asList(namesT).stream().map(Span::parsablePrint).collect(Collectors.toSet()), Indexer.NAMES_FIELD_DELIMITER)));
int ocs = originalContent.length();
List<String> namesOriginal = Arrays.asList(names).stream().filter(sp -> sp.end < ocs).map(Span::parsablePrint).collect(Collectors.toList());
ldoc.add(new StoredField(NAMES_ORIGINAL, Util.join(namesOriginal, Indexer.NAMES_FIELD_DELIMITER)));
// log.info("Found: "+names.size()+" total names and "+names_original.size()+" in original");
// TODO: Sometimes, updating can lead to deleted docs and keeping these deleted docs can bring down the search performance
// Could building a new index be faster?
archive.updateDocument(ldoc);
duTime += System.currentTimeMillis() - st;
di++;
totalTime += System.currentTimeMillis() - st1;
pctComplete = 30 + ((double) di / (double) ds) * 70;
double ems = (double) (totalTime * (ds - di)) / (double) (di * 1000);
status = "Recognized entities in " + Util.commatize(di) + " of " + Util.commatize(ds) + " emails ";
// Util.approximateTimeLeft((long)ems/1000);
eta = (long) ems;
if (di % 100 == 0)
log.info(status);
time += System.currentTimeMillis() - st;
if (cancelled) {
status = "Cancelling...";
throw new CancelledException();
}
}
log.info("Trained and recognised entities in " + di + " docs in " + totalTime + "ms" + "\nPerson: " + ps + "\nOrgs:" + os + "\nLocs:" + ls);
archive.close();
// prepare to read again.
archive.openForRead();
}
Aggregations