use of edu.stanford.muse.index.Document in project epadd by ePADD.
the class SimpleSessions method saveArchive.
/**
* saves the archive in the current session to the cachedir. note: no blobs saved.
*/
public static boolean saveArchive(String baseDir, String name, Archive archive) throws IOException {
log.info("Before saving the archive checking if it is still in good shape");
archive.Verify();
String dir = baseDir + File.separatorChar + Archive.SESSIONS_SUBDIR;
// just to be safe
new File(dir).mkdirs();
String filename = dir + File.separatorChar + name + SimpleSessions.SESSION_SUFFIX;
log.info("Saving archive to (session) file " + filename);
/*//file path names of addressbook, entitybook and correspondentAuthorityMapper data.
String addressBookPath = dir + File.separatorChar + Archive.ADDRESSBOOK_SUFFIX;
String entityBookPath = dir + File.separatorChar + Archive.ENTITYBOOK_SUFFIX;
String cAuthorityPath = dir + File.separatorChar + Archive.CAUTHORITYMAPPER_SUFFIX;
*/
if (archive.collectionMetadata == null)
archive.collectionMetadata = new Archive.CollectionMetadata();
archive.collectionMetadata.timestamp = new Date().getTime();
archive.collectionMetadata.tz = TimeZone.getDefault().getID();
archive.collectionMetadata.nDocs = archive.getAllDocs().size();
archive.collectionMetadata.nUniqueBlobs = archive.blobStore.uniqueBlobs.size();
int totalAttachments = 0, images = 0, docs = 0, others = 0, sentMessages = 0, receivedMessages = 0, hackyDates = 0;
Date firstDate = null, lastDate = null;
for (Document d : archive.getAllDocs()) {
if (!(d instanceof EmailDocument))
continue;
EmailDocument ed = (EmailDocument) d;
if (ed.date != null) {
if (ed.hackyDate)
hackyDates++;
else {
if (firstDate == null || ed.date.before(firstDate))
firstDate = ed.date;
if (lastDate == null || ed.date.after(lastDate))
lastDate = ed.date;
}
}
int sentOrReceived = ed.sentOrReceived(archive.addressBook);
if ((sentOrReceived & EmailDocument.SENT_MASK) != 0)
sentMessages++;
if ((sentOrReceived & EmailDocument.RECEIVED_MASK) != 0)
receivedMessages++;
if (!Util.nullOrEmpty(ed.attachments)) {
totalAttachments += ed.attachments.size();
for (Blob b : ed.attachments) if (!Util.nullOrEmpty(b.filename)) {
if (Util.is_image_filename(b.filename))
images++;
else if (Util.is_doc_filename(b.filename))
docs++;
else
others++;
}
}
}
archive.collectionMetadata.firstDate = firstDate;
archive.collectionMetadata.lastDate = lastDate;
archive.collectionMetadata.nIncomingMessages = receivedMessages;
archive.collectionMetadata.nOutgoingMessages = sentMessages;
archive.collectionMetadata.nHackyDates = hackyDates;
archive.collectionMetadata.nBlobs = totalAttachments;
archive.collectionMetadata.nUniqueBlobs = archive.blobStore.uniqueBlobs.size();
archive.collectionMetadata.nImageBlobs = images;
archive.collectionMetadata.nDocBlobs = docs;
archive.collectionMetadata.nOtherBlobs = others;
try (ObjectOutputStream oos = new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(filename)))) {
oos.writeObject("archive");
oos.writeObject(archive);
} catch (Exception e1) {
Util.print_exception("Failed to write archive: ", e1, log);
}
// Now write modular transient fields to separate files-
// By Dec 2017 there are three transient fields which will be saved and loaded separately
// 1. AddressBook -- Stored in a gzip file with name in the same ` directory as of archive.
// 2. EntityBook
// 3. CorrespondentAuthorityMapper
// Before final release of v5 in Feb 2018, modularize annotation out of archive.
// ///////////////AddressBook Writing -- In human readable form ///////////////////////////////////
SimpleSessions.saveAddressBook(archive);
// //////////////EntityBook Writing -- In human readable form/////////////////////////////////////
SimpleSessions.saveEntityBook(archive);
// /////////////CAuthorityMapper Writing-- Serialized///////////////////////////////
SimpleSessions.saveCorrespondentAuthorityMapper(archive);
// ////////////LabelManager Writing -- Serialized//////////////////////////////////
SimpleSessions.saveLabelManager(archive);
// ////////////AnnotationManager writing-- In human readable form/////////////////////////////////////
SimpleSessions.saveAnnotations(archive);
writeCollectionMetadata(archive.collectionMetadata, baseDir);
/*
// now write out the metadata
String processingFilename = dir + File.separatorChar + name + Config.COLLECTION_METADATA_FILE;
oos = new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(processingFilename)));
try {
oos.writeObject(archive.collectionMetadata);
} catch (Exception e1) {
Util.print_exception("Failed to write archive's metadata: ", e1, log);
oos.close();
} finally {
oos.close();
}
*/
/*
if (archive.correspondentAuthorityMapper!= null) {
String authorityMapperFilename = dir + File.separatorChar + name + Config.AUTHORITIES_FILENAME;
oos = new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(authorityMapperFilename)));
try {
oos.writeObject(archive.correspondentAuthorityMapper);
} catch (Exception e1) {
Util.print_exception("Failed to write archive's authority mapper: ", e1, log);
oos.close();
} finally {
oos.close();
}
}
*/
archive.close();
// re-open for reading
archive.openForRead();
// note: no need of saving archive authorities separately -- they are already saved as part of the archive object
return true;
}
use of edu.stanford.muse.index.Document in project epadd by ePADD.
the class AddressBook method fillL1_SummaryObject.
public void fillL1_SummaryObject(Collection<Document> alldocs) {
// clear the summary objects.
L1_Summary_SentDocs.clear();
L1_Summary_ReceivedDocs.clear();
L1_Summary_RecFrmOwnerDocs.clear();
// L1_Summary_totalInDocs.clear();
// Archive archive = ArchiveReaderWriter.getArchiveForArchiveID(archiveID);
AddressBook ab = this;
Contact ownContact = ab.getContactForSelf();
List<Contact> allContacts = ab.sortedContacts((Collection) alldocs);
List<EmailDocument> alldocslist = alldocs.stream().map(e -> (EmailDocument) e).collect(Collectors.toList());
// compute counts
for (EmailDocument ed : alldocslist) {
String senderEmail = ed.getFromEmailAddress();
Contact senderContact = ab.lookupByEmail(senderEmail);
if (senderContact == null)
continue;
// senderContact = ownContact; // should never happen, we should always have a sender contact: Don't do this otherwise wrongly reporting a message sent by owner.
// L1_Summary_SentDocs store the email messages where this contact is a sender.
{
Set<EmailDocument> tmp = L1_Summary_SentDocs.get(senderContact);
if (tmp == null)
tmp = new LinkedHashSet<>();
tmp.add(ed);
L1_Summary_SentDocs.put(senderContact, tmp);
}
// get receiver of this mail.
Collection<Contact> toContacts = ed.getToCCBCCContacts(ab);
for (Contact c : toContacts) {
// add the info that all these contacts received this mail.
Set<EmailDocument> tmp = L1_Summary_ReceivedDocs.get(c);
if (tmp == null)
tmp = new LinkedHashSet<>();
tmp.add(ed);
L1_Summary_ReceivedDocs.put(c, tmp);
}
// if this mail was received from the owner then add it in the map L1_Summary_RecFrmOwner
// for filling other fields.
int x = ed.sentOrReceived(ab);
// message could be both sent and received
if ((x & EmailDocument.SENT_MASK) != 0) {
// this is a sent email (sent by the owner), each to/cc/bcc gets +1 outcount.
// one of them could be own contact also.
toContacts = ed.getToCCBCCContacts(ab);
for (Contact c : toContacts) {
Set<EmailDocument> tmp = L1_Summary_RecFrmOwnerDocs.get(c);
if (tmp == null)
tmp = new LinkedHashSet<>();
tmp.add(ed);
L1_Summary_RecFrmOwnerDocs.put(c, tmp);
}
}
/* boolean received = (x & EmailDocument.RECEIVED_MASK) != 0 // explicitly received
|| (x & EmailDocument.SENT_MASK) == 0; // its not explicitly sent, so we must count it as received by default
if (received) {
// sender gets a +1 in count (could be ownContact also)
// all others get a mention count.
Set<EmailDocument> tmp = L1_Summary_RecFrmOwnerDocs.get(senderContact);
if(tmp==null)
tmp=new LinkedHashSet<>();
tmp.add(ed);
L1_Summary_RecFrmOwnerDocs.put(senderContact, tmp);
}
*/
// Removed the mention semantics in v7.
/* if ((x & EmailDocument.SENT_MASK) == 0) {
// this message is not sent, its received.
// add mentions for everyone who's not me, who's on the to/cc/bcc of this message.
Collection<Contact> toContacts = ed.getToCCBCCContacts(ab);
for (Contact c : toContacts) {
if (c == ownContact)
continue; // doesn't seem to make sense to give a mention count for sender in addition to incount
Set<EmailDocument> tmp = L1_Summary_contactMentionDocs.get(c);
if(tmp==null)
tmp=new LinkedHashSet<>();
tmp.add(ed);
L1_Summary_contactMentionDocs.put(c, tmp);
}
}*/
}
}
use of edu.stanford.muse.index.Document in project epadd by ePADD.
the class NameExpansion method getMatches.
/* Given the string s in emailDocument ed, returns a matches object with candidates matching s */
public static Matches getMatches(String s, Archive archive, EmailDocument ed, int maxResults) {
Matches matches = new Matches(s, maxResults);
AddressBook ab = archive.addressBook;
List<Contact> contactsExceptSelf = ed.getParticipatingContactsExceptOwn(archive.addressBook);
List<Contact> contacts = new ArrayList(contactsExceptSelf);
contacts.add(ab.getContactForSelf());
// check if s matches any contacts on this message
outer: for (Contact c : contacts) {
if (c.getNames() == null)
continue;
for (String name : c.getNames()) {
StringMatchType matchType = Matches.match(s, name);
if (matchType != null) {
float score = 1.0F;
if (matches.addMatch(name, score, matchType, "Name of a contact on this message", true))
return matches;
continue outer;
}
}
}
// check if s matches anywhere else in this message
if (matchAgainstEmailContent(archive, ed, matches, "Mentioned elsewhere in this message", 1.0F)) {
return matches;
}
synchronized (archive) {
if (ed.threadID == 0L) {
archive.assignThreadIds();
}
}
// check if s matches anywhere else in this thread
List<EmailDocument> messagesInThread = (List) archive.docsWithThreadId(ed.threadID);
for (EmailDocument messageInThread : messagesInThread) {
if (matchAgainstEmailContent(archive, messageInThread, matches, "Mentioned in this thread", 0.9F)) {
return matches;
}
}
// check if s matches any other email with any of these correspondents
for (Contact c : contactsExceptSelf) {
if (c.getEmails() != null) {
String correspondentsSearchStr = String.join(";", c.getEmails());
// As filterForCorrespondents function do not use queryparams therefore it is fine to instantiate SearchResult
// object with queryParams as null. After refactoring, filter methods take SearchObject as input and modify it
// according to the filter.
SearchResult inputSet = new SearchResult(archive, null);
SearchResult outputSet = SearchResult.filterForCorrespondents(inputSet, correspondentsSearchStr, true, true, true, true);
Set<Document> messagesWithSameCorrespondents = outputSet.getDocumentSet();
for (Document messageWithSameCorrespondents : messagesWithSameCorrespondents) {
EmailDocument edoc = (EmailDocument) messageWithSameCorrespondents;
if (matchAgainstEmailContent(archive, edoc, matches, "Mentioned in other messages with these correspondents", 0.8F)) {
return matches;
}
}
}
}
// search for s anywhere in the archive
Multimap<String, String> params = LinkedHashMultimap.create();
params.put("termSubject", "on");
params.put("termBody", "on");
String term = s;
if (s.contains(" ") && (!s.startsWith("\"") || !s.endsWith("\""))) {
term = "\"" + s + "\"";
}
// To search for terms, create a searchResult object and invoke appropriate filter method on it.
SearchResult inputSet = new SearchResult(archive, params);
SearchResult outputSet = SearchResult.searchForTerm(inputSet, term);
Set<Document> docsWithTerm = outputSet.getDocumentSet();
for (Document docWithTerm : docsWithTerm) {
EmailDocument edoc = (EmailDocument) docWithTerm;
if (matchAgainstEmailContent(archive, edoc, matches, "Mentioned elsewhere in this archive", 0.7F))
return matches;
}
return matches;
}
use of edu.stanford.muse.index.Document in project epadd by ePADD.
the class EntityBookManager method recalculateCache.
/*
This method recalculates cache for entitybook of given type. If type is given as Max, it does it for all at once. This method was carved out mainly to reduce the recalculation of
individual type entitybook (which involves expensive operation of lucene search for each doc).
*/
private void recalculateCache(Short giventype) {
log.info("Computing EntityBook Cache");
long start = System.currentTimeMillis();
// a subtle issue: If type is Short.MAX_VALUE then we need to have docsetmap one for each type.
// so create a map of this map.
Map<Short, Map<MappedEntity, Pair<Double, Set<Document>>>> alldocsetmap = new LinkedHashMap<>();
// now fill this map.
if (giventype == Short.MAX_VALUE) {
for (NEType.Type t : NEType.Type.values()) {
Map<MappedEntity, Pair<Double, Set<Document>>> docsetmap = new LinkedHashMap<>();
alldocsetmap.put(t.getCode(), docsetmap);
}
} else {
Map<MappedEntity, Pair<Double, Set<Document>>> docsetmap = new LinkedHashMap<>();
alldocsetmap.put(giventype, docsetmap);
}
// iterate over
// iterate over lucene doc to recalculate the count and other summaries of the modified
// fill cache summary for ebook in other fields of ebook.
double theta = 0.001;
long luceneduration1 = 0;
long luceneduration2 = 0;
long additionduration = 0;
Map<String, Span[]> docEntitiesMap = mArchive.getAllEntities(mArchive.getAllDocs().size());
for (String docid : docEntitiesMap.keySet()) {
Span[] allspans = docEntitiesMap.get(docid);
EmailDocument edoc = mArchive.indexer.docForId(docid);
for (Span span : allspans) {
// bail out if not of entity type that we're looking for, or not enough confidence, but don't bail out if we have to do it for all types, i.e. type is Short.MAX_TYPE
if (giventype != Short.MAX_VALUE && (span.type != giventype || span.typeScore < theta))
continue;
// if type is Short.Max_Type then set the type as the current type, if not this is like a NOP.
Short type = span.type;
Double score = new Double(span.typeScore);
String name = span.getText();
String canonicalizedname = EntityBook.canonicalize(name);
// map the name to its display name. if no mapping, we should get the same name back as its displayName
MappedEntity mappedEntity = (mTypeToEntityBook.get(type).nameToMappedEntity.get(canonicalizedname));
if (mappedEntity == null) {
// It implies that we have erased some names from the entitybook so no need to consider them.
continue;
}
// add this doc in the docsetmap for the mappedEntity.
Double oldscore = Double.valueOf(0);
if (alldocsetmap.get(type).get(mappedEntity) != null)
oldscore = alldocsetmap.get(type).get(mappedEntity).first;
Double finalscore = Double.max(oldscore, score);
Set<Document> docset = new LinkedHashSet<>();
if (alldocsetmap.get(type).get(mappedEntity) != null)
docset = alldocsetmap.get(type).get(mappedEntity).second;
docset.add(edoc);
// docset.add(doc);
alldocsetmap.get(type).put(mappedEntity, new Pair(finalscore, docset));
}
}
// fill cache summary for ebook in other fields of ebook.
// Beware!! what happens if type is MAX (means we need to do this for all types).
long end = System.currentTimeMillis();
log.info("Finished computing entitybook cache in " + (end - start) + " milliseconds");
if (giventype == Short.MAX_VALUE) {
for (NEType.Type t : NEType.Type.values()) {
mTypeToEntityBook.get(t.getCode()).fillSummaryFields(alldocsetmap.get(t.getCode()), mArchive);
}
} else
mTypeToEntityBook.get(giventype).fillSummaryFields(alldocsetmap.get(giventype), mArchive);
// log.info("Luceneduration 1 = "+luceneduration1+" milliseconds, Luceneduration 2 = "+luceneduration2 + " milliseconds, addition duration = "+additionduration+ " milliseconds");
// log.info("Finished filling summary of entitybook cache in "+ (System.currentTimeMillis()-end)+" milliseconds");
log.info("EntityBook Cache computed successfully");
}
use of edu.stanford.muse.index.Document in project epadd by ePADD.
the class NER method recognizeArchive.
// main method trains the model, recognizes the entities and updates the doc.
public void recognizeArchive() throws CancelledException, IOException {
time = 0;
archive.openForRead();
archive.setupForWrite();
if (cancelled) {
status = "Cancelling...";
throw new CancelledException();
}
List<Document> docs = archive.getAllDocs();
if (cancelled) {
status = "Cancelling...";
throw new CancelledException();
}
int di = 0, ds = docs.size();
int ps = 0, ls = 0, os = 0;
long totalTime = 0, updateTime = 0, recTime = 0, duTime = 0, snoTime = 0;
for (Document doc : docs) {
long st1 = System.currentTimeMillis();
long st = System.currentTimeMillis();
org.apache.lucene.document.Document ldoc = archive.getLuceneDoc(doc.getUniqueId());
// pass the lucene doc instead of muse doc, else a major performance penalty
// do not recognise names in original content and content separately
// Its possible to improve the performance further by using linear kernel
// instead of RBF kernel and classifier instead of a regression model
// (the confidence scores of regression model can be useful in segmentation)
String originalContent = archive.getContents(ldoc, true);
String content = archive.getContents(ldoc, false);
String title = archive.getTitle(ldoc);
// original content is substring of content;
Span[] names = nerModel.find(content);
Span[] namesT = nerModel.find(title);
recTime += System.currentTimeMillis() - st;
st = System.currentTimeMillis();
stats.update(names);
stats.update(namesT);
updateTime += System.currentTimeMillis() - st;
st = System.currentTimeMillis();
// !!!!!!SEVERE!!!!!!!!!!
// TODO: an entity name is stored in NAMES, NAMES_ORIGINAL, nameoffsets, and one or more of
// EPER, ELOC, EORG fields, that is a lot of redundancy
// !!!!!!SEVERE!!!!!!!!!!
// storeSerialized(ldoc, NAMES_OFFSETS, mapAndOffsets.second);
// storeSerialized(ldoc, TITLE_NAMES_OFFSETS, mapAndOffsetsTitle.second);
// storeSerialized(ldoc, FINE_ENTITIES, mapAndOffsets.getFirst());
// storeSerialized(ldoc, TITLE_FINE_ENTITIES, mapAndOffsets.getSecond());
Map<Short, Integer> counts = new LinkedHashMap<>();
Map<Short, Integer> countsT = new LinkedHashMap<>();
Arrays.stream(names).map(sp -> NEType.getCoarseType(sp.type).getCode()).forEach(s -> counts.put(s, counts.getOrDefault(s, 0) + 1));
Arrays.stream(namesT).map(sp -> NEType.getCoarseType(sp.type).getCode()).forEach(s -> countsT.put(s, countsT.getOrDefault(s, 0) + 1));
ps += counts.getOrDefault(NEType.Type.PERSON.getCode(), 0) + countsT.getOrDefault(NEType.Type.PERSON.getCode(), 0);
ls += counts.getOrDefault(NEType.Type.PLACE.getCode(), 0) + countsT.getOrDefault(NEType.Type.PLACE.getCode(), 0);
os += counts.getOrDefault(NEType.Type.ORGANISATION.getCode(), 0) + countsT.getOrDefault(NEType.Type.ORGANISATION.getCode(), 0);
snoTime += System.currentTimeMillis() - st;
st = System.currentTimeMillis();
String[] updateFields = new String[] { NAMES, NAMES_ORIGINAL, NAMES_TITLE };
for (String s : updateFields) {
if (ldoc.get(s) != null)
ldoc.removeField(s);
}
// ldoc.removeField(NAMES);ldoc.removeField(NAMES_TITLE);//may be NAMES_ORIGINAL was left to be deleted hence delete docs were added.
ldoc.add(new StoredField(NAMES, Util.join(Arrays.stream(names).map(Span::parsablePrint).collect(Collectors.toSet()), Indexer.NAMES_FIELD_DELIMITER)));
ldoc.add(new StoredField(NAMES_TITLE, Util.join(Arrays.stream(namesT).map(Span::parsablePrint).collect(Collectors.toSet()), Indexer.NAMES_FIELD_DELIMITER)));
int ocs = originalContent.length();
List<String> namesOriginal = Arrays.stream(names).filter(sp -> sp.end < ocs).map(Span::parsablePrint).collect(Collectors.toList());
ldoc.add(new StoredField(NAMES_ORIGINAL, Util.join(namesOriginal, Indexer.NAMES_FIELD_DELIMITER)));
// log.info("Found: "+names.size()+" total names and "+names_original.size()+" in original");
// TODO: Sometimes, updating can lead to deleted docs and keeping these deleted docs can bring down the search performance
// Could building a new index be faster?
archive.updateDocument(ldoc);
duTime += System.currentTimeMillis() - st;
di++;
totalTime += System.currentTimeMillis() - st1;
pctComplete = 30 + ((double) di / (double) ds) * 70;
double ems = (double) (totalTime * (ds - di)) / (double) (di * 1000);
status = "Recognized entities in " + Util.commatize(di) + " of " + Util.commatize(ds) + " emails ";
// Util.approximateTimeLeft((long)ems/1000);
eta = (long) ems;
if (di % 100 == 0)
log.info(status);
time += System.currentTimeMillis() - st;
if (cancelled) {
status = "Cancelling...";
throw new CancelledException();
}
}
log.info("Trained and recognised entities in " + di + " docs in " + totalTime + "ms" + "\nPerson: " + ps + "\nOrgs:" + os + "\nLocs:" + ls);
archive.close();
// prepare to read again.
archive.openForRead();
}
Aggregations