use of edu.stanford.muse.AddressBookManager.Contact in project epadd by ePADD.
the class IndexUtils method computeDetailedFacets.
/**
* version that stores actual dates instead of just counts for each facet
*/
public static Map<String, Collection<DetailedFacetItem>> computeDetailedFacets(Collection<Document> docs, Archive archive) {
AddressBook addressBook = archive.addressBook;
Map<String, Collection<DetailedFacetItem>> facetMap = new LinkedHashMap<>();
if (addressBook != null) {
// people
Map<Contact, DetailedFacetItem> peopleMap = partitionDocsByPerson(docs, addressBook);
facetMap.put("correspondent", peopleMap.values());
// direction
Map<String, DetailedFacetItem> directionMap = partitionDocsByDirection(docs, addressBook);
if (directionMap.size() > 1)
facetMap.put("direction", directionMap.values());
/*
--No longer need this code as restriction, reviewed etc. are handled by labels--
// flags -- provide them only if they have at least 2 types in these docs. if all docs have the same value for a particular flag, no point showing it.
Map<String, DetailedFacetItem> doNotTransferMap = partitionDocsByDoNotTransfer(docs);
if (doNotTransferMap.size() > 1)
facetMap.put("transfer", doNotTransferMap.values());
Map<String, DetailedFacetItem> transferWithRestrictionsMap = partitionDocsByTransferWithRestrictions(docs);
if (transferWithRestrictionsMap.size() > 1)
facetMap.put("restrictions", transferWithRestrictionsMap.values());
Map<String, DetailedFacetItem> reviewedMap = partitionDocsByReviewed(docs);
if (reviewedMap.size() > 1)
facetMap.put("reviewed", reviewedMap.values());
*/
// facet for restriction labels
Map<String, DetailedFacetItem> restrlabels = partitionDocsByLabelTypes(docs, archive, LabelManager.LabType.RESTRICTION);
facetMap.put("Restriction Labels", restrlabels.values());
// facet for general labels
Map<String, DetailedFacetItem> genlabels = partitionDocsByLabelTypes(docs, archive, LabelManager.LabType.GENERAL);
facetMap.put("General Labels", genlabels.values());
// facet for accession IDs- only in modes other than appraisal
if (!ModeConfig.isAppraisalMode()) {
Map<String, DetailedFacetItem> accIDs = partitionDocsByAccessionID(docs, archive);
facetMap.put("Accessions", accIDs.values());
}
Map<String, DetailedFacetItem> annotationPresenceMap = partitionDocsByAnnotationPresence(docs, archive);
facetMap.put("Annotations", annotationPresenceMap.values());
// attachments
if (!ModeConfig.isPublicMode()) {
Map<String, DetailedFacetItem> attachmentTypesMap = partitionDocsByAttachmentType(docs);
facetMap.put("attachment type", attachmentTypesMap.values());
}
}
if (!ModeConfig.isPublicMode()) {
Map<String, DetailedFacetItem> folderNameMap = partitionDocsByFolder(docs);
if (folderNameMap.size() > 0)
facetMap.put("folders", folderNameMap.values());
}
// sort so that in each topic, the heaviest facets are first
for (String s : facetMap.keySet()) {
Collection<DetailedFacetItem> detailedFacets = facetMap.get(s);
List<DetailedFacetItem> list = new ArrayList<>(detailedFacets);
Collections.sort(list);
facetMap.put(s, list);
}
return facetMap;
}
use of edu.stanford.muse.AddressBookManager.Contact in project epadd by ePADD.
the class Lens method detailsForTerm.
/**
* gets details from index for the given term
*/
public static JSONObject detailsForTerm(String term, float pageScore, Archive archive, AddressBook ab, String baseURL, Collection<EmailDocument> allDocs) throws JSONException, IOException {
if (term.length() <= 2)
return null;
term = JSPHelper.convertRequestParamToUTF8(term);
JSONObject json = new JSONObject();
json.put("text", term);
json.put("pageScore", pageScore);
int NAME_IN_ADDRESS_BOOK_WEIGHT = 100;
// look up term in 2 places -- AB and in the index
List<EmailDocument> docsForNameInAddressBook = (List) IndexUtils.selectDocsByPersonsAsList(ab, allDocs, new String[] { term });
List<EmailDocument> docsForTerm = (List) new ArrayList<>(archive.docsForQuery("\"" + term + "\"", -1, Indexer.QueryType.FULL));
// weigh any docs for name in addressbook hugely more!
double termScore = docsForNameInAddressBook.size() * NAME_IN_ADDRESS_BOOK_WEIGHT + docsForTerm.size();
json.put("indexScore", termScore);
Set<EmailDocument> finalDocSet = new LinkedHashSet<>();
finalDocSet.addAll(docsForNameInAddressBook);
finalDocSet.addAll(docsForTerm);
List<EmailDocument> finalDocList = new ArrayList<>(finalDocSet);
json.put("nMessages", finalDocList.size());
// score people
Map<Contact, Float> peopleScores = new LinkedHashMap<>();
for (EmailDocument ed : finalDocSet) {
Collection<String> addrs = ed.getParticipatingAddrsExcept(ab.getOwnAddrs());
for (String s : addrs) {
if ("user".equals(s))
continue;
// weight = 1/size
float weight = 1.0f / addrs.size();
Contact c = ab.lookupByEmail(s);
peopleScores.merge(c, weight, (a, b) -> a + b);
}
}
// add the top people
int MAX_PEOPLE = 5;
List<Pair<Contact, Float>> pairs = Util.sortMapByValue(peopleScores);
JSONArray people = new JSONArray();
Contact own = ab.getContactForSelf();
int count = 0;
for (Pair<Contact, Float> p : pairs) {
if (count > MAX_PEOPLE)
break;
// ab.lookupByEmail(email);
Contact c = p.getFirst();
if (c == own)
// ignore own name
continue;
JSONObject person = new JSONObject();
String displayName = c == null ? "" : c.pickBestName();
person.put("person", displayName);
person.put("score", p.getSecond());
people.put(count, person);
count++;
}
json.put("people", people);
if (finalDocList.size() > 0 && log.isDebugEnabled())
log.debug("Term: " + term + " content hits: " + docsForTerm.size() + " header hits: " + docsForNameInAddressBook.size() + " total: " + finalDocList.size());
String url = baseURL + "/browse?term=\"" + term + "\"";
json.put("url", url);
JSONArray messages = new JSONArray();
// put up to 5 teasers in the json response
int N_TEASERS = 5;
for (int i = 0; i < finalDocList.size() && i < N_TEASERS; i++) {
JSONObject message = finalDocList.get(i).toJSON(0);
messages.put(i, message);
}
json.put("messages", messages);
return json;
}
use of edu.stanford.muse.AddressBookManager.Contact in project epadd by ePADD.
the class EmailUtils method getContactsForMessage.
/* returns a set of contact objects for all to/from/cc/bcc of the message */
public static Set<Contact> getContactsForMessage(AddressBook ab, EmailDocument ed) {
// only lookup the fields (to/cc/bcc/from) that have been enabled
Set<InternetAddress> allAddressesInMessage = new LinkedHashSet<>();
// now check for mailing list state
if (!Util.nullOrEmpty(ed.to)) {
allAddressesInMessage.addAll((List) Arrays.asList(ed.to));
}
if (!Util.nullOrEmpty(ed.from)) {
allAddressesInMessage.addAll((List) Arrays.asList(ed.from));
}
if (!Util.nullOrEmpty(ed.cc)) {
allAddressesInMessage.addAll((List) Arrays.asList(ed.cc));
}
if (!Util.nullOrEmpty(ed.bcc)) {
allAddressesInMessage.addAll((List) Arrays.asList(ed.bcc));
}
Set<Contact> contactsInMessage = new LinkedHashSet<>();
for (InternetAddress a : allAddressesInMessage) {
// try and find the contact for both the email address and the name, because sometimes (in extreme cases only) perhaps the email is not there, and we only have a name
Contact c = ab.lookupByEmail(a.getAddress());
if (c != null)
contactsInMessage.add(c);
else {
// look up name contact only if the email lookup failed -- hopefully this is rare
log.debug("Warning: email lookup failed for " + a);
Collection<Contact> contacts = ab.lookupByName(a.getPersonal());
if (!Util.nullOrEmpty(contacts))
contactsInMessage.addAll(contacts);
}
}
return contactsInMessage;
}
use of edu.stanford.muse.AddressBookManager.Contact in project epadd by ePADD.
the class CrossCollectionSearch method initialize.
/**
* initializes lookup structures (entity infos and ctokenToInfos) for cross collection search
* reads all archives available in the base dir.
* should be synchronized so there's no chance of doing it multiple times at the same time.
*/
private static synchronized void initialize(String baseDir) {
// this is created only once in one run. if it has already been created, reuse it.
// in the future, this may be read from a serialized file, etc.
cTokenToInfos = LinkedHashMultimap.create();
File[] files = new File(baseDir).listFiles();
if (files == null) {
log.warn("Trying to initialize cross collection search from an invalid directory: " + baseDir);
return;
}
int archiveNum = 0;
for (File f : files) {
if (!f.isDirectory())
continue;
try {
String archiveFile = f.getAbsolutePath() + File.separator + Archive.BAG_DATA_FOLDER + File.separator + Archive.SESSIONS_SUBDIR + File.separator + "default" + SimpleSessions.getSessionSuffix();
if (!new File(archiveFile).exists()) {
log.warn("Unable to find archive file" + archiveFile + ".. Serious error");
continue;
}
// Assumption is that this feature is present only in discovery mode. In future when we want to add it to processing, we need proper care.
Archive archive = ArchiveReaderWriter.readArchiveIfPresent(f.getAbsolutePath(), ModeConfig.Mode.DISCOVERY);
if (archive == null) {
log.warn("failed to read archive from " + f.getAbsolutePath());
continue;
}
log.info("Loaded archive from " + f.getAbsolutePath());
log.info("Loaded archive metadata from " + f.getAbsolutePath());
// process all docs in this archive to set up centityToInfo map
String archiveID = ArchiveReaderWriter.getArchiveIDForArchive(archive);
Map<String, EntityInfo> centityToInfo = new LinkedHashMap<>();
{
// get all contacts from the addressbook
Set<Pair<String, Pair<Pair<Date, Date>, Integer>>> correspondentEntities = new LinkedHashSet<>();
{
Map<Contact, DetailedFacetItem> res = IndexUtils.partitionDocsByPerson(archive.getAllDocs(), archive.getAddressBook());
res.entrySet().forEach(s -> {
// get contactname
Contact c = s.getKey();
// get duration (first and last doc where this contact was used)
Set<EmailDocument> edocs = s.getValue().docs.stream().map(t -> (EmailDocument) t).collect(Collectors.toSet());
Pair<Date, Date> duration = EmailUtils.getFirstLast(edocs);
if (duration == null) {
duration = new Pair<>(archive.collectionMetadata.firstDate, archive.collectionMetadata.lastDate);
}
if (duration.first == null)
duration.first = archive.collectionMetadata.firstDate;
if (duration.second == null)
duration.second = archive.collectionMetadata.lastDate;
// get number of messages where this was used.
Integer count = s.getValue().docs.size();
if (c.getNames() != null) {
Pair<Date, Date> finalDuration = duration;
c.getNames().forEach(w -> {
if (!Util.nullOrEmpty(w) && finalDuration != null && count != null)
correspondentEntities.add(new Pair(canonicalize(w), new Pair(finalDuration, count)));
});
}
if (c.getEmails() != null) {
Pair<Date, Date> finalDuration1 = duration;
c.getEmails().forEach(w -> {
if (!Util.nullOrEmpty(w) && finalDuration1 != null && count != null)
correspondentEntities.add(new Pair(canonicalize(w), new Pair(finalDuration1, count)));
});
}
});
}
// get all entities from entitybookmanager
Set<Pair<String, Pair<Pair<Date, Date>, Integer>>> entitiessummary = new LinkedHashSet<>();
{
entitiessummary = archive.getEntityBookManager().getAllEntitiesSummary();
// filter out any null or empty strings (just in case)
// don't canonicalize right away because we need to keep the original form of the name
entitiessummary = entitiessummary.stream().filter(s -> !Util.nullOrEmpty(s.first)).collect(Collectors.toSet());
}
// if an entity is present as a person entity as well as in correspondent then consider the count of the person entity as the final count. Therefore start with
// processing of correspondent entities.
correspondentEntities.forEach(entity -> {
String centity = canonicalize(entity.first);
EntityInfo ei = centityToInfo.get(centity);
if (ei == null) {
ei = new EntityInfo();
ei.archiveID = archiveID;
ei.displayName = entity.first;
centityToInfo.put(centity, ei);
}
ei.isCorrespondent = true;
ei.firstDate = entity.second.first.first;
ei.lastDate = entity.second.first.second;
ei.count = entity.second.second;
});
// Now process entities (except correspondents).
entitiessummary.forEach(entity -> {
String centity = canonicalize(entity.first);
EntityInfo ei = centityToInfo.get(centity);
if (ei == null) {
ei = new EntityInfo();
ei.archiveID = archiveID;
ei.displayName = entity.first;
centityToInfo.put(centity, ei);
}
// ei.isCorrespondent=true;
ei.firstDate = entity.second.first.first;
ei.lastDate = entity.second.first.second;
ei.count = entity.second.second;
});
}
log.info("Archive # " + archiveNum + " read " + centityToInfo.size() + " entities");
// now set up this map as a token map
for (EntityInfo ei : centityToInfo.values()) {
String entity = ei.displayName;
String centity = canonicalize(entity);
allCEntities.add(centity);
// consider a set of tokens because we don't want repeats
Set<String> ctokens = new LinkedHashSet<>(Util.tokenize(centity));
for (String ctoken : ctokens) cTokenToInfos.put(ctoken, ei);
}
} catch (Exception e) {
Util.print_exception("Error loading archive in directory " + f.getAbsolutePath(), e, log);
}
archiveNum++;
}
}
use of edu.stanford.muse.AddressBookManager.Contact in project epadd by ePADD.
the class NameTypes method computeInfo.
public static void computeInfo(Map<String, NameInfo> nameMap, Collection<EmailDocument> allDocs, Archive archive, Lexicon lex) throws IOException {
// assign types to all the names
if (allDocs == null)
allDocs = (List) archive.getAllDocs();
// compute name -> nameInfo
Map<String, Collection<Document>> sentimentToDocs = archive.getSentimentMap(lex, true);
for (EmailDocument ed : allDocs) {
String id = ed.getUniqueId();
List<String> names = archive.getNamesForDocId(id, Indexer.QueryType.FULL);
List<Address> mentionedAddresses = ed.getToCCBCC();
Set<String> sentimentsForDoc = new LinkedHashSet<>();
for (String sentiment : sentimentToDocs.keySet()) {
if (sentimentToDocs.get(sentiment).contains(ed))
sentimentsForDoc.add(sentiment);
}
for (String name : names) {
// canonical title
String cTitle = name.trim().toLowerCase().replaceAll(" ", "_");
NameInfo I = nameMap.get(cTitle);
if (I == null) {
log.info("Warning: null info for name: " + name);
continue;
}
// Map sentiment to its prominence in document.
if (I.sentimentCatToCount == null)
I.sentimentCatToCount = new LinkedHashMap<>();
for (String sentiment : sentimentsForDoc) {
if (// if the sentiment isn't there.
!I.sentimentCatToCount.containsKey(sentiment))
I.sentimentCatToCount.put(sentiment, 1);
else {
int sum = I.sentimentCatToCount.get(sentiment);
sum = sum + 1;
I.sentimentCatToCount.put(sentiment, sum);
}
}
I.sentimentCatToCount = Util.reorderMapByValue(I.sentimentCatToCount);
// obtain list of contacts to whom email is being sent.
for (Address adr : mentionedAddresses) {
InternetAddress emailadr = (InternetAddress) adr;
String address_string = emailadr.getAddress();
Contact associatedcontact = archive.addressBook.lookupByEmail(address_string);
if (I.peopleToCount == null)
I.peopleToCount = new LinkedHashMap<>();
if (// if the contact is not yet associated.
!I.peopleToCount.containsKey(associatedcontact))
I.peopleToCount.put(associatedcontact, 1);
else {
int sum = I.peopleToCount.get(associatedcontact);
sum = sum + 1;
I.peopleToCount.put(associatedcontact, sum);
}
}
if (I.peopleToCount != null)
I.peopleToCount = Util.reorderMapByValue(I.peopleToCount);
// while (I.peopleToCount.containsKey(null)){ //clean peopleToCount
// I.peopleToCount.remove(null);
// System.out.println ("Cleaned peopleToCount.");
// }
// determine start and end dates of the term.
Date documentDate = ed.getDate();
if (I.firstDate == null)
I.firstDate = documentDate;
if (I.lastDate == null)
I.lastDate = documentDate;
if (I.firstDate.after(documentDate))
I.firstDate = documentDate;
if (I.lastDate.before(documentDate))
I.lastDate = documentDate;
// System.out.println("Name " + name + " FirstDate: " + (I.firstDate.toString()) + " LastDate:" + (I.lastDate.toString()));
}
}
// compute map of sentiment -> docs for each sentiment in goodSentiments
// for every document, get canonical name of a person who is associated with it.
// for each docs in archive, get the list of names
// for each name, update the first/last date, get the list of docs, and count how many of them are in the intersection with each sentiment
}
Aggregations