use of edu.stanford.muse.util.Span in project epadd by ePADD.
the class EntityFeature method checkIndex.
/**
* @arg2 force creation of index irrespective of previous existence of the
* index.
* Checks and creates index if required.
* @return true if successful
*/
private boolean checkIndex(Archive archive, boolean force) {
Boolean exists = indexExists(archive);
int c1 = 0, c2 = 0, c3 = 0;
int g1 = 0, g2 = 0, g3 = 0;
int f1 = 0, f2 = 0, f3 = 0;
boolean istatus = true;
if (force || (!exists)) {
Map<String, EntityFeature> features = new HashMap<>();
Collection<EmailDocument> docs = (Collection) archive.getAllDocs();
int totalEntities = 0;
log.info("No feature index found..., starting to process and index. This can take a while.");
int di = 0;
for (EmailDocument ed : docs) {
if (cancel) {
clean(archive);
return false;
}
if (di % 1000 == 0) {
JSPHelper.log.info("Done analysing documents: " + di + " of: " + docs.size());
status = "Analyzed " + di + "/" + docs.size() + " email documents";
pctComplete = ((double) di * 50) / (double) docs.size();
}
di++;
List<Span> names;
try {
names = Arrays.asList(archive.getAllNamesInDoc(ed, true));
} catch (IOException ioe) {
log.error("Problem accessing entities in " + ed.getUniqueId(), ioe);
continue;
}
List<String> entities = names.stream().filter(n -> n.type == NEType.Type.PERSON.getCode()).map(n -> n.text).collect(Collectors.toList());
List<String> places = names.stream().filter(n -> n.type == NEType.Type.PLACE.getCode()).map(n -> n.text).collect(Collectors.toList());
List<String> orgs = names.stream().filter(n -> n.type == NEType.Type.ORGANISATION.getCode()).map(n -> n.text).collect(Collectors.toList());
if (entities != null)
c1 += entities.size();
if (orgs != null)
c2 += orgs.size();
if (places != null)
c3 += places.size();
Map<String, String> goodNames = new HashMap<>();
List<String> correspondents = ed.getAllNames();
List<String> addresses = ed.getAllAddrs();
if (correspondents != null)
for (String c : correspondents) {
if (c != null && c.contains(" ")) {
// EmailUtils.normalizePersonNameForLookup(c);
String n = IndexUtils.canonicalizeEntity(c);
goodNames.put(n, "person");
}
}
for (String e : entities) {
if (e != null && e.contains(" ")) {
String canonicalEntity = IndexUtils.canonicalizeEntity(e);
if (canonicalEntity == null)
continue;
goodNames.put(canonicalEntity, "person");
g1++;
}
}
for (String o : orgs) {
String canonicalEntity = IndexUtils.canonicalizeEntity(o);
if (canonicalEntity == null)
continue;
goodNames.put(canonicalEntity, "org");
g2++;
}
for (String p : places) {
String canonicalEntity = IndexUtils.canonicalizeEntity(p);
if (canonicalEntity == null)
continue;
goodNames.put(canonicalEntity, "places");
g3++;
}
// O(goodNames.size())
for (String gn : goodNames.keySet()) {
if (features.get(gn) == null) {
if (goodNames.get(gn).equals("person")) {
features.put(gn, new EntityFeature(gn, EntityFeature.PERSON));
f1++;
} else if (goodNames.get(gn).equals("org")) {
features.put(gn, new EntityFeature(gn, EntityFeature.ORG));
f2++;
} else if (goodNames.get(gn).equals("places")) {
features.put(gn, new EntityFeature(gn, EntityFeature.PLACE));
f3++;
}
}
features.get(gn).accountForThis();
features.get(gn).addAllCE(goodNames.keySet());
if (addresses != null)
features.get(gn).addAllEA(addresses);
features.get(gn).priorProbablity = features.get(gn).priorProbablity + 1.0;
totalEntities++;
}
}
log.info("Found: " + c1 + " entities, " + c2 + " orgs and " + c3 + " places");
log.info("Gn: " + g1 + " entities, " + g2 + " orgs and " + g3 + " places");
log.info("Found goodfeatures: " + f1 + " entities, " + f2 + " orgs and " + f3 + " places");
for (String key : features.keySet()) features.get(key).priorProbablity = features.get(key).priorProbablity / (double) totalEntities;
log.info("Done analysing docs. Starting to index.");
istatus = index(features, archive);
}
return istatus;
}
use of edu.stanford.muse.util.Span in project epadd by ePADD.
the class ProperNounLinker method main.
public static void main(String[] args) {
// BOWtest();
// test();
Random rand = new Random();
try {
String userDir = System.getProperty("user.home") + File.separator + "epadd-appraisal" + File.separator + "user";
Archive archive = SimpleSessions.readArchiveIfPresent(userDir);
// findMerges(archive);
// SimpleSessions.saveArchive(archive.baseDir, "default", archive);
List<Document> docs = archive.getAllDocs();
long st = System.currentTimeMillis();
int numQ = 0;
for (int i = 0; i < 5; i++) {
Document doc = docs.get(rand.nextInt(docs.size()));
Span[] es = NER.getNames(doc, true, archive);
Arrays.asList(es).stream().filter(s -> !s.text.contains(" ")).forEach(s -> System.out.println(s.text + "<->" + getNearestMatches(new EmailMention(s, doc, new EmailHierarchy()), 5, archive)));
numQ += Arrays.asList(es).stream().filter(s -> !s.text.contains(" ")).count();
}
System.out.println("NumQ:" + numQ + "- Time: " + (System.currentTimeMillis() - st) + "ms" + "- AVG: " + ((float) (System.currentTimeMillis() - st) / numQ) + "ms");
} catch (Exception e) {
e.printStackTrace();
}
}
use of edu.stanford.muse.util.Span in project epadd by ePADD.
the class EntityBookManager method recalculateCache.
/*
This method recalculates cache for entitybook of given type. If type is given as Max, it does it for all at once. This method was carved out mainly to reduce the recalculation of
individual type entitybook (which involves expensive operation of lucene search for each doc).
*/
private void recalculateCache(Short giventype) {
log.info("Computing EntityBook Cache");
long start = System.currentTimeMillis();
// a subtle issue: If type is Short.MAX_VALUE then we need to have docsetmap one for each type.
// so create a map of this map.
Map<Short, Map<MappedEntity, Pair<Double, Set<Document>>>> alldocsetmap = new LinkedHashMap<>();
// now fill this map.
if (giventype == Short.MAX_VALUE) {
for (NEType.Type t : NEType.Type.values()) {
Map<MappedEntity, Pair<Double, Set<Document>>> docsetmap = new LinkedHashMap<>();
alldocsetmap.put(t.getCode(), docsetmap);
}
} else {
Map<MappedEntity, Pair<Double, Set<Document>>> docsetmap = new LinkedHashMap<>();
alldocsetmap.put(giventype, docsetmap);
}
// iterate over
// iterate over lucene doc to recalculate the count and other summaries of the modified
// fill cache summary for ebook in other fields of ebook.
double theta = 0.001;
long luceneduration1 = 0;
long luceneduration2 = 0;
long additionduration = 0;
Map<String, Span[]> docEntitiesMap = mArchive.getAllEntities(mArchive.getAllDocs().size());
for (String docid : docEntitiesMap.keySet()) {
Span[] allspans = docEntitiesMap.get(docid);
EmailDocument edoc = mArchive.indexer.docForId(docid);
for (Span span : allspans) {
// bail out if not of entity type that we're looking for, or not enough confidence, but don't bail out if we have to do it for all types, i.e. type is Short.MAX_TYPE
if (giventype != Short.MAX_VALUE && (span.type != giventype || span.typeScore < theta))
continue;
// if type is Short.Max_Type then set the type as the current type, if not this is like a NOP.
Short type = span.type;
Double score = new Double(span.typeScore);
String name = span.getText();
String canonicalizedname = EntityBook.canonicalize(name);
// map the name to its display name. if no mapping, we should get the same name back as its displayName
MappedEntity mappedEntity = (mTypeToEntityBook.get(type).nameToMappedEntity.get(canonicalizedname));
if (mappedEntity == null) {
// It implies that we have erased some names from the entitybook so no need to consider them.
continue;
}
// add this doc in the docsetmap for the mappedEntity.
Double oldscore = Double.valueOf(0);
if (alldocsetmap.get(type).get(mappedEntity) != null)
oldscore = alldocsetmap.get(type).get(mappedEntity).first;
Double finalscore = Double.max(oldscore, score);
Set<Document> docset = new LinkedHashSet<>();
if (alldocsetmap.get(type).get(mappedEntity) != null)
docset = alldocsetmap.get(type).get(mappedEntity).second;
docset.add(edoc);
// docset.add(doc);
alldocsetmap.get(type).put(mappedEntity, new Pair(finalscore, docset));
}
}
// fill cache summary for ebook in other fields of ebook.
// Beware!! what happens if type is MAX (means we need to do this for all types).
long end = System.currentTimeMillis();
log.info("Finished computing entitybook cache in " + (end - start) + " milliseconds");
if (giventype == Short.MAX_VALUE) {
for (NEType.Type t : NEType.Type.values()) {
mTypeToEntityBook.get(t.getCode()).fillSummaryFields(alldocsetmap.get(t.getCode()), mArchive);
}
} else
mTypeToEntityBook.get(giventype).fillSummaryFields(alldocsetmap.get(giventype), mArchive);
// log.info("Luceneduration 1 = "+luceneduration1+" milliseconds, Luceneduration 2 = "+luceneduration2 + " milliseconds, addition duration = "+additionduration+ " milliseconds");
// log.info("Finished filling summary of entitybook cache in "+ (System.currentTimeMillis()-end)+" milliseconds");
log.info("EntityBook Cache computed successfully");
}
use of edu.stanford.muse.util.Span in project epadd by ePADD.
the class ProperNounLinker method getNearestMatches.
/**
*Use this method with caution! Don't use this method for bulk resolutions by making repeated calls to the method.
* The response time of this method can be in the order of fraction of secs.
* Given an EmailMention, gets the closest possible resolutions in the archive.
* Uses EMailHierarchy to measure distance between email mentions.
*/
public static List<Pair<EmailMention, Integer>> getNearestMatches(EmailMention mention, int maxMatches, Archive archive) {
// maximum number of documents to consider before giving up on the search
int MAX_DOCS = 1000;
// Collect one year of docs
long WINDOW = 365 * 24 * 3600 * 1000l;
Date st = new Date(mention.date.getTime() - WINDOW / 2), et = new Date(mention.date.getTime() + WINDOW / 2);
Calendar scal = new GregorianCalendar(), ecal = new GregorianCalendar();
scal.setTime(st);
ecal.setTime(et);
Collection<DatedDocument> docs = (Collection) archive.getAllDocs();
List<DatedDocument> sdocs = IndexUtils.selectDocsByDateRange(docs, scal.get(Calendar.YEAR), scal.get(Calendar.MONTH), scal.get(Calendar.DATE), ecal.get(Calendar.YEAR), ecal.get(Calendar.MONTH), ecal.get(Calendar.DATE));
Set<String> docIds = sdocs.stream().map(Document::getUniqueId).collect(Collectors.toSet());
Hierarchy hierarchy = new EmailHierarchy();
String[] vlevels = new String[hierarchy.getNumLevels()];
for (int i = 0; i < hierarchy.getNumLevels(); i++) vlevels[i] = hierarchy.getValue(i, mention.ed);
boolean isAcronym = mention.entity.text.length() > 2 && FeatureGeneratorUtil.tokenFeature(mention.entity.text).equals("ac");
if (mention.entity.text.length() <= 2)
return new ArrayList<>();
long addingTime = 0, ldocTime = 0, pst = 0;
List<Pair<EmailMention, Integer>> matches = new ArrayList<>();
// order the docs based on distance from the current doc
// Under the assumption that the hierarchy would always impose distance between two email mentions at doc level granularity in the least
Map<Integer, List<String>> docDist = new LinkedHashMap<>();
for (String docId : docIds) {
EmailDocument ed = archive.docForId(docId);
int dist = -1;
for (int i = 0; i < hierarchy.getNumLevels(); i++) if (vlevels[i] != null && vlevels[i].equals(hierarchy.getValue(i, ed))) {
dist = i;
break;
}
if (dist == -1)
continue;
if (!docDist.containsKey(dist))
docDist.put(dist, new ArrayList<>());
docDist.get(dist).add(docId);
}
Set<String> fieldsToLoad = new LinkedHashSet<>();
fieldsToLoad.add(NER.NAMES);
fieldsToLoad.add(NER.NAMES_TITLE);
// cache stuff
Map<String, Boolean> processed = new LinkedHashMap<>();
Set<String> considered = new LinkedHashSet<>();
int docsProcessed = 0;
outer: for (Integer level = 0; level < hierarchy.getNumLevels(); level++) {
if (!docDist.containsKey(level))
continue;
for (String docId : docDist.get(level)) {
long st1 = System.currentTimeMillis();
org.apache.lucene.document.Document ldoc = null;
try {
ldoc = archive.getLuceneDoc(docId, fieldsToLoad);
} catch (IOException e) {
edu.stanford.muse.util.Util.print_exception("Failed to fetch lucene doc for doc id: " + docId, e, log);
continue;
}
ldocTime += System.currentTimeMillis() - st1;
st1 = System.currentTimeMillis();
Span[] entities = NER.getNames(ldoc, true);
pst += (System.currentTimeMillis() - st1);
st1 = System.currentTimeMillis();
List<Span> names = new ArrayList<>();
names.addAll(Arrays.asList(entities));
EmailDocument ed = archive.docForId(docId);
List<String> hpeople = ed.getAllNames();
for (String hp : hpeople) {
Span s = new Span(hp, -1, -1);
s.setType(NEType.Type.PERSON.getCode(), 1.0f);
names.add(s);
}
for (Span name : names) {
if (name == null || name.text == null)
continue;
String tText = mention.entity.text;
boolean match;
Boolean pMatch = processed.get(name.text);
if (pMatch == null) {
match = (isAcronym && !name.text.equals(tText) && Util.getAcronym(name.text).equals(tText)) || (name.text.contains(" " + tText + " ") || name.text.startsWith(tText + " ") || name.text.endsWith(" " + tText));
processed.put(name.text, match);
} else
match = pMatch;
if (match) {
if (!considered.contains(name.text)) {
considered.add(name.text);
matches.add(new Pair<>(new EmailMention(name, ed, hierarchy), level));
if (matches.size() >= maxMatches)
return matches;
}
}
}
addingTime += (System.currentTimeMillis() - st1);
if (docsProcessed++ > MAX_DOCS)
break outer;
}
}
System.out.println("Ldoc get time" + ldocTime + " -- Parsing time: " + pst + " -- Adding time: " + addingTime);
return matches;
}
Aggregations