use of edu.stanford.muse.ie.variants.EntityBook in project epadd by ePADD.
the class Archive method getEntitiesCountMapModuloThersold.
// method to return the count map of entities provided that the score of the entity is greater than
// thersold.
public static Map<Short, Integer> getEntitiesCountMapModuloThersold(Archive archive, double thersold) {
Map<Short, Integer> entityTypeToCount = new LinkedHashMap<>();
EntityBook entityBook = archive.getEntityBook();
Set<String> seen = new LinkedHashSet<>();
for (Document doc : archive.getAllDocs()) {
Span[] es1 = archive.getEntitiesInDoc(doc, true);
Span[] es2 = archive.getEntitiesInDoc(doc, false);
Set<Span> ss = Arrays.stream(es1).collect(Collectors.toSet());
Set<Span> ss1 = Arrays.stream(es2).collect(Collectors.toSet());
ss.addAll(ss1);
for (Span span : ss) {
if (span.typeScore < thersold)
continue;
String name = span.getText();
String displayName = name;
// map the name to its display name. if no mapping, we should get the same name back as its displayName
if (entityBook != null)
displayName = entityBook.getDisplayName(name, span.type);
displayName = displayName.trim();
if (seen.contains(displayName.toLowerCase()))
// count an entity only once
continue;
seen.add(displayName.toLowerCase());
if (!entityTypeToCount.containsKey(span.getType())) {
entityTypeToCount.put(span.getType(), 0);
}
entityTypeToCount.put(span.getType(), entityTypeToCount.get(span.getType()) + 1);
}
}
return entityTypeToCount;
}
use of edu.stanford.muse.ie.variants.EntityBook in project epadd by ePADD.
the class SimpleSessions method loadSessionAsMap.
/**
* loads session from the given filename, and returns the map of loaded
* attributes.
* if readOnly is false, caller MUST make sure to call packIndex.
* baseDir is Indexer's baseDir (path before "indexes/")
*
* @throws IOException
* @throws LockObtainFailedException
* @throws CorruptIndexException
* Change as on Nov 2017-
* Earlier the whole archive was serialized and deserialized as one big entity. Now it is broken into
* four main parts, Addressbook, entitybook, correspondentAuthorityMapper and the rest of the object
* We save all these four components separately in saveArchive. Therefore while reading, we need to read
* all those separately from appropriate files.
*/
public static Map<String, Object> loadSessionAsMap(String filename, String baseDir, boolean readOnly) throws IOException {
log.info("Loading session from file " + filename + " size: " + Util.commatize(new File(filename).length() / 1024) + " KB");
ObjectInputStream ois = null;
// keep reading till eof exception
Map<String, Object> result = new LinkedHashMap<>();
try {
ois = new ObjectInputStream(new GZIPInputStream(new FileInputStream(filename)));
while (true) {
String key = (String) ois.readObject();
log.info("loading key: " + key);
try {
Object value = ois.readObject();
if (value == null)
break;
result.put(key, value);
} catch (InvalidClassException ice) {
log.error("Bad version for value of key " + key + ": " + ice + "\nContinuing but this key is not set...");
} catch (ClassNotFoundException cnfe) {
log.error("Class not found for value of key " + key + ": " + cnfe + "\nContinuing but this key is not set...");
}
}
} catch (EOFException eof) {
log.info("end of session file reached");
} catch (Exception e) {
log.warn("Warning unable to load session: " + Util.stackTrace(e));
result.clear();
}
if (ois != null)
try {
ois.close();
} catch (Exception e) {
Util.print_exception(e, log);
}
// need to set up sentiments explicitly -- now no need since lexicon is part of the session
log.info("Memory status: " + Util.getMemoryStats());
Archive archive = (Archive) result.get("archive");
// no groups in public mode
if (archive != null) {
/*
Read other three modules of Archive object which were set as transient and hence did not serialize.
*/
// file path names of addressbook, entitybook and correspondentAuthorityMapper data.
String dir = baseDir + File.separatorChar + Archive.SESSIONS_SUBDIR;
String addressBookPath = dir + File.separatorChar + Archive.ADDRESSBOOK_SUFFIX;
String entityBookPath = dir + File.separatorChar + Archive.ENTITYBOOK_SUFFIX;
String cAuthorityPath = dir + File.separatorChar + Archive.CAUTHORITYMAPPER_SUFFIX;
String labMapDirPath = dir + File.separatorChar + Archive.LABELMAPDIR;
String annotationMapPath = dir + File.separatorChar + Archive.ANNOTATION_SUFFIX;
// above three files are not present. In that case start afresh with importing the email-archive again in processing mode.
if (!(new File(addressBookPath).exists()) || !(new File(entityBookPath).exists()) || !(new File(cAuthorityPath).exists())) {
result.put("archive", null);
return result;
}
// ///////////////AddressBook////////////////////////////////////////////
BufferedReader br = new BufferedReader(new FileReader(addressBookPath));
AddressBook ab = AddressBook.readObjectFromStream(br);
archive.addressBook = ab;
br.close();
// //////////////EntityBook/////////////////////////////////////
br = new BufferedReader(new FileReader(entityBookPath));
EntityBook eb = EntityBook.readObjectFromStream(br);
archive.setEntityBook(eb);
br.close();
// /////////////CorrespondentAuthorityMapper/////////////////////////////
CorrespondentAuthorityMapper cmapper = null;
cmapper = CorrespondentAuthorityMapper.readObjectFromStream(cAuthorityPath);
archive.correspondentAuthorityMapper = cmapper;
// ///////////////Label Mapper/////////////////////////////////////////////////////
LabelManager labelManager = null;
try {
labelManager = LabelManager.readObjectFromStream(labMapDirPath);
} catch (Exception e) {
Util.print_exception("Exception in reading label manager from archive, assigning a new label manager", e, log);
labelManager = new LabelManager();
}
archive.setLabelManager(labelManager);
// /////////////Annotation Manager///////////////////////////////////////////////////////
AnnotationManager annotationManager = AnnotationManager.readObjectFromStream(annotationMapPath);
archive.setAnnotationManager(annotationManager);
// this is useful when we import a legacy archive into processing, where we've updated the pm file directly, without updating the archive.
try {
archive.collectionMetadata = readCollectionMetadata(baseDir);
} catch (Exception e) {
Util.print_exception("Error trying to read processing metadata file", e, log);
}
// ///////////////////////////Done reading//////////////////////////////////////////////////////
// most of this code should probably move inside Archive, maybe a function called "postDeserialized()"
archive.postDeserialized(baseDir, readOnly);
result.put("emailDocs", archive.getAllDocs());
}
return result;
}
Aggregations