use of edu.stanford.muse.util.DetailedFacetItem in project epadd by ePADD.
the class CrossCollectionSearch method initialize.
/**
* initializes lookup structures (entity infos and ctokenToInfos) for cross collection search
* reads all archives available in the base dir.
* should be synchronized so there's no chance of doing it multiple times at the same time.
*/
private static synchronized void initialize(String baseDir) {
// this is created only once in one run. if it has already been created, reuse it.
// in the future, this may be read from a serialized file, etc.
cTokenToInfos = LinkedHashMultimap.create();
File[] files = new File(baseDir).listFiles();
if (files == null) {
log.warn("Trying to initialize cross collection search from an invalid directory: " + baseDir);
return;
}
int archiveNum = 0;
for (File f : files) {
if (!f.isDirectory())
continue;
try {
String archiveFile = f.getAbsolutePath() + File.separator + Archive.BAG_DATA_FOLDER + File.separator + Archive.SESSIONS_SUBDIR + File.separator + "default" + SimpleSessions.getSessionSuffix();
if (!new File(archiveFile).exists()) {
log.warn("Unable to find archive file" + archiveFile + ".. Serious error");
continue;
}
// Assumption is that this feature is present only in discovery mode. In future when we want to add it to processing, we need proper care.
Archive archive = ArchiveReaderWriter.readArchiveIfPresent(f.getAbsolutePath(), ModeConfig.Mode.DISCOVERY);
if (archive == null) {
log.warn("failed to read archive from " + f.getAbsolutePath());
continue;
}
log.info("Loaded archive from " + f.getAbsolutePath());
log.info("Loaded archive metadata from " + f.getAbsolutePath());
// process all docs in this archive to set up centityToInfo map
String archiveID = ArchiveReaderWriter.getArchiveIDForArchive(archive);
Map<String, EntityInfo> centityToInfo = new LinkedHashMap<>();
{
// get all contacts from the addressbook
Set<Pair<String, Pair<Pair<Date, Date>, Integer>>> correspondentEntities = new LinkedHashSet<>();
{
Map<Contact, DetailedFacetItem> res = IndexUtils.partitionDocsByPerson(archive.getAllDocs(), archive.getAddressBook());
res.entrySet().forEach(s -> {
// get contactname
Contact c = s.getKey();
// get duration (first and last doc where this contact was used)
Set<EmailDocument> edocs = s.getValue().docs.stream().map(t -> (EmailDocument) t).collect(Collectors.toSet());
Pair<Date, Date> duration = EmailUtils.getFirstLast(edocs);
if (duration == null) {
duration = new Pair<>(archive.collectionMetadata.firstDate, archive.collectionMetadata.lastDate);
}
if (duration.first == null)
duration.first = archive.collectionMetadata.firstDate;
if (duration.second == null)
duration.second = archive.collectionMetadata.lastDate;
// get number of messages where this was used.
Integer count = s.getValue().docs.size();
if (c.getNames() != null) {
Pair<Date, Date> finalDuration = duration;
c.getNames().forEach(w -> {
if (!Util.nullOrEmpty(w) && finalDuration != null && count != null)
correspondentEntities.add(new Pair(canonicalize(w), new Pair(finalDuration, count)));
});
}
if (c.getEmails() != null) {
Pair<Date, Date> finalDuration1 = duration;
c.getEmails().forEach(w -> {
if (!Util.nullOrEmpty(w) && finalDuration1 != null && count != null)
correspondentEntities.add(new Pair(canonicalize(w), new Pair(finalDuration1, count)));
});
}
});
}
// get all entities from entitybookmanager
Set<Pair<String, Pair<Pair<Date, Date>, Integer>>> entitiessummary = new LinkedHashSet<>();
{
entitiessummary = archive.getEntityBookManager().getAllEntitiesSummary();
// filter out any null or empty strings (just in case)
// don't canonicalize right away because we need to keep the original form of the name
entitiessummary = entitiessummary.stream().filter(s -> !Util.nullOrEmpty(s.first)).collect(Collectors.toSet());
}
// if an entity is present as a person entity as well as in correspondent then consider the count of the person entity as the final count. Therefore start with
// processing of correspondent entities.
correspondentEntities.forEach(entity -> {
String centity = canonicalize(entity.first);
EntityInfo ei = centityToInfo.get(centity);
if (ei == null) {
ei = new EntityInfo();
ei.archiveID = archiveID;
ei.displayName = entity.first;
centityToInfo.put(centity, ei);
}
ei.isCorrespondent = true;
ei.firstDate = entity.second.first.first;
ei.lastDate = entity.second.first.second;
ei.count = entity.second.second;
});
// Now process entities (except correspondents).
entitiessummary.forEach(entity -> {
String centity = canonicalize(entity.first);
EntityInfo ei = centityToInfo.get(centity);
if (ei == null) {
ei = new EntityInfo();
ei.archiveID = archiveID;
ei.displayName = entity.first;
centityToInfo.put(centity, ei);
}
// ei.isCorrespondent=true;
ei.firstDate = entity.second.first.first;
ei.lastDate = entity.second.first.second;
ei.count = entity.second.second;
});
}
log.info("Archive # " + archiveNum + " read " + centityToInfo.size() + " entities");
// now set up this map as a token map
for (EntityInfo ei : centityToInfo.values()) {
String entity = ei.displayName;
String centity = canonicalize(entity);
allCEntities.add(centity);
// consider a set of tokens because we don't want repeats
Set<String> ctokens = new LinkedHashSet<>(Util.tokenize(centity));
for (String ctoken : ctokens) cTokenToInfos.put(ctoken, ei);
}
} catch (Exception e) {
Util.print_exception("Error loading archive in directory " + f.getAbsolutePath(), e, log);
}
archiveNum++;
}
}
Aggregations