use of edu.stanford.muse.ie.variants.EntityBookManager in project epadd by ePADD.
the class Archive method export.
/**
* a fresh archive is created under out_dir. name is the name of the session
* under it. blobs are exported into this archive dir. destructive! but
* should be so only in memory. original files on disk should be unmodified.
*
* @param retainedDocs
* @throws Exception
*/
public synchronized String export(Collection<? extends Document> retainedDocs, Export_Mode export_mode, String out_dir, String name, Consumer<StatusProvider> setStatusProvider) throws Exception {
if (Util.nullOrEmpty(out_dir))
return null;
File dir = new File(out_dir);
if (dir.exists() && dir.isDirectory()) {
log.warn("Overwriting existing directory '" + out_dir + "' (it may already exist)");
FileUtils.deleteDirectory(dir);
} else if (!dir.mkdirs()) {
log.warn("Unable to create directory: " + out_dir);
return null;
}
String statusmsg = export_mode == Export_Mode.EXPORT_APPRAISAL_TO_PROCESSING ? "Exporting to Processing" : (export_mode == Export_Mode.EXPORT_PROCESSING_TO_DISCOVERY ? "Exporting to Discovery" : "Exporting to Delivery");
boolean exportInPublicMode = export_mode == Export_Mode.EXPORT_PROCESSING_TO_DISCOVERY;
setStatusProvider.accept(new StaticStatusProvider(statusmsg + ":" + "Preparing base directory.."));
prepareBaseDir(out_dir);
if (!exportInPublicMode && new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + LEXICONS_SUBDIR).exists())
FileUtils.copyDirectory(new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + LEXICONS_SUBDIR), new File(out_dir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + LEXICONS_SUBDIR));
// copy normalization file if it exists
if (!exportInPublicMode && new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + Archive.SESSIONS_SUBDIR + File.separator + Archive.BLOBLNORMALIZATIONFILE_SUFFIX).exists())
FileUtils.copyFile(new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + Archive.SESSIONS_SUBDIR + File.separator + Archive.BLOBLNORMALIZATIONFILE_SUFFIX), new File(out_dir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + Archive.SESSIONS_SUBDIR + File.separator + Archive.BLOBLNORMALIZATIONFILE_SUFFIX));
if (new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + IMAGES_SUBDIR).exists())
FileUtils.copyDirectory(new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + IMAGES_SUBDIR), new File(out_dir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + IMAGES_SUBDIR));
// internal disambiguation cache
if (new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + FEATURES_SUBDIR).exists())
FileUtils.copyDirectory(new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + FEATURES_SUBDIR), new File(out_dir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + FEATURES_SUBDIR));
if (new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + edu.stanford.muse.Config.AUTHORITY_ASSIGNER_FILENAME).exists())
FileUtils.copyFile(new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + edu.stanford.muse.Config.AUTHORITY_ASSIGNER_FILENAME), new File(out_dir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + edu.stanford.muse.Config.AUTHORITY_ASSIGNER_FILENAME));
// save the states that may get modified
List<Document> savedAllDocs = allDocs;
LabelManager oldLabelManager = getLabelManager();
// change state of the current archive -temporarily//////////
if (exportInPublicMode) {
// replace description with names;
allDocs = new ArrayList<>(retainedDocs);
replaceDescriptionWithNames(allDocs, this);
// Also replace the attachment information present in EmailDocument Object
redactAttachmentDetailsFromDocs(allDocs, this);
} else {
allDocs = new ArrayList<>(retainedDocs);
}
Set<String> retainedDocIDs = retainedDocs.stream().map(Document::getUniqueId).collect(Collectors.toSet());
LabelManager newLabelManager = getLabelManager().getLabelManagerForExport(retainedDocIDs, export_mode);
setStatusProvider.accept(new StaticStatusProvider(statusmsg + ":" + "Exporting LabelManager.."));
setLabelManager(newLabelManager);
// copy index and if for public mode, also redact body and remove title
// fields
final boolean redact_body_instead_of_remove = true;
/* Set<String> docIdSet = new LinkedHashSet<>();
for (Document d : allDocs)
docIdSet.add(d.getUniqueId());
final Set<String> retainedDocIds = docIdSet;*/
Indexer.FilterFunctor emailFilter = doc -> {
if (!retainedDocIDs.contains(doc.get("docId")))
return false;
if (exportInPublicMode) {
String text;
if (redact_body_instead_of_remove) {
text = doc.get("body");
}
doc.removeFields("body");
doc.removeFields("body_original");
if (text != null) {
String redacted_text = IndexUtils.retainOnlyNames(text, doc);
doc.add(new Field("body", redacted_text, Indexer.full_ft));
// this uses standard analyzer, not stemming because redacted bodys only have names.
}
String title = doc.get("title");
doc.removeFields("title");
if (title != null) {
String redacted_title = IndexUtils.retainOnlyNames(text, doc);
doc.add(new Field("title", redacted_title, Indexer.full_ft));
}
}
return true;
};
/*
Moveing it at the end- after changing the basedir of the archive. Because addressbook is getting saved
after maskEmailDomain.
if (exportInPublicMode) {
List<Document> docs = this.getAllDocs();
List<EmailDocument> eds = new ArrayList<>();
for (Document doc : docs)
eds.add((EmailDocument) doc);
EmailUtils.maskEmailDomain(eds, this.addressBook);
}
*/
Indexer.FilterFunctor attachmentFilter = doc -> {
if (exportInPublicMode) {
return false;
}
String docId = doc.get("emailDocId");
if (docId == null) {
Integer di = Integer.parseInt(doc.get("docId"));
// don't want to print too many messages
if (di < 10)
log.error("Looks like this is an old archive, filtering all the attachments!!\n" + "Consider re-indexing with the latest version for a proper export.");
return false;
}
return retainedDocIDs.contains(docId);
};
setStatusProvider.accept(new StaticStatusProvider(statusmsg + ":" + "Exporting Index.."));
indexer.copyDirectoryWithDocFilter(out_dir + File.separatorChar + Archive.BAG_DATA_FOLDER, emailFilter, attachmentFilter);
log.info("Completed exporting indexes");
setStatusProvider.accept(new StaticStatusProvider(statusmsg + ":" + "Exporting Blobs.."));
// save the blobs in a new blobstore
if (!exportInPublicMode) {
log.info("Starting to export blobs, old blob store is: " + blobStore);
Set<Blob> blobsToKeep = new LinkedHashSet<>();
for (Document d : allDocs) if (d instanceof EmailDocument)
if (!Util.nullOrEmpty(((EmailDocument) d).attachments))
blobsToKeep.addAll(((EmailDocument) d).attachments);
String blobsDir = out_dir + File.separatorChar + Archive.BAG_DATA_FOLDER + File.separatorChar + BLOBS_SUBDIR;
new File(blobsDir).mkdirs();
BlobStore newBlobStore = blobStore.createCopy(blobsDir, blobsToKeep);
log.info("Completed exporting blobs, newBlobStore in dir: " + blobsDir + " is: " + newBlobStore);
// switch to the new blob store (important -- the urls and indexes in the new blob store are different from the old one! */
blobStore = newBlobStore;
}
String oldBaseDir = baseDir;
// change base directory
setBaseDir(out_dir);
if (exportInPublicMode) {
List<Document> docs = this.getAllDocs();
List<EmailDocument> eds = new ArrayList<>();
for (Document doc : docs) eds.add((EmailDocument) doc);
EmailUtils.maskEmailDomain(eds, this.addressBook);
}
setStatusProvider.accept(new StaticStatusProvider(statusmsg + ":" + "Exporting EntityBook Manager.."));
// now read entitybook manager as well (or build from lucene)
String outdir = out_dir + File.separatorChar + Archive.BAG_DATA_FOLDER + File.separatorChar + Archive.SESSIONS_SUBDIR;
String entityBookPath = outdir + File.separatorChar + Archive.ENTITYBOOKMANAGER_SUFFIX;
EntityBookManager entityBookManager = ArchiveReaderWriter.readEntityBookManager(this, entityBookPath);
this.setEntityBookManager(entityBookManager);
// recompute entity count because some documents have been redacted
double theta = 0.001;
// getEntitiesCountMapModuloThreshold(this,theta);
this.collectionMetadata.entityCounts = this.getEntityBookManager().getEntitiesCountMapModuloThreshold(theta);
// write out the archive file.. note that this is a fresh creation of archive in the exported folder
setStatusProvider.accept(new StaticStatusProvider(statusmsg + ":" + "Export done. Saving Archive.."));
// save .session file.
ArchiveReaderWriter.saveArchive(out_dir, name, this, Save_Archive_Mode.FRESH_CREATION);
log.info("Completed saving archive object");
// restore states
setBaseDir(oldBaseDir);
allDocs = savedAllDocs;
setLabelManager(oldLabelManager);
return out_dir;
}
use of edu.stanford.muse.ie.variants.EntityBookManager in project epadd by ePADD.
the class ArchiveReaderWriter method loadSessionAsMap.
// #############################################End: Weak reference cache for the archive object and archive#####################################
// #############################################Start: Reading/loading an archive bag###########################################################
/**
* loads session from the given filename, and returns the map of loaded
* attributes.
* if readOnly is false, caller MUST make sure to call packIndex.
* baseDir is Indexer's baseDir (path before "indexes/")
*
* @throws IOException
* @throws LockObtainFailedException
* @throws CorruptIndexException
* Change as on Nov 2017-
* Earlier the whole archive was serialized and deserialized as one big entity. Now it is broken into
* four main parts, Addressbook, entitybook, correspondentAuthorityMapper and the rest of the object
* We save all these four components separately in saveArchive. Therefore while reading, we need to read
* all those separately from appropriate files.
*/
private static Map<String, Object> loadSessionAsMap(String filename, String baseDir, boolean readOnly, ModeConfig.Mode mode) throws IOException {
log.info("Loading session from file " + filename + " size: " + Util.commatize(new File(filename).length() / 1024) + " KB");
ObjectInputStream ois = null;
long startTime = System.currentTimeMillis();
// keep reading till eof exception
Map<String, Object> result = new LinkedHashMap<>();
try {
ois = new ObjectInputStream(new BufferedInputStream(new GZIPInputStream(new FileInputStream(filename))));
while (true) {
String key = (String) ois.readObject();
log.info("loading key: " + key);
try {
Object value = ois.readObject();
if (value == null)
break;
result.put(key, value);
} catch (InvalidClassException ice) {
log.error("Bad version for value of key " + key + ": " + ice + "\nContinuing but this key is not set...");
} catch (ClassNotFoundException cnfe) {
log.error("Class not found for value of key " + key + ": " + cnfe + "\nContinuing but this key is not set...");
}
}
} catch (EOFException eof) {
log.info("end of session file reached");
} catch (Exception e) {
log.warn("Warning unable to load session: " + Util.stackTrace(e));
result.clear();
}
if (ois != null)
try {
ois.close();
} catch (Exception e) {
Util.print_exception(e, log);
}
log.info("Session loaded successfully");
// need to set up sentiments explicitly -- now no need since lexicon is part of the session
log.info("Memory status: " + Util.getMemoryStats());
Archive archive = (Archive) result.get("archive");
// no groups in public mode
if (archive != null) {
long deserializationTime = System.currentTimeMillis();
log.info("Time taken to read and deserialize archive object: " + (deserializationTime - startTime) + " milliseconds");
/*
Read other three modules of Archive object which were set as transient and hence did not serialize.
*/
// file path names of addressbook, entitybook and correspondentAuthorityMapper data.
String dir = baseDir + File.separatorChar + Archive.BAG_DATA_FOLDER + File.separatorChar + Archive.SESSIONS_SUBDIR;
String addressBookPath = dir + File.separatorChar + Archive.ADDRESSBOOK_SUFFIX;
String entityBookPath = dir + File.separatorChar + Archive.ENTITYBOOKMANAGER_SUFFIX;
String cAuthorityPath = dir + File.separatorChar + Archive.CAUTHORITYMAPPER_SUFFIX;
String labMapDirPath = dir + File.separatorChar + Archive.LABELMAPDIR;
String annotationMapPath = dir + File.separatorChar + Archive.ANNOTATION_SUFFIX;
String blobNormalizationMapPath = dir + File.separatorChar + Archive.BLOBLNORMALIZATIONFILE_SUFFIX;
// above three files are not present. In that case start afresh with importing the email-archive again in processing mode.
if (!(new File(addressBookPath).exists()) || /*|| !(new File(entityBookPath).exists())*/
!(new File(cAuthorityPath).exists())) {
result.put("archive", null);
return result;
}
log.info("Setting up post-deserialization action");
archive.postDeserialized(baseDir, readOnly);
long postDeserializationDuration = System.currentTimeMillis();
log.info("Post-deserialization action completed in " + (postDeserializationDuration - deserializationTime) + " milliseconds");
// /////////////Processing metadata////////////////////////////////////////////////
// Read collection metadata first because some of the collection's information might be used while loading other modules. Like first date and last date of an archive
// is used when doc's dates are found corrupted.
// override the PM inside the archive with the one in the PM file
// update: since v5 no pm will be inside the archive.
// this is useful when we import a legacy archive into processing, where we've updated the pm file directly, without updating the archive.
log.info("Loading collection metadata");
try {
archive.collectionMetadata = readCollectionMetadata(baseDir);
} catch (Exception e) {
Util.print_exception("Error trying to read processing metadata file", e, log);
}
long collectionMetadataDuration = System.currentTimeMillis();
if (archive.collectionMetadata != null) {
log.info("Collection metadata loaded successfully in " + (collectionMetadataDuration - postDeserializationDuration) + " milliseconds");
}
// ///////////////AddressBook////////////////////////////////////////////
log.info("Loading address book");
archive.addressBook = readAddressBook(addressBookPath, archive.getAllDocs());
long addressBookLoading = System.currentTimeMillis();
log.info("Addressbook loaded successfully in " + (addressBookLoading - collectionMetadataDuration) + " milliseconds");
// //////////////EntityBook/////////////////////////////////////
log.info("Loading EntityBook Manager");
EntityBookManager eb = readEntityBookManager(archive, entityBookPath);
long entityBookLoading = System.currentTimeMillis();
archive.setEntityBookManager(eb);
log.info("EntityBook Manager loaded successfully in " + (entityBookLoading - addressBookLoading) + " milliseconds");
// /////////////CorrespondentAuthorityMapper/////////////////////////////
long correspondentAuthorityLoading, labelManagerLoading, annotationManagerLoading, blobLoading;
if (mode != ModeConfig.Mode.DISCOVERY) {
CorrespondentAuthorityMapper cmapper = null;
log.info("Loading Correspondent authority mapper");
cmapper = CorrespondentAuthorityMapper.readObjectFromStream(cAuthorityPath);
correspondentAuthorityLoading = System.currentTimeMillis();
log.info("Correspondent authority mapper loaded successfully in " + (correspondentAuthorityLoading - entityBookLoading) + " milliseconds");
archive.correspondentAuthorityMapper = cmapper;
} else {
correspondentAuthorityLoading = entityBookLoading;
}
// ///////////////Label Mapper/////////////////////////////////////////////////////
if (mode != ModeConfig.Mode.DISCOVERY) {
log.info("Loading Label Manager");
LabelManager labelManager = readLabelManager(ArchiveReaderWriter.getArchiveIDForArchive(archive), labMapDirPath);
archive.setLabelManager(labelManager);
labelManagerLoading = System.currentTimeMillis();
log.info("Label Manager loaded successfully in " + (labelManagerLoading - correspondentAuthorityLoading) + " milliseconds");
} else {
labelManagerLoading = correspondentAuthorityLoading;
}
// /////////////Annotation Manager///////////////////////////////////////////////////////
if (mode != ModeConfig.Mode.DISCOVERY) {
log.info("Loading Annotation Manager");
AnnotationManager annotationManager = AnnotationManager.readObjectFromStream(annotationMapPath);
archive.setAnnotationManager(annotationManager);
annotationManagerLoading = System.currentTimeMillis();
log.info("Annotation Manager loaded successfully in " + (annotationManagerLoading - labelManagerLoading));
} else {
annotationManagerLoading = labelManagerLoading;
}
// ///////////////////Blob Normalization map (IF exists)//////////////////////////////////////////////////////
if (new File(blobNormalizationMapPath).exists()) {
log.info("Computing blob normalization map (An artifact of AMatica tool)");
archive.getBlobStore().setNormalizationMap(blobNormalizationMapPath);
blobLoading = System.currentTimeMillis();
log.info("Blob normalization map computed successfully in " + (blobLoading - annotationManagerLoading) + " milliseconds");
} else {
blobLoading = annotationManagerLoading;
}
// ///////////////////////////Done reading//////////////////////////////////////////////////////
// most of this code should probably move inside Archive, maybe a function called "postDeserialized()"
result.put("emailDocs", archive.getAllDocs());
log.info("Assigning thread IDs");
archive.assignThreadIds();
log.info("Thread IDs assigned successfully");
log.info("Total time spent in archive loading is " + (System.currentTimeMillis() - startTime) + " milliseconds");
}
return result;
}
Aggregations