Search in sources :

Example 1 with CorrespondentAuthorityMapper

use of edu.stanford.muse.AddressBookManager.CorrespondentAuthorityMapper in project epadd by ePADD.

the class SimpleSessions method loadSessionAsMap.

/**
 * loads session from the given filename, and returns the map of loaded
 * attributes.
 * if readOnly is false, caller MUST make sure to call packIndex.
 * baseDir is Indexer's baseDir (path before "indexes/")
 *
 * @throws IOException
 * @throws LockObtainFailedException
 * @throws CorruptIndexException
 * Change as on Nov 2017-
 * Earlier the whole archive was serialized and deserialized as one big entity. Now it is broken into
 * four main parts, Addressbook, entitybook, correspondentAuthorityMapper and the rest of the object
 * We save all these four components separately in saveArchive. Therefore while reading, we need to read
 * all those separately from appropriate files.
 */
public static Map<String, Object> loadSessionAsMap(String filename, String baseDir, boolean readOnly) throws IOException {
    log.info("Loading session from file " + filename + " size: " + Util.commatize(new File(filename).length() / 1024) + " KB");
    ObjectInputStream ois = null;
    // keep reading till eof exception
    Map<String, Object> result = new LinkedHashMap<>();
    try {
        ois = new ObjectInputStream(new GZIPInputStream(new FileInputStream(filename)));
        while (true) {
            String key = (String) ois.readObject();
            log.info("loading key: " + key);
            try {
                Object value = ois.readObject();
                if (value == null)
                    break;
                result.put(key, value);
            } catch (InvalidClassException ice) {
                log.error("Bad version for value of key " + key + ": " + ice + "\nContinuing but this key is not set...");
            } catch (ClassNotFoundException cnfe) {
                log.error("Class not found for value of key " + key + ": " + cnfe + "\nContinuing but this key is not set...");
            }
        }
    } catch (EOFException eof) {
        log.info("end of session file reached");
    } catch (Exception e) {
        log.warn("Warning unable to load session: " + Util.stackTrace(e));
        result.clear();
    }
    if (ois != null)
        try {
            ois.close();
        } catch (Exception e) {
            Util.print_exception(e, log);
        }
    // need to set up sentiments explicitly -- now no need since lexicon is part of the session
    log.info("Memory status: " + Util.getMemoryStats());
    Archive archive = (Archive) result.get("archive");
    // no groups in public mode
    if (archive != null) {
        /*
				Read other three modules of Archive object which were set as transient and hence did not serialize.
				*/
        // file path names of addressbook, entitybook and correspondentAuthorityMapper data.
        String dir = baseDir + File.separatorChar + Archive.SESSIONS_SUBDIR;
        String addressBookPath = dir + File.separatorChar + Archive.ADDRESSBOOK_SUFFIX;
        String entityBookPath = dir + File.separatorChar + Archive.ENTITYBOOK_SUFFIX;
        String cAuthorityPath = dir + File.separatorChar + Archive.CAUTHORITYMAPPER_SUFFIX;
        String labMapDirPath = dir + File.separatorChar + Archive.LABELMAPDIR;
        String annotationMapPath = dir + File.separatorChar + Archive.ANNOTATION_SUFFIX;
        // above three files are not present. In that case start afresh with importing the email-archive again in processing mode.
        if (!(new File(addressBookPath).exists()) || !(new File(entityBookPath).exists()) || !(new File(cAuthorityPath).exists())) {
            result.put("archive", null);
            return result;
        }
        // ///////////////AddressBook////////////////////////////////////////////
        BufferedReader br = new BufferedReader(new FileReader(addressBookPath));
        AddressBook ab = AddressBook.readObjectFromStream(br);
        archive.addressBook = ab;
        br.close();
        // //////////////EntityBook/////////////////////////////////////
        br = new BufferedReader(new FileReader(entityBookPath));
        EntityBook eb = EntityBook.readObjectFromStream(br);
        archive.setEntityBook(eb);
        br.close();
        // /////////////CorrespondentAuthorityMapper/////////////////////////////
        CorrespondentAuthorityMapper cmapper = null;
        cmapper = CorrespondentAuthorityMapper.readObjectFromStream(cAuthorityPath);
        archive.correspondentAuthorityMapper = cmapper;
        // ///////////////Label Mapper/////////////////////////////////////////////////////
        LabelManager labelManager = null;
        try {
            labelManager = LabelManager.readObjectFromStream(labMapDirPath);
        } catch (Exception e) {
            Util.print_exception("Exception in reading label manager from archive, assigning a new label manager", e, log);
            labelManager = new LabelManager();
        }
        archive.setLabelManager(labelManager);
        // /////////////Annotation Manager///////////////////////////////////////////////////////
        AnnotationManager annotationManager = AnnotationManager.readObjectFromStream(annotationMapPath);
        archive.setAnnotationManager(annotationManager);
        // this is useful when we import a legacy archive into processing, where we've updated the pm file directly, without updating the archive.
        try {
            archive.collectionMetadata = readCollectionMetadata(baseDir);
        } catch (Exception e) {
            Util.print_exception("Error trying to read processing metadata file", e, log);
        }
        // ///////////////////////////Done reading//////////////////////////////////////////////////////
        // most of this code should probably move inside Archive, maybe a function called "postDeserialized()"
        archive.postDeserialized(baseDir, readOnly);
        result.put("emailDocs", archive.getAllDocs());
    }
    return result;
}
Also used : AnnotationManager(edu.stanford.muse.AnnotationManager.AnnotationManager) Archive(edu.stanford.muse.index.Archive) EntityBook(edu.stanford.muse.ie.variants.EntityBook) ParseException(org.apache.lucene.queryparser.classic.ParseException) LockObtainFailedException(org.apache.lucene.store.LockObtainFailedException) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) GZIPInputStream(java.util.zip.GZIPInputStream) AddressBook(edu.stanford.muse.AddressBookManager.AddressBook) CorrespondentAuthorityMapper(edu.stanford.muse.AddressBookManager.CorrespondentAuthorityMapper) LabelManager(edu.stanford.muse.LabelManager.LabelManager)

Example 2 with CorrespondentAuthorityMapper

use of edu.stanford.muse.AddressBookManager.CorrespondentAuthorityMapper in project epadd by ePADD.

the class ArchiveReaderWriter method loadSessionAsMap.

// #############################################End: Weak reference cache for the archive object and archive#####################################
// #############################################Start: Reading/loading an archive bag###########################################################
/**
 * loads session from the given filename, and returns the map of loaded
 * attributes.
 * if readOnly is false, caller MUST make sure to call packIndex.
 * baseDir is Indexer's baseDir (path before "indexes/")
 *
 * @throws IOException
 * @throws LockObtainFailedException
 * @throws CorruptIndexException
 * Change as on Nov 2017-
 * Earlier the whole archive was serialized and deserialized as one big entity. Now it is broken into
 * four main parts, Addressbook, entitybook, correspondentAuthorityMapper and the rest of the object
 * We save all these four components separately in saveArchive. Therefore while reading, we need to read
 * all those separately from appropriate files.
 */
private static Map<String, Object> loadSessionAsMap(String filename, String baseDir, boolean readOnly, ModeConfig.Mode mode) throws IOException {
    log.info("Loading session from file " + filename + " size: " + Util.commatize(new File(filename).length() / 1024) + " KB");
    ObjectInputStream ois = null;
    long startTime = System.currentTimeMillis();
    // keep reading till eof exception
    Map<String, Object> result = new LinkedHashMap<>();
    try {
        ois = new ObjectInputStream(new BufferedInputStream(new GZIPInputStream(new FileInputStream(filename))));
        while (true) {
            String key = (String) ois.readObject();
            log.info("loading key: " + key);
            try {
                Object value = ois.readObject();
                if (value == null)
                    break;
                result.put(key, value);
            } catch (InvalidClassException ice) {
                log.error("Bad version for value of key " + key + ": " + ice + "\nContinuing but this key is not set...");
            } catch (ClassNotFoundException cnfe) {
                log.error("Class not found for value of key " + key + ": " + cnfe + "\nContinuing but this key is not set...");
            }
        }
    } catch (EOFException eof) {
        log.info("end of session file reached");
    } catch (Exception e) {
        log.warn("Warning unable to load session: " + Util.stackTrace(e));
        result.clear();
    }
    if (ois != null)
        try {
            ois.close();
        } catch (Exception e) {
            Util.print_exception(e, log);
        }
    log.info("Session loaded successfully");
    // need to set up sentiments explicitly -- now no need since lexicon is part of the session
    log.info("Memory status: " + Util.getMemoryStats());
    Archive archive = (Archive) result.get("archive");
    // no groups in public mode
    if (archive != null) {
        long deserializationTime = System.currentTimeMillis();
        log.info("Time taken to read and deserialize archive object: " + (deserializationTime - startTime) + " milliseconds");
        /*
                Read other three modules of Archive object which were set as transient and hence did not serialize.
                */
        // file path names of addressbook, entitybook and correspondentAuthorityMapper data.
        String dir = baseDir + File.separatorChar + Archive.BAG_DATA_FOLDER + File.separatorChar + Archive.SESSIONS_SUBDIR;
        String addressBookPath = dir + File.separatorChar + Archive.ADDRESSBOOK_SUFFIX;
        String entityBookPath = dir + File.separatorChar + Archive.ENTITYBOOKMANAGER_SUFFIX;
        String cAuthorityPath = dir + File.separatorChar + Archive.CAUTHORITYMAPPER_SUFFIX;
        String labMapDirPath = dir + File.separatorChar + Archive.LABELMAPDIR;
        String annotationMapPath = dir + File.separatorChar + Archive.ANNOTATION_SUFFIX;
        String blobNormalizationMapPath = dir + File.separatorChar + Archive.BLOBLNORMALIZATIONFILE_SUFFIX;
        // above three files are not present. In that case start afresh with importing the email-archive again in processing mode.
        if (!(new File(addressBookPath).exists()) || /*|| !(new File(entityBookPath).exists())*/
        !(new File(cAuthorityPath).exists())) {
            result.put("archive", null);
            return result;
        }
        log.info("Setting up post-deserialization action");
        archive.postDeserialized(baseDir, readOnly);
        long postDeserializationDuration = System.currentTimeMillis();
        log.info("Post-deserialization action completed in " + (postDeserializationDuration - deserializationTime) + " milliseconds");
        // /////////////Processing metadata////////////////////////////////////////////////
        // Read collection metadata first because some of the collection's information might be used while loading other modules. Like first date and last date of an archive
        // is used when doc's dates are found corrupted.
        // override the PM inside the archive with the one in the PM file
        // update: since v5 no pm will be inside the archive.
        // this is useful when we import a legacy archive into processing, where we've updated the pm file directly, without updating the archive.
        log.info("Loading collection metadata");
        try {
            archive.collectionMetadata = readCollectionMetadata(baseDir);
        } catch (Exception e) {
            Util.print_exception("Error trying to read processing metadata file", e, log);
        }
        long collectionMetadataDuration = System.currentTimeMillis();
        if (archive.collectionMetadata != null) {
            log.info("Collection metadata loaded successfully in " + (collectionMetadataDuration - postDeserializationDuration) + " milliseconds");
        }
        // ///////////////AddressBook////////////////////////////////////////////
        log.info("Loading address book");
        archive.addressBook = readAddressBook(addressBookPath, archive.getAllDocs());
        long addressBookLoading = System.currentTimeMillis();
        log.info("Addressbook loaded successfully in " + (addressBookLoading - collectionMetadataDuration) + " milliseconds");
        // //////////////EntityBook/////////////////////////////////////
        log.info("Loading EntityBook Manager");
        EntityBookManager eb = readEntityBookManager(archive, entityBookPath);
        long entityBookLoading = System.currentTimeMillis();
        archive.setEntityBookManager(eb);
        log.info("EntityBook Manager loaded successfully in " + (entityBookLoading - addressBookLoading) + " milliseconds");
        // /////////////CorrespondentAuthorityMapper/////////////////////////////
        long correspondentAuthorityLoading, labelManagerLoading, annotationManagerLoading, blobLoading;
        if (mode != ModeConfig.Mode.DISCOVERY) {
            CorrespondentAuthorityMapper cmapper = null;
            log.info("Loading Correspondent authority mapper");
            cmapper = CorrespondentAuthorityMapper.readObjectFromStream(cAuthorityPath);
            correspondentAuthorityLoading = System.currentTimeMillis();
            log.info("Correspondent authority mapper loaded successfully in " + (correspondentAuthorityLoading - entityBookLoading) + " milliseconds");
            archive.correspondentAuthorityMapper = cmapper;
        } else {
            correspondentAuthorityLoading = entityBookLoading;
        }
        // ///////////////Label Mapper/////////////////////////////////////////////////////
        if (mode != ModeConfig.Mode.DISCOVERY) {
            log.info("Loading Label Manager");
            LabelManager labelManager = readLabelManager(ArchiveReaderWriter.getArchiveIDForArchive(archive), labMapDirPath);
            archive.setLabelManager(labelManager);
            labelManagerLoading = System.currentTimeMillis();
            log.info("Label Manager loaded successfully in " + (labelManagerLoading - correspondentAuthorityLoading) + " milliseconds");
        } else {
            labelManagerLoading = correspondentAuthorityLoading;
        }
        // /////////////Annotation Manager///////////////////////////////////////////////////////
        if (mode != ModeConfig.Mode.DISCOVERY) {
            log.info("Loading Annotation Manager");
            AnnotationManager annotationManager = AnnotationManager.readObjectFromStream(annotationMapPath);
            archive.setAnnotationManager(annotationManager);
            annotationManagerLoading = System.currentTimeMillis();
            log.info("Annotation Manager loaded successfully in " + (annotationManagerLoading - labelManagerLoading));
        } else {
            annotationManagerLoading = labelManagerLoading;
        }
        // ///////////////////Blob Normalization map (IF exists)//////////////////////////////////////////////////////
        if (new File(blobNormalizationMapPath).exists()) {
            log.info("Computing blob normalization map (An artifact of AMatica tool)");
            archive.getBlobStore().setNormalizationMap(blobNormalizationMapPath);
            blobLoading = System.currentTimeMillis();
            log.info("Blob normalization map computed successfully in " + (blobLoading - annotationManagerLoading) + " milliseconds");
        } else {
            blobLoading = annotationManagerLoading;
        }
        // ///////////////////////////Done reading//////////////////////////////////////////////////////
        // most of this code should probably move inside Archive, maybe a function called "postDeserialized()"
        result.put("emailDocs", archive.getAllDocs());
        log.info("Assigning thread IDs");
        archive.assignThreadIds();
        log.info("Thread IDs assigned successfully");
        log.info("Total time spent in archive loading is " + (System.currentTimeMillis() - startTime) + " milliseconds");
    }
    return result;
}
Also used : AnnotationManager(edu.stanford.muse.AnnotationManager.AnnotationManager) EntityBookManager(edu.stanford.muse.ie.variants.EntityBookManager) ParseException(org.apache.lucene.queryparser.classic.ParseException) LockObtainFailedException(org.apache.lucene.store.LockObtainFailedException) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) NoSuchAlgorithmException(java.security.NoSuchAlgorithmException) GZIPInputStream(java.util.zip.GZIPInputStream) CorrespondentAuthorityMapper(edu.stanford.muse.AddressBookManager.CorrespondentAuthorityMapper) LabelManager(edu.stanford.muse.LabelManager.LabelManager)

Aggregations

CorrespondentAuthorityMapper (edu.stanford.muse.AddressBookManager.CorrespondentAuthorityMapper)2 AnnotationManager (edu.stanford.muse.AnnotationManager.AnnotationManager)2 LabelManager (edu.stanford.muse.LabelManager.LabelManager)2 GZIPInputStream (java.util.zip.GZIPInputStream)2 CorruptIndexException (org.apache.lucene.index.CorruptIndexException)2 ParseException (org.apache.lucene.queryparser.classic.ParseException)2 LockObtainFailedException (org.apache.lucene.store.LockObtainFailedException)2 AddressBook (edu.stanford.muse.AddressBookManager.AddressBook)1 EntityBook (edu.stanford.muse.ie.variants.EntityBook)1 EntityBookManager (edu.stanford.muse.ie.variants.EntityBookManager)1 Archive (edu.stanford.muse.index.Archive)1 NoSuchAlgorithmException (java.security.NoSuchAlgorithmException)1