Search in sources :

Example 11 with Pair

use of edu.stanford.muse.util.Pair in project epadd by ePADD.

the class SequenceModelTest method readFromDir.

private static Pair<Map<String, String>, Map<String, String>> readFromDir(String dirName) {
    try {
        // the buffer size can be much higher than default 512 for GZIPInputStream
        ObjectInputStream ois = new ObjectInputStream(new GZIPInputStream(new FileInputStream(dirName + File.separator + "trainTestDBpedia.ser.gz")));
        Pair<Map<String, String>, Map<String, String>> model = (Pair<Map<String, String>, Map<String, String>>) ois.readObject();
        ois.close();
        return model;
    } catch (Exception e) {
        e.printStackTrace();
        return null;
    }
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream) Pair(edu.stanford.muse.util.Pair)

Example 12 with Pair

use of edu.stanford.muse.util.Pair in project epadd by ePADD.

the class POSTokenizer method tokenize.

/**
 * {@inheritDoc}
 */
@Override
public List<Triple<String, Integer, Integer>> tokenize(String content) {
    Span[] sents = NLPUtils.sentenceDetector.sentPosDetect(content);
    List<Triple<String, Integer, Integer>> ret = new ArrayList<>();
    for (Span span : sents) {
        String sent = span.getCoveredText(content).toString();
        if (sent == null || sent.length() > MAX_SENT_LENGTH)
            continue;
        List<Pair<String, Triple<String, Integer, Integer>>> posTags = NLPUtils.posTagWithOffsets(sent);
        List<String> allowedPOSTags = Arrays.asList("NNP", "NNS", "NN", "JJ", "IN", "POS");
        int startOffset = 0;
        int endOffset = 0;
        String str = "";
        boolean padded = false;
        int padL = 0;
        for (int pi = 0; pi < posTags.size(); pi++) {
            Pair<String, Triple<String, Integer, Integer>> p = posTags.get(pi);
            String tag = p.second.first;
            String nxtTag = null;
            if (pi < posTags.size() - 1)
                nxtTag = posTags.get(pi + 1).second.first;
            // POS for 's
            // should not end or start in improper tags
            // !!Think twice before making changes here, dont mess up the offsets!!
            boolean startCond = str.equals("") && (tag.equals("POS") || tag.equals("IN") || p.getFirst().equals("'") || p.getFirst().equals("Dear") || p.getFirst().equals("from"));
            boolean endCond = ((nxtTag == null || !allowedPOSTags.contains(nxtTag)) && (tag.equals("POS") || tag.equals("IN") || p.getFirst().equals("'")));
            boolean isEnd = nxtTag == null || !allowedPOSTags.contains(nxtTag);
            if (allowedPOSTags.contains(tag) && !startCond && !endCond) {
                str += p.getFirst();
                // the test for end is not trivial, hence the check for if the string is padded
                if (!isEnd) {
                    String pad = sent.substring(p.second.getThird(), ((pi + 1) < posTags.size()) ? posTags.get(pi + 1).getSecond().getSecond() : sent.length());
                    str += pad;
                    padL = pad.length();
                    padded = true;
                } else
                    padded = false;
            } else {
                if (!str.equals("")) {
                    if (padded)
                        str = str.substring(0, str.length() - padL);
                    ret.add(new Triple<>(str, startOffset, endOffset));
                    str = "";
                }
                if (pi < posTags.size() - 1)
                    startOffset = posTags.get(pi + 1).second.getSecond();
            }
            endOffset = p.second.getThird();
        }
        if (!str.equals("")) {
            if (padded)
                str = str.substring(0, str.length() - padL);
            // sentence ending is the segment ending
            ret.add(new Triple<>(str, startOffset, endOffset));
        }
    }
    return ret;
}
Also used : Span(opennlp.tools.util.Span) Triple(edu.stanford.muse.util.Triple) Pair(edu.stanford.muse.util.Pair)

Example 13 with Pair

use of edu.stanford.muse.util.Pair in project epadd by ePADD.

the class EntityBook method fillSummaryFields.

public void fillSummaryFields(Map<MappedEntity, Pair<Double, Set<Document>>> docsetmap, Archive archive) {
    JSONArray resultArray = new JSONArray();
    // trick to use count (modifiable variable) inside for each.
    final Integer[] count = { 0 };
    summary_L1_entityCountMap.clear();
    docsetmap.entrySet().forEach(entry -> {
        count[0] = count[0] + 1;
        Summary_L1 summary = new Summary_L1();
        summary.score = entry.getValue().first;
        summary.messages = entry.getValue().second;
        // get date range
        Collection<EmailDocument> emaildocs = summary.messages.stream().map(s -> (EmailDocument) s).collect(Collectors.toList());
        Pair<Date, Date> daterange = EmailUtils.getFirstLast(emaildocs, true);
        if (daterange == null) {
            daterange = new Pair<>(archive.collectionMetadata.firstDate, archive.collectionMetadata.lastDate);
        }
        if (daterange.first == null)
            daterange.first = archive.collectionMetadata.firstDate;
        if (daterange.second == null)
            daterange.second = archive.collectionMetadata.lastDate;
        summary.startDate = daterange.first;
        summary.endDate = daterange.second;
        summary_L1_entityCountMap.put(entry.getKey(), summary);
        String entity = entry.getKey().getDisplayName();
        JSONArray j = new JSONArray();
        Short etype = entityType;
        Set<String> altNamesSet = entry.getKey().getAltNames();
        String altNames = (altNamesSet == null) ? "" : "Alternate names: " + Util.join(altNamesSet, ";");
        j.put(0, Util.escapeHTML(entity));
        j.put(1, summary.score);
        j.put(2, summary.messages.size());
        j.put(3, altNames);
        if (summary.startDate != null)
            j.put(4, new SimpleDateFormat("MM/dd/yyyy").format(summary.startDate));
        else
            j.put(4, summary.startDate);
        if (summary.endDate != null)
            j.put(5, new SimpleDateFormat("MM/dd/yyyy").format(summary.endDate));
        else
            j.put(5, summary.endDate);
        // add entity type as well..
        j.put(6, NEType.getTypeForCode(entityType).getDisplayName());
        resultArray.put(count[0] - 1, j);
    });
    summary_JSON = resultArray;
}
Also used : java.util(java.util) BufferedWriter(java.io.BufferedWriter) Util(edu.stanford.muse.util.Util) SimpleDateFormat(java.text.SimpleDateFormat) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) Doc(javax.print.Doc) Serializable(java.io.Serializable) Document(edu.stanford.muse.index.Document) NameTypes(edu.stanford.muse.ie.NameTypes) Pair(edu.stanford.muse.util.Pair) Logger(org.apache.logging.log4j.Logger) Archive(edu.stanford.muse.index.Archive) NEType(edu.stanford.muse.ner.model.NEType) EmailDocument(edu.stanford.muse.index.EmailDocument) BufferedReader(java.io.BufferedReader) Entity(edu.stanford.muse.ner.Entity) EmailUtils(edu.stanford.muse.util.EmailUtils) LogManager(org.apache.logging.log4j.LogManager) JSONArray(org.json.JSONArray) EmailDocument(edu.stanford.muse.index.EmailDocument) JSONArray(org.json.JSONArray) SimpleDateFormat(java.text.SimpleDateFormat)

Example 14 with Pair

use of edu.stanford.muse.util.Pair in project epadd by ePADD.

the class EntityBookManager method getAllEntitiesSummary.

public Set<Pair<String, Pair<Pair<Date, Date>, Integer>>> getAllEntitiesSummary() {
    Set<Pair<String, Pair<Pair<Date, Date>, Integer>>> result = new LinkedHashSet<>();
    for (NEType.Type t : NEType.Type.values()) {
        EntityBook ebook = this.getEntityBookForType(t.getCode());
        ebook.summary_L1_entityCountMap.entrySet().forEach(s -> {
            result.add(new Pair(s.getKey().getDisplayName(), new Pair(new Pair(s.getValue().startDate, s.getValue().endDate), s.getValue().messages.size())));
        });
    }
    return result;
}
Also used : Pair(edu.stanford.muse.util.Pair) NEType(edu.stanford.muse.ner.model.NEType)

Example 15 with Pair

use of edu.stanford.muse.util.Pair in project epadd by ePADD.

the class EntityBookManager method fillEntityBookFromLucene.

/*
    This is a slow path but the assumption is that it must be used only once when porting the old archives (where entitybooks are not factored out as files). After that only the other
    path 'fillEntityBookFromText' will be used repetitively (when loading the archive)
     */
private void fillEntityBookFromLucene(Short type) {
    EntityBook ebook = new EntityBook(type);
    mTypeToEntityBook.put(type, ebook);
    double theta = 0.001;
    // docset map maps a mappedentity to it's score and the set of documents.
    Map<MappedEntity, Pair<Double, Set<Document>>> docsetmap = new LinkedHashMap<>();
    for (Document doc : mArchive.getAllDocs()) {
        Span[] spansbody = getEntitiesInDocFromLucene(doc, true);
        Span[] spans = getEntitiesInDocFromLucene(doc, false);
        Span[] allspans = ArrayUtils.addAll(spans, spansbody);
        Set<String> seenInThisDoc = new LinkedHashSet<>();
        for (Span span : allspans) {
            // bail out if not of entity type that we're looking for, or not enough confidence
            if (span.type != type || span.typeScore < theta)
                continue;
            String name = span.getText();
            String canonicalizedname = EntityBook.canonicalize(name);
            Double score = new Double(span.typeScore);
            // map the name to its display name. if no mapping, we should get the same name back as its displayName
            MappedEntity mappedEntity = (ebook.nameToMappedEntity.get(canonicalizedname));
            if (mappedEntity == null) {
                // add this name as a mapped entity in the entiybook.
                mappedEntity = new MappedEntity();
                // Don't canonicalize for the display purpose otherwise 'University of Florida' becomes 'florida of university'
                mappedEntity.setDisplayName(name);
                mappedEntity.setEntityType(type);
                mappedEntity.addAltNames(name);
                ebook.nameToMappedEntity.put(canonicalizedname, mappedEntity);
                Set<Document> docset = new LinkedHashSet<>();
                docsetmap.put(mappedEntity, new Pair(score, docset));
                // No doc exists already for this mappedntity
                docset.add(doc);
            } else {
                // add it in the docset.//what about the score??? For now take the score as max of all scores..
                Double oldscore = docsetmap.get(mappedEntity).first;
                Double finalscore = Double.max(oldscore, score);
                Set<Document> docset = docsetmap.get(mappedEntity).second;
                docset.add(doc);
                docsetmap.put(mappedEntity, new Pair(finalscore, docset));
            }
        }
    }
    // fill cache summary for ebook in other fields of ebook.
    ebook.fillSummaryFields(docsetmap, mArchive);
}
Also used : Document(edu.stanford.muse.index.Document) EmailDocument(edu.stanford.muse.index.EmailDocument) Span(edu.stanford.muse.util.Span) Pair(edu.stanford.muse.util.Pair)

Aggregations

Pair (edu.stanford.muse.util.Pair)35 Blob (edu.stanford.muse.datacache.Blob)7 EmailDocument (edu.stanford.muse.index.EmailDocument)6 IOException (java.io.IOException)6 NEType (edu.stanford.muse.ner.model.NEType)5 AnnotationManager (edu.stanford.muse.AnnotationManager.AnnotationManager)4 BlobStore (edu.stanford.muse.datacache.BlobStore)4 Util (edu.stanford.muse.util.Util)4 java.util (java.util)4 Pattern (java.util.regex.Pattern)4 Collectors (java.util.stream.Collectors)4 AddressBook (edu.stanford.muse.AddressBookManager.AddressBook)3 Document (edu.stanford.muse.index.Document)3 Triple (edu.stanford.muse.util.Triple)3 ModeConfig (edu.stanford.muse.webapp.ModeConfig)3 File (java.io.File)3 Matcher (java.util.regex.Matcher)3 Span (opennlp.tools.util.Span)3 LogManager (org.apache.logging.log4j.LogManager)3 Contact (edu.stanford.muse.AddressBookManager.Contact)2