Search in sources :

Example 21 with Log

use of org.apache.commons.logging.Log in project pentaho-platform by pentaho.

the class LocaleImportHandlerTest method setUp.

@Before
public void setUp() throws Exception {
    NameBaseMimeResolver nameResolver = new NameBaseMimeResolver();
    PentahoSystem.registerObject(nameResolver);
    IRepositoryContentConverterHandler converterHandler = new DefaultRepositoryContentConverterHandler(new HashMap<String, Converter>());
    List<IMimeType> localeMimeList = new ArrayList<IMimeType>();
    localeMimeList.add(new MimeType("text/locale", "locale"));
    nameResolver.addMimeType(new MimeType("text/prptMimeType", "prpt"));
    nameResolver.addMimeType(new MimeType("text/xactionMimeType", "xaction"));
    MimeType mimeType = new MimeType("text/xml", "xml");
    mimeType.setHidden(true);
    nameResolver.addMimeType(mimeType);
    mimeType = new MimeType("image/png", "png");
    mimeType.setHidden(true);
    nameResolver.addMimeType(mimeType);
    List<String> allowedArtifacts = new ArrayList<String>();
    allowedArtifacts.add("xaction");
    allowedArtifacts.add("url");
    LocaleImportHandler localeImportHandler = new LocaleImportHandler(localeMimeList, allowedArtifacts);
    LocaleImportHandler spylocaleImportHandler = spy(localeImportHandler);
    Log log = mock(Log.class);
    doReturn(log).when(spylocaleImportHandler).getLogger();
    List<IPlatformImportHandler> handlers = new ArrayList<IPlatformImportHandler>();
    handlers.add(spylocaleImportHandler);
    importer = new PentahoPlatformImporter(handlers, converterHandler);
    importer.setRepositoryImportLogger(new Log4JRepositoryImportLogger());
    localeFilesProcessor = new LocaleFilesProcessor();
}
Also used : Log(org.apache.commons.logging.Log) ArrayList(java.util.ArrayList) Matchers.anyString(org.mockito.Matchers.anyString) IMimeType(org.pentaho.platform.api.mimetype.IMimeType) MimeType(org.pentaho.platform.core.mimetype.MimeType) IRepositoryContentConverterHandler(org.pentaho.platform.api.repository2.unified.IRepositoryContentConverterHandler) IMimeType(org.pentaho.platform.api.mimetype.IMimeType) Log4JRepositoryImportLogger(org.pentaho.platform.plugin.services.importexport.Log4JRepositoryImportLogger) Converter(org.pentaho.platform.api.repository2.unified.Converter) Before(org.junit.Before)

Example 22 with Log

use of org.apache.commons.logging.Log in project pentaho-platform by pentaho.

the class PentahoVersionCheckReflectHelperTest method logVersionCheckTest.

@Test
public void logVersionCheckTest() {
    Log mockLog = Mockito.mock(Log.class);
    List<String> results = new ArrayList<>();
    results.add(XML_TEXT);
    PentahoVersionCheckReflectHelper.logVersionCheck(results, mockLog);
    verify(mockLog, times(2)).info(anyObject());
}
Also used : Log(org.apache.commons.logging.Log) ArrayList(java.util.ArrayList) Test(org.junit.Test)

Example 23 with Log

use of org.apache.commons.logging.Log in project epadd by ePADD.

the class Highlighter method getHTMLAnnotatedDocumentContents.

/**
 * @param contents is the content to be annotated, typically the text in email body
 * A convenience method to do the bulk job of annotating all the terms in termsToHighlight, termsToHyperlink and entitiesWithId
 * Also hyperlinks any URLs found in the content
 * @param regexToHighlight - the output will highlight all the strings matching regexToHighlight
 * @param showDebugInfo - when set will append to the output some debug info. related to the entities present in the content and passed through entitiesWithId
 *
 * Note: DO not modify any of the objects passed in the parameter
 *       if need to be modified then clone and modify a local copy
 */
// TODO: can also get rid of termsToHyperlink
public static String getHTMLAnnotatedDocumentContents(String contents, Date d, String docId, String regexToHighlight, Set<String> termsToHighlight, Map<String, EmailRenderer.Entity> entitiesWithId, Set<String> termsToHyperlink, boolean showDebugInfo) {
    Set<String> highlightTerms = new LinkedHashSet<>(), hyperlinkTerms = new LinkedHashSet<>();
    if (termsToHighlight != null)
        highlightTerms.addAll(termsToHighlight);
    if (termsToHyperlink != null)
        hyperlinkTerms.addAll(termsToHyperlink);
    if (log.isDebugEnabled())
        log.debug("DocId: " + docId + "; Highlight terms: " + highlightTerms + "; Entities: " + entitiesWithId + "; Hyperlink terms: " + hyperlinkTerms);
    // System.err.println("DocId: " + docId + "; Highlight terms: " + highlightTerms + "; Entities: " + entitiesWithId + "; Hyperlink terms: " + hyperlinkTerms);
    short HIGHLIGHT = 0, HYPERLINK = 1;
    // pp for post process, as we cannot add complex tags which highlighting
    String preHighlightTag = "<span class='hilitedTerm rounded' >", postHighlightTag = "</span>";
    String preHyperlinkTag = "<span data-process='pp'>", postHyperlinkTag = "</span>";
    // since the urls are not tokenized as one token, it is not possible to highlight with Lucene highlighter.
    Pattern p = Pattern.compile("https?://[^\\s\\n]*");
    Matcher m = p.matcher(contents);
    StringBuffer sb = new StringBuffer();
    while (m.find()) {
        String link = m.group();
        String url = link;
        if (d != null) {
            Calendar c = new GregorianCalendar();
            c.setTime(d);
            String archiveDate = c.get(Calendar.YEAR) + String.format("%02d", c.get(Calendar.MONTH)) + String.format("%02d", c.get(Calendar.DATE)) + "120000";
            url = "http://web.archive.org/web/" + archiveDate + "/" + link;
        }
        m.appendReplacement(sb, Matcher.quoteReplacement("<a target=\"_blank\" href=\"" + url + "\">" + link + "</a> "));
    }
    m.appendTail(sb);
    contents = sb.toString();
    if (!Util.nullOrEmpty(regexToHighlight)) {
        contents = annotateRegex(contents, regexToHighlight, preHighlightTag, postHighlightTag);
    }
    List<String> catchTerms = Arrays.asList("class", "span", "data", "ignore");
    Set<String> ignoreTermsForHyperlinking = catchTerms.stream().map(String::toLowerCase).collect(Collectors.toSet());
    // entitiesid stuff is already canonicalized with tokenize used with analyzer
    if (entitiesWithId != null)
        hyperlinkTerms.addAll(entitiesWithId.keySet().stream().filter(term -> !ignoreTermsForHyperlinking.contains(term.trim().toLowerCase())).map(term -> "\"" + term + "\"").collect(Collectors.toSet()));
    // If there are overlapping annotations, then they need to be serialised.
    // This is serialized order for such annotations.
    // map strings to be annotated -> boolean denoting whether to highlight or hyperlink.
    List<Pair<String, Short>> order = new ArrayList<>();
    // should preserve order so that highlight terms are seen before hyperlink
    Set<String> allTerms = new LinkedHashSet<>();
    allTerms.addAll(highlightTerms);
    /*
		 * We ant to assign order in which terms are highlighted or hyperlinked.
		 * for example: if we want to annotate both "Robert" and "Robert Creeley", and if we annotate "Robert" first then we may miss on "Robert Creeley"
		 * so we assign order over strings that share any common words as done in the loop below
		 * TODO:
		 * This test can still miss cases when a regular expression that eventually matches a word already annotated and
		 * when two terms like "Robert Creeley" "Mr Robert" to match a text like: "Mr Robert Creeley".
		 * TODO: Give pref. to highlighter over hyperlink
		 * TODO: remove order and simplify
		 * In such cases one of the terms may not be annotated.
		 * Terms that are added to o are those that just share at-least one word
		 */
    // should preserve order so that highlight terms that are added first stay that way
    Map<Pair<String, Short>, Integer> o = new LinkedHashMap<>();
    // prioritised terms
    // Note that a term can be marked both for highlight and hyperlink
    Set<String> consTermsHighlight = new HashSet<>(), consTermsHyperlink = new HashSet<>();
    for (String at : allTerms) {
        // Catch: if we are trying to highlight terms like class, span e.t.c,
        // we better annotate them first as it may go into span tags and annotate the stuff, causing the highlighter to break
        Set<String> substrs = IndexUtils.computeAllSubstrings(at);
        for (String substr : substrs) {
            if (at.equals(substr) || at.equals("\"" + substr + "\""))
                continue;
            boolean match = catchTerms.contains(substr.toLowerCase());
            int val = match ? Integer.MAX_VALUE : substr.length();
            // The highlight or hyperlink terms may have quotes, specially handling below is for that.. is there a better way?
            if (highlightTerms.contains(substr) || highlightTerms.contains("\"" + substr + "\"")) {
                highlightTerms.remove(substr);
                highlightTerms.remove("\"" + substr + "\"");
                // there should be no repetitions in the order array, else it leads to multiple annotations i.e. two spans around one single element
                if (!consTermsHighlight.contains(substr)) {
                    o.put(new Pair<>(substr, HIGHLIGHT), val);
                    consTermsHighlight.add(substr);
                }
            }
            if (hyperlinkTerms.contains(substr) || hyperlinkTerms.contains("\"" + substr + "\"")) {
                hyperlinkTerms.remove(substr);
                hyperlinkTerms.remove("\"" + substr + "\"");
                if (!consTermsHyperlink.contains(substr)) {
                    o.put(new Pair<>(substr, HYPERLINK), val);
                    consTermsHyperlink.add(substr);
                }
            }
        }
    }
    // now sort the phrases from longest length to smallest length
    List<Pair<Pair<String, Short>, Integer>> os = Util.sortMapByValue(o);
    order.addAll(os.stream().map(pair -> pair.first).collect(Collectors.toSet()));
    // System.err.println(order+" hit: "+highlightTerms+" -- hyt: "+hyperlinkTerms);
    // annotate whatever is left in highlight and hyperlink Terms.
    // String result = contents;
    String result = highlightBatch(contents, highlightTerms.toArray(new String[highlightTerms.size()]), preHighlightTag, postHighlightTag);
    result = highlightBatch(result, hyperlinkTerms.toArray(new String[hyperlinkTerms.size()]), preHyperlinkTag, postHyperlinkTag);
    // now highlight terms in order.
    for (Pair<String, Short> ann : order) {
        short type = ann.second;
        String term = ann.first;
        String preTag = null, postTag = null;
        if (type == HYPERLINK) {
            preTag = preHyperlinkTag;
            postTag = postHyperlinkTag;
        } else if (type == HIGHLIGHT) {
            preTag = preHighlightTag;
            postTag = postHighlightTag;
        }
        try {
            result = highlight(result, term, preTag, postTag);
        } catch (IOException | InvalidTokenOffsetsException | ParseException e) {
            Util.print_exception("Exception while adding html annotation: " + ann.first, e, log);
            e.printStackTrace();
        }
    }
    // do some line breaking and show overflow.
    String[] lines = result.split("\\n");
    StringBuilder htmlResult = new StringBuilder();
    boolean overflow = false;
    for (String line : lines) {
        htmlResult.append(line);
        htmlResult.append("\n<br/>");
    }
    if (overflow) {
        htmlResult.append("</div>\n");
        // the nojog class ensures that the jog doesn't pop up when the more
        // button is clicked
        htmlResult.append("<span class=\"nojog\" style=\"color:#500050;text-decoration:underline;font-size:12px\" onclick=\"muse.reveal(this, false);\">More</span><br/>\n");
    }
    // Now do post-processing to add complex tags that depend on the text inside. title, link and cssclass
    org.jsoup.nodes.Document doc = Jsoup.parse(htmlResult.toString());
    Elements elts = doc.select("[data-process]");
    for (int j = 0; j < elts.size(); j++) {
        Element elt = elts.get(j);
        Element par = elt.parent();
        // Do not touch nested entities
        if (par != null && par.attr("data-process") == null)
            // (preHighlightTag.contains(par.tagName())||preHyperlinkTag.contains(par.tagName())))
            continue;
        String entity = elt.text();
        int span_j = j;
        String link = "browse?adv-search=1&termBody=on&termSubject=on&termAttachments=on&termOriginalBody=on&term=\"" + Util.escapeHTML(entity) + "\"";
        // note &quot here because the quotes have to survive
        // through the html page and reflect back in the URL
        // may need to URI escape docId?
        link += "&initDocId=" + docId;
        String title = "";
        try {
            String cssclass = "";
            EmailRenderer.Entity info = entitiesWithId.get(entity);
            if (info != null) {
                if (info.ids != null) {
                    title += "<div id=\"fast_" + info.ids + "\"></div>";
                    title += "<script>getFastData(\"" + info.ids + "\");</script>";
                    cssclass = "resolved";
                } else {
                    // the last three are the OpenNLPs'
                    // could have defined overlapping sub-classes, which would have reduced code repetitions in css file; but this way more flexibility
                    String[] types = new String[] { "cp", "cl", "co", "person", "org", "place", "acr" };
                    String[] cssclasses = new String[] { "custom-people", "custom-loc", "custom-org", "opennlp-person", "opennlp-org", "opennlp-place", "acronym" };
                    outer: for (String et : info.types) {
                        for (int t = 0; t < types.length; t++) {
                            String type = types[t];
                            if (type.equals(et)) {
                                if (t < 3) {
                                    cssclass += cssclasses[t] + " ";
                                    // consider no other class
                                    continue outer;
                                } else {
                                    cssclass += cssclasses[t] + " ";
                                }
                            }
                        }
                    }
                }
            } else {
                cssclass += " unresolved";
            }
            // enables completion (expansion) of words while browsing of messages.
            if (entity != null) {
                // enable for only few types
                if (cssclass.contains("custom-people") || cssclass.contains("acronym") || cssclass.contains("custom-org") || cssclass.contains("custom-loc")) {
                    // TODO: remove regexs
                    entity = entity.replaceAll("(^\\s+|\\s+$)", "");
                    if (!entity.contains(" ")) {
                        // String rnd = rand.nextInt() + "";
                        // <img src="images/spinner.gif" style="height:15px"/>
                        // <script>expand("" + entity + "\",\"" + StringEscapeUtils.escapeJava(docId) + "\",\"" + rnd + "");</script>
                        // if(info.expandsTo!=null)
                        // title += "<div class=\"resolutions\" id=\"expand_" + rnd + "\"><a href='browse?term=\""+info.expandsTo+"\"'>"+info.expandsTo+"</a></div>";
                        cssclass += " expand";
                    }
                }
            }
            for (int k = j; k <= span_j; k++) {
                elt = elts.get(k);
                // don't annotate nested tags-- double check if the parent tag is highlight related tag or entity related annotation
                if (elt.parent().tag().getName().toLowerCase().equals("span") && elt.parent().classNames().toString().contains("custom")) {
                    continue;
                }
                String cc = elt.attr("class");
                elt.attr("class", cc + " " + cssclass);
                elt.attr("title", title);
                elt.attr("onclick", "window.location='" + link + "'");
                // A tag may have nested tags in it and is involved to get the text in it.
                elt.attr("data-text", entity);
                elt.attr("data-docId", StringEscapeUtils.escapeHtml(docId));
            }
        } catch (Exception e) {
            Util.print_exception("Some unknown error while highlighting", e, log);
        }
    }
    // The output Jsoup .html() will dump each tag in separate line
    String html = doc.html();
    if (showDebugInfo) {
        String debug_html = html + "<br>";
        debug_html += "<div class='debug' style='display:none'>";
        debug_html += "docId: " + docId;
        debug_html += "<br>-------------------------------------------------<br>";
        for (String str : entitiesWithId.keySet()) debug_html += str + ":" + entitiesWithId.get(str).types + ";;; ";
        debug_html += "<br>-------------------------------------------------<br>";
        String[] opennlp = new String[] { "person", "place", "org" };
        String[] custom = new String[] { "cp", "cl", "co" };
        for (int j = 0; j < opennlp.length; j++) {
            String t1 = opennlp[j];
            String t2 = custom[j];
            Set<String> e1 = new HashSet<>();
            Set<String> e2 = new HashSet<>();
            for (String str : entitiesWithId.keySet()) {
                Set<String> types = entitiesWithId.get(str).types;
                if (types.contains(t1) && !types.contains(t2))
                    e1.add(entitiesWithId.get(str).name);
                else if (types.contains(t2) && !types.contains(t1))
                    e2.add(entitiesWithId.get(str).name);
            }
            debug_html += opennlp[j] + " entities recognised by only opennlp: " + e1;
            debug_html += "<br>";
            debug_html += opennlp[j] + " entities recognised by only custom: " + e2;
            debug_html += "<br><br>";
        }
        debug_html += "-------------------------------------------------<br>";
        lines = contents.split("\\n");
        for (String line : lines) debug_html += line + "<br>";
        debug_html += "</div>";
        debug_html += "<button onclick='$(\".debug\").style(\"display\",\"block\");'>Show Debug Info</button>";
        return debug_html;
    }
    return html;
}
Also used : ParseException(org.apache.lucene.queryparser.classic.ParseException) org.apache.lucene.search.highlight(org.apache.lucene.search.highlight) java.util(java.util) CharArraySet(org.apache.lucene.analysis.CharArraySet) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) Matcher(java.util.regex.Matcher) Formatter(org.apache.lucene.search.highlight.Formatter) Element(org.jsoup.nodes.Element) SimpleSessions(edu.stanford.muse.webapp.SimpleSessions) EmailRenderer(edu.stanford.muse.webapp.EmailRenderer) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TokenStream(org.apache.lucene.analysis.TokenStream) Analyzer(org.apache.lucene.analysis.Analyzer) Util(edu.stanford.muse.util.Util) FileWriter(java.io.FileWriter) IOException(java.io.IOException) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Version(org.apache.lucene.util.Version) Collectors(java.util.stream.Collectors) File(java.io.File) BooleanClause(org.apache.lucene.search.BooleanClause) Pair(edu.stanford.muse.util.Pair) BooleanQuery(org.apache.lucene.search.BooleanQuery) StringReader(java.io.StringReader) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) NEType(edu.stanford.muse.ner.model.NEType) Log(org.apache.commons.logging.Log) LogFactory(org.apache.commons.logging.LogFactory) Jsoup(org.jsoup.Jsoup) Elements(org.jsoup.select.Elements) Pattern(java.util.regex.Pattern) ModeConfig(edu.stanford.muse.webapp.ModeConfig) StringEscapeUtils(org.apache.commons.lang.StringEscapeUtils) Matcher(java.util.regex.Matcher) Element(org.jsoup.nodes.Element) Elements(org.jsoup.select.Elements) Pair(edu.stanford.muse.util.Pair) Pattern(java.util.regex.Pattern) EmailRenderer(edu.stanford.muse.webapp.EmailRenderer) IOException(java.io.IOException) ParseException(org.apache.lucene.queryparser.classic.ParseException) IOException(java.io.IOException) ParseException(org.apache.lucene.queryparser.classic.ParseException)

Example 24 with Log

use of org.apache.commons.logging.Log in project epadd by ePADD.

the class SequenceModelTest method testCONLL.

// we are missing F.C's like F.C. La Valletta
/**
 * Tested on 28th Jan. 2016 on what is believed to be the testa.dat file of original CONLL.
 * I procured this data-set from a prof's (UMass Prof., don't remember the name) home page where he provided the test files for a homework, guess who topped the assignment :)
 * (So, don't use this data to report results at any serious venue)
 * The results on multi-word names is as follows.
 * Note that the test only considered PERSON, LOCATION and ORG; Also, it does not distinguish between the types because the type assigned by Sequence Labeler is almost always right. And, importantly this will avoid any scuffle over the mapping from fine-grained type to the coarse types.
 *  -------------
 *  Found: 8861 -- Total: 7781 -- Correct: 6675
 *  Precision: 0.75330096
 *  Recall: 0.8578589
 *  F1: 0.80218726
 *  ------------
 * I went through 2691 sentences of which only 200 had any unrecognised entities and identified various sources of error.
 * The sources of missing names are as follows in decreasing order of their contribution (approximately), I have put some examples with the sources. The example phrases are recognized as one chunk with a type.
 * Obviously, this list is not exhaustive, USE IT WITH CAUTION!
 *  1. Bad segmentation -- which is minor for ePADD and depends on training data and principles.
 *     For example: "Overseas Development Minister <PERSON>Lynda Chalker</PERSON>",Czech <PERSON>Daniel Vacek</PERSON>, "Frenchman <PERSON>Cedric Pioline</PERSON>"
 *     "President <PERSON>Nelson Mandela</PERSON>","<BANK>Reserve Bank of India</BANK> Governor <PERSON>Chakravarty Rangarajan</PERSON>"
 *     "Third-seeded <PERSON>Wayne Ferreira</PERSON>",
 *     Hong Kong Newsroom -- we got only Hong Kong, <BANK>Hong Kong Interbank</BANK> Offered Rate, Privately-owned <BANK>Bank Duta</BANK>
 *     [SERIOUS]
 *  2. Bad training data -- since our training data (DBpedia instances) contain phrases like "of Romania" a lot
 *     Ex: <PERSON>Yayuk Basuki</PERSON> of Indonesia, <PERSON>Karim Alami</PERSON> of Morocc
 *     This is also leading to errors like when National Bank of Holand is segmented as National Bank
 *     [SERIOUS]
 *  3. Some unknown names, mostly personal -- we see very weird names in CONLL; Hopefully, we can avoid this problem in ePADD by considering the address book of the archive.
 *     Ex: NOVYE ATAGI, Hans-Otto Sieg, NS Kampfruf, Marie-Jose Perec, Billy Mayfair--Paul Goydos--Hidemichi Tanaki
 *     we miss many (almost all) names of the form "M. Dowman" because of uncommon or unknown last name.
 *  4. Bad segmentation due to limitations of CIC
 *     Ex: Hassan al-Turabi, National Democratic party, Department of Humanitarian affairs, Reserve bank of India, Saint of the Gutters, Queen of the South, Queen's Park
 *  5. Very Long entities -- we refrain from seq. labelling if the #tokens>7
 *     Ex: National Socialist German Workers ' Party Foreign Organisation
 *  6. We are missing OCEANs?!
 *     Ex: Atlantic Ocean, Indian Ocean
 *  7. Bad segments -- why are some segments starting with weird chars like '&'
 *     Ex: Goldman Sachs & Co Wertpapier GmbH -> {& Co Wertpapier GmbH, Goldman Sachs}
 *  8. We are missing Times of London?! We get nothing that contains "Newsroom" -- "Amsterdam Newsroom", "Hong Kong News Room"
 *     Why are we getting "Students of South Korea" instead of "South Korea"?
 *
 * 1/50th on only MWs
 * 13 Feb 13:24:54 BMMModel INFO  - -------------
 * 13 Feb 13:24:54 BMMModel INFO  - Found: 4238 -- Total: 4236 -- Correct: 3242 -- Missed due to wrong type: 358
 * 13 Feb 13:24:54 BMMModel INFO  - Precision: 0.7649835
 * 13 Feb 13:24:54 BMMModel INFO  - Recall: 0.7653447
 * 13 Feb 13:24:54 BMMModel INFO  - F1: 0.765164
 * 13 Feb 13:24:54 BMMModel INFO  - ------------
 *
 * Best performance on testa with [ignore segmentation] and single word with CONLL data is
 * 25 Sep 13:27:03 SequenceModel INFO  - -------------
 * 25 Sep 13:27:03 SequenceModel INFO  - Found: 4117 -- Total: 4236 -- Correct: 3368 -- Missed due to wrong type: 266
 * 25 Sep 13:27:03 SequenceModel INFO  - Precision: 0.8180714
 * 25 Sep 13:27:03 SequenceModel INFO  - Recall: 0.7950897
 * 25 Sep 13:27:03 SequenceModel INFO  - F1: 0.80641687
 * 25 Sep 13:27:03 SequenceModel INFO  - ------------
 **
 * on testa, *not* ignoring segmentation (exact match), any number of words
 * 25 Sep 17:23:14 SequenceModel INFO  - -------------
 * 25 Sep 17:23:14 SequenceModel INFO  - Found: 6006 -- Total: 7219 -- Correct: 4245 -- Missed due to wrong type: 605
 * 25 Sep 17:23:14 SequenceModel INFO  - Precision: 0.7067932
 * 25 Sep 17:23:14 SequenceModel INFO  - Recall: 0.5880316
 * 25 Sep 17:23:14 SequenceModel INFO  - F1: 0.6419659
 * 25 Sep 17:23:14 SequenceModel INFO  - ------------
 *
 * on testa, exact matches, multi-word names
 * 25 Sep 17:28:04 SequenceModel INFO  - -------------
 * 25 Sep 17:28:04 SequenceModel INFO  - Found: 4117 -- Total: 4236 -- Correct: 3096 -- Missed due to wrong type: 183
 * 25 Sep 17:28:04 SequenceModel INFO  - Precision: 0.7520039
 * 25 Sep 17:28:04 SequenceModel INFO  - Recall: 0.7308782
 * 25 Sep 17:28:04 SequenceModel INFO  - F1: 0.74129057
 * 25 Sep 17:28:04 SequenceModel INFO  - ------------
 *
 * With a model that is not trained on CONLL lists
 * On testa, ignoring segmentation, any number of words.
 * Sep 19:22:26 SequenceModel INFO  - -------------
 * 25 Sep 19:22:26 SequenceModel INFO  - Found: 6129 -- Total: 7219 -- Correct: 4725 -- Missed due to wrong type: 964
 * 25 Sep 19:22:26 SequenceModel INFO  - Precision: 0.7709251
 * 25 Sep 19:22:26 SequenceModel INFO  - Recall: 0.6545228
 * 25 Sep 19:22:26 SequenceModel INFO  - F1: 0.7079712
 * 25 Sep 19:22:26 SequenceModel INFO  - ------------
 *
 * testa -- model trained on CONLL, ignore segmenatation, any phrase
 * 26 Sep 20:23:58 SequenceModelTest INFO  - -------------
 * Found: 6391 -- Total: 7219 -- Correct: 4900 -- Missed due to wrong type: 987
 * Precision: 0.7667032
 * Recall: 0.67876434
 * F1: 0.7200588
 * ------------
 *
 * testb -- model trained on CONLL, ignore segmenatation, any phrase
 * 26 Sep 20:24:01 SequenceModelTest INFO  - -------------
 * Found: 2198 -- Total: 2339 -- Correct: 1597 -- Missed due to wrong type: 425
 * Precision: 0.7265696
 * Recall: 0.68277043
 * F1: 0.7039894
 * ------------
 */
public static PerfStats testCONLL(SequenceModel seqModel, boolean verbose, ParamsCONLL params) {
    PerfStats stats = new PerfStats();
    try {
        // only multi-word are considered
        boolean onlyMW = params.onlyMultiWord;
        // use ignoreSegmentation=true only with onlyMW=true it is not tested otherwise
        boolean ignoreSegmentation = params.ignoreSegmentation;
        String test = params.testType.toString();
        InputStream in = Config.getResourceAsStream("CONLL" + File.separator + "annotation" + File.separator + test + "spacesep.txt");
        // 7==0111 PER, LOC, ORG
        Conll03NameSampleStream sampleStream = new Conll03NameSampleStream(Conll03NameSampleStream.LANGUAGE.EN, in, 7);
        Set<String> correct = new LinkedHashSet<>(), found = new LinkedHashSet<>(), real = new LinkedHashSet<>(), wrongType = new LinkedHashSet<>();
        Multimap<String, String> matchMap = ArrayListMultimap.create();
        Map<String, String> foundTypes = new LinkedHashMap<>(), benchmarkTypes = new LinkedHashMap<>();
        NameSample sample = sampleStream.read();
        CICTokenizer tokenizer = new CICTokenizer();
        while (sample != null) {
            String[] words = sample.getSentence();
            String sent = "";
            for (String s : words) sent += s + " ";
            sent = sent.substring(0, sent.length() - 1);
            Map<String, String> names = new LinkedHashMap<>();
            opennlp.tools.util.Span[] nspans = sample.getNames();
            for (opennlp.tools.util.Span nspan : nspans) {
                String n = "";
                for (int si = nspan.getStart(); si < nspan.getEnd(); si++) {
                    if (si < words.length - 1 && words[si + 1].equals("'s"))
                        n += words[si];
                    else
                        n += words[si] + " ";
                }
                if (n.endsWith(" "))
                    n = n.substring(0, n.length() - 1);
                if (!onlyMW || n.contains(" "))
                    names.put(n, nspan.getType());
            }
            Span[] chunks = seqModel.find(sent);
            Map<String, String> foundSample = new LinkedHashMap<>();
            if (chunks != null)
                for (Span chunk : chunks) {
                    String text = chunk.text;
                    Short type = chunk.type;
                    if (type == NEType.Type.DISEASE.getCode() || type == NEType.Type.EVENT.getCode() || type == NEType.Type.AWARD.getCode())
                        continue;
                    Short coarseType = NEType.getCoarseType(type).getCode();
                    String typeText;
                    if (coarseType == NEType.Type.PERSON.getCode())
                        typeText = "person";
                    else if (coarseType == NEType.Type.PLACE.getCode())
                        typeText = "location";
                    else
                        typeText = "organization";
                    double s = chunk.typeScore;
                    if (s > 0 && (!onlyMW || text.contains(" ")))
                        foundSample.put(text, typeText);
                }
            Set<String> foundNames = new LinkedHashSet<>();
            Map<String, String> localMatchMap = new LinkedHashMap<>();
            for (Map.Entry<String, String> entry : foundSample.entrySet()) {
                foundTypes.put(entry.getKey(), entry.getValue());
                boolean foundEntry = false;
                String foundType = null;
                for (String name : names.keySet()) {
                    String cname = EmailUtils.uncanonicaliseName(name).toLowerCase();
                    String ek = EmailUtils.uncanonicaliseName(entry.getKey()).toLowerCase();
                    if (cname.equals(ek) || (ignoreSegmentation && ((cname.startsWith(ek + " ") || cname.endsWith(" " + ek) || ek.startsWith(cname + " ") || ek.endsWith(" " + cname))))) {
                        foundEntry = true;
                        foundType = names.get(name);
                        matchMap.put(entry.getKey(), name);
                        localMatchMap.put(entry.getKey(), name);
                        break;
                    }
                }
                if (foundEntry) {
                    if (entry.getValue().equals(foundType)) {
                        foundNames.add(entry.getKey());
                        correct.add(entry.getKey());
                    } else {
                        wrongType.add(entry.getKey());
                    }
                }
            }
            if (verbose) {
                log.info("CIC tokens: " + tokenizer.tokenizeWithoutOffsets(sent));
                log.info(chunks);
                String fn = "Found names:";
                for (String f : foundNames) fn += f + "[" + foundSample.get(f) + "] with " + localMatchMap.get(f) + "--";
                if (fn.endsWith("--"))
                    log.info(fn);
                String extr = "Extra names: ";
                for (String f : foundSample.keySet()) if (!localMatchMap.containsKey(f))
                    extr += f + "[" + foundSample.get(f) + "]--";
                if (extr.endsWith("--"))
                    log.info(extr);
                String miss = "Missing names: ";
                for (String name : names.keySet()) if (!localMatchMap.values().contains(name))
                    miss += name + "[" + names.get(name) + "]--";
                if (miss.endsWith("--"))
                    log.info(miss);
                String misAssign = "Mis-assigned Types: ";
                for (String f : foundSample.keySet()) if (matchMap.containsKey(f)) {
                    // log.warn("This is not expected: " + f + " in matchMap not found names -- " + names);
                    if (names.get(matchMap.get(f)) != null && !names.get(matchMap.get(f)).equals(foundSample.get(f)))
                        misAssign += f + "[" + foundSample.get(f) + "] Expected [" + names.get(matchMap.get(f)) + "]--";
                }
                if (misAssign.endsWith("--"))
                    log.info(misAssign);
                log.info(sent + "\n------------------");
            }
            for (String name : names.keySet()) benchmarkTypes.put(name, names.get(name));
            real.addAll(names.keySet());
            found.addAll(foundSample.keySet());
            sample = sampleStream.read();
        }
        float prec = (float) correct.size() / (float) found.size();
        float recall = (float) correct.size() / (float) real.size();
        if (verbose) {
            log.info("----Correct names----");
            for (String str : correct) log.info(str + " with " + new LinkedHashSet<>(matchMap.get(str)));
            log.info("----Missed names----");
            real.stream().filter(str -> !matchMap.values().contains(str)).forEach(log::info);
            log.info("---Extra names------");
            found.stream().filter(str -> !matchMap.keySet().contains(str)).forEach(log::info);
            log.info("---Assigned wrong type------");
            for (String str : wrongType) {
                Set<String> bMatches = new LinkedHashSet<>(matchMap.get(str));
                for (String bMatch : bMatches) {
                    String ft = foundTypes.get(str);
                    String bt = benchmarkTypes.get(bMatch);
                    if (!ft.equals(bt))
                        log.info(str + "[" + ft + "] expected " + bMatch + "[" + bt + "]");
                }
            }
        }
        stats.f1 = (2 * prec * recall / (prec + recall));
        stats.precision = prec;
        stats.recall = recall;
        stats.numFound = found.size();
        stats.numReal = real.size();
        stats.numCorrect = correct.size();
        stats.numWrongType = wrongType.size();
        log.info(stats.toString());
    } catch (IOException e) {
        e.printStackTrace();
    }
    return stats;
}
Also used : ArrayListMultimap(com.google.common.collect.ArrayListMultimap) Span(edu.stanford.muse.util.Span) Config(edu.stanford.muse.Config) java.util(java.util) GZIPInputStream(java.util.zip.GZIPInputStream) DecimalFormat(java.text.DecimalFormat) Test(org.junit.Test) Multimap(com.google.common.collect.Multimap) SequenceModel(edu.stanford.muse.ner.model.SequenceModel) Collectors(java.util.stream.Collectors) Pair(edu.stanford.muse.util.Pair) Stream(java.util.stream.Stream) java.io(java.io) NEType(edu.stanford.muse.ner.model.NEType) CICTokenizer(edu.stanford.muse.ner.tokenize.CICTokenizer) Conll03NameSampleStream(opennlp.tools.formats.Conll03NameSampleStream) Log(org.apache.commons.logging.Log) GZIPOutputStream(java.util.zip.GZIPOutputStream) NERModel(edu.stanford.muse.ner.model.NERModel) LogFactory(org.apache.commons.logging.LogFactory) EmailUtils(edu.stanford.muse.util.EmailUtils) Assert(org.junit.Assert) NameSample(opennlp.tools.namefind.NameSample) NameSample(opennlp.tools.namefind.NameSample) Span(edu.stanford.muse.util.Span) CICTokenizer(edu.stanford.muse.ner.tokenize.CICTokenizer) GZIPInputStream(java.util.zip.GZIPInputStream) java.util(java.util) Conll03NameSampleStream(opennlp.tools.formats.Conll03NameSampleStream)

Example 25 with Log

use of org.apache.commons.logging.Log in project pdi-platform-plugin by pentaho.

the class PdiActionTest method testSetParamsIntoExecuteConfigInExecuteTrans.

@Test
public void testSetParamsIntoExecuteConfigInExecuteTrans() throws ActionExecutionException {
    PdiAction action = spy(new PdiAction());
    TransMeta meta = mock(TransMeta.class);
    LogWriter logWriter = mock(LogWriter.class);
    Trans trans = mock(Trans.class);
    Log log = mock(Log.class);
    TransExecutionConfiguration transExecutionConfiguration = mock(TransExecutionConfiguration.class);
    action.setLogger(log);
    action.setLogLevel(TEST_LOG_LEVEL_PARAM);
    action.setClearLog(TEST_TRUE_BOOLEAN_PARAM);
    action.setRunSafeMode(TEST_FALSE_BOOLEAN_PARAM);
    action.setGatheringMetrics(TEST_FALSE_BOOLEAN_PARAM);
    doReturn(trans).when(action).newTrans(meta);
    doReturn(true).when(action).customizeTrans(trans, logWriter);
    doReturn(false).when(log).isDebugEnabled();
    doReturn(transExecutionConfiguration).when(action).newTransExecutionConfiguration();
    action.executeTransformation(meta, logWriter);
    verify(transExecutionConfiguration).setLogLevel(LogLevel.getLogLevelForCode(TEST_LOG_LEVEL_PARAM));
    verify(transExecutionConfiguration).setClearingLog(Boolean.valueOf(TEST_TRUE_BOOLEAN_PARAM));
    verify(transExecutionConfiguration).setSafeModeEnabled(Boolean.valueOf(TEST_FALSE_BOOLEAN_PARAM));
    verify(transExecutionConfiguration).setGatheringMetrics(Boolean.valueOf(TEST_FALSE_BOOLEAN_PARAM));
}
Also used : TransExecutionConfiguration(org.pentaho.di.trans.TransExecutionConfiguration) LogWriter(org.pentaho.di.core.logging.LogWriter) Log(org.apache.commons.logging.Log) TransMeta(org.pentaho.di.trans.TransMeta) Trans(org.pentaho.di.trans.Trans) Test(org.junit.Test)

Aggregations

Log (org.apache.commons.logging.Log)188 Test (org.junit.Test)51 Test (org.junit.jupiter.api.Test)40 DirectFieldAccessor (org.springframework.beans.DirectFieldAccessor)35 ArgumentMatchers.anyString (org.mockito.ArgumentMatchers.anyString)19 BeanFactory (org.springframework.beans.factory.BeanFactory)17 CountDownLatch (java.util.concurrent.CountDownLatch)15 LogConfigurationException (org.apache.commons.logging.LogConfigurationException)15 ArrayList (java.util.ArrayList)12 File (java.io.File)11 QueueChannel (org.springframework.integration.channel.QueueChannel)11 MethodInvocation (org.aopalliance.intercept.MethodInvocation)10 IOException (java.io.IOException)9 AtomicReference (java.util.concurrent.atomic.AtomicReference)9 Log4JLogger (org.apache.commons.logging.impl.Log4JLogger)9 Message (org.springframework.messaging.Message)8 List (java.util.List)7 ApplicationEventPublisher (org.springframework.context.ApplicationEventPublisher)7 InputStream (java.io.InputStream)6 LogFactory (org.apache.commons.logging.LogFactory)6