Search in sources :

Example 6 with ArrayCoreMap

use of edu.stanford.nlp.util.ArrayCoreMap in project CoreNLP by stanfordnlp.

the class ParsedGigawordReader method toAnnotation.

/*
   * Old implementation based on JDOM.
   * No longer maintained due to JDOM licensing issues.
  private static Annotation toAnnotation(String xml) throws IOException {
    Element docElem;
    try {
      docElem = new SAXBuilder().build(new StringReader(xml)).getRootElement();
    } catch (JDOMException e) {
      throw new RuntimeException(String.format("error:\n%s\ninput:\n%s", e, xml));
    }
    Element textElem = docElem.getChild("TEXT");
    StringBuilder text = new StringBuilder();
    int offset = 0;
    List<CoreMap> sentences = new ArrayList<CoreMap>();
    for (Object sentObj: textElem.getChildren("SENT")) {
      CoreMap sentence = new ArrayCoreMap();
      sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
      Element sentElem = (Element)sentObj;
      Tree tree = Tree.valueOf(sentElem.getText());
      List<CoreLabel> tokens = new ArrayList<CoreLabel>();
      List<Tree> preTerminals = preTerminals(tree);
      for (Tree preTerminal: preTerminals) {
        String posTag = preTerminal.value();
        for (Tree wordTree: preTerminal.children()) {
          String word = wordTree.value();
          CoreLabel token = new CoreLabel();
          token.set(CoreAnnotations.TextAnnotation.class, word);
          token.set(CoreAnnotations.TextAnnotation.class, word);
          token.set(CoreAnnotations.PartOfSpeechAnnotation.class, posTag);
          token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
          offset += word.length();
          token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset);
          text.append(word);
          text.append(' ');
          offset += 1;
          tokens.add(token);
        }
      }
      if (preTerminals.size() > 0) {
        text.setCharAt(text.length() - 1, '\n');
      }
      sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset - 1);
      sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
      sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
      sentences.add(sentence);
    }

    String docID = docElem.getAttributeValue("id");
    Matcher matcher = datePattern.matcher(docID);
    matcher.find();
    Calendar docDate = new Timex(matcher.group(1)).getDate();

    Annotation document = new Annotation(text.toString());
    document.set(CoreAnnotations.DocIDAnnotation.class, docID);
    document.set(CoreAnnotations.CalendarAnnotation.class, docDate);
    document.set(CoreAnnotations.SentencesAnnotation.class, sentences);
    return document;
  }
  */
private static Annotation toAnnotation(String xml) throws IOException {
    Element docElem;
    try {
        Builder parser = new Builder();
        StringReader in = new StringReader(xml);
        docElem = parser.build(in).getRootElement();
    } catch (ParsingException e) {
        throw new RuntimeException(String.format("error:\n%s\ninput:\n%s", e, xml));
    } catch (IOException e) {
        throw new RuntimeException(String.format("error:\n%s\ninput:\n%s", e, xml));
    }
    Element textElem = docElem.getFirstChildElement("TEXT");
    StringBuilder text = new StringBuilder();
    int offset = 0;
    List<CoreMap> sentences = new ArrayList<>();
    Elements sentenceElements = textElem.getChildElements("SENT");
    for (int crtsent = 0; crtsent < sentenceElements.size(); crtsent++) {
        Element sentElem = sentenceElements.get(crtsent);
        CoreMap sentence = new ArrayCoreMap();
        sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
        // XXX ms: is this the same as sentElem.getText() in JDOM?
        Tree tree = Tree.valueOf(sentElem.getChild(0).getValue());
        List<CoreLabel> tokens = new ArrayList<>();
        List<Tree> preTerminals = preTerminals(tree);
        for (Tree preTerminal : preTerminals) {
            String posTag = preTerminal.value();
            for (Tree wordTree : preTerminal.children()) {
                String word = wordTree.value();
                CoreLabel token = new CoreLabel();
                token.set(CoreAnnotations.TextAnnotation.class, word);
                token.set(CoreAnnotations.TextAnnotation.class, word);
                token.set(CoreAnnotations.PartOfSpeechAnnotation.class, posTag);
                token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
                offset += word.length();
                token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset);
                text.append(word);
                text.append(' ');
                offset += 1;
                tokens.add(token);
            }
        }
        if (preTerminals.size() > 0) {
            text.setCharAt(text.length() - 1, '\n');
        }
        sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset - 1);
        sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
        sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
        sentences.add(sentence);
    }
    String docID = docElem.getAttributeValue("id");
    Matcher matcher = datePattern.matcher(docID);
    matcher.find();
    Calendar docDate = new Timex("DATE", matcher.group(1)).getDate();
    Annotation document = new Annotation(text.toString());
    document.set(CoreAnnotations.DocIDAnnotation.class, docID);
    document.set(CoreAnnotations.CalendarAnnotation.class, docDate);
    document.set(CoreAnnotations.SentencesAnnotation.class, sentences);
    return document;
}
Also used : ArrayCoreMap(edu.stanford.nlp.util.ArrayCoreMap) Matcher(java.util.regex.Matcher) Element(nu.xom.Element) Builder(nu.xom.Builder) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) IOException(java.io.IOException) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) Elements(nu.xom.Elements) Annotation(edu.stanford.nlp.pipeline.Annotation) CoreLabel(edu.stanford.nlp.ling.CoreLabel) ParsingException(nu.xom.ParsingException) StringReader(java.io.StringReader) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) Tree(edu.stanford.nlp.trees.Tree) CoreMap(edu.stanford.nlp.util.CoreMap) ArrayCoreMap(edu.stanford.nlp.util.ArrayCoreMap)

Example 7 with ArrayCoreMap

use of edu.stanford.nlp.util.ArrayCoreMap in project CoreNLP by stanfordnlp.

the class POSTaggerAnnotatorITest method testSentencesAnnotation.

/**
   * Test that a single sentence works for the SentenceAnnotation.
   */
public void testSentencesAnnotation() {
    List<CoreLabel> labels = makeSentence(testSentences[0]);
    CoreMap sentence = new ArrayCoreMap();
    sentence.set(CoreAnnotations.TokensAnnotation.class, labels);
    List<CoreMap> sentences = new ArrayList<>();
    sentences.add(sentence);
    Annotation annotation = new Annotation(shortText);
    annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
    tagger.annotate(annotation);
    checkLabels(labels, "PRP$", "NN", "VBZ", "JJ", "CC", "JJ", ".");
}
Also used : ArrayCoreMap(edu.stanford.nlp.util.ArrayCoreMap) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) ArrayList(java.util.ArrayList) CoreMap(edu.stanford.nlp.util.CoreMap) ArrayCoreMap(edu.stanford.nlp.util.ArrayCoreMap)

Example 8 with ArrayCoreMap

use of edu.stanford.nlp.util.ArrayCoreMap in project CoreNLP by stanfordnlp.

the class POSTaggerAnnotatorITest method makeAnnotation.

private static Annotation makeAnnotation(String... testText) {
    List<CoreMap> sentences = new ArrayList<>();
    for (String text : testText) {
        List<CoreLabel> labels = makeSentence(text);
        CoreMap sentence = new ArrayCoreMap();
        sentence.set(CoreAnnotations.TokensAnnotation.class, labels);
        sentences.add(sentence);
    }
    Annotation annotation = new Annotation(StringUtils.join(testText));
    annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
    return annotation;
}
Also used : ArrayCoreMap(edu.stanford.nlp.util.ArrayCoreMap) CoreLabel(edu.stanford.nlp.ling.CoreLabel) ArrayList(java.util.ArrayList) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap) ArrayCoreMap(edu.stanford.nlp.util.ArrayCoreMap)

Example 9 with ArrayCoreMap

use of edu.stanford.nlp.util.ArrayCoreMap in project CoreNLP by stanfordnlp.

the class RegexNERAnnotatorITest method testBasicMatching.

public void testBasicMatching() {
    String str = "President Barack Obama lives in Chicago , Illinois , " + "and is a practicing Christian .";
    String[] split = str.split(" ");
    List<CoreLabel> tokens = SentenceUtils.toCoreLabelList(split);
    tokens.get(1).set(CoreAnnotations.NamedEntityTagAnnotation.class, "PERSON");
    tokens.get(2).set(CoreAnnotations.NamedEntityTagAnnotation.class, "PERSON");
    tokens.get(5).set(CoreAnnotations.NamedEntityTagAnnotation.class, "LOCATION");
    tokens.get(7).set(CoreAnnotations.NamedEntityTagAnnotation.class, "LOCATION");
    CoreMap sentence = new ArrayCoreMap();
    sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
    List<CoreMap> sentences = new ArrayList<CoreMap>();
    sentences.add(sentence);
    Annotation corpus = new Annotation("President Barack Obama lives in Chicago, Illinois," + "and is a practicing Christian.");
    corpus.set(CoreAnnotations.SentencesAnnotation.class, sentences);
    annotator.annotate(corpus);
    checkTags(tokens, "TITLE", "PERSON", "PERSON", "O", "O", "LOCATION", "O", "STATE_OR_PROVINCE", "O", "O", "O", "O", "O", "IDEOLOGY", "O");
}
Also used : ArrayCoreMap(edu.stanford.nlp.util.ArrayCoreMap) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) ArrayList(java.util.ArrayList) CoreMap(edu.stanford.nlp.util.CoreMap) ArrayCoreMap(edu.stanford.nlp.util.ArrayCoreMap)

Example 10 with ArrayCoreMap

use of edu.stanford.nlp.util.ArrayCoreMap in project CoreNLP by stanfordnlp.

the class AnnotatedTextReader method parseFile.

public static List<CoreMap> parseFile(BufferedReader reader, Set<String> categoriesAllowed, Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels, boolean setGoldClass, String sentIDprefix) throws IOException {
    Pattern startingLabelToken = Pattern.compile("<(" + StringUtils.join(categoriesAllowed, "|") + ")>");
    Pattern endLabelToken = Pattern.compile("</(" + StringUtils.join(categoriesAllowed, "|") + ")>");
    String backgroundSymbol = "O";
    List<CoreMap> sentences = new ArrayList<>();
    int lineNum = -1;
    String l = null;
    while ((l = reader.readLine()) != null) {
        lineNum++;
        String[] t = l.split("\t", 2);
        String id = null;
        String text = null;
        if (t.length == 2) {
            id = t[0];
            text = t[1];
        } else if (t.length == 1) {
            text = t[0];
            id = String.valueOf(lineNum);
        }
        id = sentIDprefix + id;
        DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
        PTBTokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizerFactory.newCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
        dp.setTokenizerFactory(tokenizerFactory);
        String label = backgroundSymbol;
        int sentNum = -1;
        for (List<HasWord> sentence : dp) {
            sentNum++;
            String sentStr = "";
            List<CoreLabel> sent = new ArrayList<>();
            for (HasWord tokw : sentence) {
                String tok = tokw.word();
                Matcher startingMatcher = startingLabelToken.matcher(tok);
                Matcher endMatcher = endLabelToken.matcher(tok);
                if (startingMatcher.matches()) {
                    //System.out.println("matched starting");
                    label = startingMatcher.group(1);
                } else if (endMatcher.matches()) {
                    //System.out.println("matched end");
                    label = backgroundSymbol;
                } else {
                    CoreLabel c = new CoreLabel();
                    List<String> toks = new ArrayList<>();
                    toks.add(tok);
                    for (String toksplit : toks) {
                        sentStr += " " + toksplit;
                        c.setWord(toksplit);
                        c.setLemma(toksplit);
                        c.setValue(toksplit);
                        c.set(CoreAnnotations.TextAnnotation.class, toksplit);
                        c.set(CoreAnnotations.OriginalTextAnnotation.class, tok);
                        if (setGoldClass) {
                            c.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
                        }
                        if (setClassForTheseLabels != null && setClassForTheseLabels.containsKey(label))
                            c.set(setClassForTheseLabels.get(label), label);
                        sent.add(c);
                    }
                }
            }
            CoreMap sentcm = new ArrayCoreMap();
            sentcm.set(CoreAnnotations.TextAnnotation.class, sentStr.trim());
            sentcm.set(CoreAnnotations.TokensAnnotation.class, sent);
            sentcm.set(CoreAnnotations.DocIDAnnotation.class, id + "-" + sentNum);
            sentences.add(sentcm);
        }
    }
    return sentences;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) ArrayCoreMap(edu.stanford.nlp.util.ArrayCoreMap) Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) CoreLabel(edu.stanford.nlp.ling.CoreLabel) StringReader(java.io.StringReader) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor) CoreMap(edu.stanford.nlp.util.CoreMap) ArrayCoreMap(edu.stanford.nlp.util.ArrayCoreMap)

Aggregations

CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)13 CoreLabel (edu.stanford.nlp.ling.CoreLabel)13 ArrayCoreMap (edu.stanford.nlp.util.ArrayCoreMap)13 CoreMap (edu.stanford.nlp.util.CoreMap)13 ArrayList (java.util.ArrayList)7 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)2 StringReader (java.io.StringReader)2 Matcher (java.util.regex.Matcher)2 RuntimeIOException (edu.stanford.nlp.io.RuntimeIOException)1 HasWord (edu.stanford.nlp.ling.HasWord)1 DepPattern (edu.stanford.nlp.patterns.dep.DepPattern)1 CreatePatterns (edu.stanford.nlp.patterns.surface.CreatePatterns)1 Annotation (edu.stanford.nlp.pipeline.Annotation)1 DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)1 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)1 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)1 Tree (edu.stanford.nlp.trees.Tree)1 IOException (java.io.IOException)1 Pattern (java.util.regex.Pattern)1 Builder (nu.xom.Builder)1