Search in sources :

Example 31 with HasWord

use of edu.stanford.nlp.ling.HasWord in project lucida by claritylab.

the class StanfordPosTagger method tokenize.

/**
	 * Splits the sentence into individual tokens.
	 * 
	 * @param sentence Input sentence
	 * @return Array of tokens
	 */
public static String[] tokenize(String sentence) {
    List t = MaxentTagger.tokenizeText(new StringReader(sentence));
    List<String> tokens = new ArrayList<String>();
    for (int j = 0; j < t.size(); j++) {
        Sentence s1 = (Sentence) t.get(j);
        for (int i = 0; i < s1.length(); i++) {
            HasWord w = s1.getHasWord(i);
            tokens.add(w.word());
        }
    }
    return (String[]) tokens.toArray(new String[tokens.size()]);
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) StringReader(java.io.StringReader) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) Sentence(edu.stanford.nlp.ling.Sentence)

Example 32 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class ChineseEscaper method apply.

/** <i>Note:</i> At present this clobbers the input list items.
   *  This should be fixed.
   */
public List<HasWord> apply(List<HasWord> arg) {
    List<HasWord> ans = new ArrayList<>(arg);
    for (HasWord wd : ans) {
        String w = wd.word();
        Matcher m2 = p2.matcher(w);
        // log.info("Escaper: w is " + w);
        if (m2.find()) {
            // log.info("  Found pattern.");
            w = m2.replaceAll("$1");
        // log.info("  Changed it to: " + w);
        }
        String newW = UTF8EquivalenceFunction.replaceAscii(w);
        wd.setWord(newW);
    }
    return ans;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) ArrayList(java.util.ArrayList)

Example 33 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class SpanishXMLTreeReader method buildEllipticNode.

/**
   * Build a parse tree node corresponding to an elliptic node in the parse XML.
   */
private Tree buildEllipticNode(Node root) {
    Element eRoot = (Element) root;
    String constituentStr = eRoot.getNodeName();
    List<Tree> kids = new ArrayList<>();
    Tree leafNode = treeFactory.newLeaf(SpanishTreeNormalizer.EMPTY_LEAF_VALUE);
    if (leafNode.label() instanceof HasWord)
        ((HasWord) leafNode.label()).setWord(SpanishTreeNormalizer.EMPTY_LEAF_VALUE);
    kids.add(leafNode);
    Tree t = treeFactory.newTreeNode(constituentStr, kids);
    return t;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) Element(org.w3c.dom.Element) Tree(edu.stanford.nlp.trees.Tree)

Example 34 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class StringUtils method joinWords.

public static String joinWords(Iterable<? extends HasWord> l, String glue) {
    StringBuilder sb = new StringBuilder(l instanceof Collection ? ((Collection) l).size() : 64);
    boolean first = true;
    for (HasWord o : l) {
        if (!first) {
            sb.append(glue);
        } else {
            first = false;
        }
        sb.append(o.word());
    }
    return sb.toString();
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord)

Example 35 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class DocumentPreprocessorTest method testPlainTextIterator.

public void testPlainTextIterator() {
    String test = "This is a one line test . \n";
    String[] expectedResults = { "This", "is", "a", "one", "line", "test", "." };
    DocumentPreprocessor document = new DocumentPreprocessor(new BufferedReader(new StringReader(test)));
    document.setTokenizerFactory(null);
    document.setSentenceDelimiter("\n");
    Iterator<List<HasWord>> iterator = document.iterator();
    // we test twice because this call should not eat any text
    assertTrue(iterator.hasNext());
    assertTrue(iterator.hasNext());
    List<HasWord> words = iterator.next();
    assertEquals(expectedResults.length, words.size());
    for (int i = 0; i < expectedResults.length; ++i) {
        assertEquals(expectedResults[i], words.get(i).word());
    }
    // we test twice to make sure we don't blow up on multiple calls
    assertFalse(iterator.hasNext());
    assertFalse(iterator.hasNext());
    try {
        iterator.next();
        throw new AssertionError("iterator.next() should have blown up");
    } catch (NoSuchElementException e) {
    // yay, this is what we want
    }
    // just in case
    assertFalse(iterator.hasNext());
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) BufferedReader(java.io.BufferedReader) StringReader(java.io.StringReader) List(java.util.List) ArrayList(java.util.ArrayList) NoSuchElementException(java.util.NoSuchElementException)

Aggregations

HasWord (edu.stanford.nlp.ling.HasWord)58 CoreLabel (edu.stanford.nlp.ling.CoreLabel)17 TaggedWord (edu.stanford.nlp.ling.TaggedWord)15 ArrayList (java.util.ArrayList)15 HasTag (edu.stanford.nlp.ling.HasTag)13 Tree (edu.stanford.nlp.trees.Tree)13 DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)12 StringReader (java.io.StringReader)12 Label (edu.stanford.nlp.ling.Label)10 Word (edu.stanford.nlp.ling.Word)10 List (java.util.List)8 BufferedReader (java.io.BufferedReader)6 MaxentTagger (edu.stanford.nlp.tagger.maxent.MaxentTagger)5 File (java.io.File)5 PrintWriter (java.io.PrintWriter)5 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)4 Pair (edu.stanford.nlp.util.Pair)4 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)3 HasIndex (edu.stanford.nlp.ling.HasIndex)3 Sentence (edu.stanford.nlp.ling.Sentence)3