use of edu.stanford.nlp.ling.HasWord in project lucida by claritylab.
the class StanfordPosTagger method tokenize.
/**
* Splits the sentence into individual tokens.
*
* @param sentence Input sentence
* @return Array of tokens
*/
public static String[] tokenize(String sentence) {
List t = MaxentTagger.tokenizeText(new StringReader(sentence));
List<String> tokens = new ArrayList<String>();
for (int j = 0; j < t.size(); j++) {
Sentence s1 = (Sentence) t.get(j);
for (int i = 0; i < s1.length(); i++) {
HasWord w = s1.getHasWord(i);
tokens.add(w.word());
}
}
return (String[]) tokens.toArray(new String[tokens.size()]);
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class ChineseEscaper method apply.
/** <i>Note:</i> At present this clobbers the input list items.
* This should be fixed.
*/
public List<HasWord> apply(List<HasWord> arg) {
List<HasWord> ans = new ArrayList<>(arg);
for (HasWord wd : ans) {
String w = wd.word();
Matcher m2 = p2.matcher(w);
// log.info("Escaper: w is " + w);
if (m2.find()) {
// log.info(" Found pattern.");
w = m2.replaceAll("$1");
// log.info(" Changed it to: " + w);
}
String newW = UTF8EquivalenceFunction.replaceAscii(w);
wd.setWord(newW);
}
return ans;
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class SpanishXMLTreeReader method buildEllipticNode.
/**
* Build a parse tree node corresponding to an elliptic node in the parse XML.
*/
private Tree buildEllipticNode(Node root) {
Element eRoot = (Element) root;
String constituentStr = eRoot.getNodeName();
List<Tree> kids = new ArrayList<>();
Tree leafNode = treeFactory.newLeaf(SpanishTreeNormalizer.EMPTY_LEAF_VALUE);
if (leafNode.label() instanceof HasWord)
((HasWord) leafNode.label()).setWord(SpanishTreeNormalizer.EMPTY_LEAF_VALUE);
kids.add(leafNode);
Tree t = treeFactory.newTreeNode(constituentStr, kids);
return t;
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class StringUtils method joinWords.
public static String joinWords(Iterable<? extends HasWord> l, String glue) {
StringBuilder sb = new StringBuilder(l instanceof Collection ? ((Collection) l).size() : 64);
boolean first = true;
for (HasWord o : l) {
if (!first) {
sb.append(glue);
} else {
first = false;
}
sb.append(o.word());
}
return sb.toString();
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class DocumentPreprocessorTest method testPlainTextIterator.
public void testPlainTextIterator() {
String test = "This is a one line test . \n";
String[] expectedResults = { "This", "is", "a", "one", "line", "test", "." };
DocumentPreprocessor document = new DocumentPreprocessor(new BufferedReader(new StringReader(test)));
document.setTokenizerFactory(null);
document.setSentenceDelimiter("\n");
Iterator<List<HasWord>> iterator = document.iterator();
// we test twice because this call should not eat any text
assertTrue(iterator.hasNext());
assertTrue(iterator.hasNext());
List<HasWord> words = iterator.next();
assertEquals(expectedResults.length, words.size());
for (int i = 0; i < expectedResults.length; ++i) {
assertEquals(expectedResults[i], words.get(i).word());
}
// we test twice to make sure we don't blow up on multiple calls
assertFalse(iterator.hasNext());
assertFalse(iterator.hasNext());
try {
iterator.next();
throw new AssertionError("iterator.next() should have blown up");
} catch (NoSuchElementException e) {
// yay, this is what we want
}
// just in case
assertFalse(iterator.hasNext());
}
Aggregations