use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class AnnotatedTextReader method parseFile.
public static List<CoreMap> parseFile(BufferedReader reader, Set<String> categoriesAllowed, Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels, boolean setGoldClass, String sentIDprefix) throws IOException {
Pattern startingLabelToken = Pattern.compile("<(" + StringUtils.join(categoriesAllowed, "|") + ")>");
Pattern endLabelToken = Pattern.compile("</(" + StringUtils.join(categoriesAllowed, "|") + ")>");
String backgroundSymbol = "O";
List<CoreMap> sentences = new ArrayList<>();
int lineNum = -1;
String l = null;
while ((l = reader.readLine()) != null) {
lineNum++;
String[] t = l.split("\t", 2);
String id = null;
String text = null;
if (t.length == 2) {
id = t[0];
text = t[1];
} else if (t.length == 1) {
text = t[0];
id = String.valueOf(lineNum);
}
id = sentIDprefix + id;
DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
PTBTokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizerFactory.newCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
dp.setTokenizerFactory(tokenizerFactory);
String label = backgroundSymbol;
int sentNum = -1;
for (List<HasWord> sentence : dp) {
sentNum++;
String sentStr = "";
List<CoreLabel> sent = new ArrayList<>();
for (HasWord tokw : sentence) {
String tok = tokw.word();
Matcher startingMatcher = startingLabelToken.matcher(tok);
Matcher endMatcher = endLabelToken.matcher(tok);
if (startingMatcher.matches()) {
//System.out.println("matched starting");
label = startingMatcher.group(1);
} else if (endMatcher.matches()) {
//System.out.println("matched end");
label = backgroundSymbol;
} else {
CoreLabel c = new CoreLabel();
List<String> toks = new ArrayList<>();
toks.add(tok);
for (String toksplit : toks) {
sentStr += " " + toksplit;
c.setWord(toksplit);
c.setLemma(toksplit);
c.setValue(toksplit);
c.set(CoreAnnotations.TextAnnotation.class, toksplit);
c.set(CoreAnnotations.OriginalTextAnnotation.class, tok);
if (setGoldClass) {
c.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
}
if (setClassForTheseLabels != null && setClassForTheseLabels.containsKey(label))
c.set(setClassForTheseLabels.get(label), label);
sent.add(c);
}
}
}
CoreMap sentcm = new ArrayCoreMap();
sentcm.set(CoreAnnotations.TextAnnotation.class, sentStr.trim());
sentcm.set(CoreAnnotations.TokensAnnotation.class, sent);
sentcm.set(CoreAnnotations.DocIDAnnotation.class, id + "-" + sentNum);
sentences.add(sentcm);
}
}
return sentences;
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class WordSegmentingTokenizer method getNext.
@Override
protected HasWord getNext() {
while (wordIter == null || !wordIter.hasNext()) {
if (!tok.hasNext()) {
return null;
}
CoreLabel token = tok.next();
String s = token.word();
if (s == null) {
return null;
}
if (s.equals(WhitespaceLexer.NEWLINE)) {
// if newlines were significant, we should make sure to return
// them when we see them
List<HasWord> se = Collections.<HasWord>singletonList(token);
wordIter = se.iterator();
} else {
List<HasWord> se = wordSegmenter.segment(s);
wordIter = se.iterator();
}
}
return wordIter.next();
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class PTBEscapingProcessor method fixQuotes.
private static List<HasWord> fixQuotes(List<HasWord> input) {
int inputSize = input.size();
LinkedList<HasWord> result = new LinkedList<>();
if (inputSize == 0) {
return result;
}
boolean begin;
// see if there is a quote at the end
if (input.get(inputSize - 1).word().equals("\"")) {
// alternate from the end
begin = false;
for (int i = inputSize - 1; i >= 0; i--) {
HasWord hw = input.get(i);
String tok = hw.word();
if (tok.equals("\"")) {
if (begin) {
hw.setWord("``");
begin = false;
} else {
hw.setWord("\'\'");
begin = true;
}
}
// otherwise leave it alone
result.addFirst(hw);
}
// end loop
} else {
// alternate from the beginning
begin = true;
for (HasWord hw : input) {
String tok = hw.word();
if (tok.equals("\"")) {
if (begin) {
hw.setWord("``");
begin = false;
} else {
hw.setWord("\'\'");
begin = true;
}
}
// otherwise leave it alone
result.addLast(hw);
}
// end loop
}
return result;
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class PlainTextDocumentReaderAndWriter method init.
@Override
public void init(SeqClassifierFlags flags) {
String options = "tokenizeNLs=false,invertible=true";
if (flags.tokenizerOptions != null) {
options = options + ',' + flags.tokenizerOptions;
}
TokenizerFactory<IN> factory;
if (flags.tokenizerFactory != null) {
try {
Class<TokenizerFactory<? extends HasWord>> clazz = ErasureUtils.uncheckedCast(Class.forName(flags.tokenizerFactory));
Method factoryMethod = clazz.getMethod("newCoreLabelTokenizerFactory", String.class);
factory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, options));
} catch (Exception e) {
throw new RuntimeException(e);
}
} else {
factory = ErasureUtils.uncheckedCast(PTBTokenizer.PTBTokenizerFactory.newCoreLabelTokenizerFactory(options));
}
init(flags, factory);
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class Tree method yield.
/**
* Gets the yield of the tree. The <code>Label</code> of all leaf nodes
* is returned
* as a list ordered by the natural left to right order of the
* leaves. Null values, if any, are inserted into the list like any
* other value. This has been rewritten to thread, so only one List
* is used.
*
* @param y The list in which the yield of the tree will be placed.
* Normally, this will be empty when the routine is called, but
* if not, the new yield is added to the end of the list.
* @return a <code>List</code> of the data in the tree's leaves.
*/
@SuppressWarnings("unchecked")
public <T> List<T> yield(List<T> y) {
if (isLeaf()) {
if (label() instanceof HasWord) {
HasWord hw = (HasWord) label();
hw.setWord(label().value());
}
y.add((T) label());
} else {
Tree[] kids = children();
for (Tree kid : kids) {
kid.yield(y);
}
}
return y;
}
Aggregations