use of java.io.StringReader in project CoreNLP by stanfordnlp.
the class DcorefBenchmarkSlowITest method getCorefResults.
public static Counter<String> getCorefResults(String resultsString) throws IOException {
Counter<String> results = new ClassicCounter<>();
BufferedReader r = new BufferedReader(new StringReader(resultsString));
for (String line; (line = r.readLine()) != null; ) {
Matcher m1 = MENTION_PATTERN.matcher(line);
if (m1.matches()) {
results.setCount(MENTION_TP, Double.parseDouble(m1.group(1)));
results.setCount(MENTION_F1, Double.parseDouble(m1.group(2)));
}
Matcher m2 = MUC_PATTERN.matcher(line);
if (m2.matches()) {
results.setCount(MUC_TP, Double.parseDouble(m2.group(1)));
results.setCount(MUC_F1, Double.parseDouble(m2.group(2)));
}
Matcher m3 = BCUBED_PATTERN.matcher(line);
if (m3.matches()) {
results.setCount(BCUBED_TP, Double.parseDouble(m3.group(1)));
results.setCount(BCUBED_F1, Double.parseDouble(m3.group(2)));
}
Matcher m4 = CEAFM_PATTERN.matcher(line);
if (m4.matches()) {
results.setCount(CEAFM_TP, Double.parseDouble(m4.group(1)));
results.setCount(CEAFM_F1, Double.parseDouble(m4.group(2)));
}
Matcher m5 = CEAFE_PATTERN.matcher(line);
if (m5.matches()) {
results.setCount(CEAFE_TP, Double.parseDouble(m5.group(1)));
results.setCount(CEAFE_F1, Double.parseDouble(m5.group(2)));
}
Matcher m6 = BLANC_PATTERN.matcher(line);
if (m6.matches()) {
results.setCount(BLANC_F1, Double.parseDouble(m6.group(1)));
}
Matcher m7 = CONLL_PATTERN.matcher(line);
if (m7.matches()) {
results.setCount(CONLL_SCORE, Double.parseDouble(m7.group(1)));
}
}
return results;
}
use of java.io.StringReader in project CoreNLP by stanfordnlp.
the class SpanishTokenizerITest method testOffsetsSpacing.
public void testOffsetsSpacing() {
// guide 1 2 3 4 5 6 7 8 9 0 1 2 3
// guide 0123456789012345678901234567890123456789012345678 90123456789012345678901234567 8 901234567890123456789012345678901234567890123456789012345
String text = " La combinación consonántica ss es ajena a la\tortografía castellana: \n\n traigámosela, mandémoselos, escribámosela, comprémoselo.";
final TokenizerFactory<CoreLabel> tf = SpanishTokenizer.coreLabelFactory();
tf.setOptions("");
tf.setOptions("splitAll=true");
Tokenizer<CoreLabel> spanishTokenizer = tf.getTokenizer(new StringReader(text));
List<CoreLabel> tokens = spanishTokenizer.tokenize();
System.err.println(tokens);
assertEquals(27, tokens.size());
// assertEquals(" ", tokens.get(0).get(CoreAnnotations.BeforeAnnotation.class));
// assertEquals("\t", tokens.get(8).get(CoreAnnotations.AfterAnnotation.class));
assertEquals("Begin char offset", 2, (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
assertEquals("End char offset", 4, (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
assertEquals("La", tokens.get(0).get(CoreAnnotations.OriginalTextAnnotation.class));
// note: after(x) and before(x+1) are the same
// assertEquals(" ", tokens.get(0).get(CoreAnnotations.AfterAnnotation.class));
// assertEquals(" ", tokens.get(1).get(CoreAnnotations.BeforeAnnotation.class));
assertEquals("escribámo", tokens.get(19).get(CoreAnnotations.OriginalTextAnnotation.class));
assertEquals("escribamos", tokens.get(19).get(CoreAnnotations.TextAnnotation.class));
assertEquals("Begin char offset", 108, (int) tokens.get(19).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
assertEquals("End char offset", 117, (int) tokens.get(19).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
assertEquals("se", tokens.get(20).get(CoreAnnotations.OriginalTextAnnotation.class));
assertEquals("se", tokens.get(20).get(CoreAnnotations.TextAnnotation.class));
assertEquals("Begin char offset", 117, (int) tokens.get(20).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
assertEquals("End char offset", 119, (int) tokens.get(20).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
assertEquals("la", tokens.get(21).get(CoreAnnotations.OriginalTextAnnotation.class));
assertEquals("la", tokens.get(21).get(CoreAnnotations.TextAnnotation.class));
assertEquals("Begin char offset", 119, (int) tokens.get(21).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
assertEquals("End char offset", 121, (int) tokens.get(21).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
assertEquals(",", tokens.get(22).get(CoreAnnotations.OriginalTextAnnotation.class));
assertEquals(",", tokens.get(22).get(CoreAnnotations.TextAnnotation.class));
assertEquals("Begin char offset", 121, (int) tokens.get(22).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
assertEquals("End char offset", 122, (int) tokens.get(22).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
}
use of java.io.StringReader in project CoreNLP by stanfordnlp.
the class Morphology method stem.
public String stem(String word) {
try {
lexer.yyreset(new StringReader(word));
lexer.yybegin(Morpha.any);
String wordRes = lexer.next();
return wordRes;
} catch (IOException e) {
log.warning("Morphology.stem() had error on word " + word);
return word;
}
}
use of java.io.StringReader in project CoreNLP by stanfordnlp.
the class Morphology method lemmatize.
/** Lemmatize the word, being sensitive to the tag, using the
* passed in lexer.
*
* @param lowercase If this is true, words other than proper nouns will
* be changed to all lowercase.
*/
private static String lemmatize(String word, String tag, Morpha lexer, boolean lowercase) {
boolean wordHasForbiddenChar = word.indexOf('_') >= 0 || word.indexOf(' ') >= 0 || word.indexOf('\n') >= 0;
String quotedWord = word;
if (wordHasForbiddenChar) {
// choose something unlikely. Classical Vedic!
quotedWord = quotedWord.replaceAll("_", "ᳰ");
quotedWord = quotedWord.replaceAll(" ", "ᳱ");
quotedWord = quotedWord.replaceAll("\n", "ᳲ");
}
String wordtag = quotedWord + '_' + tag;
if (DEBUG)
log.info("Trying to normalize |" + wordtag + '|');
try {
lexer.setOption(1, lowercase);
lexer.yyreset(new StringReader(wordtag));
lexer.yybegin(Morpha.scan);
String wordRes = lexer.next();
// go past tag
lexer.next();
if (wordHasForbiddenChar) {
if (DEBUG)
log.info("Restoring forbidden chars");
wordRes = wordRes.replaceAll("ᳰ", "_");
wordRes = wordRes.replaceAll("ᳱ", " ");
wordRes = wordRes.replaceAll("ᳲ", "\n");
}
return wordRes;
} catch (IOException e) {
log.warning("Morphology.stem() had error on word " + word + '/' + tag);
return word;
}
}
use of java.io.StringReader in project liquibase by liquibase.
the class StreamUtilTest method testGetReaderContents.
@Test
public void testGetReaderContents() throws IOException {
String contents = "TEST";
StringReader reader = new StringReader(contents);
String result = StreamUtil.getReaderContents(reader);
assertEquals(contents, result);
}
Aggregations