use of edu.stanford.nlp.util.ArrayCoreMap in project CoreNLP by stanfordnlp.
the class ParsedGigawordReader method toAnnotation.
/*
* Old implementation based on JDOM.
* No longer maintained due to JDOM licensing issues.
private static Annotation toAnnotation(String xml) throws IOException {
Element docElem;
try {
docElem = new SAXBuilder().build(new StringReader(xml)).getRootElement();
} catch (JDOMException e) {
throw new RuntimeException(String.format("error:\n%s\ninput:\n%s", e, xml));
}
Element textElem = docElem.getChild("TEXT");
StringBuilder text = new StringBuilder();
int offset = 0;
List<CoreMap> sentences = new ArrayList<CoreMap>();
for (Object sentObj: textElem.getChildren("SENT")) {
CoreMap sentence = new ArrayCoreMap();
sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
Element sentElem = (Element)sentObj;
Tree tree = Tree.valueOf(sentElem.getText());
List<CoreLabel> tokens = new ArrayList<CoreLabel>();
List<Tree> preTerminals = preTerminals(tree);
for (Tree preTerminal: preTerminals) {
String posTag = preTerminal.value();
for (Tree wordTree: preTerminal.children()) {
String word = wordTree.value();
CoreLabel token = new CoreLabel();
token.set(CoreAnnotations.TextAnnotation.class, word);
token.set(CoreAnnotations.TextAnnotation.class, word);
token.set(CoreAnnotations.PartOfSpeechAnnotation.class, posTag);
token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
offset += word.length();
token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset);
text.append(word);
text.append(' ');
offset += 1;
tokens.add(token);
}
}
if (preTerminals.size() > 0) {
text.setCharAt(text.length() - 1, '\n');
}
sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset - 1);
sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
sentences.add(sentence);
}
String docID = docElem.getAttributeValue("id");
Matcher matcher = datePattern.matcher(docID);
matcher.find();
Calendar docDate = new Timex(matcher.group(1)).getDate();
Annotation document = new Annotation(text.toString());
document.set(CoreAnnotations.DocIDAnnotation.class, docID);
document.set(CoreAnnotations.CalendarAnnotation.class, docDate);
document.set(CoreAnnotations.SentencesAnnotation.class, sentences);
return document;
}
*/
private static Annotation toAnnotation(String xml) throws IOException {
Element docElem;
try {
Builder parser = new Builder();
StringReader in = new StringReader(xml);
docElem = parser.build(in).getRootElement();
} catch (ParsingException e) {
throw new RuntimeException(String.format("error:\n%s\ninput:\n%s", e, xml));
} catch (IOException e) {
throw new RuntimeException(String.format("error:\n%s\ninput:\n%s", e, xml));
}
Element textElem = docElem.getFirstChildElement("TEXT");
StringBuilder text = new StringBuilder();
int offset = 0;
List<CoreMap> sentences = new ArrayList<>();
Elements sentenceElements = textElem.getChildElements("SENT");
for (int crtsent = 0; crtsent < sentenceElements.size(); crtsent++) {
Element sentElem = sentenceElements.get(crtsent);
CoreMap sentence = new ArrayCoreMap();
sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
// XXX ms: is this the same as sentElem.getText() in JDOM?
Tree tree = Tree.valueOf(sentElem.getChild(0).getValue());
List<CoreLabel> tokens = new ArrayList<>();
List<Tree> preTerminals = preTerminals(tree);
for (Tree preTerminal : preTerminals) {
String posTag = preTerminal.value();
for (Tree wordTree : preTerminal.children()) {
String word = wordTree.value();
CoreLabel token = new CoreLabel();
token.set(CoreAnnotations.TextAnnotation.class, word);
token.set(CoreAnnotations.TextAnnotation.class, word);
token.set(CoreAnnotations.PartOfSpeechAnnotation.class, posTag);
token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
offset += word.length();
token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset);
text.append(word);
text.append(' ');
offset += 1;
tokens.add(token);
}
}
if (preTerminals.size() > 0) {
text.setCharAt(text.length() - 1, '\n');
}
sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset - 1);
sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
sentences.add(sentence);
}
String docID = docElem.getAttributeValue("id");
Matcher matcher = datePattern.matcher(docID);
matcher.find();
Calendar docDate = new Timex("DATE", matcher.group(1)).getDate();
Annotation document = new Annotation(text.toString());
document.set(CoreAnnotations.DocIDAnnotation.class, docID);
document.set(CoreAnnotations.CalendarAnnotation.class, docDate);
document.set(CoreAnnotations.SentencesAnnotation.class, sentences);
return document;
}
use of edu.stanford.nlp.util.ArrayCoreMap in project CoreNLP by stanfordnlp.
the class POSTaggerAnnotatorITest method testSentencesAnnotation.
/**
* Test that a single sentence works for the SentenceAnnotation.
*/
public void testSentencesAnnotation() {
List<CoreLabel> labels = makeSentence(testSentences[0]);
CoreMap sentence = new ArrayCoreMap();
sentence.set(CoreAnnotations.TokensAnnotation.class, labels);
List<CoreMap> sentences = new ArrayList<>();
sentences.add(sentence);
Annotation annotation = new Annotation(shortText);
annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
tagger.annotate(annotation);
checkLabels(labels, "PRP$", "NN", "VBZ", "JJ", "CC", "JJ", ".");
}
use of edu.stanford.nlp.util.ArrayCoreMap in project CoreNLP by stanfordnlp.
the class POSTaggerAnnotatorITest method makeAnnotation.
private static Annotation makeAnnotation(String... testText) {
List<CoreMap> sentences = new ArrayList<>();
for (String text : testText) {
List<CoreLabel> labels = makeSentence(text);
CoreMap sentence = new ArrayCoreMap();
sentence.set(CoreAnnotations.TokensAnnotation.class, labels);
sentences.add(sentence);
}
Annotation annotation = new Annotation(StringUtils.join(testText));
annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
return annotation;
}
use of edu.stanford.nlp.util.ArrayCoreMap in project CoreNLP by stanfordnlp.
the class RegexNERAnnotatorITest method testBasicMatching.
public void testBasicMatching() {
String str = "President Barack Obama lives in Chicago , Illinois , " + "and is a practicing Christian .";
String[] split = str.split(" ");
List<CoreLabel> tokens = SentenceUtils.toCoreLabelList(split);
tokens.get(1).set(CoreAnnotations.NamedEntityTagAnnotation.class, "PERSON");
tokens.get(2).set(CoreAnnotations.NamedEntityTagAnnotation.class, "PERSON");
tokens.get(5).set(CoreAnnotations.NamedEntityTagAnnotation.class, "LOCATION");
tokens.get(7).set(CoreAnnotations.NamedEntityTagAnnotation.class, "LOCATION");
CoreMap sentence = new ArrayCoreMap();
sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
List<CoreMap> sentences = new ArrayList<CoreMap>();
sentences.add(sentence);
Annotation corpus = new Annotation("President Barack Obama lives in Chicago, Illinois," + "and is a practicing Christian.");
corpus.set(CoreAnnotations.SentencesAnnotation.class, sentences);
annotator.annotate(corpus);
checkTags(tokens, "TITLE", "PERSON", "PERSON", "O", "O", "LOCATION", "O", "STATE_OR_PROVINCE", "O", "O", "O", "O", "O", "IDEOLOGY", "O");
}
use of edu.stanford.nlp.util.ArrayCoreMap in project CoreNLP by stanfordnlp.
the class AnnotatedTextReader method parseFile.
public static List<CoreMap> parseFile(BufferedReader reader, Set<String> categoriesAllowed, Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels, boolean setGoldClass, String sentIDprefix) throws IOException {
Pattern startingLabelToken = Pattern.compile("<(" + StringUtils.join(categoriesAllowed, "|") + ")>");
Pattern endLabelToken = Pattern.compile("</(" + StringUtils.join(categoriesAllowed, "|") + ")>");
String backgroundSymbol = "O";
List<CoreMap> sentences = new ArrayList<>();
int lineNum = -1;
String l = null;
while ((l = reader.readLine()) != null) {
lineNum++;
String[] t = l.split("\t", 2);
String id = null;
String text = null;
if (t.length == 2) {
id = t[0];
text = t[1];
} else if (t.length == 1) {
text = t[0];
id = String.valueOf(lineNum);
}
id = sentIDprefix + id;
DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
PTBTokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizerFactory.newCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
dp.setTokenizerFactory(tokenizerFactory);
String label = backgroundSymbol;
int sentNum = -1;
for (List<HasWord> sentence : dp) {
sentNum++;
String sentStr = "";
List<CoreLabel> sent = new ArrayList<>();
for (HasWord tokw : sentence) {
String tok = tokw.word();
Matcher startingMatcher = startingLabelToken.matcher(tok);
Matcher endMatcher = endLabelToken.matcher(tok);
if (startingMatcher.matches()) {
//System.out.println("matched starting");
label = startingMatcher.group(1);
} else if (endMatcher.matches()) {
//System.out.println("matched end");
label = backgroundSymbol;
} else {
CoreLabel c = new CoreLabel();
List<String> toks = new ArrayList<>();
toks.add(tok);
for (String toksplit : toks) {
sentStr += " " + toksplit;
c.setWord(toksplit);
c.setLemma(toksplit);
c.setValue(toksplit);
c.set(CoreAnnotations.TextAnnotation.class, toksplit);
c.set(CoreAnnotations.OriginalTextAnnotation.class, tok);
if (setGoldClass) {
c.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
}
if (setClassForTheseLabels != null && setClassForTheseLabels.containsKey(label))
c.set(setClassForTheseLabels.get(label), label);
sent.add(c);
}
}
}
CoreMap sentcm = new ArrayCoreMap();
sentcm.set(CoreAnnotations.TextAnnotation.class, sentStr.trim());
sentcm.set(CoreAnnotations.TokensAnnotation.class, sent);
sentcm.set(CoreAnnotations.DocIDAnnotation.class, id + "-" + sentNum);
sentences.add(sentcm);
}
}
return sentences;
}
Aggregations