use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class PatternsSimpleThreadedITest method runTest.
void runTest(String numThreads) {
Properties spiedProperties = new Properties();
final Path tempPath;
try {
tempPath = Files.createTempDirectory(null);
spiedProperties.load(new InputStreamReader(new FileInputStream(new File("data/edu/stanford/nlp/patterns/patterns_itest.properties")), StandardCharsets.UTF_8));
} catch (IOException e) {
throw new RuntimeIOException(e);
}
Path outputPath = Paths.get(tempPath.toString(), "output");
Path modelPath = Paths.get(tempPath.toString(), "model");
Path docsentsPath = Paths.get(tempPath.toString(), "docsents.ser");
System.out.println("Test " + numThreads + " writing to " + tempPath);
spiedProperties.setProperty("seedWordsFiles", "VACCINE_PREVENTABLE_DISEASE,data/edu/stanford/nlp/patterns/VACCINE_PREVENTABLE_DISEASE.txt");
// We generate this file below
spiedProperties.setProperty("file", docsentsPath.toString());
spiedProperties.setProperty("fileFormat", "ser");
spiedProperties.setProperty("outDir", outputPath.toString());
spiedProperties.setProperty("patternsWordsDir", modelPath.toString());
spiedProperties.setProperty("loadSavedPatternsWordsDir", Boolean.toString(false));
spiedProperties.setProperty("numThreads", numThreads);
// Run the pipeline on an input document
// Algorithm based on
// https://github.com/stanfordnlp/CoreNLP/blob/a9a4c2d75b177790a24c0f46188810668d044cd8/src/edu/stanford/nlp/patterns/GetPatternsFromDataMultiClass.java#L702
// useTargetParserParentRestriction is false
final Annotation document = new Annotation("** If you survive measles without complications ** I love these . " + "Why would n't you survive without complications , Immunologist ?");
nlpPipeline.annotate(document);
// Convert annotation to map to serialize, similarly to the original code algorithm
int i = 0;
final Map<String, DataInstance> sentenceMap = new HashMap<>();
for (final CoreMap sentence : document.get(SentencesAnnotation.class)) {
sentenceMap.put(Integer.toString(i++), DataInstance.getNewInstance(PatternFactory.PatternType.SURFACE, sentence));
}
try (final ObjectOutputStream sentenceMapStream = new ObjectOutputStream(new FileOutputStream(docsentsPath.toString()))) {
sentenceMapStream.writeObject(sentenceMap);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
try {
GetPatternsFromDataMultiClass.<SurfacePattern>run(spiedProperties);
} catch (Exception e) {
System.out.println("Test " + numThreads + " FAILED");
System.out.println(" Intermediate files in " + tempPath);
throw new RuntimeException(e);
}
System.out.println("Cleaning up temp files from " + tempPath);
FileSystem.deleteDir(tempPath.toFile());
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class CoNLL2011DocumentReader method main.
/**
* Reads and dumps output, mainly for debugging.
*/
public static void main(String[] args) throws IOException {
Properties props = StringUtils.argsToProperties(args);
boolean debug = Boolean.parseBoolean(props.getProperty("debug", "false"));
String filepath = props.getProperty("i");
String outfile = props.getProperty("o");
if (filepath == null || outfile == null) {
usage();
System.exit(-1);
}
PrintWriter fout = new PrintWriter(outfile);
logger.info("Writing to " + outfile);
String ext = props.getProperty("ext");
Options options;
if (ext != null) {
options = new Options(".*" + ext + "$");
} else {
options = new Options();
}
options.annotateTreeCoref = true;
options.annotateTreeNer = true;
CorpusStats corpusStats = new CorpusStats();
CoNLL2011DocumentReader reader = new CoNLL2011DocumentReader(filepath, options);
int docCnt = 0;
int sentCnt = 0;
int tokenCnt = 0;
for (Document doc; (doc = reader.getNextDocument()) != null; ) {
corpusStats.process(doc);
docCnt++;
Annotation anno = doc.getAnnotation();
if (debug)
System.out.println("Document " + docCnt + ": " + anno.get(CoreAnnotations.DocIDAnnotation.class));
for (CoreMap sentence : anno.get(CoreAnnotations.SentencesAnnotation.class)) {
if (debug)
System.out.println("Parse: " + sentence.get(TreeCoreAnnotations.TreeAnnotation.class));
if (debug)
System.out.println("Sentence Tokens: " + StringUtils.join(sentence.get(CoreAnnotations.TokensAnnotation.class), ","));
writeTabSep(fout, sentence, doc.corefChainMap);
sentCnt++;
tokenCnt += sentence.get(CoreAnnotations.TokensAnnotation.class).size();
}
if (debug) {
for (CoreMap ner : doc.nerChunks) {
System.out.println("NER Chunk: " + ner);
}
for (String id : doc.corefChainMap.keySet()) {
System.out.println("Coref: " + id + " = " + StringUtils.join(doc.corefChainMap.get(id), ";"));
}
}
}
fout.close();
System.out.println("Total document count: " + docCnt);
System.out.println("Total sentence count: " + sentCnt);
System.out.println("Total token count: " + tokenCnt);
System.out.println(corpusStats);
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class RuleBasedCorefMentionFinder method parse.
private Tree parse(List<CoreLabel> tokens, List<ParserConstraint> constraints) {
CoreMap sent = new Annotation("");
sent.set(CoreAnnotations.TokensAnnotation.class, tokens);
sent.set(ParserAnnotations.ConstraintAnnotation.class, constraints);
Annotation doc = new Annotation("");
List<CoreMap> sents = new ArrayList<>(1);
sents.add(sent);
doc.set(CoreAnnotations.SentencesAnnotation.class, sents);
getParser().annotate(doc);
sents = doc.get(CoreAnnotations.SentencesAnnotation.class);
return sents.get(0).get(TreeCoreAnnotations.TreeAnnotation.class);
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class PhraseTable method splitText.
public String[] splitText(String phraseText) {
String[] words;
if (tokenizer != null) {
Annotation annotation = new Annotation(phraseText);
tokenizer.annotate(annotation);
List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
words = new String[tokens.size()];
for (int i = 0; i < tokens.size(); i++) {
words[i] = tokens.get(i).word();
}
} else {
phraseText = possPattern.matcher(phraseText).replaceAll(" 's$1");
words = delimPattern.split(phraseText);
}
return words;
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class XMLToAnnotation method readXMLFormat.
public static Data readXMLFormat(String fileName) throws Exception {
// Extract character list, gold quote speaker and mention information from the XML document.
Document doc = XMLUtils.readDocumentFromFile(fileName);
Node text = doc.getDocumentElement().getElementsByTagName("text").item(0);
String docText = getJustText(text);
Annotation document = getAnnotatedFile(docText, fileName, getProcessedCoreNLPProperties());
List<CoreMap> quotes = document.get(CoreAnnotations.QuotationsAnnotation.class);
List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
List<GoldQuoteInfo> goldList = new ArrayList<>();
Map<Integer, Mention> idToMention = new HashMap<>();
List<Person> personList = readXMLCharacterList(doc);
Map<String, List<Person>> personMap = QuoteAttributionUtils.readPersonMap(personList);
List<Pair<Integer, String>> mentionIdToSpeakerList = new ArrayList<>();
// there is at least 1 case in which the XML quote does not match up with the automatically-extracted quote. (Ex: quote by Mr. Collins that begins, "Hunsford, near Westerham, Kent, ...")
// as the dirty solution, we treat all quotes encapsulated within an XML quote as the same speaker (although this is not 100% accurate!)
int quoteIndex = 0;
NodeList textElems = text.getChildNodes();
int tokenIndex = 0;
for (int i = 0; i < textElems.getLength(); i++) {
Node chapterNode = textElems.item(i);
if (chapterNode.getNodeName().equals("chapter")) {
NodeList chapElems = chapterNode.getChildNodes();
for (int j = 0; j < chapElems.getLength(); j++) {
Node child = chapElems.item(j);
if (child.getNodeName().equals("quote")) {
// search for nested mentions
NodeList quoteChildren = child.getChildNodes();
for (int k = 0; k < quoteChildren.getLength(); k++) {
Node quoteChild = quoteChildren.item(k);
if (quoteChild.getNodeName().equals("mention")) {
String mentionText = quoteChild.getTextContent();
int id = Integer.parseInt(quoteChild.getAttributes().getNamedItem("id").getTextContent().substring(1));
List<Integer> connections = readConnection(quoteChild.getAttributes().getNamedItem("connection").getNodeValue());
int endIndex = getEndIndex(tokenIndex, tokens, mentionText);
// mentions.put(id, new XMLMention(quoteChild.getTextContent(), tokenIndex, endIndex, id, connections));
idToMention.put(id, new Mention(mentionText, tokenIndex, endIndex));
tokenIndex = endIndex + 1;
} else {
String quoteText = quoteChild.getTextContent();
// trim unnecessarily newlines
quoteText = quoteText.replaceAll("\n(?!\n)", " ");
quoteText = quoteText.replaceAll("_", "");
tokenIndex = getEndIndex(tokenIndex, tokens, quoteText) + 1;
}
}
String quoteText = child.getTextContent();
// tokenIndex = getEndIndex(tokenIndex, tokens, quoteText) + 1;
// trim unnecessarily newlines
quoteText = quoteText.replaceAll("\n(?!\n)", " ");
quoteText = quoteText.replaceAll("_", "");
int quotationOffset = 1;
if (quoteText.startsWith("``"))
quotationOffset = 2;
List<Integer> connections = readConnection(child.getAttributes().getNamedItem("connection").getTextContent());
int id = Integer.parseInt(child.getAttributes().getNamedItem("id").getTextContent().substring(1));
Integer mention_id = null;
if (connections.size() > 0)
mention_id = connections.get(0);
else {
System.out.println("quote w/ no mention. ID: " + id);
}
// Pair<Integer, Integer> mentionPair = idToMentionPair.get(mention_id);
mentionIdToSpeakerList.add(new Pair<>(mention_id, child.getAttributes().getNamedItem("speaker").getTextContent()));
String annotatedQuoteText = quotes.get(quoteIndex).get(CoreAnnotations.TextAnnotation.class);
while (!quoteText.endsWith(annotatedQuoteText)) {
quoteIndex++;
annotatedQuoteText = quotes.get(quoteIndex).get(CoreAnnotations.TextAnnotation.class);
mentionIdToSpeakerList.add(new Pair<>(mention_id, child.getAttributes().getNamedItem("speaker").getTextContent()));
}
// idToMentionPair.put(id, new Pair<>(-1, -1));
// imention_id = connections.get(0);
// quotes.add(new XMLQuote(quoteText.substring(quotationOffset, quoteText.length() - quotationOffset), child.getAttributes().getNamedItem("speaker").getTextContent(), id, chapterIndex, mention_id));
quoteIndex++;
} else if (child.getNodeName().equals("mention")) {
String mentionText = child.getTextContent();
int id = Integer.parseInt(child.getAttributes().getNamedItem("id").getTextContent().substring(1));
List<Integer> connections = readConnection(child.getAttributes().getNamedItem("connection").getNodeValue());
int endIndex = getEndIndex(tokenIndex, tokens, mentionText);
idToMention.put(id, new Mention(mentionText, tokenIndex, endIndex));
// mentions.put(id, new XMLMention(child.getTextContent(), tokenIndex, endIndex, id, connections));
tokenIndex = endIndex + 1;
} else {
// #text
String nodeText = child.getTextContent();
nodeText = nodeText.replaceAll("\n(?!\n)", " ");
nodeText = nodeText.replaceAll("_", "");
if (tokenIndex >= tokens.size()) {
continue;
}
tokenIndex = getEndIndex(tokenIndex, tokens, nodeText) + 1;
}
}
}
}
for (Pair<Integer, String> item : mentionIdToSpeakerList) {
Mention mention = idToMention.get(item.first);
if (mention == null) {
goldList.add(new GoldQuoteInfo(-1, -1, item.second, null));
} else {
goldList.add(new GoldQuoteInfo(mention.begin, mention.end, item.second, mention.text));
}
}
// verify
if (document.get(CoreAnnotations.QuotationsAnnotation.class).size() != goldList.size()) {
throw new RuntimeException("Quotes size and gold size don't match!");
}
return new Data(goldList, personList, document);
}
Aggregations