use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class DependencyParser method genDictionaries.
/**
* Scan a corpus and store all words, part-of-speech tags, and
* dependency relation labels observed. Prepare other structures
* which support word / POS / label lookup at train- / run-time.
*/
private void genDictionaries(List<CoreMap> sents, List<DependencyTree> trees) {
// Collect all words (!), etc. in lists, tacking on one sentence
// after the other
List<String> word = new ArrayList<>();
List<String> pos = new ArrayList<>();
List<String> label = new ArrayList<>();
for (CoreMap sentence : sents) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
for (CoreLabel token : tokens) {
word.add(token.word());
pos.add(token.tag());
}
}
String rootLabel = null;
for (DependencyTree tree : trees) for (int k = 1; k <= tree.n; ++k) if (tree.getHead(k) == 0)
rootLabel = tree.getLabel(k);
else
label.add(tree.getLabel(k));
// Generate "dictionaries," possibly with frequency cutoff
knownWords = Util.generateDict(word, config.wordCutOff);
knownPos = Util.generateDict(pos);
knownLabels = Util.generateDict(label);
knownLabels.add(0, rootLabel);
// Avoid the case that rootLabel equals to one of the other labels
for (int k = 1; k < knownLabels.size(); ++k) if (knownLabels.get(k).equals(rootLabel)) {
knownLabels.remove(k);
break;
}
knownWords.add(0, Config.UNKNOWN);
knownWords.add(1, Config.NULL);
knownWords.add(2, Config.ROOT);
knownPos.add(0, Config.UNKNOWN);
knownPos.add(1, Config.NULL);
knownPos.add(2, Config.ROOT);
knownLabels.add(0, Config.NULL);
generateIDs();
log.info(Config.SEPARATOR);
log.info("#Word: " + knownWords.size());
log.info("#POS:" + knownPos.size());
log.info("#Label: " + knownLabels.size());
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class Util method writeConllFile.
public static void writeConllFile(String outFile, List<CoreMap> sentences, List<DependencyTree> trees) {
try {
PrintWriter output = IOUtils.getPrintWriter(outFile);
for (int i = 0; i < sentences.size(); i++) {
CoreMap sentence = sentences.get(i);
DependencyTree tree = trees.get(i);
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
for (int j = 1, size = tokens.size(); j <= size; ++j) {
CoreLabel token = tokens.get(j - 1);
output.printf("%d\t%s\t_\t%s\t%s\t_\t%d\t%s\t_\t_%n", j, token.word(), token.tag(), token.tag(), tree.getHead(j), tree.getLabel(j));
}
output.println();
}
output.close();
} catch (Exception e) {
throw new RuntimeIOException(e);
}
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class DependencyParserCoreNLPDemo method main.
public static void main(String[] args) {
String text;
if (args.length > 0) {
text = IOUtils.slurpFileNoExceptions(args[0], "utf-8");
} else {
text = "I can almost always tell when movies use fake dinosaurs.";
}
Annotation ann = new Annotation(text);
Properties props = PropertiesUtils.asProperties("annotators", "tokenize,ssplit,pos,depparse", "depparse.model", DependencyParser.DEFAULT_MODEL);
AnnotationPipeline pipeline = new StanfordCoreNLP(props);
pipeline.annotate(ann);
for (CoreMap sent : ann.get(CoreAnnotations.SentencesAnnotation.class)) {
SemanticGraph sg = sent.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
log.info(IOUtils.eolChar + sg.toString(SemanticGraph.OutputFormat.LIST));
}
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class NumberNormalizerITest method testNumbers.
public void testNumbers() throws IOException {
// Set up test text
String testText = "two dozen\n" + "six hundred,\n" + "four hundred, and twelve.\n" + "4 million six hundred fifty thousand, two hundred and eleven.\n" + "6 hundred billion, five million six hundred fifty thousand, three hundred and seventy six\n" + "5,786,345\n" + "twenty-five.\n" + // "one and half million\n" +
"1.3 million.\n" + "one thousand two hundred and twenty four\n" + "10 thousand million.\n" + "3.625\n" + "zero\n" + "-15\n" + "one two three four.\n" + "one hundred and fifty five\n" + "a hundred\n";
// set up expected results
Iterator<? extends Number> expectedNumbers = Arrays.asList(24.0, 600.0, 412.0, 4650211.0, 600005650376.0, 5786345, 25.0, 1300000.0, 1224.0, 10000000000.0, 3.625, 0, -15.0, 1, 2, 3, 4, 155.0, 100).iterator();
Iterator<String> expectedTexts = Arrays.asList("two dozen", "six hundred", "four hundred, and twelve", "4 million six hundred fifty thousand, two hundred and eleven", "6 hundred billion, five million six hundred fifty thousand, three hundred and seventy six", "5,786,345", "twenty-five", // "one and half million\n" +
"1.3 million", "one thousand two hundred and twenty four", "10 thousand million", "3.625", "zero", "-15", "one", "two", "three", "four", "one hundred and fifty five", "hundred").iterator();
// create document
Annotation document = createDocument(testText);
// Annotate numbers
NumberNormalizer.findAndAnnotateNumericExpressions(document);
// Check answers
for (CoreMap num : document.get(CoreAnnotations.NumerizedTokensAnnotation.class)) {
if (num.containsKey(CoreAnnotations.NumericCompositeTypeAnnotation.class)) {
Number expectedNumber = expectedNumbers.next();
String expectedType = "NUMBER";
String expectedText = expectedTexts.next();
String text = document.get(CoreAnnotations.TextAnnotation.class).substring(num.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), num.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
assertEquals(expectedText, text);
assertEquals(expectedType, num.get(CoreAnnotations.NumericCompositeTypeAnnotation.class));
assertEquals(expectedNumber.toString(), num.get(CoreAnnotations.NumericCompositeValueAnnotation.class).toString());
}
}
assertFalse(expectedNumbers.hasNext());
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class NumberNormalizerITest method testOrdinals.
public void testOrdinals() throws IOException {
// Set up test text
String testText = "0th, 1st, 2nd, 3rd, 4th, 5th, 6th, 7th, 8th, 9th, 10th\n" + "zeroth, first, second, third, fourth, fifth, sixth, seventh, eighth, ninth, tenth\n" + "11th, 12th, 13th, 14th, 15th, 16th, 17th, 18th, 19th, 20th\n" + "Eleventh, twelfth, thirteenth, Fourteenth, fifteenth, Sixteenth, seventeenth, eighteenth, nineteenth, twentieth\n" + "Twenty-first, twenty first, twenty second, twenty third, twenty fourth\n" + "thirtieth, thirty first, thirty-second," + "fortieth, one hundredth, two hundredth, one hundred and fifty first, one hundred fifty first";
// TODO: Fix consistency of number representation
// set up expected results
Iterator<? extends Number> expectedNumbers = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21.0, 21.0, 22.0, 23.0, 24.0, 30, 31.0, 32.0, 40, 100.0, 200.0, 151.0, 151.0).iterator();
Iterator<String> expectedTexts = Arrays.asList(testText.split("\\s*[,\\n]+\\s*")).iterator();
// create document
Annotation document = createDocument(testText);
// Annotate numbers
NumberNormalizer.findAndAnnotateNumericExpressions(document);
// Check answers
for (CoreMap num : document.get(CoreAnnotations.NumerizedTokensAnnotation.class)) {
if (num.containsKey(CoreAnnotations.NumericCompositeTypeAnnotation.class)) {
Number expectedNumber = expectedNumbers.next();
String expectedType = "ORDINAL";
String expectedText = expectedTexts.next();
String text = document.get(CoreAnnotations.TextAnnotation.class).substring(num.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), num.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
assertEquals(expectedText, text);
assertEquals("Type for " + expectedText, expectedType, num.get(CoreAnnotations.NumericCompositeTypeAnnotation.class));
assertEquals(expectedNumber.toString(), num.get(CoreAnnotations.NumericCompositeValueAnnotation.class).toString());
}
}
assertFalse(expectedNumbers.hasNext());
}
Aggregations