use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class BulkTokenizer method main.
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
parseArgs(args);
if (file == null) {
System.err.println("Must provide a file or directory name on the command line.");
return;
}
File[] files;
File nf = new File(file);
if (nf.isDirectory())
files = new File(args[0]).listFiles();
else {
files = new File[1];
files[0] = nf;
}
ArrayList<String> datas = readAllFiles(files);
BufferedWriter fw = new BufferedWriter(new FileWriter(new File("tokenizerdiffs.out")));
final TextAnnotationBuilder stab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
if (profile) {
System.out.println("Starting profiling");
while (true) {
for (String data : datas) {
stab.createTextAnnotation(data);
}
}
} else {
System.out.println("Starting new annotations");
long nt = System.currentTimeMillis();
ArrayList<TextAnnotation> newannotations = new ArrayList<TextAnnotation>();
final TextAnnotationBuilder ntab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
for (String data : datas) {
TextAnnotation ta = ntab.createTextAnnotation(data);
newannotations.add(ta);
}
nt = System.currentTimeMillis() - nt;
System.out.println("Starting old annotations");
long ot = System.currentTimeMillis();
ArrayList<TextAnnotation> oldannotations = new ArrayList<TextAnnotation>();
final TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new IllinoisTokenizer());
for (String data : datas) {
TextAnnotation ta = tab.createTextAnnotation(data);
oldannotations.add(ta);
}
ot = System.currentTimeMillis() - ot;
System.out.println("new way = " + nt + ", old way = " + ot);
int good = 0, bad = 0;
for (int i = 0; i < oldannotations.size(); i++) {
File file = files[i];
TextAnnotation newone = newannotations.get(i);
TextAnnotation oldone = oldannotations.get(i);
if (newone.sentences().equals(oldone.sentences())) {
good++;
} else {
bad++;
fw.write("-" + file + "\n");
if (verbose) {
List<Sentence> newsentences = newone.sentences();
List<Sentence> oldsentences = oldone.sentences();
int max = newsentences.size() > oldsentences.size() ? newsentences.size() : oldsentences.size();
boolean sentencewritten = false;
for (int j = 0; j < max; j++) {
String news = newsentences.size() > j ? newsentences.get(j).toString() : "???";
String olds = oldsentences.size() > j ? oldsentences.get(j).toString() : "???";
if (!compareSentences(olds, news)) {
if (!sentencewritten) {
sentencewritten = true;
fw.write("-" + file + "\n");
fw.write(newone.toString() + "\n");
}
fw.write(" new : " + news + "\n old : " + olds + "\n");
}
}
}
}
}
fw.close();
System.out.println(good + " correct, " + bad + " wrong.");
}
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class ACERelationTester method testRandomText.
public static void testRandomText(String text) {
String corpus = "";
String textId = "";
TextAnnotationBuilder stab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation ta = stab.createTextAnnotation(corpus, textId, text);
try {
POSAnnotator pos_annotator = new POSAnnotator();
ChunkerAnnotator chunker = new ChunkerAnnotator(true);
chunker.initialize(new ChunkerConfigurator().getDefaultConfig());
Properties stanfordProps = new Properties();
stanfordProps.put("annotators", "pos, parse");
stanfordProps.put("parse.originalDependencies", true);
stanfordProps.put("parse.maxlen", Stanford331Configurator.STFRD_MAX_SENTENCE_LENGTH);
stanfordProps.put("parse.maxtime", Stanford331Configurator.STFRD_TIME_PER_SENTENCE);
POSTaggerAnnotator posAnnotator = new POSTaggerAnnotator("pos", stanfordProps);
ParserAnnotator parseAnnotator = new ParserAnnotator("parse", stanfordProps);
StanfordDepHandler stanfordDepHandler = new StanfordDepHandler(posAnnotator, parseAnnotator);
MentionAnnotator mentionAnnotator = new MentionAnnotator("ACE_TYPE");
RelationAnnotator relationAnnotator = new RelationAnnotator();
ta.addView(pos_annotator);
stanfordDepHandler.addView(ta);
chunker.addView(ta);
mentionAnnotator.addView(ta);
relationAnnotator.addView(ta);
for (Relation r : ta.getView(ViewNames.RELATION).getRelations()) {
IOHelper.printRelation(r);
}
} catch (Exception e) {
e.printStackTrace();
}
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class Demo method main.
public static void main(String[] args) throws IOException, AnnotatorException {
Options options = new Options();
Option inputtext = new Option("t", "text", true, "input text to be processed");
inputtext.setRequired(false);
options.addOption(inputtext);
CommandLineParser parser = new DefaultParser();
HelpFormatter formatter = new HelpFormatter();
try {
CommandLine cmd = parser.parse(options, args);
String defaultText = "The flu season is winding down, and it has killed 105 children so far - about the average toll.\n" + "\n" + "The season started about a month earlier than usual, sparking concerns it might turn into the worst in " + "a decade. It ended up being very hard on the elderly, but was moderately severe overall, according to " + "the Centers for Disease Control and Prevention.\n" + "\n" + "Six of the pediatric deaths were reported in the last week, and it's possible there will be more, said " + "the CDC's Dr. Michael Jhung said Friday.\n" + "\n" + "Roughly 100 children die in an average flu season. One exception was the swine flu pandemic of " + "2009-2010, when 348 children died.\n" + "\n" + "The CDC recommends that all children ages 6 months and older be vaccinated against flu each season, " + "though only about half get a flu shot or nasal spray.\n" + "\n" + "All but four of the children who died were old enough to be vaccinated, but 90 percent of them did " + "not get vaccinated, CDC officials said.\n" + "\n" + "This year's vaccine was considered effective in children, though it didn't work very well in older " + "people. And the dominant flu strain early in the season was one that tends to " + "cause more severe illness.\n" + "\n" + "The government only does a national flu death count for children. But it does track hospitalization " + "rates for people 65 and older, and those statistics have been grim.\n" + "\n" + "In that group, 177 out of every 100,000 were hospitalized with flu-related illness in the past " + "several months. That's more than 2 1/2 times higher than any other recent season.\n" + "\n" + "This flu season started in early December, a month earlier than usual, and peaked by the end " + "of year. Since then, flu reports have been dropping off throughout the country.\n" + "\n" + "\"We appear to be getting close to the end of flu season,\" Jhung said.";
String text = cmd.getOptionValue("text", defaultText);
TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation ta = tab.createTextAnnotation("corpus", "id", text);
POSAnnotator annotator = new POSAnnotator();
try {
annotator.getView(ta);
} catch (AnnotatorException e) {
fail("AnnotatorException thrown!\n" + e.getMessage());
}
Properties rmProps = new TemporalChunkerConfigurator().getDefaultConfig().getProperties();
rmProps.setProperty("useHeidelTime", "False");
TemporalChunkerAnnotator tca = new TemporalChunkerAnnotator(new ResourceManager(rmProps));
tca.addView(ta);
View temporalViews = ta.getView(ViewNames.TIMEX3);
List<Constituent> constituents = temporalViews.getConstituents();
System.out.printf("There're %d time expressions (TIMEX) in total.\n", constituents.size());
for (Constituent c : constituents) {
System.out.printf("TIMEX #%d: Text=%s, Type=%s, Value=%s\n", constituents.indexOf(c), c, c.getAttribute("type"), c.getAttribute("value"));
}
} catch (ParseException e) {
System.out.println(e.getMessage());
formatter.printHelp("Temporal Normalizer Demo", options);
System.exit(1);
}
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class SemEvalMentionReader method readTrainFile.
public List<TextAnnotation> readTrainFile(String fileName, String mode) {
List<String> sentences = new ArrayList<>();
List<String> types = new ArrayList<>();
List<TextAnnotation> ret = new ArrayList<>();
int counter = 0;
if (mode.equals("TRAIN")) {
try (BufferedReader br = new BufferedReader(new FileReader(fileName))) {
String line;
while ((line = br.readLine()) != null) {
if (counter % 4 == 0) {
String curSentence = line.split("\t")[1];
if (curSentence.charAt(0) == '"') {
curSentence = curSentence.substring(1);
}
if (curSentence.charAt(curSentence.length() - 1) == '"') {
curSentence = curSentence.substring(0, curSentence.length() - 1);
}
sentences.add(curSentence);
}
if (counter % 4 == 1) {
types.add(line);
}
counter++;
}
} catch (Exception e) {
e.printStackTrace();
}
}
if (mode.equals("TEST")) {
try (BufferedReader br = new BufferedReader(new FileReader(fileName))) {
String line;
while ((line = br.readLine()) != null) {
String curSentence = line.split("\t")[1];
if (curSentence.charAt(0) == '"') {
curSentence = curSentence.substring(1);
}
if (curSentence.charAt(curSentence.length() - 1) == '"') {
curSentence = curSentence.substring(0, curSentence.length() - 1);
}
sentences.add(curSentence);
types.add("UNKNOWN");
}
} catch (Exception e) {
e.printStackTrace();
}
}
StatefulTokenizer statefulTokenizer = new StatefulTokenizer();
for (int i = 0; i < sentences.size(); i++) {
List<String[]> tokens = new ArrayList<>();
String sentence = sentences.get(i);
String type = types.get(i);
Pair<String[], IntPair[]> tokenizedSentence = statefulTokenizer.tokenizeSentence(sentence);
List<String> curTokens = new LinkedList<>(Arrays.asList(tokenizedSentence.getFirst()));
int firstArgStart = 0;
int firstArgEnd = 0;
int secondArgStart = 0;
int secondArgEnd = 0;
for (int j = 0; j < curTokens.size(); j++) {
if (curTokens.get(j).equals("<") && curTokens.get(j + 1).equals("e1") && curTokens.get(j + 2).equals(">")) {
firstArgStart = j;
for (int k = j; k < j + 3; k++) {
curTokens.remove(j);
}
}
if (curTokens.get(j).equals("<") && curTokens.get(j + 1).equals("/") && curTokens.get(j + 2).equals("e1") && curTokens.get(j + 3).equals(">")) {
firstArgEnd = j;
for (int k = j; k < j + 4; k++) {
curTokens.remove(j);
}
}
if (curTokens.get(j).equals("<") && curTokens.get(j + 1).equals("e2") && curTokens.get(j + 2).equals(">")) {
secondArgStart = j;
for (int k = j; k < j + 3; k++) {
curTokens.remove(j);
}
}
if (curTokens.get(j).equals("<") && curTokens.get(j + 1).equals("/") && curTokens.get(j + 2).equals("e2") && curTokens.get(j + 3).equals(">")) {
secondArgEnd = j;
for (int k = j; k < j + 4; k++) {
curTokens.remove(j);
}
}
}
tokens.add(curTokens.toArray(new String[curTokens.size()]));
TextAnnotation ta = BasicTextAnnotationBuilder.createTextAnnotationFromTokens(tokens);
try {
ta.addView(_posAnnotator);
__chunker.addView(ta);
__stanfordDep.addView(ta);
__mentionAnnotator.addView(ta);
View annotatedTokenView = new SpanLabelView("RE_ANNOTATED", ta);
for (Constituent co : ta.getView(ViewNames.TOKENS).getConstituents()) {
Constituent c = co.cloneForNewView("RE_ANNOTATED");
for (String s : co.getAttributeKeys()) {
c.addAttribute(s, co.getAttribute(s));
}
c.addAttribute("WORDNETTAG", BIOFeatureExtractor.getWordNetTags(_wordnet, c));
c.addAttribute("WORDNETHYM", BIOFeatureExtractor.getWordNetHyms(_wordnet, c));
annotatedTokenView.addConstituent(c);
}
ta.addView("RE_ANNOTATED", annotatedTokenView);
} catch (Exception e) {
e.printStackTrace();
}
SpanLabelView mentionView = new SpanLabelView("MENTIONS", "MENTIONS", ta, 1.0f);
Constituent firstArg = new Constituent("MENTION", 1.0f, "MENTIONS", ta, firstArgStart, firstArgEnd);
Constituent secondArg = new Constituent("MENTION", 1.0f, "MENTIONS", ta, secondArgStart, secondArgEnd);
firstArg.addAttribute("GAZ", _gazetteers.annotatePhrase(firstArg));
secondArg.addAttribute("GAZ", _gazetteers.annotatePhrase(secondArg));
View annotatedMentionView = ta.getView(ViewNames.MENTION);
List<Constituent> firstMentions = annotatedMentionView.getConstituentsCoveringToken(firstArg.getStartSpan());
List<Constituent> secondMentions = annotatedMentionView.getConstituentsCoveringToken(secondArg.getStartSpan());
if (firstMentions.size() == 0) {
firstArg.addAttribute("EntityType", "UNKNOWN");
} else {
firstArg.addAttribute("EntityType", firstMentions.get(0).getAttribute("EntityType"));
}
if (secondMentions.size() == 0) {
secondArg.addAttribute("EntityType", "UNKNOWN");
} else {
secondArg.addAttribute("EntityType", secondMentions.get(0).getAttribute("EntityType"));
}
mentionView.addConstituent(firstArg);
mentionView.addConstituent(secondArg);
if (type.contains("e1,e2")) {
Relation relation = new Relation(type.split("[(]")[0], firstArg, secondArg, 1.0f);
relation.addAttribute("RelationSubtype", relation.getRelationName());
mentionView.addRelation(relation);
} else if (type.contains("e2,e1")) {
Relation relation = new Relation(type.split("[(]")[0], secondArg, firstArg, 1.0f);
relation.addAttribute("RelationSubtype", relation.getRelationName());
mentionView.addRelation(relation);
} else {
Relation relationLeft = new Relation(type, secondArg, firstArg, 1.0f);
Relation relationRight = new Relation(type, firstArg, secondArg, 1.0f);
relationLeft.addAttribute("RelationSubtype", relationLeft.getRelationName());
relationRight.addAttribute("RelationSubtype", relationRight.getRelationName());
mentionView.addRelation(relationLeft);
mentionView.addRelation(relationRight);
}
ta.addView("MENTIONS", mentionView);
ret.add(ta);
}
return ret;
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class NerOntonotesTest method testOntonotesNer.
@Test
public void testOntonotesNer() {
TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
Properties props = new Properties();
NERAnnotator nerOntonotes = NerAnnotatorManager.buildNerAnnotator(new ResourceManager(props), ViewNames.NER_ONTONOTES);
TextAnnotation taOnto = tab.createTextAnnotation("", "", TEST_INPUT);
try {
nerOntonotes.getView(taOnto);
} catch (AnnotatorException e) {
e.printStackTrace();
fail(e.getMessage());
}
View v = taOnto.getView(nerOntonotes.getViewName());
assertEquals(3, v.getConstituents().size());
}
Aggregations