use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class TemporalNormalizerBenchmark method testNormalizationWithTrueExtraction.
/**
* Normalize the dataset using real extraction
* @param outputFolder
* @param verbose
* @throws Exception
*/
public void testNormalizationWithTrueExtraction(String outputFolder, boolean verbose) throws Exception {
TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(false, false));
System.out.println("Working Directory = " + System.getProperty("user.dir"));
ResourceManager nerRm = new TemporalChunkerConfigurator().getDefaultConfig();
IOUtilities.existsInClasspath(TemporalChunkerAnnotator.class, nerRm.getString("modelDirPath"));
java.util.logging.Logger.getLogger("HeidelTimeStandalone").setLevel(Level.OFF);
long preprocessTime = System.currentTimeMillis();
List<TextAnnotation> taList = new ArrayList<>();
POSAnnotator annotator = new POSAnnotator();
for (int j = 0; j < te3inputText.size(); j++) {
String text = testText.get(j);
text = text.replaceAll("\\n", " ");
TextAnnotation ta = tab.createTextAnnotation("corpus", "id", text);
try {
annotator.getView(ta);
} catch (AnnotatorException e) {
fail("AnnotatorException thrown!\n" + e.getMessage());
}
taList.add(ta);
}
long startTime = System.currentTimeMillis();
int numTimex = 0;
File outDir = new File(outputFolder);
if (!outDir.exists()) {
outDir.mkdir();
}
for (int j = 0; j < te3inputText.size(); j++) {
TextAnnotation ta = taList.get(j);
tca.addDocumentCreationTime(DCTs.get(j));
if (verbose) {
System.out.println(docIDs.get(j));
}
try {
List<TimexChunk> timex = tca.extractTimexFromFile(te3inputText.get(j), testText.get(j), ta, verbose);
tca.setTimex(timex);
String outputFileName = outputFolder + "/" + docIDs.get(j) + ".tml";
tca.write2Text(outputFileName, docIDs.get(j), testText.get(j));
numTimex += timex.size();
tca.deleteTimex();
} catch (AnnotatorException e) {
fail("Exception while adding TIMEX3 VIEW " + e.getStackTrace());
}
}
long endTime = System.currentTimeMillis();
long totalTime = endTime - startTime;
System.out.println("Process time: " + totalTime);
System.out.println("Preprocess + process time: " + (endTime - preprocessTime));
System.out.println("Total timex3: " + numTimex);
}
use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class TestTemporalChunker method testTemporalChunkerWithPlainText.
@Test
public void testTemporalChunkerWithPlainText() throws Exception {
String text = "The flu season is winding down, and it has killed 105 children so far - about the average toll.\n" + "\n" + "The season started about a month earlier than usual, sparking concerns it might turn into the worst in " + "a decade. It ended up being very hard on the elderly, but was moderately severe overall, according to " + "the Centers for Disease Control and Prevention.\n" + "\n" + "Six of the pediatric deaths were reported in the last week, and it's possible there will be more, said " + "the CDC's Dr. Michael Jhung said Friday.\n" + "\n" + "Roughly 100 children die in an average flu season. One exception was the swine flu pandemic of " + "2009-2010, when 348 children died.\n" + "\n" + "The CDC recommends that all children ages 6 months and older be vaccinated against flu each season, " + "though only about half get a flu shot or nasal spray.\n" + "\n" + "All but four of the children who died were old enough to be vaccinated, but 90 percent of them did " + "not get vaccinated, CDC officials said.\n" + "\n" + "This year's vaccine was considered effective in children, though it didn't work very well in older " + "people. And the dominant flu strain early in the season was one that tends to " + "cause more severe illness.\n" + "\n" + "The government only does a national flu death count for children. But it does track hospitalization " + "rates for people 65 and older, and those statistics have been grim.\n" + "\n" + "In that group, 177 out of every 100,000 were hospitalized with flu-related illness in the past " + "several months. That's more than 2 1/2 times higher than any other recent season.\n" + "\n" + "This flu season started in early December, a month earlier than usual, and peaked by the end " + "of year. Since then, flu reports have been dropping off throughout the country.\n" + "\n" + "\"We appear to be getting close to the end of flu season,\" Jhung said.";
TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation ta = tab.createTextAnnotation("corpus", "id", text);
POSAnnotator annotator = new POSAnnotator();
try {
annotator.getView(ta);
} catch (AnnotatorException e) {
fail("AnnotatorException thrown!\n" + e.getMessage());
}
tca.addView(ta);
View temporalViews = ta.getView(ViewNames.TIMEX3);
List<Constituent> constituents = temporalViews.getConstituents();
assertEquals("<TIMEX3 type=\"DURATION\" value=\"P1M\">", constituents.get(0).getLabel());
}
use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class BulkTokenizer method main.
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
parseArgs(args);
if (file == null) {
System.err.println("Must provide a file or directory name on the command line.");
return;
}
File[] files;
File nf = new File(file);
if (nf.isDirectory())
files = new File(args[0]).listFiles();
else {
files = new File[1];
files[0] = nf;
}
ArrayList<String> datas = readAllFiles(files);
BufferedWriter fw = new BufferedWriter(new FileWriter(new File("tokenizerdiffs.out")));
final TextAnnotationBuilder stab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
if (profile) {
System.out.println("Starting profiling");
while (true) {
for (String data : datas) {
stab.createTextAnnotation(data);
}
}
} else {
System.out.println("Starting new annotations");
long nt = System.currentTimeMillis();
ArrayList<TextAnnotation> newannotations = new ArrayList<TextAnnotation>();
final TextAnnotationBuilder ntab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
for (String data : datas) {
TextAnnotation ta = ntab.createTextAnnotation(data);
newannotations.add(ta);
}
nt = System.currentTimeMillis() - nt;
System.out.println("Starting old annotations");
long ot = System.currentTimeMillis();
ArrayList<TextAnnotation> oldannotations = new ArrayList<TextAnnotation>();
final TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new IllinoisTokenizer());
for (String data : datas) {
TextAnnotation ta = tab.createTextAnnotation(data);
oldannotations.add(ta);
}
ot = System.currentTimeMillis() - ot;
System.out.println("new way = " + nt + ", old way = " + ot);
int good = 0, bad = 0;
for (int i = 0; i < oldannotations.size(); i++) {
File file = files[i];
TextAnnotation newone = newannotations.get(i);
TextAnnotation oldone = oldannotations.get(i);
if (newone.sentences().equals(oldone.sentences())) {
good++;
} else {
bad++;
fw.write("-" + file + "\n");
if (verbose) {
List<Sentence> newsentences = newone.sentences();
List<Sentence> oldsentences = oldone.sentences();
int max = newsentences.size() > oldsentences.size() ? newsentences.size() : oldsentences.size();
boolean sentencewritten = false;
for (int j = 0; j < max; j++) {
String news = newsentences.size() > j ? newsentences.get(j).toString() : "???";
String olds = oldsentences.size() > j ? oldsentences.get(j).toString() : "???";
if (!compareSentences(olds, news)) {
if (!sentencewritten) {
sentencewritten = true;
fw.write("-" + file + "\n");
fw.write(newone.toString() + "\n");
}
fw.write(" new : " + news + "\n old : " + olds + "\n");
}
}
}
}
}
fw.close();
System.out.println(good + " correct, " + bad + " wrong.");
}
}
use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class TokenizerValidation method main.
/**
* @param args
* @throws SQLException
*/
public static void main(String[] args) throws SQLException {
parseArgs(args);
// create both tokenizers
TextAnnotationBuilder statefulBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotationBuilder ilBuilder = new TokenizerTextAnnotationBuilder(new IllinoisTokenizer());
String snt = "At .5 or 3.5 decibles.";
TextAnnotation tr = statefulBuilder.createTextAnnotation("test1", "state", snt);
List<Constituent> t = tr.getView(ViewNames.TOKENS).getConstituents();
TextAnnotation tr2 = ilBuilder.createTextAnnotation("test1", "state", snt);
List<Constituent> t2 = tr2.getView(ViewNames.TOKENS).getConstituents();
int idx = 0;
for (Constituent ntt : t) System.out.println(ntt.getSurfaceForm());
System.err.println();
for (Constituent ntt : t2) System.out.println(ntt.getSurfaceForm());
Connection con = getConnection();
String sentencequery = "SELECT s.id,s.no_trace_string FROM sentence s, document d, subcorpus c \n" + " where s.document_id = d.id AND d.subcorpus_id = c.id AND c.language_id = 'en'";
// issue the query, process one string at a time.
int counter = 0;
int bad = 0;
try (ResultSet rs1 = con.createStatement().executeQuery(sentencequery)) {
while (rs1.next()) {
counter++;
// System.out.println("counter = "+counter);
String id = rs1.getString(1);
String sentence = rs1.getString(2);
if (sentence.length() == 0)
continue;
sentence = sentence.replaceAll(" 's ", "'s ");
sentence = sentence.replaceAll(" 'S ", "'S ");
sentence = sentence.replaceAll(" 'm ", "'m ");
sentence = sentence.replaceAll(" 're ", "'re ");
sentence = sentence.replaceAll(" 'nt ", "'nt ");
sentence = sentence.replaceAll(" 've ", "'ve ");
sentence = sentence.replaceAll(" 'd ", "'d ");
sentence = sentence.replaceAll(" 'll ", "'ll ");
sentence = sentence.replaceAll(" do n't ", " don't ");
TextAnnotation stateful = null;
try {
stateful = statefulBuilder.createTextAnnotation("test1", "state", sentence);
} catch (ArrayIndexOutOfBoundsException aioobe) {
System.err.println("Bad Sentence : " + sentence);
System.exit(1);
;
}
TextAnnotation il = ilBuilder.createTextAnnotation("test2", "il", sentence);
List<Constituent> statefulToks = stateful.getView(ViewNames.TOKENS).getConstituents();
List<Constituent> ilToks = il.getView(ViewNames.TOKENS).getConstituents();
// get the provided tokens.
int sidx = 0;
int iidx = 0;
for (; true; ) {
if (sidx < statefulToks.size() && iidx < ilToks.size()) {
String stok = statefulToks.get(sidx).getSurfaceForm();
String itok = ilToks.get(iidx).getSurfaceForm();
if (!stok.equals(itok)) {
System.out.println(sentence);
System.out.println("stateful:" + stok + " il:" + itok);
bad++;
break;
}
} else {
if (statefulToks.size() != ilToks.size()) {
System.out.println(sentence);
System.out.println("stateful size:" + statefulToks.size() + " il size:" + ilToks.size());
bad++;
}
break;
}
sidx++;
iidx++;
}
/**
* String tokequery ="select t.word from token t where t.id like '%"+id+
* "' AND part_of_speech!='-NONE-'"; try (ResultSet rs2 =
* con.createStatement().executeQuery(tokequery)) { int iindex = 0; int sindex = 0;
* while (rs2.next()) { String word = rs2.getString(1); if (sindex >=
* statefulToks.size()) { System.out.println("On token '"+word+
* ", stateful parsing revealed too few tokens."); } else { String stok =
* statefulToks.get(sindex).getSurfaceForm(); if (!word.equals(stok))
* System.out.println("On token '"+word+"', stateful parsing token was '"+stok+"'");
* } if (iindex >= ilToks.size()) { System.out.println("On token '"+word+
* "', illinois parsing revealed too few tokens."); } else { String stok =
* ilToks.get(iindex).getSurfaceForm(); if (!word.equals(stok))
* System.out.println("On token '"+word+"', illinois parsing token was '"+stok+"'");
*
* } iindex++; sindex++; } }
*/
}
}
System.out.println("Done of " + counter + ", " + bad + " were bad.");
}
use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class ACERelationTester method testRandomText.
public static void testRandomText(String text) {
String corpus = "";
String textId = "";
TextAnnotationBuilder stab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation ta = stab.createTextAnnotation(corpus, textId, text);
try {
POSAnnotator pos_annotator = new POSAnnotator();
ChunkerAnnotator chunker = new ChunkerAnnotator(true);
chunker.initialize(new ChunkerConfigurator().getDefaultConfig());
Properties stanfordProps = new Properties();
stanfordProps.put("annotators", "pos, parse");
stanfordProps.put("parse.originalDependencies", true);
stanfordProps.put("parse.maxlen", Stanford331Configurator.STFRD_MAX_SENTENCE_LENGTH);
stanfordProps.put("parse.maxtime", Stanford331Configurator.STFRD_TIME_PER_SENTENCE);
POSTaggerAnnotator posAnnotator = new POSTaggerAnnotator("pos", stanfordProps);
ParserAnnotator parseAnnotator = new ParserAnnotator("parse", stanfordProps);
StanfordDepHandler stanfordDepHandler = new StanfordDepHandler(posAnnotator, parseAnnotator);
MentionAnnotator mentionAnnotator = new MentionAnnotator("ACE_TYPE");
RelationAnnotator relationAnnotator = new RelationAnnotator();
ta.addView(pos_annotator);
stanfordDepHandler.addView(ta);
chunker.addView(ta);
mentionAnnotator.addView(ta);
relationAnnotator.addView(ta);
for (Relation r : ta.getView(ViewNames.RELATION).getRelations()) {
IOHelper.printRelation(r);
}
} catch (Exception e) {
e.printStackTrace();
}
}
Aggregations