use of edu.illinois.cs.cogcomp.nlp.tokenizer.IllinoisTokenizer in project cogcomp-nlp by CogComp.
the class TestWriteSVMLightFormat method runBeforeAllTests.
@BeforeClass
public static void runBeforeAllTests() {
// "brown-clusters/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt"
bcfex = BrownClusterFeatureExtractor.instance1000;
taBldr = new TokenizerTextAnnotationBuilder(new IllinoisTokenizer());
ta = taBldr.createTextAnnotation("test", "test", "This test sentence has Joynt and Lieberknecht and Fibonnaci in it " + "just to exercise possible brown cluster hits in resources used by NER.");
ta2 = taBldr.createTextAnnotation("test", "test2", "Why Joynt should have anything to do beyond JFK and Jimmy Carter " + "is beyond your oh-so-humble British writer.");
feats = new ArrayList<>();
for (int wordIndex = 0; wordIndex < ta.size(); ++wordIndex) try {
feats.addAll(bcfex.getWordFeatures(ta, wordIndex));
} catch (EdisonException e) {
e.printStackTrace();
fail(e.getMessage());
}
feats2 = new ArrayList<>();
for (int wordIndex = 0; wordIndex < ta2.size(); ++wordIndex) try {
feats2.addAll(bcfex.getWordFeatures(ta2, wordIndex));
} catch (EdisonException e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.IllinoisTokenizer in project cogcomp-nlp by CogComp.
the class BulkTokenizer method main.
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
parseArgs(args);
if (file == null) {
System.err.println("Must provide a file or directory name on the command line.");
return;
}
File[] files;
File nf = new File(file);
if (nf.isDirectory())
files = new File(args[0]).listFiles();
else {
files = new File[1];
files[0] = nf;
}
ArrayList<String> datas = readAllFiles(files);
BufferedWriter fw = new BufferedWriter(new FileWriter(new File("tokenizerdiffs.out")));
final TextAnnotationBuilder stab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
if (profile) {
System.out.println("Starting profiling");
while (true) {
for (String data : datas) {
stab.createTextAnnotation(data);
}
}
} else {
System.out.println("Starting new annotations");
long nt = System.currentTimeMillis();
ArrayList<TextAnnotation> newannotations = new ArrayList<TextAnnotation>();
final TextAnnotationBuilder ntab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
for (String data : datas) {
TextAnnotation ta = ntab.createTextAnnotation(data);
newannotations.add(ta);
}
nt = System.currentTimeMillis() - nt;
System.out.println("Starting old annotations");
long ot = System.currentTimeMillis();
ArrayList<TextAnnotation> oldannotations = new ArrayList<TextAnnotation>();
final TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new IllinoisTokenizer());
for (String data : datas) {
TextAnnotation ta = tab.createTextAnnotation(data);
oldannotations.add(ta);
}
ot = System.currentTimeMillis() - ot;
System.out.println("new way = " + nt + ", old way = " + ot);
int good = 0, bad = 0;
for (int i = 0; i < oldannotations.size(); i++) {
File file = files[i];
TextAnnotation newone = newannotations.get(i);
TextAnnotation oldone = oldannotations.get(i);
if (newone.sentences().equals(oldone.sentences())) {
good++;
} else {
bad++;
fw.write("-" + file + "\n");
if (verbose) {
List<Sentence> newsentences = newone.sentences();
List<Sentence> oldsentences = oldone.sentences();
int max = newsentences.size() > oldsentences.size() ? newsentences.size() : oldsentences.size();
boolean sentencewritten = false;
for (int j = 0; j < max; j++) {
String news = newsentences.size() > j ? newsentences.get(j).toString() : "???";
String olds = oldsentences.size() > j ? oldsentences.get(j).toString() : "???";
if (!compareSentences(olds, news)) {
if (!sentencewritten) {
sentencewritten = true;
fw.write("-" + file + "\n");
fw.write(newone.toString() + "\n");
}
fw.write(" new : " + news + "\n old : " + olds + "\n");
}
}
}
}
}
fw.close();
System.out.println(good + " correct, " + bad + " wrong.");
}
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.IllinoisTokenizer in project cogcomp-nlp by CogComp.
the class LlmStringComparator method initialize.
private void initialize(ResourceManager rm_, Comparator<String, EntailmentResult> comparator) throws IOException {
ResourceManager fullRm = new SimConfigurator().getConfig(rm_);
double threshold = fullRm.getDouble(SimConfigurator.LLM_ENTAILMENT_THRESHOLD.key);
tokenizer = new IllinoisTokenizer();
this.comparator = comparator;
filter = new WordListFilter(fullRm);
neAligner = new Aligner<String, EntailmentResult>(new NEComparator(), filter);
aligner = new Aligner<String, EntailmentResult>(comparator, filter);
scorer = new GreedyAlignmentScorer<String>(threshold);
}
Aggregations