use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class ThaiTokenizer method tokenizeSentence.
/**
* given a sentence, return a set of tokens and their character offsets
*
* @param text The sentence string
* @return A {@link Pair} containing the array of tokens and their character offsets
*/
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String text) {
List<IntPair> offsets = new ArrayList<>();
List<String> surfaces = new ArrayList<>();
List<Integer> sen_ends = new ArrayList<>();
BreakIterator boundary = BreakIterator.getWordInstance(new Locale("th", "TH", "TH"));
boundary.setText(text);
int start = boundary.first();
for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
// System.out.println(start+" "+end+" "+text.length());
String sur = text.substring(start, end);
if (sur.trim().isEmpty()) {
// sen_ends.add(surfaces.size());
continue;
}
surfaces.add(sur);
offsets.add(new IntPair(start, end));
}
if (surfaces.size() > 0 && (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size()))
sen_ends.add(surfaces.size());
IntPair[] offs = new IntPair[offsets.size()];
offs = offsets.toArray(offs);
String[] surfs = new String[surfaces.size()];
surfs = surfaces.toArray(surfs);
return new Pair(surfs, offs);
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class WhiteSpaceTokenizer method tokenizeSentence.
/**
* given a sentence, return a set of tokens and their character offsets
*
* @param sentence The sentence string
* @return A {@link Pair} containing the array of tokens and their character offsets
*/
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String sentence) {
List<IntPair> offsets = new ArrayList<>();
List<String> surfaces = new ArrayList<>();
String t = "";
int t_start = -1;
int i;
for (i = 0; i < sentence.length(); i++) {
String c = sentence.substring(i, i + 1);
if (c.trim().isEmpty()) {
if (!t.isEmpty()) {
surfaces.add(t);
offsets.add(new IntPair(t_start, i));
t = "";
}
} else if (c.equals(",") || c.equals("\"") || c.equals("'") || c.equals("(") || c.equals(")") || c.equals(";") || c.equals(":")) {
if (!t.isEmpty()) {
surfaces.add(t);
offsets.add(new IntPair(t_start, i));
}
surfaces.add(c);
offsets.add(new IntPair(i, i + 1));
t = "";
} else {
if (t.isEmpty())
t_start = i;
t += c;
}
}
if (!t.isEmpty()) {
surfaces.add(t);
offsets.add(new IntPair(t_start, i));
}
IntPair[] offs = new IntPair[offsets.size()];
offs = offsets.toArray(offs);
String[] surfs = new String[surfaces.size()];
surfs = surfaces.toArray(surfs);
return new Pair(surfs, offs);
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class ChineseTokenizer method tokenizeSentence.
/**
* given a sentence, return a set of tokens and their character offsets
*
* @param text The sentence string
* @return A {@link Pair} containing the array of tokens and their character offsets
*/
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String text) {
if (text.trim().isEmpty())
return new Pair(new String[] {}, new IntPair[] {});
text = trad2simp(text);
List<IntPair> offsets = new ArrayList<>();
List<String> surfaces = new ArrayList<>();
List<Integer> sen_ends = new ArrayList<>();
String[] lines = text.split("\n");
int idx = 0;
for (String line : lines) {
if (line.trim().isEmpty())
continue;
String[] sentences = line.split("。");
for (int i = 0; i < sentences.length; i++) {
String sentence = sentences[i];
if (sentence.trim().isEmpty())
continue;
List<String> segs = segmenter.segmentString(sentence);
for (String seg : segs) {
idx = text.indexOf(seg, idx);
if (!containsHanScript(seg)) {
surfaces.add(seg);
offsets.add(new IntPair(idx, idx + seg.length()));
} else {
for (int j = 0; j < seg.length(); j++) {
String ch = seg.substring(j, j + 1);
surfaces.add(ch);
offsets.add(new IntPair(idx + j, idx + j + 1));
}
}
idx += seg.length();
}
if (i < sentences.length - 1) {
surfaces.add("。");
idx = text.indexOf("。", idx);
offsets.add(new IntPair(idx, ++idx));
}
}
}
// for(int i = 0; i < surfaces.size(); i++){
// System.out.println(i+" "+surfaces.get(i)+" "+offsets.get(i));
// }
// System.out.println(sen_ends);
// System.out.println(surfaces.size()+" "+offsets.size()+" "+sen_ends.size());
IntPair[] offs = new IntPair[offsets.size()];
offs = offsets.toArray(offs);
String[] surfs = new String[surfaces.size()];
surfs = surfaces.toArray(surfs);
return new Pair(surfs, offs);
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class ClassifierComparison method printConstrainedClassifierPerformance.
public static void printConstrainedClassifierPerformance(Parser parser) {
List<Pair<Classifier, EvaluateDiscrete>> classifiers = new ArrayList<>();
LocalCommaClassifier learner = new LocalCommaClassifier();
EvaluateDiscrete unconstrainedPerformance = new EvaluateDiscrete();
learner.setLTU(new SparseAveragedPerceptron(0.003, 0, 3.5));
classifiers.add(new Pair<Classifier, EvaluateDiscrete>(new SubstitutePairConstrainedCommaClassifier(), new EvaluateDiscrete()));
classifiers.add(new Pair<Classifier, EvaluateDiscrete>(new LocativePairConstrainedCommaClassifier(), new EvaluateDiscrete()));
classifiers.add(new Pair<Classifier, EvaluateDiscrete>(new ListCommasConstrainedCommaClassifier(), new EvaluateDiscrete()));
classifiers.add(new Pair<Classifier, EvaluateDiscrete>(new OxfordCommaConstrainedCommaClassifier(), new EvaluateDiscrete()));
int k = 5;
parser.reset();
FoldParser foldParser = new FoldParser(parser, k, SplitPolicy.sequential, 0, false);
for (int i = 0; i < k; foldParser.setPivot(++i)) {
foldParser.setFromPivot(false);
foldParser.reset();
learner.forget();
BatchTrainer bt = new BatchTrainer(learner, foldParser);
Lexicon lexicon = bt.preExtract(null);
learner.setLexicon(lexicon);
bt.train(250);
learner.save();
foldParser.setFromPivot(true);
foldParser.reset();
unconstrainedPerformance.reportAll(EvaluateDiscrete.evaluateDiscrete(learner, learner.getLabeler(), foldParser));
for (Pair<Classifier, EvaluateDiscrete> pair : classifiers) {
foldParser.reset();
pair.getSecond().reportAll(EvaluateDiscrete.evaluateDiscrete(pair.getFirst(), learner.getLabeler(), foldParser));
}
}
for (Pair<Classifier, EvaluateDiscrete> pair : classifiers) {
System.out.println(pair.getFirst().name + " " + pair.getSecond().getOverallStats()[2]);
}
}
Aggregations