Search in sources :

Example 56 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class ThaiTokenizer method tokenizeSentence.

/**
     * given a sentence, return a set of tokens and their character offsets
     *
     * @param text The sentence string
     * @return A {@link Pair} containing the array of tokens and their character offsets
     */
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String text) {
    List<IntPair> offsets = new ArrayList<>();
    List<String> surfaces = new ArrayList<>();
    List<Integer> sen_ends = new ArrayList<>();
    BreakIterator boundary = BreakIterator.getWordInstance(new Locale("th", "TH", "TH"));
    boundary.setText(text);
    int start = boundary.first();
    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
        //            System.out.println(start+" "+end+" "+text.length());
        String sur = text.substring(start, end);
        if (sur.trim().isEmpty()) {
            //                    sen_ends.add(surfaces.size());
            continue;
        }
        surfaces.add(sur);
        offsets.add(new IntPair(start, end));
    }
    if (surfaces.size() > 0 && (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size()))
        sen_ends.add(surfaces.size());
    IntPair[] offs = new IntPair[offsets.size()];
    offs = offsets.toArray(offs);
    String[] surfs = new String[surfaces.size()];
    surfs = surfaces.toArray(surfs);
    return new Pair(surfs, offs);
}
Also used : Locale(java.util.Locale) ArrayList(java.util.ArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) BreakIterator(java.text.BreakIterator) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 57 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class WhiteSpaceTokenizer method tokenizeSentence.

/**
     * given a sentence, return a set of tokens and their character offsets
     *
     * @param sentence The sentence string
     * @return A {@link Pair} containing the array of tokens and their character offsets
     */
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String sentence) {
    List<IntPair> offsets = new ArrayList<>();
    List<String> surfaces = new ArrayList<>();
    String t = "";
    int t_start = -1;
    int i;
    for (i = 0; i < sentence.length(); i++) {
        String c = sentence.substring(i, i + 1);
        if (c.trim().isEmpty()) {
            if (!t.isEmpty()) {
                surfaces.add(t);
                offsets.add(new IntPair(t_start, i));
                t = "";
            }
        } else if (c.equals(",") || c.equals("\"") || c.equals("'") || c.equals("(") || c.equals(")") || c.equals(";") || c.equals(":")) {
            if (!t.isEmpty()) {
                surfaces.add(t);
                offsets.add(new IntPair(t_start, i));
            }
            surfaces.add(c);
            offsets.add(new IntPair(i, i + 1));
            t = "";
        } else {
            if (t.isEmpty())
                t_start = i;
            t += c;
        }
    }
    if (!t.isEmpty()) {
        surfaces.add(t);
        offsets.add(new IntPair(t_start, i));
    }
    IntPair[] offs = new IntPair[offsets.size()];
    offs = offsets.toArray(offs);
    String[] surfs = new String[surfaces.size()];
    surfs = surfaces.toArray(surfs);
    return new Pair(surfs, offs);
}
Also used : ArrayList(java.util.ArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 58 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class ChineseTokenizer method tokenizeSentence.

/**
     * given a sentence, return a set of tokens and their character offsets
     *
     * @param text The sentence string
     * @return A {@link Pair} containing the array of tokens and their character offsets
     */
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String text) {
    if (text.trim().isEmpty())
        return new Pair(new String[] {}, new IntPair[] {});
    text = trad2simp(text);
    List<IntPair> offsets = new ArrayList<>();
    List<String> surfaces = new ArrayList<>();
    List<Integer> sen_ends = new ArrayList<>();
    String[] lines = text.split("\n");
    int idx = 0;
    for (String line : lines) {
        if (line.trim().isEmpty())
            continue;
        String[] sentences = line.split("。");
        for (int i = 0; i < sentences.length; i++) {
            String sentence = sentences[i];
            if (sentence.trim().isEmpty())
                continue;
            List<String> segs = segmenter.segmentString(sentence);
            for (String seg : segs) {
                idx = text.indexOf(seg, idx);
                if (!containsHanScript(seg)) {
                    surfaces.add(seg);
                    offsets.add(new IntPair(idx, idx + seg.length()));
                } else {
                    for (int j = 0; j < seg.length(); j++) {
                        String ch = seg.substring(j, j + 1);
                        surfaces.add(ch);
                        offsets.add(new IntPair(idx + j, idx + j + 1));
                    }
                }
                idx += seg.length();
            }
            if (i < sentences.length - 1) {
                surfaces.add("。");
                idx = text.indexOf("。", idx);
                offsets.add(new IntPair(idx, ++idx));
            }
        }
    }
    //        for(int i = 0; i < surfaces.size(); i++){
    //            System.out.println(i+" "+surfaces.get(i)+" "+offsets.get(i));
    //        }
    //        System.out.println(sen_ends);
    //        System.out.println(surfaces.size()+" "+offsets.size()+" "+sen_ends.size());
    IntPair[] offs = new IntPair[offsets.size()];
    offs = offsets.toArray(offs);
    String[] surfs = new String[surfaces.size()];
    surfs = surfaces.toArray(surfs);
    return new Pair(surfs, offs);
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 59 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class ClassifierComparison method printConstrainedClassifierPerformance.

public static void printConstrainedClassifierPerformance(Parser parser) {
    List<Pair<Classifier, EvaluateDiscrete>> classifiers = new ArrayList<>();
    LocalCommaClassifier learner = new LocalCommaClassifier();
    EvaluateDiscrete unconstrainedPerformance = new EvaluateDiscrete();
    learner.setLTU(new SparseAveragedPerceptron(0.003, 0, 3.5));
    classifiers.add(new Pair<Classifier, EvaluateDiscrete>(new SubstitutePairConstrainedCommaClassifier(), new EvaluateDiscrete()));
    classifiers.add(new Pair<Classifier, EvaluateDiscrete>(new LocativePairConstrainedCommaClassifier(), new EvaluateDiscrete()));
    classifiers.add(new Pair<Classifier, EvaluateDiscrete>(new ListCommasConstrainedCommaClassifier(), new EvaluateDiscrete()));
    classifiers.add(new Pair<Classifier, EvaluateDiscrete>(new OxfordCommaConstrainedCommaClassifier(), new EvaluateDiscrete()));
    int k = 5;
    parser.reset();
    FoldParser foldParser = new FoldParser(parser, k, SplitPolicy.sequential, 0, false);
    for (int i = 0; i < k; foldParser.setPivot(++i)) {
        foldParser.setFromPivot(false);
        foldParser.reset();
        learner.forget();
        BatchTrainer bt = new BatchTrainer(learner, foldParser);
        Lexicon lexicon = bt.preExtract(null);
        learner.setLexicon(lexicon);
        bt.train(250);
        learner.save();
        foldParser.setFromPivot(true);
        foldParser.reset();
        unconstrainedPerformance.reportAll(EvaluateDiscrete.evaluateDiscrete(learner, learner.getLabeler(), foldParser));
        for (Pair<Classifier, EvaluateDiscrete> pair : classifiers) {
            foldParser.reset();
            pair.getSecond().reportAll(EvaluateDiscrete.evaluateDiscrete(pair.getFirst(), learner.getLabeler(), foldParser));
        }
    }
    for (Pair<Classifier, EvaluateDiscrete> pair : classifiers) {
        System.out.println(pair.getFirst().name + " " + pair.getSecond().getOverallStats()[2]);
    }
}
Also used : ListCommasConstrainedCommaClassifier(edu.illinois.cs.cogcomp.comma.lbj.ListCommasConstrainedCommaClassifier) OxfordCommaConstrainedCommaClassifier(edu.illinois.cs.cogcomp.comma.lbj.OxfordCommaConstrainedCommaClassifier) LocativePairConstrainedCommaClassifier(edu.illinois.cs.cogcomp.comma.lbj.LocativePairConstrainedCommaClassifier) Lexicon(edu.illinois.cs.cogcomp.lbjava.learn.Lexicon) ArrayList(java.util.ArrayList) OxfordCommaConstrainedCommaClassifier(edu.illinois.cs.cogcomp.comma.lbj.OxfordCommaConstrainedCommaClassifier) Classifier(edu.illinois.cs.cogcomp.lbjava.classify.Classifier) LocativePairConstrainedCommaClassifier(edu.illinois.cs.cogcomp.comma.lbj.LocativePairConstrainedCommaClassifier) StructuredCommaClassifier(edu.illinois.cs.cogcomp.comma.sl.StructuredCommaClassifier) SubstitutePairConstrainedCommaClassifier(edu.illinois.cs.cogcomp.comma.lbj.SubstitutePairConstrainedCommaClassifier) LocalCommaClassifier(edu.illinois.cs.cogcomp.comma.lbj.LocalCommaClassifier) ListCommasConstrainedCommaClassifier(edu.illinois.cs.cogcomp.comma.lbj.ListCommasConstrainedCommaClassifier) BatchTrainer(edu.illinois.cs.cogcomp.lbjava.learn.BatchTrainer) EvaluateDiscrete(edu.illinois.cs.cogcomp.comma.utils.EvaluateDiscrete) SubstitutePairConstrainedCommaClassifier(edu.illinois.cs.cogcomp.comma.lbj.SubstitutePairConstrainedCommaClassifier) SparseAveragedPerceptron(edu.illinois.cs.cogcomp.lbjava.learn.SparseAveragedPerceptron) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair) LocalCommaClassifier(edu.illinois.cs.cogcomp.comma.lbj.LocalCommaClassifier) FoldParser(edu.illinois.cs.cogcomp.lbjava.parse.FoldParser)

Aggregations

Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)59 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)35 ArrayList (java.util.ArrayList)17 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)10 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)10 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)7 Matcher (java.util.regex.Matcher)7 Paragraph (edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph)6 HashMap (java.util.HashMap)6 Pattern (java.util.regex.Pattern)6 TreeView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TreeView)3 SenseInstance (edu.illinois.cs.cogcomp.verbsense.jlis.SenseInstance)3 SenseStructure (edu.illinois.cs.cogcomp.verbsense.jlis.SenseStructure)3 JsonObject (com.google.gson.JsonObject)2 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)2 ITransformer (edu.illinois.cs.cogcomp.core.transformers.ITransformer)2 IndexedWord (edu.stanford.nlp.ling.IndexedWord)2 Annotation (edu.stanford.nlp.pipeline.Annotation)2 CoreMap (edu.stanford.nlp.util.CoreMap)2 LinkedHashSet (java.util.LinkedHashSet)2