Search in sources :

Example 96 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class StringTransformationTest method testSequentialSequence.

/**
     * runs the same set of ops as testSequence, but applies edits after each transformation.
     * Ensures that the behavior is the same whether edits are done in a single pass, or over multiple passes.
     */
@Test
public void testSequentialSequence() {
    //        SEQUENCE= "The http://theonlyway.org {only}^@^@^@ way___";
    //        MODSEQUENCE= "The WWW -LCB-only-RCB- way-";
    StringTransformation st = new StringTransformation(SEQUENCE);
    st.transformString(4, 25, "WWW");
    // force edits to be flushed
    st.getTransformedText();
    st.transformString(8, 9, "-LCB-");
    st.getTransformedText();
    st.transformString(17, 18, "-RCB-");
    st.getTransformedText();
    st.transformString(22, 28, "");
    st.getTransformedText();
    st.transformString(26, 29, "-");
    st.getTransformedText();
    String modifiedStr = st.getTransformedText();
    assertEquals(SEQUENCE, st.getOrigText());
    assertEquals(SEQUENCE.length() - 18, modifiedStr.length());
    assertEquals(MODSEQUENCE, modifiedStr);
    int modStart = st.computeModifiedOffsetFromOriginal(4);
    int modEnd = st.computeModifiedOffsetFromOriginal(25);
    assertEquals(4, modStart);
    assertEquals(7, modEnd);
    String transfSeq = modifiedStr.substring(4, 7);
    String origSeq = st.getOrigText().substring(4, 25);
    assertEquals(transfSeq, "WWW");
    assertEquals(origSeq, "http://theonlyway.org");
    /*
         * what happens if we query a char in the middle of a deleted sequence?
         * -- should map to beginning of that modification
         */
    int modMid = st.computeModifiedOffsetFromOriginal(20);
    assertEquals(7, modMid);
    IntPair origOffsets = st.getOriginalOffsets(4, 7);
    assertEquals(4, origOffsets.getFirst());
    assertEquals(25, origOffsets.getSecond());
    // intermediate edit chars map to same offsets, treated like replacements
    origOffsets = st.getOriginalOffsets(1, 2);
    assertEquals(1, origOffsets.getFirst());
    assertEquals(2, origOffsets.getSecond());
    // in the middle of the replaced
    origOffsets = st.getOriginalOffsets(1, 6);
    assertEquals(6, origOffsets.getSecond());
    // check expand edit
    origOffsets = st.getOriginalOffsets(17, 22);
    assertEquals(31, origOffsets.getFirst());
    assertEquals(32, origOffsets.getSecond());
    transfSeq = modifiedStr.substring(17, 22);
    origSeq = st.getOrigText().substring(31, 32);
    assertEquals("-RCB-", transfSeq);
    // combines expand + delete for contiguous spans
    assertEquals("}", origSeq);
    // intermediate edit chars map to same offsets, treated like replacements.
    // note that this could be weird in case of multiple edits at same index
    //   (e.g. insertion, then deletion)
    // Note that these don't really make sense as substrings, and nor are the mapped substrings likely to make sense
    origOffsets = st.getOriginalOffsets(19, 20);
    assertEquals(29, origOffsets.getFirst());
    assertEquals(30, origOffsets.getSecond());
    // in the middle of the replaced
    modStart = st.computeModifiedOffsetFromOriginal(31);
    modEnd = st.computeModifiedOffsetFromOriginal(32);
    assertEquals(17, modStart);
    assertEquals(22, modEnd);
}
Also used : StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Test(org.junit.Test)

Example 97 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class XmlFragmentWhitespacingReaderTest method testReader.

@Test
public void testReader() {
    XmlFragmentWhitespacingDocumentReader reader = null;
    try {
        reader = new XmlFragmentWhitespacingDocumentReader("ERE_BEST", TEST_DIR, ".txt", ".txt");
    } catch (IOException e) {
        e.printStackTrace();
        fail(e.getMessage());
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    List<List<Path>> files = null;
    try {
        files = reader.getFileListing();
    } catch (IOException e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    assertEquals(2, files.size());
    Set<String> names = new TreeSet<>();
    for (List<Path> file : files) names.add(file.get(0).getName(file.get(0).getNameCount() - 1).toString());
    assertTrue(names.contains(REF_FILE_ONE));
    assertTrue(names.contains(REF_FILE_TWO));
    Map<String, TextAnnotation> tas = new HashMap<>();
    for (List<Path> file : files) {
        try {
            tas.put(file.get(0).getName(file.get(0).getNameCount() - 1).toString(), reader.getAnnotationsFromFile(file).get(0));
        } catch (Exception e) {
            e.printStackTrace();
            fail(e.getMessage());
        }
    }
    logger.info("----\n" + tas.get(REF_FILE_ONE).getText() + "----\n");
    logger.info("----\n" + tas.get(REF_FILE_TWO).getText() + "----\n");
    String FIRST_ERE_FILE = RAW_FILE_DIR + "/" + REF_FILE_ONE;
    String firstRawText = null;
    try {
        firstRawText = LineIO.slurp(FIRST_ERE_FILE);
    } catch (FileNotFoundException e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    Pattern sun = Pattern.compile("\\w*Sun\\w*");
    Matcher sunMatcher = sun.matcher(firstRawText);
    Set<IntPair> sunSpans = new HashSet<>();
    while (sunMatcher.find()) sunSpans.add(new IntPair(sunMatcher.start(), sunMatcher.end()));
    TextAnnotation ta = tas.get(REF_FILE_ONE);
    for (Constituent c : ta.getView(ViewNames.TOKENS).getConstituents()) {
        if (c.getSurfaceForm().contains("Sun")) {
            IntPair cCharSpan = new IntPair(c.getStartCharOffset(), c.getEndCharOffset());
            assertTrue(sunSpans.contains(cCharSpan));
            sunSpans.remove(cCharSpan);
            logger.error("FOUND OVERLAPPING SPAN: '" + printSpanInContext(firstRawText, cCharSpan));
        }
    }
    for (IntPair missedSpan : sunSpans) logger.error("MISSED SPAN: '" + printSpanInContext(firstRawText, missedSpan) + "'.");
    assertTrue(sunSpans.isEmpty());
}
Also used : Path(java.nio.file.Path) Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) FileNotFoundException(java.io.FileNotFoundException) IOException(java.io.IOException) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) Test(org.junit.Test)

Example 98 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class PathLSTMHandler method getSRL.

private PredicateArgumentView getSRL(TextAnnotation ta) throws Exception {
    log.debug("Input: {}", ta.getText());
    PredicateArgumentView pav = new PredicateArgumentView(viewName, "PathLSTMGenerator", ta, 1.0);
    List<String> words = new LinkedList<String>();
    // dummy ROOT token
    words.add("<ROOT>");
    // pre-tokenized text
    words.addAll(Arrays.asList(ta.getTokens()));
    // run SRL
    Sentence parsed = SRLpipeline.parse(words);
    for (Predicate p : parsed.getPredicates()) {
        // skip nominal predicates
        if (p.getPOS().startsWith("N"))
            continue;
        IntPair predicateSpan = new IntPair(p.getIdx() - 1, p.getIdx());
        String predicateLemma = p.getLemma();
        Constituent predicate = new Constituent("Predicate", viewName, ta, predicateSpan.getFirst(), predicateSpan.getSecond());
        predicate.addAttribute(PredicateArgumentView.LemmaIdentifier, predicateLemma);
        String sense = p.getSense();
        predicate.addAttribute(PredicateArgumentView.SenseIdentifer, sense);
        List<Constituent> args = new ArrayList<>();
        List<String> relations = new ArrayList<>();
        for (Word a : p.getArgMap().keySet()) {
            Set<Word> singleton = new TreeSet<Word>();
            String label = p.getArgumentTag(a);
            Yield y = a.getYield(p, label, singleton);
            IntPair span = new IntPair(y.first().getIdx() - 1, y.last().getIdx());
            assert span.getFirst() <= span.getSecond() : ta;
            args.add(new Constituent(label, viewName, ta, span.getFirst(), span.getSecond()));
            relations.add(label);
        }
        pav.addPredicateArguments(predicate, args, relations.toArray(new String[relations.size()]), new double[relations.size()]);
    }
    return pav;
}
Also used : Word(se.lth.cs.srl.corpus.Word) ArrayList(java.util.ArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) PredicateArgumentView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.PredicateArgumentView) LinkedList(java.util.LinkedList) Predicate(se.lth.cs.srl.corpus.Predicate) TreeSet(java.util.TreeSet) Yield(se.lth.cs.srl.corpus.Yield) Sentence(se.lth.cs.srl.corpus.Sentence) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 99 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class ThaiTokenizer method tokenizeSentence.

/**
     * given a sentence, return a set of tokens and their character offsets
     *
     * @param text The sentence string
     * @return A {@link Pair} containing the array of tokens and their character offsets
     */
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String text) {
    List<IntPair> offsets = new ArrayList<>();
    List<String> surfaces = new ArrayList<>();
    List<Integer> sen_ends = new ArrayList<>();
    BreakIterator boundary = BreakIterator.getWordInstance(new Locale("th", "TH", "TH"));
    boundary.setText(text);
    int start = boundary.first();
    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
        //            System.out.println(start+" "+end+" "+text.length());
        String sur = text.substring(start, end);
        if (sur.trim().isEmpty()) {
            //                    sen_ends.add(surfaces.size());
            continue;
        }
        surfaces.add(sur);
        offsets.add(new IntPair(start, end));
    }
    if (surfaces.size() > 0 && (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size()))
        sen_ends.add(surfaces.size());
    IntPair[] offs = new IntPair[offsets.size()];
    offs = offsets.toArray(offs);
    String[] surfs = new String[surfaces.size()];
    surfs = surfaces.toArray(surfs);
    return new Pair(surfs, offs);
}
Also used : Locale(java.util.Locale) ArrayList(java.util.ArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) BreakIterator(java.text.BreakIterator) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 100 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class ThaiTokenizer method tokenizeTextSpan.

/**
     * given a span of text, return a list of Pair{@literal < String[], IntPair[] >} corresponding
     * to tokenized sentences, where the String[] is the ordered list of sentence tokens and the
     * IntPair[] is the corresponding list of character offsets with respect to <b>the original
     * text</b>.
     *
     * @param textSpan
     */
@Override
public Tokenization tokenizeTextSpan(String textSpan) {
    List<IntPair> offsets = new ArrayList<>();
    List<String> surfaces = new ArrayList<>();
    List<Integer> sen_ends = new ArrayList<>();
    BreakIterator boundary = BreakIterator.getWordInstance(new Locale("th", "TH", "TH"));
    boundary.setText(textSpan);
    int start = boundary.first();
    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
        //            System.out.println(start+" "+end+" "+text.length());
        String sur = textSpan.substring(start, end);
        if (sur.trim().isEmpty()) {
            //                    sen_ends.add(surfaces.size());
            continue;
        }
        surfaces.add(sur);
        offsets.add(new IntPair(start, end));
    }
    if (surfaces.size() > 0 && (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size()))
        sen_ends.add(surfaces.size());
    IntPair[] offs = new IntPair[offsets.size()];
    offs = offsets.toArray(offs);
    String[] surfs = new String[surfaces.size()];
    surfs = surfaces.toArray(surfs);
    int[] ends = new int[sen_ends.size()];
    for (int i = 0; i < sen_ends.size(); i++) ends[i] = sen_ends.get(i);
    return new Tokenization(surfs, offs, ends);
}
Also used : Locale(java.util.Locale) ArrayList(java.util.ArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) BreakIterator(java.text.BreakIterator)

Aggregations

IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)103 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)32 Test (org.junit.Test)20 ArrayList (java.util.ArrayList)19 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)18 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)14 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)13 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)6 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)5 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)4 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)4 FileNotFoundException (java.io.FileNotFoundException)4 Matcher (java.util.regex.Matcher)4 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)3 SentenceSplitter (edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter)3 LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)3 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)3 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)3 CoreLabel (edu.stanford.nlp.ling.CoreLabel)3 Annotation (edu.stanford.nlp.pipeline.Annotation)3