use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class StringTransformationTest method testSequentialSequence.
/**
* runs the same set of ops as testSequence, but applies edits after each transformation.
* Ensures that the behavior is the same whether edits are done in a single pass, or over multiple passes.
*/
@Test
public void testSequentialSequence() {
// SEQUENCE= "The http://theonlyway.org {only}^@^@^@ way___";
// MODSEQUENCE= "The WWW -LCB-only-RCB- way-";
StringTransformation st = new StringTransformation(SEQUENCE);
st.transformString(4, 25, "WWW");
// force edits to be flushed
st.getTransformedText();
st.transformString(8, 9, "-LCB-");
st.getTransformedText();
st.transformString(17, 18, "-RCB-");
st.getTransformedText();
st.transformString(22, 28, "");
st.getTransformedText();
st.transformString(26, 29, "-");
st.getTransformedText();
String modifiedStr = st.getTransformedText();
assertEquals(SEQUENCE, st.getOrigText());
assertEquals(SEQUENCE.length() - 18, modifiedStr.length());
assertEquals(MODSEQUENCE, modifiedStr);
int modStart = st.computeModifiedOffsetFromOriginal(4);
int modEnd = st.computeModifiedOffsetFromOriginal(25);
assertEquals(4, modStart);
assertEquals(7, modEnd);
String transfSeq = modifiedStr.substring(4, 7);
String origSeq = st.getOrigText().substring(4, 25);
assertEquals(transfSeq, "WWW");
assertEquals(origSeq, "http://theonlyway.org");
/*
* what happens if we query a char in the middle of a deleted sequence?
* -- should map to beginning of that modification
*/
int modMid = st.computeModifiedOffsetFromOriginal(20);
assertEquals(7, modMid);
IntPair origOffsets = st.getOriginalOffsets(4, 7);
assertEquals(4, origOffsets.getFirst());
assertEquals(25, origOffsets.getSecond());
// intermediate edit chars map to same offsets, treated like replacements
origOffsets = st.getOriginalOffsets(1, 2);
assertEquals(1, origOffsets.getFirst());
assertEquals(2, origOffsets.getSecond());
// in the middle of the replaced
origOffsets = st.getOriginalOffsets(1, 6);
assertEquals(6, origOffsets.getSecond());
// check expand edit
origOffsets = st.getOriginalOffsets(17, 22);
assertEquals(31, origOffsets.getFirst());
assertEquals(32, origOffsets.getSecond());
transfSeq = modifiedStr.substring(17, 22);
origSeq = st.getOrigText().substring(31, 32);
assertEquals("-RCB-", transfSeq);
// combines expand + delete for contiguous spans
assertEquals("}", origSeq);
// intermediate edit chars map to same offsets, treated like replacements.
// note that this could be weird in case of multiple edits at same index
// (e.g. insertion, then deletion)
// Note that these don't really make sense as substrings, and nor are the mapped substrings likely to make sense
origOffsets = st.getOriginalOffsets(19, 20);
assertEquals(29, origOffsets.getFirst());
assertEquals(30, origOffsets.getSecond());
// in the middle of the replaced
modStart = st.computeModifiedOffsetFromOriginal(31);
modEnd = st.computeModifiedOffsetFromOriginal(32);
assertEquals(17, modStart);
assertEquals(22, modEnd);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class XmlFragmentWhitespacingReaderTest method testReader.
@Test
public void testReader() {
XmlFragmentWhitespacingDocumentReader reader = null;
try {
reader = new XmlFragmentWhitespacingDocumentReader("ERE_BEST", TEST_DIR, ".txt", ".txt");
} catch (IOException e) {
e.printStackTrace();
fail(e.getMessage());
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
List<List<Path>> files = null;
try {
files = reader.getFileListing();
} catch (IOException e) {
e.printStackTrace();
fail(e.getMessage());
}
assertEquals(2, files.size());
Set<String> names = new TreeSet<>();
for (List<Path> file : files) names.add(file.get(0).getName(file.get(0).getNameCount() - 1).toString());
assertTrue(names.contains(REF_FILE_ONE));
assertTrue(names.contains(REF_FILE_TWO));
Map<String, TextAnnotation> tas = new HashMap<>();
for (List<Path> file : files) {
try {
tas.put(file.get(0).getName(file.get(0).getNameCount() - 1).toString(), reader.getAnnotationsFromFile(file).get(0));
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
logger.info("----\n" + tas.get(REF_FILE_ONE).getText() + "----\n");
logger.info("----\n" + tas.get(REF_FILE_TWO).getText() + "----\n");
String FIRST_ERE_FILE = RAW_FILE_DIR + "/" + REF_FILE_ONE;
String firstRawText = null;
try {
firstRawText = LineIO.slurp(FIRST_ERE_FILE);
} catch (FileNotFoundException e) {
e.printStackTrace();
fail(e.getMessage());
}
Pattern sun = Pattern.compile("\\w*Sun\\w*");
Matcher sunMatcher = sun.matcher(firstRawText);
Set<IntPair> sunSpans = new HashSet<>();
while (sunMatcher.find()) sunSpans.add(new IntPair(sunMatcher.start(), sunMatcher.end()));
TextAnnotation ta = tas.get(REF_FILE_ONE);
for (Constituent c : ta.getView(ViewNames.TOKENS).getConstituents()) {
if (c.getSurfaceForm().contains("Sun")) {
IntPair cCharSpan = new IntPair(c.getStartCharOffset(), c.getEndCharOffset());
assertTrue(sunSpans.contains(cCharSpan));
sunSpans.remove(cCharSpan);
logger.error("FOUND OVERLAPPING SPAN: '" + printSpanInContext(firstRawText, cCharSpan));
}
}
for (IntPair missedSpan : sunSpans) logger.error("MISSED SPAN: '" + printSpanInContext(firstRawText, missedSpan) + "'.");
assertTrue(sunSpans.isEmpty());
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class PathLSTMHandler method getSRL.
private PredicateArgumentView getSRL(TextAnnotation ta) throws Exception {
log.debug("Input: {}", ta.getText());
PredicateArgumentView pav = new PredicateArgumentView(viewName, "PathLSTMGenerator", ta, 1.0);
List<String> words = new LinkedList<String>();
// dummy ROOT token
words.add("<ROOT>");
// pre-tokenized text
words.addAll(Arrays.asList(ta.getTokens()));
// run SRL
Sentence parsed = SRLpipeline.parse(words);
for (Predicate p : parsed.getPredicates()) {
// skip nominal predicates
if (p.getPOS().startsWith("N"))
continue;
IntPair predicateSpan = new IntPair(p.getIdx() - 1, p.getIdx());
String predicateLemma = p.getLemma();
Constituent predicate = new Constituent("Predicate", viewName, ta, predicateSpan.getFirst(), predicateSpan.getSecond());
predicate.addAttribute(PredicateArgumentView.LemmaIdentifier, predicateLemma);
String sense = p.getSense();
predicate.addAttribute(PredicateArgumentView.SenseIdentifer, sense);
List<Constituent> args = new ArrayList<>();
List<String> relations = new ArrayList<>();
for (Word a : p.getArgMap().keySet()) {
Set<Word> singleton = new TreeSet<Word>();
String label = p.getArgumentTag(a);
Yield y = a.getYield(p, label, singleton);
IntPair span = new IntPair(y.first().getIdx() - 1, y.last().getIdx());
assert span.getFirst() <= span.getSecond() : ta;
args.add(new Constituent(label, viewName, ta, span.getFirst(), span.getSecond()));
relations.add(label);
}
pav.addPredicateArguments(predicate, args, relations.toArray(new String[relations.size()]), new double[relations.size()]);
}
return pav;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class ThaiTokenizer method tokenizeSentence.
/**
* given a sentence, return a set of tokens and their character offsets
*
* @param text The sentence string
* @return A {@link Pair} containing the array of tokens and their character offsets
*/
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String text) {
List<IntPair> offsets = new ArrayList<>();
List<String> surfaces = new ArrayList<>();
List<Integer> sen_ends = new ArrayList<>();
BreakIterator boundary = BreakIterator.getWordInstance(new Locale("th", "TH", "TH"));
boundary.setText(text);
int start = boundary.first();
for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
// System.out.println(start+" "+end+" "+text.length());
String sur = text.substring(start, end);
if (sur.trim().isEmpty()) {
// sen_ends.add(surfaces.size());
continue;
}
surfaces.add(sur);
offsets.add(new IntPair(start, end));
}
if (surfaces.size() > 0 && (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size()))
sen_ends.add(surfaces.size());
IntPair[] offs = new IntPair[offsets.size()];
offs = offsets.toArray(offs);
String[] surfs = new String[surfaces.size()];
surfs = surfaces.toArray(surfs);
return new Pair(surfs, offs);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class ThaiTokenizer method tokenizeTextSpan.
/**
* given a span of text, return a list of Pair{@literal < String[], IntPair[] >} corresponding
* to tokenized sentences, where the String[] is the ordered list of sentence tokens and the
* IntPair[] is the corresponding list of character offsets with respect to <b>the original
* text</b>.
*
* @param textSpan
*/
@Override
public Tokenization tokenizeTextSpan(String textSpan) {
List<IntPair> offsets = new ArrayList<>();
List<String> surfaces = new ArrayList<>();
List<Integer> sen_ends = new ArrayList<>();
BreakIterator boundary = BreakIterator.getWordInstance(new Locale("th", "TH", "TH"));
boundary.setText(textSpan);
int start = boundary.first();
for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
// System.out.println(start+" "+end+" "+text.length());
String sur = textSpan.substring(start, end);
if (sur.trim().isEmpty()) {
// sen_ends.add(surfaces.size());
continue;
}
surfaces.add(sur);
offsets.add(new IntPair(start, end));
}
if (surfaces.size() > 0 && (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size()))
sen_ends.add(surfaces.size());
IntPair[] offs = new IntPair[offsets.size()];
offs = offsets.toArray(offs);
String[] surfs = new String[surfaces.size()];
surfs = surfaces.toArray(surfs);
int[] ends = new int[sen_ends.size()];
for (int i = 0; i < sen_ends.size(); i++) ends[i] = sen_ends.get(i);
return new Tokenization(surfs, offs, ends);
}
Aggregations