use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class ParseHelper method getTokenIndexedParseTreeNodeCovering.
public static Tree<Pair<String, IntPair>> getTokenIndexedParseTreeNodeCovering(String parseViewName, Constituent c) {
// / UGLY CODE ALERT!!!
TextAnnotation ta = c.getTextAnnotation();
int sentenceId = ta.getSentenceId(c);
Tree<String> tree = getParseTree(parseViewName, ta, sentenceId);
final int sentenceStartSpan = ta.getSentence(sentenceId).getStartSpan();
int start = c.getStartSpan() - sentenceStartSpan;
int end = c.getEndSpan() - sentenceStartSpan;
// Find the tree that covers the start and end tokens. However, start
// and end have been shifted relative to the start of the sentence. So
// we need to shift it back, which is why we have that UGLY as sin
// mapper at the end.
Tree<Pair<String, IntPair>> toknTree = getTokenIndexedTreeCovering(tree, start, end);
ITransformer<Tree<Pair<String, IntPair>>, Pair<String, IntPair>> transformer = new ITransformer<Tree<Pair<String, IntPair>>, Pair<String, IntPair>>() {
@Override
public Pair<String, IntPair> transform(Tree<Pair<String, IntPair>> input) {
Pair<String, IntPair> label = input.getLabel();
IntPair newSpan = new IntPair(label.getSecond().getFirst() + sentenceStartSpan, label.getSecond().getSecond() + sentenceStartSpan);
return new Pair<>(label.getFirst(), newSpan);
}
};
return Mappers.mapTree(toknTree, transformer);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class GazetteerViewGenerator method hashAllSpans.
private TIntObjectHashMap<ArrayList<IntPair>> hashAllSpans(TextAnnotation ta) {
TIntObjectHashMap<ArrayList<IntPair>> allSpans = new TIntObjectHashMap<>();
for (int start = 0; start < ta.size() - 1; start++) {
int last = Math.min(ta.size(), start + maxLength);
StringBuilder sb = new StringBuilder();
for (int end = start; end < last; end++) {
String token = ta.getToken(end);
token = token.replaceAll("``", "\"").replaceAll("''", "\"");
token = SentenceUtils.convertFromPTBBrackets(token);
sb.append(token).append(" ");
int hash = sb.toString().trim().hashCode();
if (!allSpans.containsKey(hash))
allSpans.put(hash, new ArrayList<>());
List<IntPair> object = allSpans.get(hash);
object.add(new IntPair(start, end + 1));
}
}
return allSpans;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class CurrencyIndicator method addCurrencyView.
private void addCurrencyView(TextAnnotation ta) throws Exception {
if (!loaded)
synchronized (this) {
// now its changed to be loaded from datastore.
if (!loaded)
loadCurrency(gzip, true);
}
synchronized (ta) {
if (ta.hasView(VIEW_NAME))
return;
List<String> tokens = new ArrayList<>();
Collections.addAll(tokens, ta.getTokens());
List<IntPair> matches = new ArrayList<>();
for (String pattern : currencies) {
List<IntPair> list = ta.getSpansMatching(pattern);
matches.addAll(list);
}
SpanLabelView view = new SpanLabelView(VIEW_NAME, "Gazetteer", ta, 1.0);
Set<IntPair> added = new LinkedHashSet<>();
for (IntPair p : matches) {
// don't add nested constituents of the same type
boolean foundContainer = false;
for (IntPair p1 : added) {
if (p1 == p)
continue;
if (p1.getFirst() <= p.getFirst() && p1.getSecond() >= p.getSecond()) {
foundContainer = true;
break;
}
}
if (!foundContainer) {
view.addSpanLabel(p.getFirst(), p.getSecond(), "CURRENCY", 1.0);
added.add(p);
}
}
ta.addView(VIEW_NAME, view);
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class Comma method getStrippedNotation.
public String getStrippedNotation(Constituent c) {
if (c == null)
return "NULL";
String notation = c.getLabel().split("-")[0];
if (NERlexicalise)
notation += "-" + getNamedEntityTag(c);
if (POSlexicalise) {
notation += "-";
IntPair span = c.getSpan();
TextAnnotation ta = c.getTextAnnotation();
for (int tokenId = span.getFirst(); tokenId < span.getSecond(); tokenId++) notation += " " + POSUtils.getPOS(ta, tokenId);
}
return notation;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class CuratorDataStructureInterface method getTextAnnotationFromRecord.
public static TextAnnotation getTextAnnotationFromRecord(String corpusId, String textId, Record record, Labeling tokensLabeling, Labeling sentenceLabeling) {
final String rawText = record.getRawText();
final List<Span> sentenceLabels = sentenceLabeling.getLabels();
final int[] sentenceEndPositions = new int[sentenceLabels.size()];
Arrays.fill(sentenceEndPositions, -1);
final List<Span> labels = tokensLabeling.getLabels();
List<IntPair> charOffsetsList = new ArrayList<>();
final String[] tokensArray = new String[labels.size()];
int tokenId = 0;
int sentenceId = 0;
int nextSentenceEnd = sentenceLabels.get(sentenceId).getEnding();
boolean endedSentence = false;
for (final Span token : labels) {
// The raw token spans are corrected for sentence indexing, no translation needed
final int rawTokenStart = token.getStart();
final int rawTokenEnd = token.getEnding();
tokensArray[tokenId] = rawText.substring(rawTokenStart, rawTokenEnd);
charOffsetsList.add(new IntPair(rawTokenStart, rawTokenEnd));
if (rawTokenEnd == nextSentenceEnd) {
// we found a sentence. Let's mark it's end point in terms of
// number of tokens.
sentenceEndPositions[sentenceId] = tokenId + 1;
endedSentence = true;
sentenceId++;
if (sentenceId < sentenceLabels.size())
nextSentenceEnd = sentenceLabels.get(sentenceId).getEnding();
else {
// be no more tokens. As a sanity check, let's assert this.
if (tokenId != labels.size() - 1)
log.error("Found tokens that don't belong to any sentence for input: " + rawText);
}
}
tokenId++;
}
// let's close the sentence with the expected sentence boundary.
if (!endedSentence) {
sentenceEndPositions[sentenceId] = tokenId;
}
IntPair[] characterOffsets = new IntPair[charOffsetsList.size()];
for (int i = 0; i < charOffsetsList.size(); i++) characterOffsets[i] = charOffsetsList.get(i);
return new TextAnnotation(corpusId, textId, rawText, characterOffsets, tokensArray, sentenceEndPositions);
}
Aggregations