use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class BrownClusters method printOovData.
public final void printOovData(Data data) {
HashMap<String, Boolean> tokensHash = new HashMap<>();
HashMap<String, Boolean> tokensHashIC = new HashMap<>();
ArrayList<LinkedVector> sentences = new ArrayList<>();
for (int docid = 0; docid < data.documents.size(); docid++) for (int sid = 0; sid < data.documents.get(docid).sentences.size(); sid++) sentences.add(data.documents.get(docid).sentences.get(sid));
for (LinkedVector sentence : sentences) for (int j = 0; j < sentence.size(); j++) {
String form = ((NEWord) sentence.get(j)).form;
tokensHash.put(form, true);
tokensHashIC.put(form.toLowerCase(), true);
}
/*
* System.out.println("Data statistics:");
* System.out.println("\t\t- Total tokens with repetitions ="+ totalTokens);
* System.out.println("\t\t- Total unique tokens ="+ tokensHash.size());
* System.out.println("\t\t- Total unique tokens ignore case ="+ tokensHashIC.size());
*/
for (THashMap<String, String> wordToPath : wordToPathByResource) {
HashMap<String, Boolean> oovCaseSensitiveHash = new HashMap<>();
HashMap<String, Boolean> oovAfterLowercasingHash = new HashMap<>();
for (LinkedVector sentence : sentences) {
for (int j = 0; j < sentence.size(); j++) {
String form = ((NEWord) sentence.get(j)).form;
if (!wordToPath.containsKey(form)) {
oovCaseSensitiveHash.put(form, true);
}
if ((!wordToPath.containsKey(form)) && (!wordToPath.containsKey(form.toLowerCase()))) {
oovAfterLowercasingHash.put(form.toLowerCase(), true);
}
}
}
}
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class TwoLayerPredictionAggregationFeatures method setLevel1AggregationFeatures.
// are we aggregating to the right or to the left
public static void setLevel1AggregationFeatures(Data data, boolean useGoldData) {
logger.debug("Extracting features for level 2 inference");
for (int docid = 0; docid < data.documents.size(); docid++) {
ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
for (LinkedVector twords : sentences) {
for (int j = 0; j < twords.size(); j++) {
setLevel1AggregationFeatures((NEWord) twords.get(j), useGoldData);
}
}
}
logger.debug("Done - Extracting features for level 2 inference");
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class POSBracketToVector method parsePOSBracketForm.
/**
* Given a single line of textual input (containing all and only the words
* in a single sentence) in the format shown above, this method parses and
* returns a <code>LinkedVector</code>.
*
* @param line A single line of text.
* @return A <code>LinkedVector</code> representing the input text.
**/
public static LinkedVector parsePOSBracketForm(String line) {
String[] tokens = line.trim().split(" ");
if (tokens.length == 0 || tokens.length == 1 && (tokens[0] == null || tokens[0].length() == 0))
return new LinkedVector();
int spaceIndex = line.indexOf(' ');
spaceIndex = line.indexOf(' ', spaceIndex + 1);
Word w = new Word(tokens[1].substring(0, tokens[1].length() - 1), tokens[0].substring(1), 0, spaceIndex - 1);
for (int i = 2; i < tokens.length; i += 2) {
int start = spaceIndex + 1;
spaceIndex = line.indexOf(' ', spaceIndex + 1);
spaceIndex = line.indexOf(' ', spaceIndex + 1);
w.next = new Word(tokens[i + 1].substring(0, tokens[i + 1].length() - 1), tokens[i].substring(1), w, start, spaceIndex - 1);
w = (Word) w.next;
}
return new LinkedVector(w);
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class Sentence method wordSplit.
/**
* Creates and returns a <code>LinkedVector</code> representation of this
* sentence in which every <code>LinkedChild</code> is a <code>Word</code>.
* Offset information is respected and propagated.
*
* @return A <code>LinkedVector</code> representation of this sentence.
* @see Word
**/
public LinkedVector wordSplit() {
LinkedList<Integer> boundaries = new LinkedList<>();
// Whitespace always signals a word boundary.
Matcher m = pSpaces.matcher(text);
while (m.find()) {
myAdd(boundaries, m.start() - 1);
myAdd(boundaries, m.end());
}
// there's whitespace there.
if (boundaries.size() > 0 && boundaries.getLast() >= text.length())
boundaries.removeLast();
else
myAdd(boundaries, text.length() - 1);
if (boundaries.size() > 1 && boundaries.getFirst() == -1)
boundaries.removeFirst();
else
myAdd(boundaries, 0);
// Commas are separate words unless they're part of a number.
for (int i = text.indexOf(','); i != -1; i = text.indexOf(',', i + 1)) {
if (i > 0 && text.charAt(i - 1) != ',' && (pNoSpaceOrDigit.matcher(text.substring(i - 1, i)).find() || i + 1 == text.length() && pDigit.matcher(text.substring(i - 1, i)).find() || i + 1 < text.length() && pDigitCommaNoDigit.matcher(text.substring(i - 1, i + 2)).find())) {
myAdd(boundaries, i - 1);
myAdd(boundaries, i);
}
if (i + 1 < text.length() && (pNoSpaceOrDigit.matcher(text.substring(i + 1, i + 2)).find() || i == 0 && pDigit.matcher(text.substring(i + 1, i + 2)).find() || i > 0 && pNoDigitCommaDigit.matcher(text.substring(i - 1, i + 2)).find())) {
myAdd(boundaries, i);
myAdd(boundaries, i + 1);
}
}
// n't "can't", "won't", "shouldn't", "aren't"
for (int i = text.indexOf('\''); i != -1; i = text.indexOf('\'', i + 1)) {
if (i - 1 > 0 && Character.isLetter(text.charAt(i - 2)) && text.charAt(i - 1) == 'n' && i + 1 < text.length() && text.charAt(i + 1) == 't' && (i + 2 == text.length() || !Character.isLetter(text.charAt(i + 2)) && text.charAt(i + 2) != '\'')) {
myAdd(boundaries, i - 2);
myAdd(boundaries, i - 1);
} else if (i > 0 && (pApostropheMask.matcher(text.substring(i - 1, i)).find() && i + 1 < text.length() && text.charAt(i + 1) == '\'' || text.charAt(i - 1) == 's' && (i + 1 == text.length() || !Character.isLetter(text.charAt(i + 1)) && text.charAt(i + 1) != '\'') || Character.isLetter(text.charAt(i - 1)) && (i + 1 < text.length() && (i + 2 == text.length() || !Character.isLetter(text.charAt(i + 2)) && text.charAt(i + 2) != '\'') && (text.charAt(i + 1) == 'd' || text.charAt(i + 1) == 'm' || text.charAt(i + 1) == 's') || i + 2 < text.length() && (i + 3 == text.length() || !Character.isLetter(text.charAt(i + 3)) && text.charAt(i + 3) != '\'') && (text.substring(i + 1, i + 3).equals("ll") || text.substring(i + 1, i + 3).equals("re") || text.substring(i + 1, i + 3).equals("ve"))) || text.charAt(i - 1) == '.' && i - 1 > 0 && Character.isLetter(text.charAt(i - 2)) && i + 1 < text.length() && (i + 2 == text.length() || !Character.isLetter(text.charAt(i + 2)) && text.charAt(i + 2) != '\'') && text.charAt(i + 1) == 's')) {
myAdd(boundaries, i - 1);
myAdd(boundaries, i);
}
if (i + 1 < text.length() && pApostropheMask.matcher(text.substring(i + 1, i + 2)).find() && (!Character.isLetter(text.charAt(i + 1)) || i > 0 && text.charAt(i - 1) == '\'')) {
myAdd(boundaries, i);
myAdd(boundaries, i + 1);
}
}
// URL.
for (int i = text.indexOf(':'); i != -1; i = text.indexOf(':', i + 1)) if (!(i >= 2 && i + 2 < text.length() && pColonSeparator.matcher(text.substring(i - 2, i + 3)).find() || i > 2 && i + 2 < text.length() && (text.substring(i - 2, i + 3).equals("tp://") || text.substring(i - 2, i + 3).equals("TP://")) || partOfURL(i))) {
if (i >= 1 && pColonMask.matcher(text.substring(i - 1, i)).find()) {
myAdd(boundaries, i - 1);
myAdd(boundaries, i);
}
if (i + 1 < text.length() && pColonMask.matcher(text.substring(i + 1, i + 2)).find()) {
myAdd(boundaries, i);
myAdd(boundaries, i + 1);
}
}
// URL.
for (int i = text.indexOf('/'); i != -1; i = text.indexOf('/', i + 1)) if (!(i >= 2 && i + 2 < text.length() && pSlashSeparator.matcher(text.substring(i - 2, i + 3)).find() || i > 3 && i + 1 < text.length() && (text.substring(i - 3, i + 2).equals("tp://") || text.substring(i - 3, i + 2).equals("TP://")) || i > 4 && (text.substring(i - 4, i + 1).equals("tp://") || text.substring(i - 4, i + 1).equals("TP://")) || partOfURL(i))) {
if (i >= 1 && pSlashMask.matcher(text.substring(i - 1, i)).find()) {
myAdd(boundaries, i - 1);
myAdd(boundaries, i);
}
if (i + 1 < text.length() && pSlashMask.matcher(text.substring(i + 1, i + 2)).find()) {
myAdd(boundaries, i);
myAdd(boundaries, i + 1);
}
}
// part of some useful structure like a compound word, a number, or a URL.
for (int i = text.indexOf('-'); i != -1; i = text.indexOf('-', i + 1)) if (!(i + 1 < text.length() && i >= 1 && pDashSeparator.matcher(text.substring(i - 1, i + 2)).find() || (i + 2 < text.length() && (i == 0 && pNegative1.matcher(text.substring(i, i + 3)).find() || i > 0 && pNegative2.matcher(text.substring(i - 1, i + 3)).find())) || partOfURL(i))) {
if (i >= 1 && pDashMask.matcher(text.substring(i - 1, i)).find()) {
myAdd(boundaries, i - 1);
myAdd(boundaries, i);
}
if (i + 1 < text.length() && pDashMask.matcher(text.substring(i + 1, i + 2)).find()) {
myAdd(boundaries, i);
myAdd(boundaries, i + 1);
}
}
// a URL.
for (int i = text.indexOf('$'); i != -1; i = text.indexOf('$', i + 1)) if (!(i == 0 && i + 2 < text.length() && pMoney1.matcher(text.substring(i, i + 3)).find() || i > 0 && i + 2 < text.length() && pMoney2.matcher(text.substring(i - 1, i + 3)).find() || partOfURL(i))) {
if (i >= 1 && pDollarMask.matcher(text.substring(i - 1, i)).find()) {
myAdd(boundaries, i - 1);
myAdd(boundaries, i);
}
if (i + 1 < text.length() && pDollarMask.matcher(text.substring(i + 1, i + 2)).find()) {
myAdd(boundaries, i);
myAdd(boundaries, i + 1);
}
}
// Three or more consecutive periods form their own word.
for (int i = text.indexOf('.'); i != -1; i = text.indexOf('.', i + 1)) {
if (i > 0 && i + 2 < text.length() && pBeforeElipsis.matcher(text.substring(i - 1, i + 3)).find()) {
myAdd(boundaries, i - 1);
myAdd(boundaries, i);
}
if (i >= 2 && i + 1 < text.length() && pAfterElipsis.matcher(text.substring(i - 2, i + 2)).find()) {
myAdd(boundaries, i);
myAdd(boundaries, i + 1);
}
}
// If the last occurrence of a period in the sentence comes after all
// occurrences of letters and digits, it is an end of sentence marker
// which constitutes its own word, unless it appears immediately after two
// other periods.
int period = text.lastIndexOf('.');
if (period != -1) {
boolean endOfSentence = true;
for (int i = period + 1; i < text.length() && endOfSentence; ++i) endOfSentence = !Character.isLetterOrDigit(text.charAt(i));
if (endOfSentence) {
if (period >= 1 && (text.charAt(period - 1) != '.' || period == 1 || text.charAt(period - 2) != '.') && pDollarMask.matcher(text.substring(period - 1, period)).find()) {
myAdd(boundaries, period - 1);
myAdd(boundaries, period);
}
if (period + 1 < text.length() && (period == 0 || text.charAt(period - 1) != '.' || period == 1 || text.charAt(period - 2) != '.') && pDollarMask.matcher(text.substring(period + 1, period + 2)).find()) {
myAdd(boundaries, period);
myAdd(boundaries, period + 1);
}
} else
period = -1;
}
// All other punctuation marks constitute their own words, unless they
// appear immediately after themselves (consecutive identical punctuation
// marks form a single word) or are part of a URL.
m = pPunctuation.matcher(text);
while (m.find()) if (!partOfURL(m.start())) {
if (m.start() + 1 < text.length() && text.charAt(m.start()) != text.charAt(m.start() + 1) && m.start() + 1 != period && pPunctuation.matcher(text.substring(m.start() + 1, m.start() + 2)).find()) {
myAdd(boundaries, m.start());
myAdd(boundaries, m.start() + 1);
}
}
m = pPunctuation2.matcher(text);
while (m.find()) if (!partOfURL(m.start())) {
myAdd(boundaries, m.start());
myAdd(boundaries, m.start() + 1);
}
m = pPunctuation3.matcher(text);
while (m.find()) if (!partOfURL(m.start())) {
myAdd(boundaries, m.start());
myAdd(boundaries, m.start() + 1);
}
// Now we just have to create the LinkedVector.
Integer[] temp = boundaries.toArray(new Integer[boundaries.size()]);
int[] I = new int[temp.length];
for (int i = 0; i < I.length; ++i) I[i] = temp[i];
Arrays.sort(I);
Word w = new Word(text.substring(I[0], I[1] + 1), I[0] + start, I[1] + start);
for (int i = 2; i < I.length; i += 2) {
w.next = new Word(text.substring(I[i], I[i + 1] + 1), w, I[i] + start, I[i + 1] + start);
w = (Word) w.next;
}
inURL = null;
return new LinkedVector(w);
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class StringArraysToWords method convert.
/**
* Given an array of <code>String</code>s, this method creates a new
* {@link LinkedVector} containing {@link Word}s.
*
* @param a An array of <code>String</code>s.
* @return A {@link LinkedVector} of {@link Word}s corresponding to the
* input <code>String</code>s.
**/
public static LinkedVector convert(String[] a) {
if (a == null)
return null;
if (a.length == 0)
return new LinkedVector();
Word w = new Word(a[0]);
for (int i = 1; i < a.length; ++i) {
w.next = new Word(a[i], null, w);
w = (Word) w.next;
}
return new LinkedVector(w);
}
Aggregations