use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.
the class WordTopicAndLayoutFeatures method addDatasets.
/*
* Note- this assumes that the data is split by documents. So if we choose to ignore the
* document boundaries, we're in trouble!!!
*/
public static void addDatasets(Vector<LinkedVector> sentences, boolean lowercaseData, double confidenceThreshold) throws Exception {
if (nb == null || map == null)
throw new Exception("Topic classifier not initialized!!!");
String documentText = "";
Vector<NEWord> docWords = new Vector<>();
for (int sid = 0; sid < sentences.size(); sid++) {
LinkedVector s = sentences.elementAt(sid);
for (int i = 0; i < s.size(); i++) {
documentText += " " + ((NEWord) s.get(i)).originalForm + " ";
docWords.addElement((NEWord) s.get(i));
}
if (((NEWord) s.get(s.size() - 1)).nextIgnoreSentenceBoundary == null) {
// this is the last sentence in the document- move on!
if (lowercaseData)
documentText = documentText.toLowerCase();
Document doc = new Document(InFile.tokenize(documentText, "\n\t -.,?<>;':\"[]{}\\|`~!@#$%^&*()_+=-0987654321`~"), -1);
int label = nb.classify(doc, confidenceThreshold);
logger.info("*********************\n" + labelnames[label + 1] + "\n*********************\n" + documentText.substring(0, Math.min(documentText.length(), 400)));
for (int i = 0; i < docWords.size(); i++) wordToTopicIdMap.put(docWords.elementAt(i), label);
documentText = "";
docWords = new Vector<>();
}
}
}
use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.
the class TaggedDataWriter method toColumnsFormat.
private static String toColumnsFormat(Data data, NEWord.LabelToLookAt labelType) {
StringBuilder res = new StringBuilder(data.documents.size() * 1000);
for (int did = 0; did < data.documents.size(); did++) {
for (int i = 0; i < data.documents.get(did).sentences.size(); i++) {
LinkedVector vector = data.documents.get(did).sentences.get(i);
if (((NEWord) vector.get(0)).previousIgnoreSentenceBoundary == null)
res.append("O 0 0 O -X- -DOCSTART- x x 0\n\n");
for (int j = 0; j < vector.size(); j++) {
NEWord w = (NEWord) vector.get(j);
res.append(w.getPrediction(labelType)).append("\t0\t").append(j).append("\tO\tO\t").append(w.form).append("\tx\tx\t0\n");
}
res.append("\n");
}
}
return res.toString();
}
use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.
the class TaggedDataReader method connectSentenceBoundaries.
public static void connectSentenceBoundaries(ArrayList<LinkedVector> sentences) {
// connecting sentence boundaries
for (int i = 0; i < sentences.size(); i++) {
for (int j = 0; j < sentences.get(i).size(); j++) {
NEWord w = (NEWord) sentences.get(i).get(j);
w.previousIgnoreSentenceBoundary = (NEWord) w.previous;
w.nextIgnoreSentenceBoundary = (NEWord) w.next;
}
if (i > 0 && sentences.get(i).size() > 0) {
NEWord w = (NEWord) sentences.get(i).get(0);
w.previousIgnoreSentenceBoundary = (NEWord) sentences.get(i - 1).get(sentences.get(i - 1).size() - 1);
}
if (i < sentences.size() - 1 && sentences.get(i).size() > 0) {
NEWord w = (NEWord) sentences.get(i).get(sentences.get(i).size() - 1);
w.nextIgnoreSentenceBoundary = (NEWord) sentences.get(i + 1).get(0);
}
}
}
use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.
the class BrownClusters method printOovData.
public final void printOovData(Data data) {
HashMap<String, Boolean> tokensHash = new HashMap<>();
HashMap<String, Boolean> tokensHashIC = new HashMap<>();
ArrayList<LinkedVector> sentences = new ArrayList<>();
for (int docid = 0; docid < data.documents.size(); docid++) for (int sid = 0; sid < data.documents.get(docid).sentences.size(); sid++) sentences.add(data.documents.get(docid).sentences.get(sid));
for (LinkedVector sentence : sentences) for (int j = 0; j < sentence.size(); j++) {
String form = ((NEWord) sentence.get(j)).form;
tokensHash.put(form, true);
tokensHashIC.put(form.toLowerCase(), true);
}
for (THashMap<String, String> wordToPath : wordToPathByResource) {
HashMap<String, Boolean> oovCaseSensitiveHash = new HashMap<>();
HashMap<String, Boolean> oovAfterLowercasingHash = new HashMap<>();
for (LinkedVector sentence : sentences) {
for (int j = 0; j < sentence.size(); j++) {
String form = ((NEWord) sentence.get(j)).form;
if (!wordToPath.containsKey(form)) {
oovCaseSensitiveHash.put(form, true);
}
if ((!wordToPath.containsKey(form)) && (!wordToPath.containsKey(form.toLowerCase()))) {
oovAfterLowercasingHash.put(form.toLowerCase(), true);
}
}
}
}
}
use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.
the class TitleTextNormalizer method getNeighborhoodWordStatistics.
/*
* the first 2 parameters must be passed. the last 2 places is where I'm keeping the answers
*/
public static void getNeighborhoodWordStatistics(NEWord word, HashMap<NEWord, Boolean> wordsInMixedCasedSentences, HashMap<String, CharacteristicWords> uppercasedFormsInMixedCaseNonSentenceStart, HashMap<String, Boolean> lowecasedForms) {
NEWord temp = word.previousIgnoreSentenceBoundary;
int count = 0;
while (temp != null && count < WindowSize) {
// we dont want to take into statistics words that begin sentences
if (wordsInMixedCasedSentences.containsKey(temp) && temp.previous != null) {
String w = temp.form;
String key = w.toLowerCase();
if (Character.isUpperCase(w.charAt(0))) {
CharacteristicWords topSpellings = new CharacteristicWords(5);
if (uppercasedFormsInMixedCaseNonSentenceStart.containsKey(key))
topSpellings = uppercasedFormsInMixedCaseNonSentenceStart.get(key);
topSpellings.addElement(w, 1);
uppercasedFormsInMixedCaseNonSentenceStart.put(key, topSpellings);
}
if (Character.isLowerCase(w.charAt(0)))
lowecasedForms.put(key, true);
}
count++;
temp = temp.previousIgnoreSentenceBoundary;
}
temp = word.nextIgnoreSentenceBoundary;
count = 0;
while (temp != null && count < WindowSize) {
// we dont want to take into statistics words that begin sentences
if (wordsInMixedCasedSentences.containsKey(temp) && temp.previous != null) {
String w = temp.form;
String key = w.toLowerCase();
if (Character.isUpperCase(w.charAt(0))) {
CharacteristicWords topSpellings = new CharacteristicWords(5);
if (uppercasedFormsInMixedCaseNonSentenceStart.containsKey(key))
topSpellings = uppercasedFormsInMixedCaseNonSentenceStart.get(key);
topSpellings.addElement(w, 1);
uppercasedFormsInMixedCaseNonSentenceStart.put(key, topSpellings);
}
if (Character.isLowerCase(w.charAt(0)))
lowecasedForms.put(key, true);
}
count++;
temp = temp.nextIgnoreSentenceBoundary;
}
}
Aggregations