use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class TitleTextNormalizer method normalizeCase.
public static void normalizeCase(Data data) {
if (!ParametersForLbjCode.currentParameters.normalizeTitleText)
return;
if (lowercasedToNormalizedTokensMap == null)
init();
// Below are the words that we'll want to normalize. We'll fill in the hashtable below with
// the
// words that appear in non-mixed case sentences. For CoNLL data, we basically fill the
// hashmap
// below with words from the titles
HashMap<NEWord, Boolean> wordsToNormalize = new HashMap<>();
HashMap<NEWord, Boolean> wordsInMixedCaseSentences = new HashMap<>();
for (int docid = 0; docid < data.documents.size(); docid++) {
ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
for (LinkedVector sentence : sentences) {
if (mixedCase(sentence)) {
// note that I exclude here the first word of a sentence on purpose!!!
for (int j = 1; j < sentence.size(); j++) wordsInMixedCaseSentences.put((NEWord) sentence.get(j), true);
} else {
// normalization
for (int j = 0; j < sentence.size(); j++) wordsToNormalize.put(((NEWord) sentence.get(j)), true);
}
}
}
for (NEWord w : wordsToNormalize.keySet()) {
w.isCaseNormalized = true;
if (w.form.equals("A")) {
w.normalizedForm = "a";
w.form = w.normalizedForm;
} else {
// the hashmap below remembers the words that appeared lowercased in the document
HashMap<String, Boolean> lowecasedForms = new HashMap<>();
// note that this MUST EXCLUDE the words that start a sentence!!!!
// for each mixed-case string in mixed-case sentences, such as "McLaren"
// we're keeping all the ways to write them out. E.g. McLaren MCLAREN etc.
// Eventually, we'll normalize to the most common spelling in the document
HashMap<String, CharacteristicWords> uppercasedFormsInMixedCaseNonSentenceStart = new HashMap<>();
getNeighborhoodWordStatistics(w, wordsInMixedCaseSentences, uppercasedFormsInMixedCaseNonSentenceStart, lowecasedForms);
// w.originalForm=w.form; // this can cauze all sorts of problems!!!
String key = w.form.toLowerCase();
if (w.normalizedMostLinkableExpression == null) {
if (lowecasedForms.containsKey(key)) {
w.normalizedForm = key;
} else {
if (uppercasedFormsInMixedCaseNonSentenceStart.containsKey(key))
w.normalizedForm = uppercasedFormsInMixedCaseNonSentenceStart.get(key).topWords.elementAt(0);
else {
if (lowercasedToNormalizedTokensMap.containsKey(w.form.toLowerCase()))
w.normalizedForm = lowercasedToNormalizedTokensMap.get(w.form.toLowerCase());
else
// .toLowerCase();
w.normalizedForm = w.form;
}
}
} else {
int start = w.normalizedMostLinkableExpression.toLowerCase().indexOf(w.form.toLowerCase());
String normalizedForm = w.normalizedMostLinkableExpression.substring(start, start + w.form.length());
if (Character.isLowerCase(normalizedForm.charAt(0)) && uppercasedFormsInMixedCaseNonSentenceStart.containsKey(normalizedForm.toLowerCase()))
w.normalizedForm = uppercasedFormsInMixedCaseNonSentenceStart.get(normalizedForm.toLowerCase()).topWords.elementAt(0);
else
w.normalizedForm = normalizedForm;
}
if (w.previous == null && Character.isLowerCase(w.normalizedForm.charAt(0)))
w.normalizedForm = Character.toUpperCase(w.normalizedForm.charAt(0)) + w.normalizedForm.substring(1);
w.form = w.normalizedForm;
}
}
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class NETagPlain method tagData.
/**
* Does this assume that {@link #init()} has been called already?
*
* @param inputPath
* @param outputPath
* @throws Exception
*/
public static void tagData(String inputPath, String outputPath) throws Exception {
File f = new File(inputPath);
Vector<String> inFiles = new Vector<>();
Vector<String> outFiles = new Vector<>();
if (f.isDirectory()) {
String[] files = f.list();
for (String file : files) if (!file.startsWith(".")) {
inFiles.addElement(inputPath + File.separator + file);
outFiles.addElement(outputPath + File.separator + file);
}
} else {
inFiles.addElement(inputPath);
outFiles.addElement(outputPath);
}
for (int fileId = 0; fileId < inFiles.size(); fileId++) {
logger.debug("Tagging file: " + inFiles.elementAt(fileId));
ArrayList<LinkedVector> sentences = PlainTextReader.parsePlainTextFile(inFiles.elementAt(fileId));
NERDocument doc = new NERDocument(sentences, "consoleInput");
Data data = new Data(doc);
ExpressiveFeaturesAnnotator.annotate(data);
// formerly there was code to load models here. Check that NETagPlain.init() is
// happening.
String tagged = tagData(data, tagger1, tagger2);
OutFile out = new OutFile(outFiles.elementAt(fileId));
out.println(tagged);
out.close();
}
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class PlainTextReader method parseText.
public static ArrayList<LinkedVector> parseText(String text) {
Vector<Vector<String>> processed = sentenceSplitAndTokenizeText(text);
ArrayList<LinkedVector> res = new ArrayList<>();
for (int i = 0; i < processed.size(); i++) {
LinkedVector sentence = new LinkedVector();
for (int j = 0; j < processed.elementAt(i).size(); j++) NEWord.addTokenToSentence(sentence, processed.elementAt(i).elementAt(j), "unlabeled");
res.add(sentence);
}
TaggedDataReader.connectSentenceBoundaries(res);
return res;
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class TaggedDataReader method readFolder.
public static Vector<NERDocument> readFolder(String path, String format) throws Exception {
Vector<NERDocument> res = new Vector<>();
String[] files = (new File(path)).list();
// sort the files so we can get deterministic order.
if (ParametersForLbjCode.currentParameters.sortLexicallyFilesInFolders) {
Arrays.sort(files);
}
for (String file1 : files) {
String file = path + "/" + file1;
if ((new File(file)).isFile() && (!file1.equals(".DS_Store"))) {
res.addElement(readFile(file, format, file1));
}
}
if (ParametersForLbjCode.currentParameters.treatAllFilesInFolderAsOneBigDocument) {
// connecting sentence boundaries
for (int i = 0; i < res.size() - 1; i++) {
ArrayList<LinkedVector> ss1 = res.elementAt(i).sentences;
ArrayList<LinkedVector> ss2 = res.elementAt(i + 1).sentences;
if (ss1.size() > 0 && ss1.get(ss1.size() - 1).size() > 0 && ss2.size() > 0 && ss2.get(0).size() > 0) {
NEWord lastWord1 = (NEWord) ss1.get(ss1.size() - 1).get(ss1.get(ss1.size() - 1).size() - 1);
NEWord firstWord2 = (NEWord) ss2.get(0).get(0);
lastWord1.nextIgnoreSentenceBoundary = firstWord2;
firstWord2.previousIgnoreSentenceBoundary = lastWord1;
}
}
}
return res;
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class TextChunkRepresentationManager method IOEb2Bio.
private static void IOEb2Bio(Data data, LabelToLookAt labelType) {
for (int docid = 0; docid < data.documents.size(); docid++) {
ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
for (int i = sentences.size() - 1; i >= 0; i--) {
LinkedVector v = sentences.get(i);
for (int j = v.size() - 1; j >= 0; j--) {
NEWord w = (NEWord) v.get(j);
String label = w.getPrediction(labelType);
if (label != null && !label.equalsIgnoreCase("O")) {
String labelSuffix = label.substring(2);
NEWord prev = (NEWord) w.previous;
String prevLabel = "O";
String prevLabelSuffix = "O";
if (prev != null) {
prevLabel = prev.getPrediction(labelType);
prevLabelSuffix = prevLabel;
}
if (prevLabel.contains("-"))
prevLabelSuffix = prevLabel.substring(2);
if ((!prevLabelSuffix.equalsIgnoreCase(labelSuffix)) || prevLabel.startsWith("E-"))
w.setPrediction("B-" + labelSuffix, labelType);
if (w.getPrediction(labelType).startsWith("E-"))
w.setPrediction("I-" + labelSuffix, labelType);
}
}
}
}
}
Aggregations