use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class HashCollisionReport method main.
/**
* Read each test file in the directory, tokenize and create the token view. Then check for
* collisions.
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
if (args.length == 0)
error("Must pass in the name of a directory with files to test against.");
File dir = new File(args[0]);
if (!dir.exists()) {
error("The directory did not exist : " + dir);
}
if (!dir.isDirectory()) {
error("The path was not a directory : " + dir);
}
File[] files = dir.listFiles();
for (File file : files) {
if (file.isFile()) {
String normal = FileUtils.readFileToString(file);
TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
HashMap<Integer, Constituent> hashmap = new HashMap<>();
// is already used, if it is report it.
for (Constituent c : normalToks) {
int code = c.hashCode();
if (hashmap.containsKey(code)) {
Constituent dup = hashmap.get(code);
System.err.println(c + " == " + dup);
} else {
hashmap.put(code, c);
}
}
}
}
}
use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testCharacterOffsetToTokenIndex.
/**
* test whether the mapping between character offset and token index is correct.
*/
@Test
public void testCharacterOffsetToTokenIndex() {
String normal = "The ordinary sample.\n\nDon't mess things up.";
String leadingWaste = "<ignoreme>wastedspace</ignoreme>";
String postWaste = " \n<ignoremetoo>aaaargh</ignoremetoo>";
String other = leadingWaste + normal + postWaste;
TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
assertEquals(13, normalToks.get(2).getStartCharOffset());
assertEquals(24, normalToks.get(5).getStartCharOffset());
int ignoreUpToOffset = leadingWaste.length();
IntPair[] characterOffsets = new IntPair[10];
String[] tokens = taNormal.getTokens();
for (int i = 0; i < normalToks.size(); ++i) {
Constituent t = normalToks.get(i);
characterOffsets[i] = new IntPair(ignoreUpToOffset + t.getStartCharOffset(), ignoreUpToOffset + t.getEndCharOffset());
}
List<Constituent> sentences = taNormal.getView(ViewNames.SENTENCE).getConstituents();
int[] sentenceEndPositions = new int[sentences.size()];
for (int i = 0; i < sentences.size(); ++i) {
Constituent s = sentences.get(i);
sentenceEndPositions[i] = s.getEndSpan();
}
// all info should be same except initial char offsets of tokens ignore spans of text
TextAnnotation taOther = new TextAnnotation("test", "other", other, characterOffsets, tokens, sentenceEndPositions);
List<Constituent> otherToks = taOther.getView(ViewNames.TOKENS).getConstituents();
int thirdTokNormalStart = normalToks.get(2).getStartCharOffset();
int thirdTokOtherStart = otherToks.get(2).getStartCharOffset();
assertEquals(thirdTokOtherStart, (thirdTokNormalStart + leadingWaste.length()));
int eighthTokNormalStart = normalToks.get(8).getStartCharOffset();
int eighthTokOtherStart = otherToks.get(8).getStartCharOffset();
assertEquals(eighthTokOtherStart, (eighthTokNormalStart + leadingWaste.length()));
int meaninglessStartOffset = taOther.getTokenIdFromCharacterOffset(2);
assertEquals(-1, meaninglessStartOffset);
int meaninglessPastEndOffset = taOther.getTokenIdFromCharacterOffset(leadingWaste.length() + normal.length() + 5);
assertEquals(-1, meaninglessPastEndOffset);
int meaninglessInBetweenToksOffset = taNormal.getTokenIdFromCharacterOffset(20);
assertEquals(-1, meaninglessInBetweenToksOffset);
}
use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class NerInitTest method testInit.
@Test
public void testInit() {
Properties props = new Properties();
props.setProperty(NerBaseConfigurator.GAZETTEER_FEATURES, "0");
// props.setProperty(NerBaseConfigurator.BROWN_CLUSTER_PATHS, "0");
props.setProperty(NerBaseConfigurator.RANDOM_NOISE_LEVEL, "0.0");
props.setProperty(NerBaseConfigurator.OMISSION_RATE, "0.0");
ResourceManager rm = (new NerBaseConfigurator()).getConfig(new ResourceManager(props));
NERAnnotator ner = NerAnnotatorManager.buildNerAnnotator(rm, ViewNames.NER_CONLL);
assertNotNull(ner);
TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation ta = tab.createTextAnnotation(TESTSTR);
try {
ner.getView(ta);
} catch (AnnotatorException e) {
e.printStackTrace();
fail(e.getMessage());
}
assert (ta.hasView(ViewNames.NER_CONLL));
assertTrue(ta.getView(ViewNames.NER_CONLL).getConstituents().size() >= 1);
}
use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class MascReader method main.
/**
* Read sections of corpus into TextAnnotations, write out TextAnnotations in json format.
* Specify MASC root dir of written files, e.g. /home/mssammon/work/data/masc-ccg/written/
* @param args
*/
public static void main(String[] args) {
if (args.length != 2) {
System.err.println("Usage: " + NAME + " mascCorpusDir outDir");
System.exit(-1);
}
String corpusDir = args[0];
String outDirGold = args[1];
String outDirPred = outDirGold + "_PRED";
Properties props = new Properties();
props.setProperty(CorpusReaderConfigurator.CORPUS_DIRECTORY.key, corpusDir);
props.setProperty(CorpusReaderConfigurator.SOURCE_DIRECTORY.key, corpusDir);
IOUtils.mkdir(outDirGold);
IOUtils.mkdir(outDirPred);
ResourceManager rm = new ResourceManager(props);
MascReader reader = null;
try {
reader = new MascReader(rm);
} catch (Exception e) {
e.printStackTrace();
System.exit(-1);
}
TextAnnotationBuilder taBldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, false));
int numGoldTokCorrect = 0;
int numGoldTokTotal = 0;
int numGoldSentCorrect = 0;
int numGoldSentTotal = 0;
while (reader.hasNext()) {
TextAnnotation goldTa = reader.next();
String text = goldTa.getText();
// Tokenizer.Tokenization tknz = tokenizer.tokenizeTextSpan(text);
TextAnnotation predTa = taBldr.createTextAnnotation(goldTa.getCorpusId() + "_PREDICTED", goldTa.getId(), text);
IntPair[] goldTokCharOffsets = getCharacterOffsets(goldTa.getView(ViewNames.TOKENS));
numGoldTokTotal += goldTokCharOffsets.length;
numGoldTokCorrect += countCorrectSpans(predTa.getView(ViewNames.TOKENS), goldTokCharOffsets);
IntPair[] goldSentCharOffsets = getCharacterOffsets(goldTa.getView(ViewNames.SENTENCE));
numGoldSentTotal += goldSentCharOffsets.length;
numGoldSentCorrect += countCorrectSpans(predTa.getView(ViewNames.SENTENCE), goldSentCharOffsets);
String taJson = SerializationHelper.serializeToJson(goldTa, true);
String outFile = Paths.get(outDirGold, goldTa.getId() + ".json").toString();
try {
logger.trace("Writing file out to '{}'...", outFile);
LineIO.write(outFile, Collections.singletonList(taJson));
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
outFile = Paths.get(outDirPred, predTa.getId() + ".json").toString();
String predTaJson = SerializationHelper.serializeToJson(predTa, true);
try {
logger.debug("writing file '{}'...", outFile);
LineIO.write(outFile, Collections.singletonList(predTaJson));
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
logger.debug("## finished processing file '{}'.", goldTa.getId());
}
System.out.println(reader.generateReport());
System.out.print("TOKEN PERFORMANCE:");
computeAndPrintAcc(numGoldTokCorrect, numGoldTokTotal);
System.out.print("SENTENCE PERFORMANCE:");
computeAndPrintAcc(numGoldSentCorrect, numGoldSentTotal);
}
use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class MultilingualEreReaderTest method testChinese.
public static void testChinese() {
EREEventReader reader = null;
try {
boolean throwExceptionOnXmlParseFail = true;
TextAnnotationBuilder chineseTaBldr = MultiLingualTokenizer.getTokenizer(Language.Chinese.getCode());
reader = new EREEventReader(EREDocumentReader.EreCorpus.ENR3, chineseTaBldr, chinesePathB, throwExceptionOnXmlParseFail);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
testReader(reader);
}
Aggregations