use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class HashCollisionReport method main.
/**
* Read each test file in the directory, tokenize and create the token view. Then check for
* collisions.
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
if (args.length == 0)
error("Must pass in the name of a directory with files to test against.");
File dir = new File(args[0]);
if (!dir.exists()) {
error("The directory did not exist : " + dir);
}
if (!dir.isDirectory()) {
error("The path was not a directory : " + dir);
}
File[] files = dir.listFiles();
for (File file : files) {
if (file.isFile()) {
String normal = FileUtils.readFileToString(file);
TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
HashMap<Integer, Constituent> hashmap = new HashMap<>();
// is already used, if it is report it.
for (Constituent c : normalToks) {
int code = c.hashCode();
if (hashmap.containsKey(code)) {
Constituent dup = hashmap.get(code);
System.err.println(c + " == " + dup);
} else {
hashmap.put(code, c);
}
}
}
}
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class TestBrownClusterFeatureExtractor method test.
@Test
public final void test() {
int[] prefixLengths = new int[] { 4, 6, 10, 20 };
BrownClusterFeatureExtractor bcfex1 = BrownClusterFeatureExtractor.instance1000;
BrownClusterFeatureExtractor bcfex2 = null;
try {
bcfex2 = new BrownClusterFeatureExtractor("bllip", "brownBllipClusters", prefixLengths);
} catch (EdisonException e) {
e.printStackTrace();
fail(e.getMessage());
}
BrownClusterFeatureExtractor bcfex3 = null;
try {
bcfex3 = new BrownClusterFeatureExtractor("wiki", "brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt", prefixLengths);
} catch (EdisonException e) {
e.printStackTrace();
fail(e.getMessage());
}
TokenizerTextAnnotationBuilder taBldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation ta = taBldr.createTextAnnotation("test", "test", "This test sentence has Joynt and Lieberknecht and Fibonnaci in it " + "just to exercise possible brown cluster hits in resources used by NER.");
Set<Feature> feats = new HashSet<>();
for (int wordIndex = 0; wordIndex < ta.size(); ++wordIndex) try {
feats.addAll(bcfex1.getWordFeatures(ta, wordIndex));
feats.addAll(bcfex2.getWordFeatures(ta, wordIndex));
feats.addAll(bcfex3.getWordFeatures(ta, wordIndex));
} catch (EdisonException e) {
e.printStackTrace();
fail(e.getMessage());
}
assertTrue(ta.hasView(ViewNames.BROWN_CLUSTERS + "_wiki"));
String[] featArray = new String[feats.size()];
int i = 0;
for (Feature f : feats) featArray[i++] = f.toString();
Arrays.sort(featArray);
String actualOutput = StringUtils.join(",", featArray);
assertEquals(expectedOutput, actualOutput);
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class PipelineFactory method buildPipeline.
/**
* create an AnnotatorService with components specified by the ResourceManager (to override
* defaults in {@link PipelineConfigurator}
*
* @param rm non-default config options
* @return AnnotatorService with specified NLP components
* @throws IOException
* @throws AnnotatorException
*/
public static BasicAnnotatorService buildPipeline(ResourceManager rm) throws IOException, AnnotatorException {
// Merges default configuration with the user-specified overrides.
ResourceManager fullRm = (new PipelineConfigurator()).getConfig(new Stanford331Configurator().getConfig(rm));
Boolean splitOnDash = fullRm.getBoolean(PipelineConfigurator.SPLIT_ON_DASH);
boolean isSentencePipeline = fullRm.getBoolean(PipelineConfigurator.USE_SENTENCE_PIPELINE.key);
if (isSentencePipeline) {
// update cache directory to be distinct from regular pipeline
String cacheDir = fullRm.getString(AnnotatorServiceConfigurator.CACHE_DIR.key);
cacheDir += "_sentence";
Properties props = fullRm.getProperties();
props.setProperty(AnnotatorServiceConfigurator.CACHE_DIR.key, cacheDir);
fullRm = new ResourceManager(props);
}
TextAnnotationBuilder taBldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(splitOnDash, false));
Map<String, Annotator> annotators = buildAnnotators(fullRm);
return isSentencePipeline ? new SentencePipeline(taBldr, annotators, fullRm) : new BasicAnnotatorService(taBldr, annotators, fullRm);
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class NerInitTest method testInit.
@Test
public void testInit() {
Properties props = new Properties();
props.setProperty(NerBaseConfigurator.GAZETTEER_FEATURES, "0");
// props.setProperty(NerBaseConfigurator.BROWN_CLUSTER_PATHS, "0");
props.setProperty(NerBaseConfigurator.RANDOM_NOISE_LEVEL, "0.0");
props.setProperty(NerBaseConfigurator.OMISSION_RATE, "0.0");
ResourceManager rm = (new NerBaseConfigurator()).getConfig(new ResourceManager(props));
NERAnnotator ner = NerAnnotatorManager.buildNerAnnotator(rm, ViewNames.NER_CONLL);
assertNotNull(ner);
TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation ta = tab.createTextAnnotation(TESTSTR);
try {
ner.getView(ta);
} catch (AnnotatorException e) {
e.printStackTrace();
fail(e.getMessage());
}
assert (ta.hasView(ViewNames.NER_CONLL));
assertTrue(ta.getView(ViewNames.NER_CONLL).getConstituents().size() >= 1);
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class MascReader method main.
/**
* Read sections of corpus into TextAnnotations, write out TextAnnotations in json format.
* Specify MASC root dir of written files, e.g. /home/mssammon/work/data/masc-ccg/written/
* @param args
*/
public static void main(String[] args) {
if (args.length != 2) {
System.err.println("Usage: " + NAME + " mascCorpusDir outDir");
System.exit(-1);
}
String corpusDir = args[0];
String outDirGold = args[1];
String outDirPred = outDirGold + "_PRED";
Properties props = new Properties();
props.setProperty(CorpusReaderConfigurator.CORPUS_DIRECTORY.key, corpusDir);
props.setProperty(CorpusReaderConfigurator.SOURCE_DIRECTORY.key, corpusDir);
IOUtils.mkdir(outDirGold);
IOUtils.mkdir(outDirPred);
ResourceManager rm = new ResourceManager(props);
MascReader reader = null;
try {
reader = new MascReader(rm);
} catch (Exception e) {
e.printStackTrace();
System.exit(-1);
}
TextAnnotationBuilder taBldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, false));
int numGoldTokCorrect = 0;
int numGoldTokTotal = 0;
int numGoldSentCorrect = 0;
int numGoldSentTotal = 0;
while (reader.hasNext()) {
TextAnnotation goldTa = reader.next();
String text = goldTa.getText();
// Tokenizer.Tokenization tknz = tokenizer.tokenizeTextSpan(text);
TextAnnotation predTa = taBldr.createTextAnnotation(goldTa.getCorpusId() + "_PREDICTED", goldTa.getId(), text);
IntPair[] goldTokCharOffsets = getCharacterOffsets(goldTa.getView(ViewNames.TOKENS));
numGoldTokTotal += goldTokCharOffsets.length;
numGoldTokCorrect += countCorrectSpans(predTa.getView(ViewNames.TOKENS), goldTokCharOffsets);
IntPair[] goldSentCharOffsets = getCharacterOffsets(goldTa.getView(ViewNames.SENTENCE));
numGoldSentTotal += goldSentCharOffsets.length;
numGoldSentCorrect += countCorrectSpans(predTa.getView(ViewNames.SENTENCE), goldSentCharOffsets);
String taJson = SerializationHelper.serializeToJson(goldTa, true);
String outFile = Paths.get(outDirGold, goldTa.getId() + ".json").toString();
try {
logger.trace("Writing file out to '{}'...", outFile);
LineIO.write(outFile, Collections.singletonList(taJson));
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
outFile = Paths.get(outDirPred, predTa.getId() + ".json").toString();
String predTaJson = SerializationHelper.serializeToJson(predTa, true);
try {
logger.debug("writing file '{}'...", outFile);
LineIO.write(outFile, Collections.singletonList(predTaJson));
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
logger.debug("## finished processing file '{}'.", goldTa.getId());
}
System.out.println(reader.generateReport());
System.out.print("TOKEN PERFORMANCE:");
computeAndPrintAcc(numGoldTokCorrect, numGoldTokTotal);
System.out.print("SENTENCE PERFORMANCE:");
computeAndPrintAcc(numGoldSentCorrect, numGoldSentTotal);
}
Aggregations