use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class TokenPerLineCorpusReader method hasNextSentence.
@Override
public boolean hasNextSentence() throws TalismaneException, IOException {
if (this.getMaxSentenceCount() > 0 && sentenceCount >= this.getMaxSentenceCount()) {
// we've reached the end, do nothing
} else {
while (sentenceLines == null) {
List<UnprocessedLine> lines = new ArrayList<>();
int skippedLineCount = 0;
if (!this.hasNextLine())
break;
while ((this.hasNextLine() || lines.size() > 0) && sentenceLines == null) {
String line = "";
if (this.hasNextLine())
line = this.nextLine().replace("\r", "");
lineNumber++;
if (LOG.isTraceEnabled())
LOG.trace("Line " + lineNumber + ": " + line);
if (line.length() > 0) {
boolean skip = false;
for (Pattern skipLinePattern : skipLinePatterns) {
if (skipLinePattern.matcher(line).matches()) {
if (LOG.isTraceEnabled())
LOG.trace("Skipping by pattern: " + skipLinePattern.pattern());
skip = true;
skippedLineCount++;
break;
}
}
List<CorpusSentenceRule> myRules = new ArrayList<>();
List<Matcher> myMatchers = new ArrayList<>();
for (CorpusSentenceRule sentenceRule : sentenceRules) {
Matcher matcher = sentenceRule.getPattern().matcher(line);
if (matcher.matches()) {
if (LOG.isTraceEnabled())
LOG.trace("Matched rule: " + sentenceRule);
myRules.add(sentenceRule);
myMatchers.add(matcher);
}
}
UnprocessedLine unprocessedLine = new UnprocessedLine(line, lineNumber, skip, myRules, myMatchers);
lines.add(unprocessedLine);
} else {
if (lines.size() == 0 || lines.size() == skippedLineCount) {
lines = new ArrayList<>();
skippedLineCount = 0;
continue;
}
// end of sentence
boolean includeMe = true;
// check cross-validation
if (this.getCrossValidationSize() > 0) {
if (this.getIncludeIndex() >= 0) {
if (sentenceCount % this.getCrossValidationSize() != this.getIncludeIndex()) {
includeMe = false;
}
} else if (this.getExcludeIndex() >= 0) {
if (sentenceCount % this.getCrossValidationSize() == this.getExcludeIndex()) {
includeMe = false;
}
}
}
if (this.getStartSentence() > sentenceCount) {
includeMe = false;
}
sentenceCount++;
LOG.debug("sentenceCount: " + sentenceCount);
if (!includeMe) {
lines = new ArrayList<>();
skippedLineCount = 0;
continue;
}
sentenceLines = new ArrayList<>();
for (UnprocessedLine unprocessedLine : lines) {
if (!unprocessedLine.skip) {
CorpusLine corpusLine = corpusLineReader.read(unprocessedLine.line, unprocessedLine.lineNumber);
sentenceLines.add(corpusLine);
if (this.lexicalEntryReader != null) {
WritableLexicalEntry lexicalEntry = new CompactLexicalEntry(lexicalEntrySupport);
this.lexicalEntryReader.readEntry(unprocessedLine.line, lexicalEntry);
corpusLine.setLexicalEntry(lexicalEntry);
}
}
}
List<CorpusSentenceRule.MergeAction> mergeActions = new ArrayList<>();
for (UnprocessedLine unprocessedLine : lines) {
if (LOG.isTraceEnabled())
LOG.trace("Line " + unprocessedLine);
for (int i = 0; i < unprocessedLine.sentenceRules.size(); i++) {
CorpusSentenceRule sentenceRule = unprocessedLine.sentenceRules.get(i);
Matcher matcher = unprocessedLine.matchers.get(i);
if (LOG.isTraceEnabled())
LOG.trace("Testing rule " + sentenceRule);
CorpusSentenceRule.Action action = sentenceRule.apply(unprocessedLine.line, unprocessedLine.lineNumber, matcher, sentenceLines);
if (LOG.isTraceEnabled())
LOG.trace("Result: " + action);
if (action != null) {
if (action instanceof MergeAction)
mergeActions.add((MergeAction) action);
break;
}
}
}
if (mergeActions.size() > 0) {
List<CorpusLine> newSentenceLines = new ArrayList<>();
Map<Integer, MergeAction> indexesToMerge = new TreeMap<>();
for (CorpusSentenceRule.MergeAction mergeAction : mergeActions) {
for (CorpusLine lineToMerge : mergeAction.getLinesToMerge()) {
indexesToMerge.put(lineToMerge.getIndex(), mergeAction);
}
}
int i = 1;
Iterator<Integer> iIndexToMerge = indexesToMerge.keySet().iterator();
int nextIndexToMerge = iIndexToMerge.next();
int linesRemoved = 0;
Map<Integer, Integer> indexChangeMap = new HashMap<>();
indexChangeMap.put(0, 0);
for (CorpusLine corpusLine : sentenceLines) {
if (i == nextIndexToMerge) {
MergeAction mergeAction = indexesToMerge.get(i);
if (i == mergeAction.getFirstIndex()) {
newSentenceLines.add(mergeAction.getMergedLine());
linesRemoved -= 1;
}
linesRemoved += 1;
if (iIndexToMerge.hasNext())
nextIndexToMerge = iIndexToMerge.next();
else
nextIndexToMerge = -1;
} else {
newSentenceLines.add(corpusLine);
}
indexChangeMap.put(i, i - linesRemoved);
i++;
}
for (CorpusLine corpusLine : newSentenceLines) {
corpusLine.setElement(CorpusElement.INDEX, "" + indexChangeMap.get(corpusLine.getIndex()));
int governorIndex = corpusLine.getGovernorIndex();
if (governorIndex >= 0)
corpusLine.setElement(CorpusElement.GOVERNOR, "" + indexChangeMap.get(corpusLine.getGovernorIndex()));
int nonProjGovernorIndex = corpusLine.getNonProjGovernorIndex();
if (nonProjGovernorIndex >= 0)
corpusLine.setElement(CorpusElement.NON_PROJ_GOVERNOR, "" + indexChangeMap.get(corpusLine.getNonProjGovernorIndex()));
}
sentenceLines = newSentenceLines;
}
Sentence sentence = null;
if (sentenceReader != null && sentenceReader.hasNextSentence()) {
sentence = sentenceReader.nextSentence();
} else {
LinguisticRules rules = TalismaneSession.get(sessionId).getLinguisticRules();
if (rules == null)
throw new TalismaneException("Linguistic rules have not been set.");
String text = "";
for (CorpusLine corpusLine : sentenceLines) {
String word = corpusLine.getElement(CorpusElement.TOKEN);
if (rules.shouldAddSpace(text, word))
text += " ";
text += word;
}
sentence = new Sentence(text, currentFile, sessionId);
}
for (SentenceAnnotator sentenceAnnotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
sentenceAnnotator.annotate(sentence);
}
this.processSentence(sentence, sentenceLines);
}
}
}
}
return (sentenceLines != null);
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class TransitionBasedParser method parseSentence.
@Override
public List<ParseConfiguration> parseSentence(List<PosTagSequence> input) throws TalismaneException, IOException {
List<PosTagSequence> posTagSequences = null;
if (this.propagatePosTaggerBeam) {
posTagSequences = input;
} else {
posTagSequences = new ArrayList<>(1);
posTagSequences.add(input.get(0));
}
long startTime = System.currentTimeMillis();
int maxAnalysisTimeMilliseconds = maxAnalysisTimePerSentence * 1000;
int minFreeMemoryBytes = minFreeMemory * KILOBYTE;
TokenSequence tokenSequence = posTagSequences.get(0).getTokenSequence();
TreeMap<Integer, PriorityQueue<ParseConfiguration>> heaps = new TreeMap<>();
PriorityQueue<ParseConfiguration> heap0 = new PriorityQueue<>();
for (PosTagSequence posTagSequence : posTagSequences) {
// add an initial ParseConfiguration for each postag sequence
ParseConfiguration initialConfiguration = new ParseConfiguration(posTagSequence);
initialConfiguration.setScoringStrategy(decisionMaker.getDefaultScoringStrategy());
heap0.add(initialConfiguration);
if (LOG.isDebugEnabled()) {
LOG.debug("Adding initial posTagSequence: " + posTagSequence);
}
}
heaps.put(0, heap0);
PriorityQueue<ParseConfiguration> backupHeap = null;
PriorityQueue<ParseConfiguration> finalHeap = null;
PriorityQueue<ParseConfiguration> terminalHeap = new PriorityQueue<>();
while (heaps.size() > 0) {
Entry<Integer, PriorityQueue<ParseConfiguration>> heapEntry = heaps.pollFirstEntry();
PriorityQueue<ParseConfiguration> currentHeap = heapEntry.getValue();
int currentHeapIndex = heapEntry.getKey();
if (LOG.isTraceEnabled()) {
LOG.trace("##### Polling next heap: " + heapEntry.getKey() + ", size: " + heapEntry.getValue().size());
}
boolean finished = false;
// systematically set the final heap here, just in case we exit
// "naturally" with no more heaps
finalHeap = heapEntry.getValue();
backupHeap = new PriorityQueue<>();
// we jump out when either (a) all tokens have been attached or
// (b) we go over the max alloted time
ParseConfiguration topConf = currentHeap.peek();
if (topConf.isTerminal()) {
LOG.trace("Exiting with terminal heap: " + heapEntry.getKey() + ", size: " + heapEntry.getValue().size());
finished = true;
}
if (earlyStop && terminalHeap.size() >= beamWidth) {
LOG.debug("Early stop activated and terminal heap contains " + beamWidth + " entries. Exiting.");
finalHeap = terminalHeap;
finished = true;
}
long analysisTime = System.currentTimeMillis() - startTime;
if (maxAnalysisTimePerSentence > 0 && analysisTime > maxAnalysisTimeMilliseconds) {
LOG.info("Parse tree analysis took too long for sentence: " + tokenSequence.getSentence().getText());
LOG.info("Breaking out after " + maxAnalysisTimePerSentence + " seconds.");
finished = true;
}
if (minFreeMemory > 0) {
long freeMemory = Runtime.getRuntime().freeMemory();
if (freeMemory < minFreeMemoryBytes) {
LOG.info("Not enough memory left to parse sentence: " + tokenSequence.getSentence().getText());
LOG.info("Min free memory (bytes):" + minFreeMemoryBytes);
LOG.info("Current free memory (bytes): " + freeMemory);
finished = true;
}
}
if (finished) {
break;
}
// limit the breadth to K
int maxSequences = currentHeap.size() > this.beamWidth ? this.beamWidth : currentHeap.size();
int j = 0;
while (currentHeap.size() > 0) {
ParseConfiguration history = currentHeap.poll();
if (LOG.isTraceEnabled()) {
LOG.trace("### Next configuration on heap " + heapEntry.getKey() + ":");
LOG.trace(history.toString());
LOG.trace("Score: " + df.format(history.getScore()));
LOG.trace(history.getPosTagSequence().toString());
}
List<Decision> decisions = new ArrayList<>();
// test the positive rules on the current configuration
boolean ruleApplied = false;
if (parserPositiveRules != null) {
for (ParserRule rule : parserPositiveRules) {
if (LOG.isTraceEnabled()) {
LOG.trace("Checking rule: " + rule.toString());
}
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<Boolean> ruleResult = rule.getCondition().check(history, env);
if (ruleResult != null && ruleResult.getOutcome()) {
Decision positiveRuleDecision = new Decision(rule.getTransition().getCode());
decisions.add(positiveRuleDecision);
positiveRuleDecision.addAuthority(rule.getCondition().getName());
ruleApplied = true;
if (LOG.isTraceEnabled()) {
LOG.trace("Rule applies. Setting transition to: " + rule.getTransition().getCode());
}
break;
}
}
}
if (!ruleApplied) {
// test the features on the current configuration
List<FeatureResult<?>> parseFeatureResults = new ArrayList<>();
for (ParseConfigurationFeature<?> feature : this.parseFeatures) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> featureResult = feature.check(history, env);
if (featureResult != null)
parseFeatureResults.add(featureResult);
}
if (LOG_FEATURES.isTraceEnabled()) {
SortedSet<String> featureResultSet = parseFeatureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<>()));
for (String featureResultString : featureResultSet) {
LOG_FEATURES.trace(featureResultString);
}
}
// evaluate the feature results using the decision maker
decisions = this.decisionMaker.decide(parseFeatureResults);
for (ClassificationObserver observer : this.observers) {
observer.onAnalyse(history, parseFeatureResults, decisions);
}
List<Decision> decisionShortList = new ArrayList<>(decisions.size());
for (Decision decision : decisions) {
if (decision.getProbability() > MIN_PROB_TO_STORE)
decisionShortList.add(decision);
}
decisions = decisionShortList;
// apply the negative rules
Set<String> eliminatedTransitions = new HashSet<>();
if (parserNegativeRules != null) {
for (ParserRule rule : parserNegativeRules) {
if (LOG.isTraceEnabled()) {
LOG.trace("Checking negative rule: " + rule.toString());
}
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<Boolean> ruleResult = rule.getCondition().check(history, env);
if (ruleResult != null && ruleResult.getOutcome()) {
for (Transition transition : rule.getTransitions()) {
eliminatedTransitions.add(transition.getCode());
if (LOG.isTraceEnabled())
LOG.trace("Rule applies. Eliminating transition: " + transition.getCode());
}
}
}
if (eliminatedTransitions.size() > 0) {
decisionShortList = new ArrayList<>();
for (Decision decision : decisions) {
if (!eliminatedTransitions.contains(decision.getOutcome())) {
decisionShortList.add(decision);
} else {
LOG.trace("Eliminating decision: " + decision.toString());
}
}
if (decisionShortList.size() > 0) {
decisions = decisionShortList;
} else {
LOG.debug("All decisions eliminated! Restoring original decisions.");
}
}
}
}
// has a positive rule been applied?
boolean transitionApplied = false;
TransitionSystem transitionSystem = TalismaneSession.get(sessionId).getTransitionSystem();
// type, we should be able to stop
for (Decision decision : decisions) {
Transition transition = transitionSystem.getTransitionForCode(decision.getOutcome());
if (LOG.isTraceEnabled())
LOG.trace("Outcome: " + transition.getCode() + ", " + decision.getProbability());
if (transition.checkPreconditions(history)) {
transitionApplied = true;
ParseConfiguration configuration = new ParseConfiguration(history);
if (decision.isStatistical())
configuration.addDecision(decision);
transition.apply(configuration);
int nextHeapIndex = parseComparisonStrategy.getComparisonIndex(configuration) * 1000;
if (configuration.isTerminal()) {
nextHeapIndex = Integer.MAX_VALUE;
} else {
while (nextHeapIndex <= currentHeapIndex) nextHeapIndex++;
}
PriorityQueue<ParseConfiguration> nextHeap = heaps.get(nextHeapIndex);
if (nextHeap == null) {
if (configuration.isTerminal())
nextHeap = terminalHeap;
else
nextHeap = new PriorityQueue<>();
heaps.put(nextHeapIndex, nextHeap);
if (LOG.isTraceEnabled())
LOG.trace("Created heap with index: " + nextHeapIndex);
}
nextHeap.add(configuration);
if (LOG.isTraceEnabled()) {
LOG.trace("Added configuration with score " + configuration.getScore() + " to heap: " + nextHeapIndex + ", total size: " + nextHeap.size());
}
configuration.clearMemory();
} else {
if (LOG.isTraceEnabled())
LOG.trace("Cannot apply transition: doesn't meet pre-conditions");
// just in case the we run out of both heaps and
// analyses, we build this backup heap
backupHeap.add(history);
}
// does transition meet pre-conditions?
}
if (transitionApplied) {
j++;
} else {
LOG.trace("No transitions could be applied: not counting this history as part of the beam");
}
// beam width test
if (j == maxSequences)
break;
}
// next history
}
// next atomic index
// return the best sequences on the heap
List<ParseConfiguration> bestConfigurations = new ArrayList<>();
int i = 0;
if (finalHeap.isEmpty())
finalHeap = backupHeap;
while (!finalHeap.isEmpty()) {
bestConfigurations.add(finalHeap.poll());
i++;
if (i >= this.getBeamWidth())
break;
}
if (LOG.isDebugEnabled()) {
for (ParseConfiguration finalConfiguration : bestConfigurations) {
LOG.debug(df.format(finalConfiguration.getScore()) + ": " + finalConfiguration.toString());
LOG.debug("Pos tag sequence: " + finalConfiguration.getPosTagSequence());
LOG.debug("Transitions: " + finalConfiguration.getTransitions());
LOG.debug("Decisions: " + finalConfiguration.getDecisions());
if (LOG.isTraceEnabled()) {
StringBuilder sb = new StringBuilder();
for (Decision decision : finalConfiguration.getDecisions()) {
sb.append(" * ");
sb.append(df.format(decision.getProbability()));
}
sb.append(" root ");
sb.append(finalConfiguration.getTransitions().size());
LOG.trace(sb.toString());
sb = new StringBuilder();
sb.append(" * PosTag sequence score ");
sb.append(df.format(finalConfiguration.getPosTagSequence().getScore()));
sb.append(" = ");
for (PosTaggedToken posTaggedToken : finalConfiguration.getPosTagSequence()) {
sb.append(" * ");
sb.append(df.format(posTaggedToken.getDecision().getProbability()));
}
sb.append(" root ");
sb.append(finalConfiguration.getPosTagSequence().size());
LOG.trace(sb.toString());
sb = new StringBuilder();
sb.append(" * Token sequence score = ");
sb.append(df.format(finalConfiguration.getPosTagSequence().getTokenSequence().getScore()));
LOG.trace(sb.toString());
}
}
}
return bestConfigurations;
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class ParseComparator method evaluate.
/**
* @throws TalismaneException
* if sentences mismatched in the two corpora
* @throws IOException
*/
public void evaluate() throws TalismaneException, IOException {
while (referenceCorpusReader.hasNextSentence()) {
ParseConfiguration realConfiguration = referenceCorpusReader.nextConfiguration();
ParseConfiguration guessConfiguaration = evaluationCorpusReader.nextConfiguration();
List<ParseConfiguration> guessConfigurations = new ArrayList<ParseConfiguration>();
guessConfigurations.add(guessConfiguaration);
double realLength = realConfiguration.getPosTagSequence().getTokenSequence().getSentence().getText().length();
double guessedLength = guessConfiguaration.getPosTagSequence().getTokenSequence().getSentence().getText().length();
double ratio = realLength > guessedLength ? guessedLength / realLength : realLength / guessedLength;
if (ratio < 0.9) {
LOG.info("Mismatched sentences");
LOG.info(realConfiguration.getPosTagSequence().getTokenSequence().getSentence().getText().toString());
LOG.info(guessConfiguaration.getPosTagSequence().getTokenSequence().getSentence().getText().toString());
throw new TalismaneException("Mismatched sentences");
}
for (ParseEvaluationObserver observer : this.observers) {
observer.onParseEnd(realConfiguration, guessConfigurations);
}
}
for (ParseEvaluationObserver observer : this.observers) {
observer.onEvaluationComplete();
}
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class LexiconReader method readLexicons.
/**
* Read the lexicons based on an properties file, as described in the class
* description. The pos-tag set is the one read from the configuration.
*
* @param lexiconPropsFile
* @return
* @throws IOException
* @throws TalismaneException
* if the config files contained an unknown property
*/
public List<PosTaggerLexicon> readLexicons(File lexiconPropsFile) throws IOException, TalismaneException {
LOG.debug("Serializing from " + lexiconPropsFile.getPath());
List<PosTaggerLexicon> lexicons = new ArrayList<>();
File lexiconDir = lexiconPropsFile.getParentFile();
Map<String, String> properties = StringUtils.getArgMap(lexiconPropsFile, "UTF-8");
String[] lexiconList = properties.get("lexicons").split(",");
List<String> knownPropertyList = Arrays.asList("file", "regex", "categories", "exclusions", "encoding", "uniqueKey");
Set<String> knownProperties = new HashSet<String>(knownPropertyList);
for (String property : properties.keySet()) {
if (property.equals("lexicons")) {
// nothing to do
} else {
boolean foundLexicon = false;
for (String lexiconName : lexiconList) {
if (property.startsWith(lexiconName + ".")) {
foundLexicon = true;
String remainder = property.substring(lexiconName.length() + 1);
if (!knownProperties.contains(remainder)) {
throw new TalismaneException("Unknown property: " + property);
}
}
if (foundLexicon)
break;
}
if (!foundLexicon)
throw new TalismaneException("Unknown lexicon in property: " + property);
}
}
for (String lexiconName : lexiconList) {
LOG.debug("Lexicon: " + lexiconName);
String lexiconFilePath = properties.get(lexiconName + ".file");
String lexiconRegexPath = properties.get(lexiconName + ".regex");
String lexiconExclusionPath = properties.get(lexiconName + ".exclusions");
String categoryString = properties.get(lexiconName + ".categories");
String lexiconEncoding = properties.get(lexiconName + ".encoding");
String lexiconUniqueKey = properties.get(lexiconName + ".uniqueKey");
File lexiconRegexFile = new File(lexiconDir, lexiconRegexPath);
Scanner regexScanner = new Scanner(new BufferedReader(new InputStreamReader(new FileInputStream(lexiconRegexFile), "UTF-8")));
File lexiconInputFile = new File(lexiconDir, lexiconFilePath);
InputStream inputStream = null;
if (lexiconInputFile.getName().endsWith(".zip")) {
InputStream inputStream2 = new FileInputStream(lexiconInputFile);
@SuppressWarnings("resource") ZipInputStream zis = new ZipInputStream(inputStream2);
zis.getNextEntry();
inputStream = zis;
} else {
inputStream = new FileInputStream(lexiconInputFile);
}
Charset lexiconCharset = Charset.defaultCharset();
if (lexiconEncoding != null)
lexiconCharset = Charset.forName(lexiconEncoding);
Reader reader = new BufferedReader(new InputStreamReader(inputStream, lexiconCharset));
Scanner lexiconScanner = new Scanner(reader);
RegexLexicalEntryReader lexicalEntryReader = new RegexLexicalEntryReader(regexScanner);
Set<String> categories = null;
if (categoryString != null) {
categories = new HashSet<String>();
String[] cats = categoryString.split(",");
for (String cat : cats) categories.add(cat);
}
List<String> exclusionAttributes = null;
List<List<String>> exclusions = null;
if (lexiconExclusionPath != null) {
exclusions = new ArrayList<List<String>>();
File lexiconExclusionFile = new File(lexiconDir, lexiconExclusionPath);
Scanner exclusionScanner = new Scanner(new BufferedReader(new InputStreamReader(new FileInputStream(lexiconExclusionFile), "UTF-8")));
while (exclusionScanner.hasNextLine()) {
String line = exclusionScanner.nextLine();
if (line.length() == 0 || line.startsWith("#"))
continue;
String[] parts = line.split("\t");
if (exclusionAttributes == null) {
exclusionAttributes = new ArrayList<String>();
for (String part : parts) {
exclusionAttributes.add(part);
}
} else {
List<String> exclusion = new ArrayList<String>();
for (String part : parts) {
exclusion.add(part);
}
exclusions.add(exclusion);
}
}
exclusionScanner.close();
}
List<LexicalAttribute> uniqueAttributes = null;
if (lexiconUniqueKey != null) {
uniqueAttributes = new ArrayList<LexicalAttribute>();
String[] uniqueKeyElements = lexiconUniqueKey.split(",");
for (String uniqueKeyElement : uniqueKeyElements) {
try {
LexicalAttribute attribute = LexicalAttribute.valueOf(uniqueKeyElement);
uniqueAttributes.add(attribute);
} catch (IllegalArgumentException e) {
lexiconScanner.close();
throw new TalismaneException("Unknown attribute in " + lexiconName + ".uniqueKey: " + uniqueKeyElement);
}
}
}
LOG.debug("Serializing: " + lexiconFilePath);
LexiconFile lexiconFile = new LexiconFile(lexiconName, lexiconScanner, lexicalEntryReader, sessionId);
if (categories != null)
lexiconFile.setCategories(categories);
if (exclusionAttributes != null)
lexiconFile.setExclusionAttributes(exclusionAttributes);
if (exclusions != null)
lexiconFile.setExclusions(exclusions);
if (uniqueAttributes != null)
lexiconFile.setUniqueKeyAttributes(uniqueAttributes);
lexiconFile.load();
inputStream.close();
lexicons.add(lexiconFile);
}
return lexicons;
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class RegexLexicalEntryReader method readEntry.
@Override
public void readEntry(String text, WritableLexicalEntry lexicalEntry) throws TalismaneException {
boolean foundWord = false;
for (LexicalAttribute attribute : this.attributePatternMap.keySet()) {
for (LexicalAttributePattern myPattern : this.attributePatternMap.get(attribute)) {
Matcher matcher = myPattern.getPattern().matcher(text);
if (matcher.find()) {
String value = matcher.group(myPattern.getGroup());
if (myPattern.getReplacement() != null)
value = myPattern.getReplacement();
switch(attribute) {
case Word:
lexicalEntry.setWord(value);
foundWord = true;
break;
case Lemma:
lexicalEntry.setLemma(value);
break;
case LemmaComplement:
lexicalEntry.setLemmaComplement(value);
break;
case Morphology:
lexicalEntry.setMorphology(value);
break;
case Category:
lexicalEntry.setCategory(value);
break;
case SubCategory:
lexicalEntry.setSubCategory(value);
break;
case Case:
lexicalEntry.addCase(value);
break;
case Gender:
lexicalEntry.addGender(value);
break;
case Number:
lexicalEntry.addNumber(value);
break;
case Person:
lexicalEntry.addPerson(value);
break;
case PossessorNumber:
lexicalEntry.addPossessorNumber(value);
break;
case Tense:
lexicalEntry.addTense(value);
break;
case Aspect:
lexicalEntry.addAspect(value);
break;
case Mood:
lexicalEntry.addMood(value);
break;
case OtherAttribute1:
break;
case OtherAttribute2:
break;
case OtherAttribute3:
break;
case OtherAttribute4:
break;
case OtherAttribute5:
break;
case OtherAttribute6:
break;
case OtherAttribute7:
break;
case OtherAttribute8:
break;
default:
break;
}
if (myPattern.isStop())
break;
}
// match found?
}
// next pattern
}
for (String otherAttribute : this.otherAttributeMap.keySet()) {
for (LexicalAttributePattern myPattern : this.otherAttributeMap.get(otherAttribute)) {
Matcher matcher = myPattern.getPattern().matcher(text);
if (matcher.find()) {
String value = matcher.group(myPattern.getGroup());
lexicalEntry.setAttribute(otherAttribute, value);
if (myPattern.isStop())
break;
}
// match found?
}
// next pattern
}
if (!foundWord)
throw new TalismaneException("No Word found in lexical entry: " + text);
}
Aggregations