Search in sources :

Example 1 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class TokenPerLineCorpusReader method hasNextSentence.

@Override
public boolean hasNextSentence() throws TalismaneException, IOException {
    if (this.getMaxSentenceCount() > 0 && sentenceCount >= this.getMaxSentenceCount()) {
    // we've reached the end, do nothing
    } else {
        while (sentenceLines == null) {
            List<UnprocessedLine> lines = new ArrayList<>();
            int skippedLineCount = 0;
            if (!this.hasNextLine())
                break;
            while ((this.hasNextLine() || lines.size() > 0) && sentenceLines == null) {
                String line = "";
                if (this.hasNextLine())
                    line = this.nextLine().replace("\r", "");
                lineNumber++;
                if (LOG.isTraceEnabled())
                    LOG.trace("Line " + lineNumber + ": " + line);
                if (line.length() > 0) {
                    boolean skip = false;
                    for (Pattern skipLinePattern : skipLinePatterns) {
                        if (skipLinePattern.matcher(line).matches()) {
                            if (LOG.isTraceEnabled())
                                LOG.trace("Skipping by pattern: " + skipLinePattern.pattern());
                            skip = true;
                            skippedLineCount++;
                            break;
                        }
                    }
                    List<CorpusSentenceRule> myRules = new ArrayList<>();
                    List<Matcher> myMatchers = new ArrayList<>();
                    for (CorpusSentenceRule sentenceRule : sentenceRules) {
                        Matcher matcher = sentenceRule.getPattern().matcher(line);
                        if (matcher.matches()) {
                            if (LOG.isTraceEnabled())
                                LOG.trace("Matched rule: " + sentenceRule);
                            myRules.add(sentenceRule);
                            myMatchers.add(matcher);
                        }
                    }
                    UnprocessedLine unprocessedLine = new UnprocessedLine(line, lineNumber, skip, myRules, myMatchers);
                    lines.add(unprocessedLine);
                } else {
                    if (lines.size() == 0 || lines.size() == skippedLineCount) {
                        lines = new ArrayList<>();
                        skippedLineCount = 0;
                        continue;
                    }
                    // end of sentence
                    boolean includeMe = true;
                    // check cross-validation
                    if (this.getCrossValidationSize() > 0) {
                        if (this.getIncludeIndex() >= 0) {
                            if (sentenceCount % this.getCrossValidationSize() != this.getIncludeIndex()) {
                                includeMe = false;
                            }
                        } else if (this.getExcludeIndex() >= 0) {
                            if (sentenceCount % this.getCrossValidationSize() == this.getExcludeIndex()) {
                                includeMe = false;
                            }
                        }
                    }
                    if (this.getStartSentence() > sentenceCount) {
                        includeMe = false;
                    }
                    sentenceCount++;
                    LOG.debug("sentenceCount: " + sentenceCount);
                    if (!includeMe) {
                        lines = new ArrayList<>();
                        skippedLineCount = 0;
                        continue;
                    }
                    sentenceLines = new ArrayList<>();
                    for (UnprocessedLine unprocessedLine : lines) {
                        if (!unprocessedLine.skip) {
                            CorpusLine corpusLine = corpusLineReader.read(unprocessedLine.line, unprocessedLine.lineNumber);
                            sentenceLines.add(corpusLine);
                            if (this.lexicalEntryReader != null) {
                                WritableLexicalEntry lexicalEntry = new CompactLexicalEntry(lexicalEntrySupport);
                                this.lexicalEntryReader.readEntry(unprocessedLine.line, lexicalEntry);
                                corpusLine.setLexicalEntry(lexicalEntry);
                            }
                        }
                    }
                    List<CorpusSentenceRule.MergeAction> mergeActions = new ArrayList<>();
                    for (UnprocessedLine unprocessedLine : lines) {
                        if (LOG.isTraceEnabled())
                            LOG.trace("Line " + unprocessedLine);
                        for (int i = 0; i < unprocessedLine.sentenceRules.size(); i++) {
                            CorpusSentenceRule sentenceRule = unprocessedLine.sentenceRules.get(i);
                            Matcher matcher = unprocessedLine.matchers.get(i);
                            if (LOG.isTraceEnabled())
                                LOG.trace("Testing rule " + sentenceRule);
                            CorpusSentenceRule.Action action = sentenceRule.apply(unprocessedLine.line, unprocessedLine.lineNumber, matcher, sentenceLines);
                            if (LOG.isTraceEnabled())
                                LOG.trace("Result: " + action);
                            if (action != null) {
                                if (action instanceof MergeAction)
                                    mergeActions.add((MergeAction) action);
                                break;
                            }
                        }
                    }
                    if (mergeActions.size() > 0) {
                        List<CorpusLine> newSentenceLines = new ArrayList<>();
                        Map<Integer, MergeAction> indexesToMerge = new TreeMap<>();
                        for (CorpusSentenceRule.MergeAction mergeAction : mergeActions) {
                            for (CorpusLine lineToMerge : mergeAction.getLinesToMerge()) {
                                indexesToMerge.put(lineToMerge.getIndex(), mergeAction);
                            }
                        }
                        int i = 1;
                        Iterator<Integer> iIndexToMerge = indexesToMerge.keySet().iterator();
                        int nextIndexToMerge = iIndexToMerge.next();
                        int linesRemoved = 0;
                        Map<Integer, Integer> indexChangeMap = new HashMap<>();
                        indexChangeMap.put(0, 0);
                        for (CorpusLine corpusLine : sentenceLines) {
                            if (i == nextIndexToMerge) {
                                MergeAction mergeAction = indexesToMerge.get(i);
                                if (i == mergeAction.getFirstIndex()) {
                                    newSentenceLines.add(mergeAction.getMergedLine());
                                    linesRemoved -= 1;
                                }
                                linesRemoved += 1;
                                if (iIndexToMerge.hasNext())
                                    nextIndexToMerge = iIndexToMerge.next();
                                else
                                    nextIndexToMerge = -1;
                            } else {
                                newSentenceLines.add(corpusLine);
                            }
                            indexChangeMap.put(i, i - linesRemoved);
                            i++;
                        }
                        for (CorpusLine corpusLine : newSentenceLines) {
                            corpusLine.setElement(CorpusElement.INDEX, "" + indexChangeMap.get(corpusLine.getIndex()));
                            int governorIndex = corpusLine.getGovernorIndex();
                            if (governorIndex >= 0)
                                corpusLine.setElement(CorpusElement.GOVERNOR, "" + indexChangeMap.get(corpusLine.getGovernorIndex()));
                            int nonProjGovernorIndex = corpusLine.getNonProjGovernorIndex();
                            if (nonProjGovernorIndex >= 0)
                                corpusLine.setElement(CorpusElement.NON_PROJ_GOVERNOR, "" + indexChangeMap.get(corpusLine.getNonProjGovernorIndex()));
                        }
                        sentenceLines = newSentenceLines;
                    }
                    Sentence sentence = null;
                    if (sentenceReader != null && sentenceReader.hasNextSentence()) {
                        sentence = sentenceReader.nextSentence();
                    } else {
                        LinguisticRules rules = TalismaneSession.get(sessionId).getLinguisticRules();
                        if (rules == null)
                            throw new TalismaneException("Linguistic rules have not been set.");
                        String text = "";
                        for (CorpusLine corpusLine : sentenceLines) {
                            String word = corpusLine.getElement(CorpusElement.TOKEN);
                            if (rules.shouldAddSpace(text, word))
                                text += " ";
                            text += word;
                        }
                        sentence = new Sentence(text, currentFile, sessionId);
                    }
                    for (SentenceAnnotator sentenceAnnotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
                        sentenceAnnotator.annotate(sentence);
                    }
                    this.processSentence(sentence, sentenceLines);
                }
            }
        }
    }
    return (sentenceLines != null);
}
Also used : Matcher(java.util.regex.Matcher) HashMap(java.util.HashMap) TalismaneException(com.joliciel.talismane.TalismaneException) CompactLexicalEntry(com.joliciel.talismane.lexicon.CompactLexicalEntry) ArrayList(java.util.ArrayList) WritableLexicalEntry(com.joliciel.talismane.lexicon.WritableLexicalEntry) MergeAction(com.joliciel.talismane.corpus.CorpusSentenceRule.MergeAction) Sentence(com.joliciel.talismane.rawText.Sentence) Pattern(java.util.regex.Pattern) MergeAction(com.joliciel.talismane.corpus.CorpusSentenceRule.MergeAction) TreeMap(java.util.TreeMap) LinguisticRules(com.joliciel.talismane.LinguisticRules) SentenceAnnotator(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator)

Example 2 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class TransitionBasedParser method parseSentence.

@Override
public List<ParseConfiguration> parseSentence(List<PosTagSequence> input) throws TalismaneException, IOException {
    List<PosTagSequence> posTagSequences = null;
    if (this.propagatePosTaggerBeam) {
        posTagSequences = input;
    } else {
        posTagSequences = new ArrayList<>(1);
        posTagSequences.add(input.get(0));
    }
    long startTime = System.currentTimeMillis();
    int maxAnalysisTimeMilliseconds = maxAnalysisTimePerSentence * 1000;
    int minFreeMemoryBytes = minFreeMemory * KILOBYTE;
    TokenSequence tokenSequence = posTagSequences.get(0).getTokenSequence();
    TreeMap<Integer, PriorityQueue<ParseConfiguration>> heaps = new TreeMap<>();
    PriorityQueue<ParseConfiguration> heap0 = new PriorityQueue<>();
    for (PosTagSequence posTagSequence : posTagSequences) {
        // add an initial ParseConfiguration for each postag sequence
        ParseConfiguration initialConfiguration = new ParseConfiguration(posTagSequence);
        initialConfiguration.setScoringStrategy(decisionMaker.getDefaultScoringStrategy());
        heap0.add(initialConfiguration);
        if (LOG.isDebugEnabled()) {
            LOG.debug("Adding initial posTagSequence: " + posTagSequence);
        }
    }
    heaps.put(0, heap0);
    PriorityQueue<ParseConfiguration> backupHeap = null;
    PriorityQueue<ParseConfiguration> finalHeap = null;
    PriorityQueue<ParseConfiguration> terminalHeap = new PriorityQueue<>();
    while (heaps.size() > 0) {
        Entry<Integer, PriorityQueue<ParseConfiguration>> heapEntry = heaps.pollFirstEntry();
        PriorityQueue<ParseConfiguration> currentHeap = heapEntry.getValue();
        int currentHeapIndex = heapEntry.getKey();
        if (LOG.isTraceEnabled()) {
            LOG.trace("##### Polling next heap: " + heapEntry.getKey() + ", size: " + heapEntry.getValue().size());
        }
        boolean finished = false;
        // systematically set the final heap here, just in case we exit
        // "naturally" with no more heaps
        finalHeap = heapEntry.getValue();
        backupHeap = new PriorityQueue<>();
        // we jump out when either (a) all tokens have been attached or
        // (b) we go over the max alloted time
        ParseConfiguration topConf = currentHeap.peek();
        if (topConf.isTerminal()) {
            LOG.trace("Exiting with terminal heap: " + heapEntry.getKey() + ", size: " + heapEntry.getValue().size());
            finished = true;
        }
        if (earlyStop && terminalHeap.size() >= beamWidth) {
            LOG.debug("Early stop activated and terminal heap contains " + beamWidth + " entries. Exiting.");
            finalHeap = terminalHeap;
            finished = true;
        }
        long analysisTime = System.currentTimeMillis() - startTime;
        if (maxAnalysisTimePerSentence > 0 && analysisTime > maxAnalysisTimeMilliseconds) {
            LOG.info("Parse tree analysis took too long for sentence: " + tokenSequence.getSentence().getText());
            LOG.info("Breaking out after " + maxAnalysisTimePerSentence + " seconds.");
            finished = true;
        }
        if (minFreeMemory > 0) {
            long freeMemory = Runtime.getRuntime().freeMemory();
            if (freeMemory < minFreeMemoryBytes) {
                LOG.info("Not enough memory left to parse sentence: " + tokenSequence.getSentence().getText());
                LOG.info("Min free memory (bytes):" + minFreeMemoryBytes);
                LOG.info("Current free memory (bytes): " + freeMemory);
                finished = true;
            }
        }
        if (finished) {
            break;
        }
        // limit the breadth to K
        int maxSequences = currentHeap.size() > this.beamWidth ? this.beamWidth : currentHeap.size();
        int j = 0;
        while (currentHeap.size() > 0) {
            ParseConfiguration history = currentHeap.poll();
            if (LOG.isTraceEnabled()) {
                LOG.trace("### Next configuration on heap " + heapEntry.getKey() + ":");
                LOG.trace(history.toString());
                LOG.trace("Score: " + df.format(history.getScore()));
                LOG.trace(history.getPosTagSequence().toString());
            }
            List<Decision> decisions = new ArrayList<>();
            // test the positive rules on the current configuration
            boolean ruleApplied = false;
            if (parserPositiveRules != null) {
                for (ParserRule rule : parserPositiveRules) {
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Checking rule: " + rule.toString());
                    }
                    RuntimeEnvironment env = new RuntimeEnvironment();
                    FeatureResult<Boolean> ruleResult = rule.getCondition().check(history, env);
                    if (ruleResult != null && ruleResult.getOutcome()) {
                        Decision positiveRuleDecision = new Decision(rule.getTransition().getCode());
                        decisions.add(positiveRuleDecision);
                        positiveRuleDecision.addAuthority(rule.getCondition().getName());
                        ruleApplied = true;
                        if (LOG.isTraceEnabled()) {
                            LOG.trace("Rule applies. Setting transition to: " + rule.getTransition().getCode());
                        }
                        break;
                    }
                }
            }
            if (!ruleApplied) {
                // test the features on the current configuration
                List<FeatureResult<?>> parseFeatureResults = new ArrayList<>();
                for (ParseConfigurationFeature<?> feature : this.parseFeatures) {
                    RuntimeEnvironment env = new RuntimeEnvironment();
                    FeatureResult<?> featureResult = feature.check(history, env);
                    if (featureResult != null)
                        parseFeatureResults.add(featureResult);
                }
                if (LOG_FEATURES.isTraceEnabled()) {
                    SortedSet<String> featureResultSet = parseFeatureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<>()));
                    for (String featureResultString : featureResultSet) {
                        LOG_FEATURES.trace(featureResultString);
                    }
                }
                // evaluate the feature results using the decision maker
                decisions = this.decisionMaker.decide(parseFeatureResults);
                for (ClassificationObserver observer : this.observers) {
                    observer.onAnalyse(history, parseFeatureResults, decisions);
                }
                List<Decision> decisionShortList = new ArrayList<>(decisions.size());
                for (Decision decision : decisions) {
                    if (decision.getProbability() > MIN_PROB_TO_STORE)
                        decisionShortList.add(decision);
                }
                decisions = decisionShortList;
                // apply the negative rules
                Set<String> eliminatedTransitions = new HashSet<>();
                if (parserNegativeRules != null) {
                    for (ParserRule rule : parserNegativeRules) {
                        if (LOG.isTraceEnabled()) {
                            LOG.trace("Checking negative rule: " + rule.toString());
                        }
                        RuntimeEnvironment env = new RuntimeEnvironment();
                        FeatureResult<Boolean> ruleResult = rule.getCondition().check(history, env);
                        if (ruleResult != null && ruleResult.getOutcome()) {
                            for (Transition transition : rule.getTransitions()) {
                                eliminatedTransitions.add(transition.getCode());
                                if (LOG.isTraceEnabled())
                                    LOG.trace("Rule applies. Eliminating transition: " + transition.getCode());
                            }
                        }
                    }
                    if (eliminatedTransitions.size() > 0) {
                        decisionShortList = new ArrayList<>();
                        for (Decision decision : decisions) {
                            if (!eliminatedTransitions.contains(decision.getOutcome())) {
                                decisionShortList.add(decision);
                            } else {
                                LOG.trace("Eliminating decision: " + decision.toString());
                            }
                        }
                        if (decisionShortList.size() > 0) {
                            decisions = decisionShortList;
                        } else {
                            LOG.debug("All decisions eliminated! Restoring original decisions.");
                        }
                    }
                }
            }
            // has a positive rule been applied?
            boolean transitionApplied = false;
            TransitionSystem transitionSystem = TalismaneSession.get(sessionId).getTransitionSystem();
            // type, we should be able to stop
            for (Decision decision : decisions) {
                Transition transition = transitionSystem.getTransitionForCode(decision.getOutcome());
                if (LOG.isTraceEnabled())
                    LOG.trace("Outcome: " + transition.getCode() + ", " + decision.getProbability());
                if (transition.checkPreconditions(history)) {
                    transitionApplied = true;
                    ParseConfiguration configuration = new ParseConfiguration(history);
                    if (decision.isStatistical())
                        configuration.addDecision(decision);
                    transition.apply(configuration);
                    int nextHeapIndex = parseComparisonStrategy.getComparisonIndex(configuration) * 1000;
                    if (configuration.isTerminal()) {
                        nextHeapIndex = Integer.MAX_VALUE;
                    } else {
                        while (nextHeapIndex <= currentHeapIndex) nextHeapIndex++;
                    }
                    PriorityQueue<ParseConfiguration> nextHeap = heaps.get(nextHeapIndex);
                    if (nextHeap == null) {
                        if (configuration.isTerminal())
                            nextHeap = terminalHeap;
                        else
                            nextHeap = new PriorityQueue<>();
                        heaps.put(nextHeapIndex, nextHeap);
                        if (LOG.isTraceEnabled())
                            LOG.trace("Created heap with index: " + nextHeapIndex);
                    }
                    nextHeap.add(configuration);
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Added configuration with score " + configuration.getScore() + " to heap: " + nextHeapIndex + ", total size: " + nextHeap.size());
                    }
                    configuration.clearMemory();
                } else {
                    if (LOG.isTraceEnabled())
                        LOG.trace("Cannot apply transition: doesn't meet pre-conditions");
                    // just in case the we run out of both heaps and
                    // analyses, we build this backup heap
                    backupHeap.add(history);
                }
            // does transition meet pre-conditions?
            }
            if (transitionApplied) {
                j++;
            } else {
                LOG.trace("No transitions could be applied: not counting this history as part of the beam");
            }
            // beam width test
            if (j == maxSequences)
                break;
        }
    // next history
    }
    // next atomic index
    // return the best sequences on the heap
    List<ParseConfiguration> bestConfigurations = new ArrayList<>();
    int i = 0;
    if (finalHeap.isEmpty())
        finalHeap = backupHeap;
    while (!finalHeap.isEmpty()) {
        bestConfigurations.add(finalHeap.poll());
        i++;
        if (i >= this.getBeamWidth())
            break;
    }
    if (LOG.isDebugEnabled()) {
        for (ParseConfiguration finalConfiguration : bestConfigurations) {
            LOG.debug(df.format(finalConfiguration.getScore()) + ": " + finalConfiguration.toString());
            LOG.debug("Pos tag sequence: " + finalConfiguration.getPosTagSequence());
            LOG.debug("Transitions: " + finalConfiguration.getTransitions());
            LOG.debug("Decisions: " + finalConfiguration.getDecisions());
            if (LOG.isTraceEnabled()) {
                StringBuilder sb = new StringBuilder();
                for (Decision decision : finalConfiguration.getDecisions()) {
                    sb.append(" * ");
                    sb.append(df.format(decision.getProbability()));
                }
                sb.append(" root ");
                sb.append(finalConfiguration.getTransitions().size());
                LOG.trace(sb.toString());
                sb = new StringBuilder();
                sb.append(" * PosTag sequence score ");
                sb.append(df.format(finalConfiguration.getPosTagSequence().getScore()));
                sb.append(" = ");
                for (PosTaggedToken posTaggedToken : finalConfiguration.getPosTagSequence()) {
                    sb.append(" * ");
                    sb.append(df.format(posTaggedToken.getDecision().getProbability()));
                }
                sb.append(" root ");
                sb.append(finalConfiguration.getPosTagSequence().size());
                LOG.trace(sb.toString());
                sb = new StringBuilder();
                sb.append(" * Token sequence score = ");
                sb.append(df.format(finalConfiguration.getPosTagSequence().getTokenSequence().getScore()));
                LOG.trace(sb.toString());
            }
        }
    }
    return bestConfigurations;
}
Also used : ClassificationObserver(com.joliciel.talismane.machineLearning.ClassificationObserver) ZipInputStream(java.util.zip.ZipInputStream) SortedSet(java.util.SortedSet) ParserRule(com.joliciel.talismane.parser.features.ParserRule) PriorityQueue(java.util.PriorityQueue) LoggerFactory(org.slf4j.LoggerFactory) Scanner(java.util.Scanner) HashMap(java.util.HashMap) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) MachineLearningModelFactory(com.joliciel.talismane.machineLearning.MachineLearningModelFactory) TreeSet(java.util.TreeSet) TalismaneException(com.joliciel.talismane.TalismaneException) TalismaneSession(com.joliciel.talismane.TalismaneSession) ParseConfigurationFeature(com.joliciel.talismane.parser.features.ParseConfigurationFeature) ArrayList(java.util.ArrayList) ClassificationModel(com.joliciel.talismane.machineLearning.ClassificationModel) HashSet(java.util.HashSet) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult) PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) Map(java.util.Map) ConfigUtils(com.joliciel.talismane.utils.ConfigUtils) ConfigFactory(com.typesafe.config.ConfigFactory) ArrayListNoNulls(com.joliciel.talismane.utils.ArrayListNoNulls) ExternalResource(com.joliciel.talismane.machineLearning.ExternalResource) DecisionMaker(com.joliciel.talismane.machineLearning.DecisionMaker) Logger(org.slf4j.Logger) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) Config(com.typesafe.config.Config) Collection(java.util.Collection) DecimalFormat(java.text.DecimalFormat) Set(java.util.Set) IOException(java.io.IOException) Decision(com.joliciel.talismane.machineLearning.Decision) Collectors(java.util.stream.Collectors) File(java.io.File) List(java.util.List) TreeMap(java.util.TreeMap) Entry(java.util.Map.Entry) InputStream(java.io.InputStream) ParserFeatureParser(com.joliciel.talismane.parser.features.ParserFeatureParser) ParserRule(com.joliciel.talismane.parser.features.ParserRule) ArrayList(java.util.ArrayList) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) PriorityQueue(java.util.PriorityQueue) TreeMap(java.util.TreeMap) Decision(com.joliciel.talismane.machineLearning.Decision) ClassificationObserver(com.joliciel.talismane.machineLearning.ClassificationObserver) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult)

Example 3 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class ParseComparator method evaluate.

/**
 * @throws TalismaneException
 *           if sentences mismatched in the two corpora
 * @throws IOException
 */
public void evaluate() throws TalismaneException, IOException {
    while (referenceCorpusReader.hasNextSentence()) {
        ParseConfiguration realConfiguration = referenceCorpusReader.nextConfiguration();
        ParseConfiguration guessConfiguaration = evaluationCorpusReader.nextConfiguration();
        List<ParseConfiguration> guessConfigurations = new ArrayList<ParseConfiguration>();
        guessConfigurations.add(guessConfiguaration);
        double realLength = realConfiguration.getPosTagSequence().getTokenSequence().getSentence().getText().length();
        double guessedLength = guessConfiguaration.getPosTagSequence().getTokenSequence().getSentence().getText().length();
        double ratio = realLength > guessedLength ? guessedLength / realLength : realLength / guessedLength;
        if (ratio < 0.9) {
            LOG.info("Mismatched sentences");
            LOG.info(realConfiguration.getPosTagSequence().getTokenSequence().getSentence().getText().toString());
            LOG.info(guessConfiguaration.getPosTagSequence().getTokenSequence().getSentence().getText().toString());
            throw new TalismaneException("Mismatched sentences");
        }
        for (ParseEvaluationObserver observer : this.observers) {
            observer.onParseEnd(realConfiguration, guessConfigurations);
        }
    }
    for (ParseEvaluationObserver observer : this.observers) {
        observer.onEvaluationComplete();
    }
}
Also used : TalismaneException(com.joliciel.talismane.TalismaneException) ArrayList(java.util.ArrayList) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration)

Example 4 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class LexiconReader method readLexicons.

/**
 * Read the lexicons based on an properties file, as described in the class
 * description. The pos-tag set is the one read from the configuration.
 *
 * @param lexiconPropsFile
 * @return
 * @throws IOException
 * @throws TalismaneException
 *           if the config files contained an unknown property
 */
public List<PosTaggerLexicon> readLexicons(File lexiconPropsFile) throws IOException, TalismaneException {
    LOG.debug("Serializing from " + lexiconPropsFile.getPath());
    List<PosTaggerLexicon> lexicons = new ArrayList<>();
    File lexiconDir = lexiconPropsFile.getParentFile();
    Map<String, String> properties = StringUtils.getArgMap(lexiconPropsFile, "UTF-8");
    String[] lexiconList = properties.get("lexicons").split(",");
    List<String> knownPropertyList = Arrays.asList("file", "regex", "categories", "exclusions", "encoding", "uniqueKey");
    Set<String> knownProperties = new HashSet<String>(knownPropertyList);
    for (String property : properties.keySet()) {
        if (property.equals("lexicons")) {
        // nothing to do
        } else {
            boolean foundLexicon = false;
            for (String lexiconName : lexiconList) {
                if (property.startsWith(lexiconName + ".")) {
                    foundLexicon = true;
                    String remainder = property.substring(lexiconName.length() + 1);
                    if (!knownProperties.contains(remainder)) {
                        throw new TalismaneException("Unknown property: " + property);
                    }
                }
                if (foundLexicon)
                    break;
            }
            if (!foundLexicon)
                throw new TalismaneException("Unknown lexicon in property: " + property);
        }
    }
    for (String lexiconName : lexiconList) {
        LOG.debug("Lexicon: " + lexiconName);
        String lexiconFilePath = properties.get(lexiconName + ".file");
        String lexiconRegexPath = properties.get(lexiconName + ".regex");
        String lexiconExclusionPath = properties.get(lexiconName + ".exclusions");
        String categoryString = properties.get(lexiconName + ".categories");
        String lexiconEncoding = properties.get(lexiconName + ".encoding");
        String lexiconUniqueKey = properties.get(lexiconName + ".uniqueKey");
        File lexiconRegexFile = new File(lexiconDir, lexiconRegexPath);
        Scanner regexScanner = new Scanner(new BufferedReader(new InputStreamReader(new FileInputStream(lexiconRegexFile), "UTF-8")));
        File lexiconInputFile = new File(lexiconDir, lexiconFilePath);
        InputStream inputStream = null;
        if (lexiconInputFile.getName().endsWith(".zip")) {
            InputStream inputStream2 = new FileInputStream(lexiconInputFile);
            @SuppressWarnings("resource") ZipInputStream zis = new ZipInputStream(inputStream2);
            zis.getNextEntry();
            inputStream = zis;
        } else {
            inputStream = new FileInputStream(lexiconInputFile);
        }
        Charset lexiconCharset = Charset.defaultCharset();
        if (lexiconEncoding != null)
            lexiconCharset = Charset.forName(lexiconEncoding);
        Reader reader = new BufferedReader(new InputStreamReader(inputStream, lexiconCharset));
        Scanner lexiconScanner = new Scanner(reader);
        RegexLexicalEntryReader lexicalEntryReader = new RegexLexicalEntryReader(regexScanner);
        Set<String> categories = null;
        if (categoryString != null) {
            categories = new HashSet<String>();
            String[] cats = categoryString.split(",");
            for (String cat : cats) categories.add(cat);
        }
        List<String> exclusionAttributes = null;
        List<List<String>> exclusions = null;
        if (lexiconExclusionPath != null) {
            exclusions = new ArrayList<List<String>>();
            File lexiconExclusionFile = new File(lexiconDir, lexiconExclusionPath);
            Scanner exclusionScanner = new Scanner(new BufferedReader(new InputStreamReader(new FileInputStream(lexiconExclusionFile), "UTF-8")));
            while (exclusionScanner.hasNextLine()) {
                String line = exclusionScanner.nextLine();
                if (line.length() == 0 || line.startsWith("#"))
                    continue;
                String[] parts = line.split("\t");
                if (exclusionAttributes == null) {
                    exclusionAttributes = new ArrayList<String>();
                    for (String part : parts) {
                        exclusionAttributes.add(part);
                    }
                } else {
                    List<String> exclusion = new ArrayList<String>();
                    for (String part : parts) {
                        exclusion.add(part);
                    }
                    exclusions.add(exclusion);
                }
            }
            exclusionScanner.close();
        }
        List<LexicalAttribute> uniqueAttributes = null;
        if (lexiconUniqueKey != null) {
            uniqueAttributes = new ArrayList<LexicalAttribute>();
            String[] uniqueKeyElements = lexiconUniqueKey.split(",");
            for (String uniqueKeyElement : uniqueKeyElements) {
                try {
                    LexicalAttribute attribute = LexicalAttribute.valueOf(uniqueKeyElement);
                    uniqueAttributes.add(attribute);
                } catch (IllegalArgumentException e) {
                    lexiconScanner.close();
                    throw new TalismaneException("Unknown attribute in " + lexiconName + ".uniqueKey: " + uniqueKeyElement);
                }
            }
        }
        LOG.debug("Serializing: " + lexiconFilePath);
        LexiconFile lexiconFile = new LexiconFile(lexiconName, lexiconScanner, lexicalEntryReader, sessionId);
        if (categories != null)
            lexiconFile.setCategories(categories);
        if (exclusionAttributes != null)
            lexiconFile.setExclusionAttributes(exclusionAttributes);
        if (exclusions != null)
            lexiconFile.setExclusions(exclusions);
        if (uniqueAttributes != null)
            lexiconFile.setUniqueKeyAttributes(uniqueAttributes);
        lexiconFile.load();
        inputStream.close();
        lexicons.add(lexiconFile);
    }
    return lexicons;
}
Also used : Scanner(java.util.Scanner) TalismaneException(com.joliciel.talismane.TalismaneException) ArrayList(java.util.ArrayList) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) InputStreamReader(java.io.InputStreamReader) ZipInputStream(java.util.zip.ZipInputStream) ObjectInputStream(java.io.ObjectInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Charset(java.nio.charset.Charset) FileInputStream(java.io.FileInputStream) ZipInputStream(java.util.zip.ZipInputStream) BufferedReader(java.io.BufferedReader) File(java.io.File)

Example 5 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class RegexLexicalEntryReader method readEntry.

@Override
public void readEntry(String text, WritableLexicalEntry lexicalEntry) throws TalismaneException {
    boolean foundWord = false;
    for (LexicalAttribute attribute : this.attributePatternMap.keySet()) {
        for (LexicalAttributePattern myPattern : this.attributePatternMap.get(attribute)) {
            Matcher matcher = myPattern.getPattern().matcher(text);
            if (matcher.find()) {
                String value = matcher.group(myPattern.getGroup());
                if (myPattern.getReplacement() != null)
                    value = myPattern.getReplacement();
                switch(attribute) {
                    case Word:
                        lexicalEntry.setWord(value);
                        foundWord = true;
                        break;
                    case Lemma:
                        lexicalEntry.setLemma(value);
                        break;
                    case LemmaComplement:
                        lexicalEntry.setLemmaComplement(value);
                        break;
                    case Morphology:
                        lexicalEntry.setMorphology(value);
                        break;
                    case Category:
                        lexicalEntry.setCategory(value);
                        break;
                    case SubCategory:
                        lexicalEntry.setSubCategory(value);
                        break;
                    case Case:
                        lexicalEntry.addCase(value);
                        break;
                    case Gender:
                        lexicalEntry.addGender(value);
                        break;
                    case Number:
                        lexicalEntry.addNumber(value);
                        break;
                    case Person:
                        lexicalEntry.addPerson(value);
                        break;
                    case PossessorNumber:
                        lexicalEntry.addPossessorNumber(value);
                        break;
                    case Tense:
                        lexicalEntry.addTense(value);
                        break;
                    case Aspect:
                        lexicalEntry.addAspect(value);
                        break;
                    case Mood:
                        lexicalEntry.addMood(value);
                        break;
                    case OtherAttribute1:
                        break;
                    case OtherAttribute2:
                        break;
                    case OtherAttribute3:
                        break;
                    case OtherAttribute4:
                        break;
                    case OtherAttribute5:
                        break;
                    case OtherAttribute6:
                        break;
                    case OtherAttribute7:
                        break;
                    case OtherAttribute8:
                        break;
                    default:
                        break;
                }
                if (myPattern.isStop())
                    break;
            }
        // match found?
        }
    // next pattern
    }
    for (String otherAttribute : this.otherAttributeMap.keySet()) {
        for (LexicalAttributePattern myPattern : this.otherAttributeMap.get(otherAttribute)) {
            Matcher matcher = myPattern.getPattern().matcher(text);
            if (matcher.find()) {
                String value = matcher.group(myPattern.getGroup());
                lexicalEntry.setAttribute(otherAttribute, value);
                if (myPattern.isStop())
                    break;
            }
        // match found?
        }
    // next pattern
    }
    if (!foundWord)
        throw new TalismaneException("No Word found in lexical entry: " + text);
}
Also used : Matcher(java.util.regex.Matcher) TalismaneException(com.joliciel.talismane.TalismaneException)

Aggregations

TalismaneException (com.joliciel.talismane.TalismaneException)47 ArrayList (java.util.ArrayList)27 Config (com.typesafe.config.Config)14 File (java.io.File)11 List (java.util.List)10 TreeSet (java.util.TreeSet)10 FeatureResult (com.joliciel.talismane.machineLearning.features.FeatureResult)9 IOException (java.io.IOException)9 HashMap (java.util.HashMap)9 Set (java.util.Set)9 Decision (com.joliciel.talismane.machineLearning.Decision)8 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)8 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)8 Token (com.joliciel.talismane.tokeniser.Token)8 Map (java.util.Map)8 SortedSet (java.util.SortedSet)8 Collectors (java.util.stream.Collectors)8 Logger (org.slf4j.Logger)8 LoggerFactory (org.slf4j.LoggerFactory)8 Sentence (com.joliciel.talismane.rawText.Sentence)7