Search in sources :

Example 1 with LinguisticRules

use of com.joliciel.talismane.LinguisticRules in project talismane by joliciel-informatique.

the class TokenPerLineCorpusReader method hasNextSentence.

@Override
public boolean hasNextSentence() throws TalismaneException, IOException {
    if (this.getMaxSentenceCount() > 0 && sentenceCount >= this.getMaxSentenceCount()) {
    // we've reached the end, do nothing
    } else {
        while (sentenceLines == null) {
            List<UnprocessedLine> lines = new ArrayList<>();
            int skippedLineCount = 0;
            if (!this.hasNextLine())
                break;
            while ((this.hasNextLine() || lines.size() > 0) && sentenceLines == null) {
                String line = "";
                if (this.hasNextLine())
                    line = this.nextLine().replace("\r", "");
                lineNumber++;
                if (LOG.isTraceEnabled())
                    LOG.trace("Line " + lineNumber + ": " + line);
                if (line.length() > 0) {
                    boolean skip = false;
                    for (Pattern skipLinePattern : skipLinePatterns) {
                        if (skipLinePattern.matcher(line).matches()) {
                            if (LOG.isTraceEnabled())
                                LOG.trace("Skipping by pattern: " + skipLinePattern.pattern());
                            skip = true;
                            skippedLineCount++;
                            break;
                        }
                    }
                    List<CorpusSentenceRule> myRules = new ArrayList<>();
                    List<Matcher> myMatchers = new ArrayList<>();
                    for (CorpusSentenceRule sentenceRule : sentenceRules) {
                        Matcher matcher = sentenceRule.getPattern().matcher(line);
                        if (matcher.matches()) {
                            if (LOG.isTraceEnabled())
                                LOG.trace("Matched rule: " + sentenceRule);
                            myRules.add(sentenceRule);
                            myMatchers.add(matcher);
                        }
                    }
                    UnprocessedLine unprocessedLine = new UnprocessedLine(line, lineNumber, skip, myRules, myMatchers);
                    lines.add(unprocessedLine);
                } else {
                    if (lines.size() == 0 || lines.size() == skippedLineCount) {
                        lines = new ArrayList<>();
                        skippedLineCount = 0;
                        continue;
                    }
                    // end of sentence
                    boolean includeMe = true;
                    // check cross-validation
                    if (this.getCrossValidationSize() > 0) {
                        if (this.getIncludeIndex() >= 0) {
                            if (sentenceCount % this.getCrossValidationSize() != this.getIncludeIndex()) {
                                includeMe = false;
                            }
                        } else if (this.getExcludeIndex() >= 0) {
                            if (sentenceCount % this.getCrossValidationSize() == this.getExcludeIndex()) {
                                includeMe = false;
                            }
                        }
                    }
                    if (this.getStartSentence() > sentenceCount) {
                        includeMe = false;
                    }
                    sentenceCount++;
                    LOG.debug("sentenceCount: " + sentenceCount);
                    if (!includeMe) {
                        lines = new ArrayList<>();
                        skippedLineCount = 0;
                        continue;
                    }
                    sentenceLines = new ArrayList<>();
                    for (UnprocessedLine unprocessedLine : lines) {
                        if (!unprocessedLine.skip) {
                            CorpusLine corpusLine = corpusLineReader.read(unprocessedLine.line, unprocessedLine.lineNumber);
                            sentenceLines.add(corpusLine);
                            if (this.lexicalEntryReader != null) {
                                WritableLexicalEntry lexicalEntry = new CompactLexicalEntry(lexicalEntrySupport);
                                this.lexicalEntryReader.readEntry(unprocessedLine.line, lexicalEntry);
                                corpusLine.setLexicalEntry(lexicalEntry);
                            }
                        }
                    }
                    List<CorpusSentenceRule.MergeAction> mergeActions = new ArrayList<>();
                    for (UnprocessedLine unprocessedLine : lines) {
                        if (LOG.isTraceEnabled())
                            LOG.trace("Line " + unprocessedLine);
                        for (int i = 0; i < unprocessedLine.sentenceRules.size(); i++) {
                            CorpusSentenceRule sentenceRule = unprocessedLine.sentenceRules.get(i);
                            Matcher matcher = unprocessedLine.matchers.get(i);
                            if (LOG.isTraceEnabled())
                                LOG.trace("Testing rule " + sentenceRule);
                            CorpusSentenceRule.Action action = sentenceRule.apply(unprocessedLine.line, unprocessedLine.lineNumber, matcher, sentenceLines);
                            if (LOG.isTraceEnabled())
                                LOG.trace("Result: " + action);
                            if (action != null) {
                                if (action instanceof MergeAction)
                                    mergeActions.add((MergeAction) action);
                                break;
                            }
                        }
                    }
                    if (mergeActions.size() > 0) {
                        List<CorpusLine> newSentenceLines = new ArrayList<>();
                        Map<Integer, MergeAction> indexesToMerge = new TreeMap<>();
                        for (CorpusSentenceRule.MergeAction mergeAction : mergeActions) {
                            for (CorpusLine lineToMerge : mergeAction.getLinesToMerge()) {
                                indexesToMerge.put(lineToMerge.getIndex(), mergeAction);
                            }
                        }
                        int i = 1;
                        Iterator<Integer> iIndexToMerge = indexesToMerge.keySet().iterator();
                        int nextIndexToMerge = iIndexToMerge.next();
                        int linesRemoved = 0;
                        Map<Integer, Integer> indexChangeMap = new HashMap<>();
                        indexChangeMap.put(0, 0);
                        for (CorpusLine corpusLine : sentenceLines) {
                            if (i == nextIndexToMerge) {
                                MergeAction mergeAction = indexesToMerge.get(i);
                                if (i == mergeAction.getFirstIndex()) {
                                    newSentenceLines.add(mergeAction.getMergedLine());
                                    linesRemoved -= 1;
                                }
                                linesRemoved += 1;
                                if (iIndexToMerge.hasNext())
                                    nextIndexToMerge = iIndexToMerge.next();
                                else
                                    nextIndexToMerge = -1;
                            } else {
                                newSentenceLines.add(corpusLine);
                            }
                            indexChangeMap.put(i, i - linesRemoved);
                            i++;
                        }
                        for (CorpusLine corpusLine : newSentenceLines) {
                            corpusLine.setElement(CorpusElement.INDEX, "" + indexChangeMap.get(corpusLine.getIndex()));
                            int governorIndex = corpusLine.getGovernorIndex();
                            if (governorIndex >= 0)
                                corpusLine.setElement(CorpusElement.GOVERNOR, "" + indexChangeMap.get(corpusLine.getGovernorIndex()));
                            int nonProjGovernorIndex = corpusLine.getNonProjGovernorIndex();
                            if (nonProjGovernorIndex >= 0)
                                corpusLine.setElement(CorpusElement.NON_PROJ_GOVERNOR, "" + indexChangeMap.get(corpusLine.getNonProjGovernorIndex()));
                        }
                        sentenceLines = newSentenceLines;
                    }
                    Sentence sentence = null;
                    if (sentenceReader != null && sentenceReader.hasNextSentence()) {
                        sentence = sentenceReader.nextSentence();
                    } else {
                        LinguisticRules rules = TalismaneSession.get(sessionId).getLinguisticRules();
                        if (rules == null)
                            throw new TalismaneException("Linguistic rules have not been set.");
                        String text = "";
                        for (CorpusLine corpusLine : sentenceLines) {
                            String word = corpusLine.getElement(CorpusElement.TOKEN);
                            if (rules.shouldAddSpace(text, word))
                                text += " ";
                            text += word;
                        }
                        sentence = new Sentence(text, currentFile, sessionId);
                    }
                    for (SentenceAnnotator sentenceAnnotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
                        sentenceAnnotator.annotate(sentence);
                    }
                    this.processSentence(sentence, sentenceLines);
                }
            }
        }
    }
    return (sentenceLines != null);
}
Also used : Matcher(java.util.regex.Matcher) HashMap(java.util.HashMap) TalismaneException(com.joliciel.talismane.TalismaneException) CompactLexicalEntry(com.joliciel.talismane.lexicon.CompactLexicalEntry) ArrayList(java.util.ArrayList) WritableLexicalEntry(com.joliciel.talismane.lexicon.WritableLexicalEntry) MergeAction(com.joliciel.talismane.corpus.CorpusSentenceRule.MergeAction) Sentence(com.joliciel.talismane.rawText.Sentence) Pattern(java.util.regex.Pattern) MergeAction(com.joliciel.talismane.corpus.CorpusSentenceRule.MergeAction) TreeMap(java.util.TreeMap) LinguisticRules(com.joliciel.talismane.LinguisticRules) SentenceAnnotator(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator)

Example 2 with LinguisticRules

use of com.joliciel.talismane.LinguisticRules in project talismane by joliciel-informatique.

the class StandoffReader method hasNextSentence.

@Override
public boolean hasNextSentence() throws TalismaneException, IOException {
    if (this.getMaxSentenceCount() > 0 && sentenceCount >= this.getMaxSentenceCount()) {
    // we've reached the end, do nothing
    } else {
        if (configuration == null && sentenceIndex < sentences.size()) {
            List<StandoffToken> tokens = sentences.get(sentenceIndex++);
            LinguisticRules rules = TalismaneSession.get(sessionId).getLinguisticRules();
            if (rules == null)
                throw new RuntimeException("Linguistic rules have not been set.");
            String text = "";
            for (StandoffToken standoffToken : tokens) {
                String word = standoffToken.text;
                if (rules.shouldAddSpace(text, word))
                    text += " ";
                text += word;
            }
            Sentence sentence = new Sentence(text, sessionId);
            for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
                annotator.annotate(sentence);
            }
            PretokenisedSequence tokenSequence = new PretokenisedSequence(sentence, sessionId);
            PosTagSequence posTagSequence = new PosTagSequence(tokenSequence);
            Map<String, PosTaggedToken> idTokenMap = new HashMap<String, PosTaggedToken>();
            for (StandoffToken standoffToken : tokens) {
                Token token = tokenSequence.addToken(standoffToken.text);
                Decision posTagDecision = new Decision(standoffToken.posTag.getCode());
                PosTaggedToken posTaggedToken = new PosTaggedToken(token, posTagDecision, sessionId);
                if (LOG.isTraceEnabled()) {
                    LOG.trace(posTaggedToken.toString());
                }
                posTaggedToken.setComment(standoffToken.comment);
                posTagSequence.addPosTaggedToken(posTaggedToken);
                idTokenMap.put(standoffToken.id, posTaggedToken);
                LOG.debug("Found token " + standoffToken.id + ", " + posTaggedToken);
            }
            tokenSequence.setWithRoot(true);
            configuration = new ParseConfiguration(posTagSequence);
            for (StandoffToken standoffToken : tokens) {
                StandoffRelation relation = relationMap.get(standoffToken.id);
                if (relation != null) {
                    PosTaggedToken head = idTokenMap.get(relation.fromToken);
                    PosTaggedToken dependent = idTokenMap.get(relation.toToken);
                    if (head == null) {
                        throw new TalismaneException("No token found for head id: " + relation.fromToken);
                    }
                    if (dependent == null) {
                        throw new TalismaneException("No token found for dependent id: " + relation.toToken);
                    }
                    DependencyArc arc = configuration.addDependency(head, dependent, relation.label, null);
                    arc.setComment(relation.comment);
                } else if (standoffToken.posTag.getOpenClassIndicator() == PosTagOpenClassIndicator.PUNCTUATION) {
                    if (punctuationDepLabel != null) {
                        PosTaggedToken dependent = idTokenMap.get(standoffToken.id);
                        for (int i = dependent.getIndex() - 1; i >= 0; i--) {
                            PosTaggedToken head = posTagSequence.get(i);
                            if (head.getTag().getOpenClassIndicator() == PosTagOpenClassIndicator.PUNCTUATION)
                                continue;
                            configuration.addDependency(head, dependent, punctuationDepLabel, null);
                            break;
                        }
                    }
                }
            }
        }
    }
    return (configuration != null);
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) TalismaneException(com.joliciel.talismane.TalismaneException) PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) Token(com.joliciel.talismane.tokeniser.Token) Decision(com.joliciel.talismane.machineLearning.Decision) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) PretokenisedSequence(com.joliciel.talismane.tokeniser.PretokenisedSequence) LinguisticRules(com.joliciel.talismane.LinguisticRules) SentenceAnnotator(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) DependencyArc(com.joliciel.talismane.parser.DependencyArc) Sentence(com.joliciel.talismane.rawText.Sentence)

Aggregations

LinguisticRules (com.joliciel.talismane.LinguisticRules)2 TalismaneException (com.joliciel.talismane.TalismaneException)2 Sentence (com.joliciel.talismane.rawText.Sentence)2 SentenceAnnotator (com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator)2 HashMap (java.util.HashMap)2 MergeAction (com.joliciel.talismane.corpus.CorpusSentenceRule.MergeAction)1 CompactLexicalEntry (com.joliciel.talismane.lexicon.CompactLexicalEntry)1 WritableLexicalEntry (com.joliciel.talismane.lexicon.WritableLexicalEntry)1 Decision (com.joliciel.talismane.machineLearning.Decision)1 DependencyArc (com.joliciel.talismane.parser.DependencyArc)1 ParseConfiguration (com.joliciel.talismane.parser.ParseConfiguration)1 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)1 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)1 PretokenisedSequence (com.joliciel.talismane.tokeniser.PretokenisedSequence)1 Token (com.joliciel.talismane.tokeniser.Token)1 ArrayList (java.util.ArrayList)1 LinkedHashMap (java.util.LinkedHashMap)1 TreeMap (java.util.TreeMap)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1