Search in sources :

Example 1 with CompactLexicalEntry

use of com.joliciel.talismane.lexicon.CompactLexicalEntry in project talismane by joliciel-informatique.

the class TokenPerLineCorpusReader method hasNextSentence.

@Override
public boolean hasNextSentence() throws TalismaneException, IOException {
    if (this.getMaxSentenceCount() > 0 && sentenceCount >= this.getMaxSentenceCount()) {
    // we've reached the end, do nothing
    } else {
        while (sentenceLines == null) {
            List<UnprocessedLine> lines = new ArrayList<>();
            int skippedLineCount = 0;
            if (!this.hasNextLine())
                break;
            while ((this.hasNextLine() || lines.size() > 0) && sentenceLines == null) {
                String line = "";
                if (this.hasNextLine())
                    line = this.nextLine().replace("\r", "");
                lineNumber++;
                if (LOG.isTraceEnabled())
                    LOG.trace("Line " + lineNumber + ": " + line);
                if (line.length() > 0) {
                    boolean skip = false;
                    for (Pattern skipLinePattern : skipLinePatterns) {
                        if (skipLinePattern.matcher(line).matches()) {
                            if (LOG.isTraceEnabled())
                                LOG.trace("Skipping by pattern: " + skipLinePattern.pattern());
                            skip = true;
                            skippedLineCount++;
                            break;
                        }
                    }
                    List<CorpusSentenceRule> myRules = new ArrayList<>();
                    List<Matcher> myMatchers = new ArrayList<>();
                    for (CorpusSentenceRule sentenceRule : sentenceRules) {
                        Matcher matcher = sentenceRule.getPattern().matcher(line);
                        if (matcher.matches()) {
                            if (LOG.isTraceEnabled())
                                LOG.trace("Matched rule: " + sentenceRule);
                            myRules.add(sentenceRule);
                            myMatchers.add(matcher);
                        }
                    }
                    UnprocessedLine unprocessedLine = new UnprocessedLine(line, lineNumber, skip, myRules, myMatchers);
                    lines.add(unprocessedLine);
                } else {
                    if (lines.size() == 0 || lines.size() == skippedLineCount) {
                        lines = new ArrayList<>();
                        skippedLineCount = 0;
                        continue;
                    }
                    // end of sentence
                    boolean includeMe = true;
                    // check cross-validation
                    if (this.getCrossValidationSize() > 0) {
                        if (this.getIncludeIndex() >= 0) {
                            if (sentenceCount % this.getCrossValidationSize() != this.getIncludeIndex()) {
                                includeMe = false;
                            }
                        } else if (this.getExcludeIndex() >= 0) {
                            if (sentenceCount % this.getCrossValidationSize() == this.getExcludeIndex()) {
                                includeMe = false;
                            }
                        }
                    }
                    if (this.getStartSentence() > sentenceCount) {
                        includeMe = false;
                    }
                    sentenceCount++;
                    LOG.debug("sentenceCount: " + sentenceCount);
                    if (!includeMe) {
                        lines = new ArrayList<>();
                        skippedLineCount = 0;
                        continue;
                    }
                    sentenceLines = new ArrayList<>();
                    for (UnprocessedLine unprocessedLine : lines) {
                        if (!unprocessedLine.skip) {
                            CorpusLine corpusLine = corpusLineReader.read(unprocessedLine.line, unprocessedLine.lineNumber);
                            sentenceLines.add(corpusLine);
                            if (this.lexicalEntryReader != null) {
                                WritableLexicalEntry lexicalEntry = new CompactLexicalEntry(lexicalEntrySupport);
                                this.lexicalEntryReader.readEntry(unprocessedLine.line, lexicalEntry);
                                corpusLine.setLexicalEntry(lexicalEntry);
                            }
                        }
                    }
                    List<CorpusSentenceRule.MergeAction> mergeActions = new ArrayList<>();
                    for (UnprocessedLine unprocessedLine : lines) {
                        if (LOG.isTraceEnabled())
                            LOG.trace("Line " + unprocessedLine);
                        for (int i = 0; i < unprocessedLine.sentenceRules.size(); i++) {
                            CorpusSentenceRule sentenceRule = unprocessedLine.sentenceRules.get(i);
                            Matcher matcher = unprocessedLine.matchers.get(i);
                            if (LOG.isTraceEnabled())
                                LOG.trace("Testing rule " + sentenceRule);
                            CorpusSentenceRule.Action action = sentenceRule.apply(unprocessedLine.line, unprocessedLine.lineNumber, matcher, sentenceLines);
                            if (LOG.isTraceEnabled())
                                LOG.trace("Result: " + action);
                            if (action != null) {
                                if (action instanceof MergeAction)
                                    mergeActions.add((MergeAction) action);
                                break;
                            }
                        }
                    }
                    if (mergeActions.size() > 0) {
                        List<CorpusLine> newSentenceLines = new ArrayList<>();
                        Map<Integer, MergeAction> indexesToMerge = new TreeMap<>();
                        for (CorpusSentenceRule.MergeAction mergeAction : mergeActions) {
                            for (CorpusLine lineToMerge : mergeAction.getLinesToMerge()) {
                                indexesToMerge.put(lineToMerge.getIndex(), mergeAction);
                            }
                        }
                        int i = 1;
                        Iterator<Integer> iIndexToMerge = indexesToMerge.keySet().iterator();
                        int nextIndexToMerge = iIndexToMerge.next();
                        int linesRemoved = 0;
                        Map<Integer, Integer> indexChangeMap = new HashMap<>();
                        indexChangeMap.put(0, 0);
                        for (CorpusLine corpusLine : sentenceLines) {
                            if (i == nextIndexToMerge) {
                                MergeAction mergeAction = indexesToMerge.get(i);
                                if (i == mergeAction.getFirstIndex()) {
                                    newSentenceLines.add(mergeAction.getMergedLine());
                                    linesRemoved -= 1;
                                }
                                linesRemoved += 1;
                                if (iIndexToMerge.hasNext())
                                    nextIndexToMerge = iIndexToMerge.next();
                                else
                                    nextIndexToMerge = -1;
                            } else {
                                newSentenceLines.add(corpusLine);
                            }
                            indexChangeMap.put(i, i - linesRemoved);
                            i++;
                        }
                        for (CorpusLine corpusLine : newSentenceLines) {
                            corpusLine.setElement(CorpusElement.INDEX, "" + indexChangeMap.get(corpusLine.getIndex()));
                            int governorIndex = corpusLine.getGovernorIndex();
                            if (governorIndex >= 0)
                                corpusLine.setElement(CorpusElement.GOVERNOR, "" + indexChangeMap.get(corpusLine.getGovernorIndex()));
                            int nonProjGovernorIndex = corpusLine.getNonProjGovernorIndex();
                            if (nonProjGovernorIndex >= 0)
                                corpusLine.setElement(CorpusElement.NON_PROJ_GOVERNOR, "" + indexChangeMap.get(corpusLine.getNonProjGovernorIndex()));
                        }
                        sentenceLines = newSentenceLines;
                    }
                    Sentence sentence = null;
                    if (sentenceReader != null && sentenceReader.hasNextSentence()) {
                        sentence = sentenceReader.nextSentence();
                    } else {
                        LinguisticRules rules = TalismaneSession.get(sessionId).getLinguisticRules();
                        if (rules == null)
                            throw new TalismaneException("Linguistic rules have not been set.");
                        String text = "";
                        for (CorpusLine corpusLine : sentenceLines) {
                            String word = corpusLine.getElement(CorpusElement.TOKEN);
                            if (rules.shouldAddSpace(text, word))
                                text += " ";
                            text += word;
                        }
                        sentence = new Sentence(text, currentFile, sessionId);
                    }
                    for (SentenceAnnotator sentenceAnnotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
                        sentenceAnnotator.annotate(sentence);
                    }
                    this.processSentence(sentence, sentenceLines);
                }
            }
        }
    }
    return (sentenceLines != null);
}
Also used : Matcher(java.util.regex.Matcher) HashMap(java.util.HashMap) TalismaneException(com.joliciel.talismane.TalismaneException) CompactLexicalEntry(com.joliciel.talismane.lexicon.CompactLexicalEntry) ArrayList(java.util.ArrayList) WritableLexicalEntry(com.joliciel.talismane.lexicon.WritableLexicalEntry) MergeAction(com.joliciel.talismane.corpus.CorpusSentenceRule.MergeAction) Sentence(com.joliciel.talismane.rawText.Sentence) Pattern(java.util.regex.Pattern) MergeAction(com.joliciel.talismane.corpus.CorpusSentenceRule.MergeAction) TreeMap(java.util.TreeMap) LinguisticRules(com.joliciel.talismane.LinguisticRules) SentenceAnnotator(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator)

Example 2 with CompactLexicalEntry

use of com.joliciel.talismane.lexicon.CompactLexicalEntry in project talismane by joliciel-informatique.

the class CorpusLineReader method read.

/**
 * Read one line out of the corpus, and transform it into a {@link CorpusLine}
 *
 * @param line
 *          the line to read
 * @param lineNumber
 *          the line number we reached, starting at 1.
 * @throws TalismaneException
 *           if the regex wasn't matched on a given line
 */
public CorpusLine read(String line, int lineNumber) throws TalismaneException {
    Matcher matcher = this.pattern.matcher(line);
    if (!matcher.matches())
        throw new TalismaneException("Didn't match pattern \"" + regex + "\". Compiled to: \"" + this.pattern.pattern() + "\". On line " + lineNumber + ": " + line);
    CorpusLine corpusLine = new CorpusLine(line, lineNumber);
    for (CorpusElement elementType : CorpusElement.values()) {
        if (placeholderIndexMap.containsKey(elementType)) {
            String value = matcher.group(placeholderIndexMap.get(elementType));
            switch(elementType) {
                case TOKEN:
                case LEMMA:
                    value = TalismaneSession.get(sessionId).getCoNLLFormatter().fromCoNLL(value);
                    break;
                default:
                    if ("_".equals(value))
                        value = "";
                    break;
            }
            corpusLine.setElement(elementType, value);
        }
    }
    if (this.lexicalEntryReader != null) {
        WritableLexicalEntry lexicalEntry = new CompactLexicalEntry(lexicalEntrySupport);
        this.lexicalEntryReader.readEntry(line, lexicalEntry);
        corpusLine.setLexicalEntry(lexicalEntry);
    }
    Map<CorpusElement, String> updateValues = new HashMap<>();
    for (CorpusRule corpusRule : corpusRules) {
        corpusRule.apply(corpusLine, updateValues);
    }
    for (CorpusElement element : updateValues.keySet()) {
        String value = updateValues.get(element);
        if (LOG.isTraceEnabled()) {
            LOG.trace("On line " + lineNumber + ", updating " + element.name() + " from '" + corpusLine.getElement(element) + "' to '" + value + "'");
        }
        corpusLine.setElement(element, value);
    }
    return corpusLine;
}
Also used : CorpusElement(com.joliciel.talismane.corpus.CorpusLine.CorpusElement) Matcher(java.util.regex.Matcher) TalismaneException(com.joliciel.talismane.TalismaneException) HashMap(java.util.HashMap) CompactLexicalEntry(com.joliciel.talismane.lexicon.CompactLexicalEntry) WritableLexicalEntry(com.joliciel.talismane.lexicon.WritableLexicalEntry)

Aggregations

TalismaneException (com.joliciel.talismane.TalismaneException)2 CompactLexicalEntry (com.joliciel.talismane.lexicon.CompactLexicalEntry)2 WritableLexicalEntry (com.joliciel.talismane.lexicon.WritableLexicalEntry)2 HashMap (java.util.HashMap)2 Matcher (java.util.regex.Matcher)2 LinguisticRules (com.joliciel.talismane.LinguisticRules)1 CorpusElement (com.joliciel.talismane.corpus.CorpusLine.CorpusElement)1 MergeAction (com.joliciel.talismane.corpus.CorpusSentenceRule.MergeAction)1 Sentence (com.joliciel.talismane.rawText.Sentence)1 SentenceAnnotator (com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator)1 ArrayList (java.util.ArrayList)1 TreeMap (java.util.TreeMap)1 Pattern (java.util.regex.Pattern)1